diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
deleted file mode 100644
index c14d1cb..0000000
--- a/.github/workflows/build.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: build
-
-on:
-  push:
-    branches:
-      - master
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    - cron:  '0 0 * * 6' # base builds run every saturday
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    env:
-      DOCKER_IMAGE_NAME: scrin/dev-spconv
-      DOCKER_FILE_PATH: ./Dockerfile
-
-    # TODO: create a action to reuse code. the problem is how to reuse docker-login.
-    steps:
-      - uses: actions/checkout@master
-      - name: Build Docker
-        run: |
-          docker build . --file ${{env.DOCKER_FILE_PATH}} --tag ${{env.DOCKER_IMAGE_NAME}}:latest
-          docker tag ${{env.DOCKER_IMAGE_NAME}}:latest ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
-          
-
-      - name: Login to Registry
-        uses: azure/docker-login@v1
-        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_PASSWORD }}
-
-      - name: Publish to Registry
-        run: |
-          docker push ${{env.DOCKER_IMAGE_NAME}}:latest
-          docker push ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }}
diff --git a/.gitignore b/.gitignore
index a68262c..657dd08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -107,3 +107,5 @@ venv.bak/
 .mypy_cache/
 
 .vscode
+
+__version__.py
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index 63f9714..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +0,0 @@
-[submodule "third_party/pybind11"]
-	path = third_party/pybind11
-	url = https://github.com/pybind/pybind11.git
-[submodule "third_party/cutlass"]
-	path = third_party/cutlass
-	url = https://github.com/NVIDIA/cutlass
-[submodule "third_party/mp11"]
-	path = third_party/mp11
-	url = https://github.com/boostorg/mp11
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ff4fa57..8a3cfcf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## [2.0.0] - 2021-10-16
+### Changed
+- Change build system from cmake to pccm.
+- Change pytorch python code to spconv.pytorch
+- Rewrite All c++ code.
+
 ## [1.2.1] - 2020-06-04
 ### Changed
 - The subm indice pair generation speed is greatly increased by two tricks: 1. most subm conv use only kernelsize=3, so we can unroll loops to get 100% performance increase. 2. subm indice pairs have a property: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1], so we can get another 100% performance increase. 
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 6be70cf..0000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
-
-option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." ON)
-option(SPCONV_BuildCUDA "Build cuda code when BUILD_TESTING is enabled." ON)
-if (SPCONV_BuildCUDA)
-    project(SparseConv LANGUAGES CXX CUDA VERSION 1.1)
-else()
-    project(SparseConv LANGUAGES CXX VERSION 1.1)
-endif()
-
-if(WIN32) # true if windows (32 and 64 bit)
-    add_compile_definitions(TV_WINDOWS)
-endif()
-add_compile_definitions(PYTORCH_VERSION=${PYTORCH_VERSION})
-
-set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    add_compile_definitions(TV_DEBUG)
-endif()
-# add_compile_definitions(TV_LOG_KERNEL_INFO)
-
-find_package(Torch REQUIRED)
-# set(CMAKE_VERBOSE_MAKEFILE ON)
-if (SPCONV_BuildCUDA)
-    set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}")
-    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
-    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY)
-    if(WIN32) # true if windows (32 and 64 bit)
-        set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
-    else()
-        set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
-    endif()
-    find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS})
-    find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS})
-    torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
-    string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}")
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}")
-    add_compile_definitions(TV_CUDA)
-endif()
-# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
-add_subdirectory(third_party/pybind11)
-
-set(ALL_LIBS ${TORCH_LIBRARIES}) 
-
-set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
-set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
-set(CUTLASS_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include)
-
-if (SPCONV_BuildCUDA)
-    set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
-    set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    add_subdirectory(src/cuhash)
-    add_subdirectory(src/spgemm)
-endif()
-add_subdirectory(src/spconv)
-add_subdirectory(src/utils)
-
-if (SPCONV_BuildTests)
-    include(CTest) #adds option BUILD_TESTING (default ON)
-    if(BUILD_TESTING)
-        enable_testing()
-        add_subdirectory(test)
-    endif()
-endif()
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 007ba79..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,9 +0,0 @@
-FROM scrin/dev:latest
-
-RUN PROBLEM_FILE=/usr/local/lib/python3.8/dist-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake && \
-    sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces;-fopenmp//g' $PROBLEM_FILE && \
-    sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces//g' $PROBLEM_FILE && \
-    cd /root && \
-    git clone --depth 1 --recursive https://www.github.com/traveller59/spconv.git && \
-    cd ./spconv && \
-    SPCONV_FORCE_BUILD_CUDA=1 python setup.py install
diff --git a/LICENSE b/LICENSE
index b131473..ba46142 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2019-2020 Yan Yan
+   Copyright 2019-2021 Yan Yan
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/PERFORMANCE_GUIDE.md b/PERFORMANCE_GUIDE.md
deleted file mode 100644
index 54bef8f..0000000
--- a/PERFORMANCE_GUIDE.md
+++ /dev/null
@@ -1,31 +0,0 @@
-## Performance Guide
-
-### 1. Regular sparse conv is very slow
-
-Regular sparse convolution will greatly increase the number of active points. for 3x3x3 3D convolution, we can get at most 27x active points, which means next convolution will perform 27x slower!
-
-This problem can be solved by using submanifold convolution (SubMConv3d). This kind of sparse convolution doesn't generate new active points.
-
-**NEVER** use SparseConv3d except downsample data, **NEVER** use SparseConv3dTranspose, use SparseInverseConv3d instead.
-
-### 2. Large Spatial Shape cost too much GPU memory
-
-Our implementation use dense map to generate indices in GPU for sparse convolution, which means if your spatial shape is ```[batchSize=4, 1600, 1600, 40]```, it will cost ~2GB GPU memory.
-
-To solve this problem, you can use CPU algorithm (hash map) for first layer that has large shape, then convert generated indices to GPU and use GPU algorithm for downsampled data.
-
-Another way is use cuda hash. Unfortunately this library isn't stable enough, it should only be used when the spatial shape is very large.
-
-### 3. Stacked submanifold convolution can share same indice data
-
-When you using stacked subm convolution, there is no need to generate indice data again, but this can't be done automatically. you need to specify a unique key ```indice_key="c0"``` and use it for all stacked subm convolution.
-
-### 4. Different convolution algorithm may lead to different performance
-
-There are three kind of algorithm: ```Native```, ```Batch```, ```BatchGemmGather```. 
-
-* ```Native```: should be used for all submanifold convolutions. should be used when there are too much active points.
-
-* ```Batch```: **cost more GPU memory** should be used when number of active points is small.
-
-* ```BatchGemmGather```: **cost more GPU memory** can be used for regular convolution.
\ No newline at end of file
diff --git a/README.md b/README.md
index 1e95261..c4b671c 100644
--- a/README.md
+++ b/README.md
@@ -1,173 +1,95 @@
+<!--
+ Copyright 2021 Yan Yan
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 # SpConv: PyTorch Spatially Sparse Convolution Library
 
 [![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)
 
-This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool.
-
-
-2020-5-2, we add ConcatTable, JoinTable, AddTable, and Identity function to build ResNet and Unet in this version of spconv.
-
-
-## Docker:
-
-```docker pull scrin/dev-spconv```, contains python 3.8, cuda 10.1, fish shell, newest pytorch and tensorflow.
-
-## Install on Ubuntu 16.04/18.04
-
-* if you are using pytorch 1.4+ and encounter "nvcc fatal: unknown -Wall", you need to go to torch package dir and remove flags contains "-Wall" in INTERFACE_COMPILE_OPTIONS in Caffe2Targets.cmake. This problem can't be fixed in this project (to avoid this, I need to remove all torch dependency in cuda sources and drop half support).
-
-0. Use ```git clone xxx.git --recursive``` to clone this repo.
-
-1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboost-all-dev``` or download compressed files from boost official website and copy headers to include path.
-
-2. Download cmake >= 3.13.2, then add cmake executables to PATH.
-
-3. Ensure you have installed pytorch 1.0+ in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
-
-4. Run ```cd ./dist```, use pip to install generated whl file.
-
-## Install on Windows 10 with CUDA 10 and python 3.6 (python 3.7 may have problem, see [this](https://github.com/pytorch/pytorch/issues/17233))
-
-Since install newest driver and CUDA is very simple on windows, please use CUDA 10 on windows. 
-
-0. Install Visual Studio 2017. Use ```git clone xxx.git --recursive``` to clone this repo.
-
-1. Download compressed files from boost official website and copy headers (i.e. boost_1_69/boost) to spconv/include.
-
-2. Download and install cmake >= 3.13.2, select add cmake to User or System PATH.
-
-3. Ensure you have installed pytorch 1.0 in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```).
-
-4. Run ```cd ./dist```, use pip to install generated whl file.
-
-## Compare with SparseConvNet
-
-### Features
-
-* SparseConvNet's Sparse Convolution don't support padding and dilation, spconv support this.
-
-* spconv only contains sparse convolutions, the batchnorm and activations can directly use layers from torch.nn, SparseConvNet contains lots of their own implementation of layers such as batchnorm and activations.
-
-### Speed
-
-* spconv is faster than SparseConvNet due to gpu indice generation and gather-gemm-scatter algorithm. SparseConvNet use hand-written gemm which is slow.
-
-## Usage
-
-### SparseConvTensor
-
-```Python
-features = # your features with shape [N, numPlanes]
-indices = # your indices/coordinates with shape [N, ndim + 1], batch index must be put in indices[:, 0]
-spatial_shape = # spatial shape of your sparse tensor, spatial_shape[i] is shape of indices[:, 1 + i].
-batch_size = # batch size of your sparse tensor.
-x = spconv.SparseConvTensor(features, indices, spatial_shape, batch_size)
-x_dense_NCHW = x.dense() # convert sparse tensor to dense NCHW tensor.
-print(x.sparity) # helper function to check sparity. 
-```
-
-### Sparse Convolution
-
-```Python
-import spconv
-from torch import nn
-class ExampleNet(nn.Module):
-    def __init__(self, shape):
-        super().__init__()
-        self.net = spconv.SparseSequential(
-            spconv.SparseConv3d(32, 64, 3), # just like nn.Conv3d but don't support group and all([d > 1, s > 1])
-            nn.BatchNorm1d(64), # non-spatial layers can be used directly in SparseSequential.
-            nn.ReLU(),
-            spconv.SubMConv3d(64, 64, 3, indice_key="subm0"),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-            # when use submanifold convolutions, their indices can be shared to save indices generation time.
-            spconv.SubMConv3d(64, 64, 3, indice_key="subm0"),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-            spconv.SparseConvTranspose3d(64, 64, 3, 2),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-            spconv.ToDense(), # convert spconv tensor to dense and convert it to NCHW format.
-            nn.Conv3d(64, 64, 3),
-            nn.BatchNorm1d(64),
-            nn.ReLU(),
-        )
-        self.shape = shape
-
-    def forward(self, features, coors, batch_size):
-        coors = coors.int() # unlike torch, this library only accept int coordinates.
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
-        return self.net(x)# .dense()
-```
+# WORK IN PROGRESS, DON'T USE!!!
 
-### Inverse Convolution
+## Breaking changes in Spconv 2.x
 
-Inverse sparse convolution means "inv" of sparse convolution. the output of inverse convolution contains same indices as input of sparse convolution.
+* ```spconv.xxx``` move to ```spconv.pytorch.xxx```, change all ```import spconv``` to ```import spconv.pytorch as spconv``` and ```from spconv.xxx import``` to ```from spconv.pytorch.xxx import```.
+* ```use_hash``` in Sparse Convolution is removed, we only use hash table in 2.x.
+* weight layout has been changed to RSKC (native algorithm) or KRSC (implicit gemm), no longer RSCK (spconv 1.x). RS is kernel size, C is input channel, K is output channel.
+* all util ops are removed (pillar scatter/nms/...)
+* VoxelGenerator has been replaced by Point2VoxelGPU[1-4]d/Point2VoxelCPU[1-4]d.
+* spconv 2.x don't support CPU for now
 
-Inverse convolution usually used in semantic segmentation.
+## News in Spconv 2.0.0
 
-```Python
-class ExampleNet(nn.Module):
-    def __init__(self, shape):
-        super().__init__()
-        self.net = spconv.SparseSequential(
-            spconv.SparseConv3d(32, 64, 3, 2, indice_key="cp0"),
-            spconv.SparseInverseConv3d(64, 32, 3, indice_key="cp0"), # need provide kernel size to create weight
-        )
-        self.shape = shape
+* training/inference speed is increased
+* support int8/tensor core
+* doesn't depend on pytorch binary. 
+* If your GPU has tensor core, try mixed precision training in spconv 2.x!
+* since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference.
 
-    def forward(self, features, coors, batch_size):
-        coors = coors.int()
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
-        return self.net(x)
-```
+## TODO in Spconv 2.x
+- [ ] Ampere (A100 / RTX 3000 series) feature support (work in progress)
+- [ ] torch QAT support (work in progress)
+- [ ] TensorRT (torch.fx based)
+- [ ] Build C++ only package
+- [ ] JIT compilation for CUDA kernels
+- [ ] Document (low priority)
+- [ ] CPU support (low priority)
 
-### Utility functions
+## Install
 
-* convert point cloud to voxel
+You need to install python >= 3.6 first to use spconv 2.x.
 
-```Python
+You need to install CUDA toolkit first before using prebuilt binaries or build from source.
 
-voxel_generator = spconv.utils.VoxelGenerator(
-    voxel_size=[0.1, 0.1, 0.1], 
-    point_cloud_range=[-50, -50, -3, 50, 50, 1],
-    max_num_points=30,
-    max_voxels=40000
-)
+You need at least CUDA 10.2 to build and run spconv 2.x. We won't offer any support for CUDA < 10.2.
 
-points = # [N, 3+] tensor.
-voxels, coords, num_points_per_voxel = voxel_generator.generate(points)
-```
+### Prebuilt
 
-## Implementation Details
+We offer python 3.6-3.10 and cuda 10.2/11.1/11.4 prebuilt binaries for linux (manylinux) and windows 10/11.
 
-This implementation use gather-gemm-scatter framework to do sparse convolution.
+We will offer prebuilts for CUDA versions supported by latest pytorch release. For example, pytorch 1.9 support cuda 10.2 and 11.1, so we support them too.
 
-## Projects using spconv:
+For Linux users, you need to install pip >= 20.3 first to install prebuilt.
 
-* [second.pytorch](https://github.com/traveller59/second.pytorch): Point Cloud Object Detection in KITTI Dataset.
+```pip install spconv-cu102``` for CUDA 10.2
 
-## Authors
+```pip install spconv-cu111``` for CUDA 11.1
 
-* **Yan Yan** - *Initial work* - [traveller59](https://github.com/traveller59)
+```pip install spconv-cu114``` for CUDA 11.4
 
-* **Bo Li** - *gpu indice generation idea, owner of patent of the sparse conv gpu indice generation algorithm (don't include subm)* - [prclibo](https://github.com/prclibo)
+### Build from source
 
-## Third party libraries
+You need to rebuild ```cumm``` first if you are build along a CUDA version that not provided in prebuilts.
 
-* [CUDPP](https://github.com/cudpp/cudpp): A cuda library. contains a cuda hash implementation.
+#### Linux
 
-* [robin-map](https://github.com/Tessil/robin-map): A fast c++ hash library. almost 2x faster than std::unordered_map in this project.
+1. install build-essential, install CUDA
+2. run ```export SPCONV_DISABLE_JIT="1"```
+3. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
 
-* [pybind11](https://github.com/pybind/pybind11): A head-only python c++ binding library.
+#### Windows 10/11
 
-* [prettyprint](https://github.com/louisdx/cxx-prettyprint): A head-only library for container print.
+1. install visual studio 2019 or newer. make sure C++ development package is installed. install CUDA
+2. set [powershell script execution policy](https://docs.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_execution_policies?view=powershell-7.1)
+3. start a new powershell, run ```tools/msvc_setup.ps1```
+4. run ```$Env:SPCONV_DISABLE_JIT = "1"```
+5. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl```
 
-## License
+## Note
 
-This project is licensed under the Apache license 2.0 License - see the [LICENSE.md](LICENSE.md) file for details
+The work is done when the author is an employee at Tusimple.
 
-The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License.
+## LICENSE
 
-The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license.
+Apache 2.0
\ No newline at end of file
diff --git a/codeai-devops.yaml b/codeai-devops.yaml
deleted file mode 100644
index 47afa8c..0000000
--- a/codeai-devops.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-global:
-    console_url: localhost:50091
-    envs: 
-        PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python # c++ libprotobuf and python will conflicit
-
-analyzers: # only one analyzer is allowed for one type for now.
-    PythonAnalyzer:
-        
-    SimpleCPPAnalyzer: # $<astex> devops.devs = ["_ci_dev_xxx"] </astex> is allowed in raw sources.
-        includes: ["*.cpp", "*.cu", "*.cc", "*.h", "*.hpp", "*.hxx", "*.cxx"]
-
-observers:
-    # run test functions when that function change or marked function change.
-    test:
-        type: TestObserver
-
-    # run dev functions when that function change or marked function change.
-    dev:
-        type: DevObserver
-        pattern: _ci_dev_.*
-
-    clangdev:
-        type: CPPDevObserver
-        main_pattern: dev_.*\.(cc|cpp|cxx)
-        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx)
-        compiler: clang++
-        executable: build/codeai_dev
-        includes: [
-            include,
-            /usr/local/cuda/include,
-            /home/yy/anaconda3/include,
-            /home/yy/anaconda3/include/python3.7m,
-            third_party/pybind11/include,
-            third_party/include,
-            /home/yy/library/boost_1_72_0,
-        ]
-        libpaths: [
-            /home/yy/anaconda3/lib,
-        ]
-        libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg]
-        std: c++14
-        options: [-Wall, -Wextra]
-
-    cudadev:
-        type: CPPDevObserver
-        main_pattern: dev_.*\.cu
-        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
-        compiler: nvcc
-        executable: build/codeai_dev_cuda
-        run_cmd: [$(executable)]
-        sources: []
-        includes: [
-            include,
-            /usr/local/cuda/include,
-            /home/yy/anaconda3/include,
-            /home/yy/anaconda3/include/python3.7m,
-            third_party/pybind11/include,
-            third_party/cutlass/include,
-        ]
-        libpaths: [
-            /usr/local/cuda/lib64,
-            /home/yy/anaconda3/lib,
-        ]
-        libraries: [-lpython3.7m, -lcudart, -lcublas, -ljpeg]
-        std: c++14
-        options: [
-            -Wno-deprecated-declarations,
-            # "-gencode=arch=compute_52,code=sm_52",
-            "-gencode=arch=compute_61,code=sm_61",
-            # "-gencode=arch=compute_60,code=sm_60",
-            # "-gencode=arch=compute_70,code=sm_70",
-            # "-gencode=arch=compute_75,code=sm_75",
-        ]
-
-    torchdev:
-        type: CPPDevObserver
-        main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
-        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
-        compiler: nvcc
-        executable: build/codeai_dev_torch
-        run_cmd: [$(executable)]
-        fail_cmds: # run cmd when pervious run fail with retcode
-            -6: [gdb, -ex, run, -ex, bt, -ex, quit, $(executable)] # segfault in unix
-        includes: [
-            include,
-            /home/yy/anaconda3/lib/python3.7/site-packages/torch/include,
-            /home/yy/anaconda3/lib/python3.7/site-packages/torch/include/torch/csrc/api/include,
-            /usr/local/cuda/include,
-            /home/yy/anaconda3/include,
-            /home/yy/anaconda3/include/python3.7m,
-            third_party/pybind11/include,
-            third_party/cutlass/include,
-        ]
-        libpaths: [
-            /home/yy/anaconda3/lib/python3.7/site-packages/torch/lib,
-            /usr/local/cuda/lib64,
-            /home/yy/anaconda3/lib,
-        ]
-        libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread, 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-lc10\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cpu\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-lc10_cuda\"", 
-                    "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cuda\""]
-        std: c++14
-        # options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
-
-        options: [
-            -Wno-deprecated-declarations,
-            --expt-relaxed-constexpr,
-            "-gencode=arch=compute_61,code=sm_61",
-            -D_GLIBCXX_USE_CXX11_ABI=0,
-        ]
-
-
-
diff --git a/docs/API.md b/docs/API.md
new file mode 100644
index 0000000..820fea0
--- /dev/null
+++ b/docs/API.md
@@ -0,0 +1,16 @@
+<!--
+ Copyright 2021 Yan Yan
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md
new file mode 100644
index 0000000..820fea0
--- /dev/null
+++ b/docs/DEVELOPMENT.md
@@ -0,0 +1,16 @@
+<!--
+ Copyright 2021 Yan Yan
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
diff --git a/docs/PERFORMANCE_GUIDE.md b/docs/PERFORMANCE_GUIDE.md
new file mode 100644
index 0000000..820fea0
--- /dev/null
+++ b/docs/PERFORMANCE_GUIDE.md
@@ -0,0 +1,16 @@
+<!--
+ Copyright 2021 Yan Yan
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
diff --git a/example/mnist_sparse.py b/example/mnist_sparse.py
index 5830c8b..537d26c 100644
--- a/example/mnist_sparse.py
+++ b/example/mnist_sparse.py
@@ -1,7 +1,21 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 import argparse
 import torch
-import spconv
+import spconv.pytorch as spconv
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
diff --git a/example/voxel_gen.py b/example/voxel_gen.py
new file mode 100644
index 0000000..2c910f8
--- /dev/null
+++ b/example/voxel_gen.py
@@ -0,0 +1,38 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np 
+
+from cumm import tensorview as tv 
+from spconv.utils import Point2VoxelCPU3d
+
+
+def main():
+    gen = Point2VoxelCPU3d(
+        vsize_xyz=[0.1, 0.1, 0.1], 
+        coors_range_xyz=[-80, -80, -2, 80, 80, 6], 
+        num_point_features=3, 
+        max_num_voxels=5000, 
+        max_num_points_per_voxel=5)
+
+    pc = np.random.uniform(-10, 10, size=[1000, 3])
+    pc_tv = tv.from_numpy(pc)
+    # generate voxels, note that voxels_tv reference to a persistent buffer in generator,
+    # so we can't run it in multi-thread.
+    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel(pc_tv)
+    # run voxel gen and FILL MEAN VALUE to voxel remain
+    voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_empty_mean(pc_tv)
+
+if __name__ == "__main__":
+    main()
diff --git a/include/cuhash/cuda_util.h b/include/cuhash/cuda_util.h
deleted file mode 100644
index c3ee1ca..0000000
--- a/include/cuhash/cuda_util.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef _CUDA_UTIL_H_
-#define _CUDA_UTIL_H_
-
-#if CUDART_VERSION >= 4000
-#define CUDA_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize();
-#else
-#define CUDA_DEVICE_SYNCHRONIZE() cudaThreadSynchronize();
-#endif
-
-#define CUDA_SAFE_CALL_NO_SYNC(call)                                           \
-  {                                                                            \
-    cudaError err = call;                                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__,  \
-              __LINE__, cudaGetErrorString(err));                              \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-
-#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);
-
-//! Check for CUDA error
-#ifdef _DEBUG
-#define CUDA_CHECK_ERROR(errorMessage)                                         \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    err = CUDA_DEVICE_SYNCHRONIZE();                                           \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-#else
-#define CUDA_CHECK_ERROR(errorMessage)                                         \
-  {                                                                            \
-    cudaError_t err = cudaGetLastError();                                      \
-    if (cudaSuccess != err) {                                                  \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",        \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString(err));      \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-  }
-#endif
-
-#endif
\ No newline at end of file
diff --git a/include/cuhash/debugging.h b/include/cuhash/debugging.h
deleted file mode 100644
index 22d4c1c..0000000
--- a/include/cuhash/debugging.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.h
- *
- * @brief Debugging/statistics/performance utilities header for hash tables.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H
-
-#include "definitions.h"
-#include <cuda_runtime_api.h>
-#include <vector_types.h>
-
-#include <algorithm>
-
-namespace cuhash {
-
-//! @name Debugging functions
-/// @{
-
-void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
-                                const unsigned table_size,
-                                const uint2 *constants,
-                                const unsigned kNumHashFunctions);
-
-//! Output how many probes were required by each thread to perform the
-//! retrieval.
-/*! @param[in]  n_queries           Number of queries being performed.
- *  @param[in]  d_retrieval_probes  Device array: the number of probes taken for
- * each thread's retrieval.
- *  @param[in]  n_functions         Number of hash functions used.
- */
-void OutputRetrievalStatistics(const unsigned n_queries,
-                               const unsigned *d_retrieval_probes,
-                               const unsigned n_functions);
-
-//! Outputs information about how many iterations threads required to
-//! successfully cuckoo hash.
-/*! @param[in]  n                       Number of keys in the input.
- *  @param[in]  d_iterations_taken      Device mem: Number of iterations each
- * thread took.
- *  @param[in]  d_max_iterations_taken  Device mem: Largest number of iterations
- * taken by any thread.
- */
-void OutputBuildStatistics(const unsigned n,
-                           const unsigned *d_iterations_taken);
-
-//! Prints out the contents of the stash.
-void PrintStashContents(const Entry *d_stash);
-
-//! Checks if a key is assigned the same slot by different hash functions.
-bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
-                           const unsigned *d_keys, const unsigned table_size,
-                           uint2 *constants);
-
-/// @}
-
-}; // namespace cuhash
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/include/cuhash/definitions.h b/include/cuhash/definitions.h
deleted file mode 100644
index 658fb87..0000000
--- a/include/cuhash/definitions.h
+++ /dev/null
@@ -1,116 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file definitions.h
- *
- * @brief Stores configuration flags and definitions for hard-coded values in
- * hash table implementations.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H
-
-#include <cstdio>
-#include <limits>
-#include <tensorview/tensorview.h>
-
-/* --------------------------------------------------------------------------
-   Debugging.
-   -------------------------------------------------------------------------- */
-#ifdef _DEBUG
-//! Forces the hash functions to generate a full set of slots for each key when
-//! not using subtables.
-// #define FORCEFULLY_GENERATE_NO_CYCLES
-
-//! Count how many iterations are taken to insert/find items.
-#define TRACK_ITERATIONS
-
-//! Count how many items fail to be inserted when the hash table fails to build.
-#define COUNT_UNINSERTED
-
-//! Take some statistics on the hash functions.
-#define TAKE_HASH_FUNCTION_STATISTICS
-
-#ifdef TAKE_HASH_FUNCTION_STATISTICS
-//! Determine how many keys hash into each table slot.
-#define COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-
-//! Determine how many unique slots a key is assigned.
-#define COUNT_HOW_MANY_HAVE_CYCLES
-#endif
-#endif
-
-#ifdef USE_DAN_OUTPUT
-#include <Utilities/output.h>
-//! Logs any error messages.
-inline void PrintMessage(const char *message, const bool error = false) {
-  PrintIndentedMessage(message, error);
-}
-#else
-//! Prints a message out to the console.
-inline void PrintMessage(const char *message, const bool error = false) {
-  if (error) {
-    printf("cudahash: %s\n", message);
-  } else {
-    printf("%s\n", message);
-  }
-}
-#endif
-
-/* -------------------------------------------------------------------------
-   Hash table constants and definitions.
-   ------------------------------------------------------------------------- */
-namespace cuhash {
-
-/**
- * \addtogroup cudpp_hash_data_structures
- *
- * @{
- */
-
-typedef unsigned long long
-    Entry; //!< A key and its value are stored in a 64-bit number.  The key is
-           //!< stored in the upper 32 bits.
-
-const unsigned kMaxRestartAttempts = 10; //!< Number of build attempts.
-const unsigned kKeyEmpty = 0xffffffffu; //!< Signifies empty slots in the table.
-const unsigned kNotFound =
-    0xffffffffu; //!< Signifies that a query key was not found.
-const unsigned kMaxHashFunctions =
-    5; //!< Maximum number of hash functions allowed.
-const unsigned kStashSize =
-    101; //!< How many slots the stash hash table contains.
-
-//! Value indicating that a hash table slot has no valid item within it.
-const Entry kEntryEmpty = Entry(kKeyEmpty) << 32;
-
-//! Value returned when a query fails.
-const Entry kEntryNotFound = (Entry(kKeyEmpty) << 32) + kNotFound;
-
-//! Number of threads to put in a thread block.
-const unsigned kBlockSize = 64;
-
-//! Number of blocks to put along each axis of the grid.
-const unsigned kGridSize = 16384;
-
-//! Minimum table sizes for 2 through 5 functions.
-const float kMinimumSpaceUsages[] = {std::numeric_limits<float>::max(),
-                                     std::numeric_limits<float>::max(),
-                                     2.01f,
-                                     1.1f,
-                                     1.03f,
-                                     1.02f};
-
-/** @} */ // end cudpp_hash_data_structures
-
-}; // namespace cuhash
-
-#endif
diff --git a/include/cuhash/hash_functions.h b/include/cuhash/hash_functions.h
deleted file mode 100644
index 31ce6f7..0000000
--- a/include/cuhash/hash_functions.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*! @file hash_functions.h
- *  @brief Hash function code.
- */
-
-#ifndef HASH_FUNCTIONS__H
-#define HASH_FUNCTIONS__H
-
-#include "definitions.h"
-#include <tensorview/tensorview.h>
-#include <vector_types.h>
-
-namespace cuhash {
-
-//! Prime number larger than the largest practical hash table size.
-const unsigned kPrimeDivisor = 4294967291u;
-// https://www.alpertron.com.ar/ECM.HTM
-// const unsigned long kPrimeDivisor = 18446744073709551557lu
-// const long kPrimeDivisor = 9223372036854775783l
-// const Entry kPrimeDivisor = 4300000013lu;
-// const unsigned kPrimeDivisor = 334214459;
-
-//! Generates a set of linear hash function constants.
-/*! @param[in]  N           Number of hash functions.
-    @param[out] constants   CPU pointer to the constants.
-    @param[in]  num_keys    Debug only: How many keys are in the input.
-    @param[in]  d_keys      Debug only: Device memory array containing the input
-   keys.
-    @param[in]  table_size  Debug only: Size of the hash table.
- */
-void GenerateFunctions(const unsigned N, const unsigned num_keys,
-                       const unsigned *d_keys, const unsigned table_size,
-                       uint2 *constants);
-
-//! Container for all of the hash functions.
-template <unsigned N> struct Functions {
-  //! The constants required for all of the hash functions, including the stash.
-  //! Each function requires 2.
-  uint2 constants[N];
-
-  //! Generate new hash function constants.
-  /*! The parameters are only used for debugging and examining the key
-     distribution. \param[in] num_keys   Debug: Number of keys in the input.
-      \param[in] d_keys     Debug: Device array of the input keys.
-      \param[in] table_size Debug: Size of the hash table.
-  */
-  void Generate(const unsigned num_keys, const unsigned *d_keys,
-                const unsigned table_size) {
-    GenerateFunctions(N, num_keys, d_keys, table_size, constants);
-  }
-};
-
-//! Computes the value of a hash function for a given key.
-/*! \param[in] constants  Constants used by the hash function.
-  ! \param[in] key        Key being hashed.
-  ! \returns              The value of the hash function for the key.
- */
-inline __device__ __host__ unsigned hash_function_inner(const uint2 constants,
-                                                        const unsigned key) {
-#if 1
-  // Fast version.
-  return ((constants.x ^ key) + constants.y) % kPrimeDivisor;
-#else
-  // Slow version.
-  return ((unsigned long long)constants.x * key + constants.y) % kPrimeDivisor;
-#endif
-}
-
-//! Computes the value of a hash function for a given key.
-/*! \param[in] functions        All of the constants used by the hash functions.
-  ! \param[in] which_function   Which hash function is being used.
-  ! \param[in] key              Key being hashed.
-  ! \returns                    The value of a hash function with a given key.
- */
-template <unsigned kNumHashFunctions>
-TV_HOST_DEVICE_INLINE unsigned
-hash_function(const Functions<kNumHashFunctions> functions,
-              const unsigned which_function, const unsigned key) {
-  return hash_function_inner(functions.constants[which_function], key);
-}
-
-//! Simple hash function used by the stash.
-TV_HOST_DEVICE_INLINE
-unsigned stash_hash_function(const uint2 stash_constants, const unsigned key) {
-  return (stash_constants.x ^ key + stash_constants.y) % kStashSize;
-}
-
-unsigned generate_random_uint32();
-
-}; // namespace cuhash
-
-#endif
diff --git a/include/cuhash/hash_table.cuh b/include/cuhash/hash_table.cuh
deleted file mode 100644
index 29f9b70..0000000
--- a/include/cuhash/hash_table.cuh
+++ /dev/null
@@ -1,275 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cuh
- *
- * @brief Implements kernel and __device__ functions for a basic hash table.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
-#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH
-
-#include "definitions.h"
-#include "hash_table.h"
-#include <driver_types.h>
-#include <tensorview/tensorview.h>
-
-namespace cuhash {
-
-//! Makes an 64-bit Entry out of a key-value pair for the hash table.
-TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) {
-  return (Entry(key) << 32) + value;
-}
-
-//! Returns the key of an Entry.
-TV_HOST_DEVICE_INLINE unsigned get_key(Entry entry) {
-  return (unsigned)(entry >> 32);
-}
-
-//! Returns the value of an Entry.
-TV_HOST_DEVICE_INLINE unsigned get_value(Entry entry) {
-  return (unsigned)(entry & 0xffffffff);
-}
-
-//! @name Internal
-//! @brief Functions used for building the hash table.
-//! @{
-
-//! Fills the entire array with a specific value.
-template <class T>
-__global__ void clear_table(const unsigned table_size, const T value,
-                            T *table) {
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-  if (thread_index < table_size) {
-    table[thread_index] = value;
-  }
-}
-
-//! Determine where in the hash table the key could be located.
-template <unsigned kNumHashFunctions>
-__device__ void KeyLocations(const Functions<kNumHashFunctions> constants,
-                             const unsigned table_size, const unsigned key,
-                             unsigned locations[kNumHashFunctions]) {
-// Compute all possible locations for the key in the big table.
-#pragma unroll
-  for (int i = 0; i < kNumHashFunctions; ++i) {
-    locations[i] = hash_function(constants, i, key) % table_size;
-  }
-}
-//! @}
-
-/* --------------------------------------------------------------------------
-   Retrieval functions.
-   -------------------------------------------------------------------------- */
-//! Answers a single query.
-/*! @ingroup PublicInterface
- *  @param[in]  key                   Query key
- *  @param[in]  table_size            Size of the hash table
- *  @param[in]  table                 The contents of the hash table
- *  @param[in]  constants             The hash functions used to build the table
- *  @param[in]  stash_constants       The hash function used to build the stash
- *  @param[in]  stash_count           The number of items in the stash
- *  @param[out] num_probes_required   Debug only: The number of probes required
- * to resolve the query.
- *  @returns The value of the query key, if the key exists in the table.
- * Otherwise, \ref kNotFound will be returned.
- */
-template <unsigned kNumHashFunctions>
-__device__ unsigned
-retrieve(const unsigned query_key, const unsigned table_size,
-         const Entry *table, const Functions<kNumHashFunctions> constants,
-         const uint2 stash_constants, const unsigned stash_count,
-         unsigned *num_probes_required = NULL) {
-  // Identify all of the locations that the key can be located in.
-  unsigned locations[kNumHashFunctions];
-  KeyLocations(constants, table_size, query_key, locations);
-
-  // Check each location until the key is found.
-  unsigned num_probes = 1;
-  Entry entry = table[locations[0]];
-  unsigned key = get_key(entry);
-
-#pragma unroll
-  for (unsigned i = 1; i < kNumHashFunctions; ++i) {
-    if (key != query_key && key != kNotFound) {
-      num_probes++;
-      entry = table[locations[i]];
-      key = get_key(entry);
-    }
-  }
-
-  // Check the stash.
-  if (stash_count && get_key(entry) != query_key) {
-    num_probes++;
-    const Entry *stash = table + table_size;
-    unsigned slot = stash_hash_function(stash_constants, query_key);
-    entry = stash[slot];
-  }
-
-#ifdef TRACK_ITERATIONS
-  if (num_probes_required) {
-    *num_probes_required = num_probes;
-  }
-#endif
-
-  if (get_key(entry) == query_key) {
-    return get_value(entry);
-  } else {
-    return kNotFound;
-  }
-}
-
-//! Perform a retrieval from a basic hash table.  Each thread manages a single
-//! query.
-template <unsigned kNumHashFunctions>
-__global__ void hash_retrieve(const unsigned n_queries, const unsigned *keys_in,
-                              const unsigned table_size, const Entry *table,
-                              const Functions<kNumHashFunctions> constants,
-                              const uint2 stash_constants,
-                              const unsigned stash_count, unsigned *values_out,
-                              unsigned *num_probes_required = NULL) {
-  // Get the key.
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-  if (thread_index >= n_queries)
-    return;
-  unsigned key = keys_in[thread_index];
-
-  values_out[thread_index] = retrieve<kNumHashFunctions>(
-      key, table_size, table, constants, stash_constants, stash_count,
-      (num_probes_required ? num_probes_required + thread_index : NULL));
-}
-
-/* --------------------------------------------------------------------------
-   Build a cuckoo hash table.
-   -------------------------------------------------------------------------- */
-//! @name Internal
-//! @{
-
-//! Determine where to insert the key next.  The hash functions are used in
-//! round-robin order.
-template <unsigned kNumHashFunctions>
-__device__ unsigned
-determine_next_location(const Functions<kNumHashFunctions> constants,
-                        const unsigned table_size, const unsigned key,
-                        const unsigned previous_location) {
-  // Identify all possible locations for the entry.
-  unsigned locations[kNumHashFunctions];
-#pragma unroll
-  for (unsigned i = 0; i < kNumHashFunctions; ++i) {
-    locations[i] = hash_function(constants, i, key) % table_size;
-  }
-
-  // Figure out where the item should be inserted next.
-  unsigned next_location = locations[0];
-#pragma unroll
-  for (int i = kNumHashFunctions - 2; i >= 0; --i) {
-    next_location =
-        (previous_location == locations[i] ? locations[i + 1] : next_location);
-  }
-  return next_location;
-}
-
-//! Attempts to insert a single entry into the hash table.
-/*! This process stops after a certain number of iterations.  If the thread is
-    still holding onto an item because of an eviction, it tries the stash.
-    If it fails to enter the stash, it returns false.
-    Otherwise, it succeeds and returns true.
- */
-template <unsigned kNumHashFunctions>
-__device__ bool
-insert(const unsigned table_size, const Functions<kNumHashFunctions> constants,
-       const uint2 stash_constants, const unsigned max_iteration_attempts,
-       Entry *table, unsigned *stash_count, Entry entry,
-       unsigned *iterations_used) {
-  unsigned key = get_key(entry);
-
-  // The key is always inserted into its first slot at the start.
-  unsigned location = hash_function(constants, 0, key) % table_size;
-
-  // Keep inserting until an empty slot is found or the eviction chain grows too
-  // large.
-  for (unsigned its = 1; its <= max_iteration_attempts; its++) {
-    // Insert the new entry.
-    entry = atomicExch(&table[location], entry);
-    key = get_key(entry);
-
-    // If no key was evicted, we're done.
-    if (key == kKeyEmpty) {
-      *iterations_used = its;
-      break;
-    }
-
-    // Otherwise, determine where the evicted key will go.
-    location = determine_next_location(constants, table_size, key, location);
-  }
-
-  if (key != kKeyEmpty) {
-    // Shove it into the stash.
-    unsigned slot = stash_hash_function(stash_constants, key);
-    Entry *stash = table + table_size;
-    Entry replaced_entry = atomicCAS(stash + slot, kEntryEmpty, entry);
-    if (replaced_entry != kEntryEmpty) {
-      return false;
-    } else {
-      atomicAdd(stash_count, 1);
-    }
-  }
-
-  return true;
-}
-
-// Build a basic hash table, using one big table.
-template <unsigned kNumHashFunctions>
-__global__ void CuckooHash(const unsigned n_entries, const unsigned *keys,
-                           const unsigned *values, const unsigned table_size,
-                           const Functions<kNumHashFunctions> constants,
-                           const unsigned max_iteration_attempts, Entry *table,
-                           uint2 stash_constants, unsigned *stash_count,
-                           unsigned *failures,
-                           unsigned *iterations_taken = nullptr) {
-  // Check if this thread has an item and if any previous threads failed.
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-  if (thread_index >= n_entries || *failures)
-    return;
-  Entry entry = make_entry(keys[thread_index], values[thread_index]);
-
-  unsigned iterations = 0;
-  bool success = insert<kNumHashFunctions>(
-      table_size, constants, stash_constants, max_iteration_attempts, table,
-      stash_count, entry, &iterations);
-
-  if (success == false) {
-    // The eviction chain grew too large.  Report failure.
-#ifdef COUNT_UNINSERTED
-    atomicAdd(failures, 1);
-#else
-    *failures = 1;
-#endif
-  }
-
-#ifdef TRACK_ITERATIONS
-  iterations_taken[thread_index] = iterations;
-#endif
-}
-//! @}
-
-}; // namespace cuhash
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/include/cuhash/hash_table.h b/include/cuhash/hash_table.h
deleted file mode 100644
index 055e08c..0000000
--- a/include/cuhash/hash_table.h
+++ /dev/null
@@ -1,228 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.h
- *
- * @brief Header for a basic hash table that stores one value per key.
- */
-
-#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
-
-#include "definitions.h"
-#include "hash_functions.h"
-
-#include <cstdio>
-
-/** \addtogroup cudpp_app
- * @{
- */
-
-/** \addtogroup cudpp_hash_data_structures
- * @{
- */
-
-/* --------------------------------------------------------------------------
-   Doxygen definitions.
-   -------------------------------------------------------------------------- */
-/*! @namespace CudaHT
- *  @brief Encapsulates the hash table library.
- */
-
-/*! @namespace CuckooHashing
- *  @brief Encapsulates the cuckoo hash table that uses stashes.
- */
-
-/* -------------------------------------------------------------------------
-   Hash table code.
-   ------------------------------------------------------------------------- */
-namespace cuhash {
-
-//! Compute how many thread blocks are required for the given number of threads.
-dim3 ComputeGridDim(unsigned threads);
-
-//! Compute how long an eviction chain is allowed to become for a given input
-//! size.
-/*! \param[in] num_keys       Number of keys in the input.
- *  \param[in] table_size     Number of slots in the hash table.
- *  \param[in] num_functions  Number of hash functions being used.
- *  \returns The number of iterations that should be allowed.
- *
- *  The latter two parameters are only needed when using an empirical
- *  formula for computing the chain length.
- */
-unsigned ComputeMaxIterations(const unsigned num_keys,
-                              const unsigned table_size,
-                              const unsigned num_functions);
-
-//! Basic hash table that stores one value for each key.
-/*! The input consists of two unsigned arrays of keys and values.
- *  None of the keys are expected to be repeated.
- *
- *  @todo Templatize the interface without forcing the header file to
- *  have CUDA calls.
- *  @ingroup cudpp_app
- */
-class HashTable {
-public:
-  HashTable();
-
-  virtual ~HashTable() { Release(); }
-
-  //! Initialize the hash table's memory. Must be called before \ref
-  //! Build() and after the random number generator has been seeded.
-  /*! @param[in] max_input_size   Largest expected number of items in the input.
-   *  @param[in] space_usage Size of the hash table relative to the
-   *                         input. Bigger tables are faster to build
-   *                         and retrieve from.
-   *  @param[in] num_functions Number of hash functions to use. May be
-   *                           2-5. More hash functions make it easier
-   *                           to build the table, but increase
-   *                           retrieval times.
-   *  @returns Whether the hash table was initialized successfully (true)
-   *           or not (false).
-   *
-   *  The minimum space usage is dependent on the number of functions
-   *  being used; for two through five functions, the minimum space
-   *  usage is 2.1, 1.1, 1.03, and 1.02 respectively.
-   */
-  virtual bool Initialize(const unsigned max_input_size,
-                          const float space_usage = 1.25,
-                          const unsigned num_functions = 4);
-
-  //! Free all memory.
-  virtual void Release();
-
-  //! Build the hash table.
-  /*! @param[in] input_size   Number of key-value pairs being inserted.
-   *  @param[in] d_keys       Device memory array containing all of the input
-   *                          keys.
-   *  @param[in] d_vals       Device memory array containing the keys' values.
-   *  @returns Whether the hash table was built successfully (true) or
-   *           not (false).
-   *
-   *  Several attempts are allowed to build the hash table in case of failure.
-   *  The input keys are expected to be completely unique.
-   *  To reduce the chance of a failure, increase the space usage or number of
-   *  functions.
-   *  Keys are not allowed to be equal to cuhash::kKeyEmpty.
-   */
-  virtual bool Build(const unsigned input_size, const unsigned *d_keys,
-                     const unsigned *d_vals);
-
-  //! Query the hash table.
-  /*! @param[in] n_queries        Number of keys in the query set.
-   *  @param[in] d_query_keys     Device memory array containing all of
-   *                              the query keys.
-   *  @param[in] d_query_results  Values for the query keys.
-   *
-   *  kNotFound is returned for any query key that failed to be found
-   *  in the table.
-   */
-  virtual void Retrieve(const unsigned n_queries, const unsigned *d_query_keys,
-                        unsigned *d_query_results);
-
-  //! @name Accessors
-  /// @brief Mainly needed to use the __device__ CudaHT::retrieve()
-  /// function directly.
-  /// @{
-
-  //! Returns how many slots the hash table has.
-  inline unsigned get_table_size() const { return table_size_; }
-
-  //! Returns how many items are stored in the stash.
-  inline unsigned get_stash_count() const { return stash_count_; }
-
-  //! Returns the constants used by the stash.
-  inline uint2 get_stash_constants() const { return stash_constants_; }
-
-  //! Returns the hash table contents.
-  inline const Entry *get_contents() const { return d_contents_; }
-
-  //! Returns the number of hash functions being used.
-  inline unsigned get_num_hash_functions() const { return num_hash_functions_; }
-
-  //! When using two hash functions, returns the constants.
-  inline Functions<2> get_constants_2() const { return constants_2_; }
-
-  //! When using three hash functions, returns the constants.
-  inline Functions<3> get_constants_3() const { return constants_3_; }
-
-  //! When using four hash functions, returns the constants.
-  inline Functions<4> get_constants_4() const { return constants_4_; }
-
-  //! When using five hash functions, returns the constants.
-  inline Functions<5> get_constants_5() const { return constants_5_; }
-
-  /// @}
-  inline Entry *data() { return d_contents_; }
-  inline const Entry *data() const { return d_contents_; }
-
-protected:
-  unsigned table_size_;         //!< Size of the hash table.
-  unsigned num_hash_functions_; //!< Number of hash functions being used.
-  Entry *d_contents_; //!< Device memory: The hash table contents.  The stash is
-                      //!< stored at the end.
-  unsigned stash_count_;  //!< Number of key-value pairs currently stored.
-  uint2 stash_constants_; //!< Hash function constants for the stash.
-
-  Functions<2> constants_2_; //!< Constants for a set of two hash functions.
-  Functions<3> constants_3_; //!< Constants for a set of three hash functions.
-  Functions<4> constants_4_; //!< Constants for a set of four hash functions.
-  Functions<5> constants_5_; //!< Constants for a set of five hash functions.
-
-  unsigned *d_failures_; //!< Device memory: General use error flag.
-};
-
-/*! @name Internal
- *  @{
- */
-namespace CUDAWrapper {
-//! Fills a 64-bit array with a particular value.
-void ClearTable(const unsigned slots_in_table, const Entry fill_value,
-                Entry *d_array);
-
-//! Calls the Cuckoo Hash construction kernel.
-void CallCuckooHash(const unsigned n_entries, const unsigned num_hash_functions,
-                    const unsigned *d_keys, const unsigned *d_values,
-                    const unsigned table_size, const Functions<2> constants_2,
-                    const Functions<3> constants_3,
-                    const Functions<4> constants_4,
-                    const Functions<5> constants_5,
-                    const unsigned max_iteration_attempts, Entry *d_contents,
-                    uint2 stash_constants, unsigned *d_stash_count,
-                    unsigned *d_failures, unsigned *d_iterations_taken);
-
-//! Calls the kernel that performs retrievals.
-void CallHashRetrieve(const unsigned n_queries,
-                      const unsigned num_hash_functions,
-                      const unsigned *keys_in, const unsigned table_size,
-                      const Entry *table, const Functions<2> constants_2,
-                      const Functions<3> constants_3,
-                      const Functions<4> constants_4,
-                      const Functions<5> constants_5,
-                      const uint2 stash_constants, const unsigned stash_count,
-                      unsigned *values_out);
-}; // namespace CUDAWrapper
-/// @}
-
-}; // namespace cuhash
-
-/** @} */ // end hash table data structures
-/** @} */ // end cudpp_app
-
-#endif
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/include/paramsgrid.h b/include/paramsgrid.h
deleted file mode 100644
index c978dfe..0000000
--- a/include/paramsgrid.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is used for c++ unit test, but pytorch jit ops don't support c++
-// debug build.
-
-#ifndef PARAMS_GRID_H_
-#define PARAMS_GRID_H_
-#include <tuple>
-#include <vector>
-
-namespace detail {
-template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
-
-template <class T, class... TArgs>
-int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
-  return arg.size() * getTotalSize(args...);
-}
-template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
-
-template <int Idx, class TT, class T>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-}
-
-template <int Idx, class TT, class T, class... TArgs>
-void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
-              std::vector<TArgs> &... args) {
-  std::get<Idx>(src) = arg[counter[Idx]];
-  assigner<Idx + 1>(src, counter, args...);
-}
-} // namespace detail
-template <class... TArgs>
-std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
-  int length = detail::getTotalSize(args...);
-  std::vector<int> sizes = {detail::getSize(args)...};
-  int size = sizes.size();
-
-  std::vector<std::tuple<TArgs...>> params(length);
-  std::vector<int> counter(size);
-  for (int i = 0; i < length; ++i) {
-    detail::assigner<0>(params[i], counter, args...);
-    counter[size - 1] += 1;
-    for (int c = size - 1; c >= 0; --c) {
-      if (counter[c] == sizes[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return params;
-}
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/box_iou.h b/include/spconv/box_iou.h
deleted file mode 100644
index 15ceee8..0000000
--- a/include/spconv/box_iou.h
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BOX_IOU_H
-#define BOX_IOU_H
-
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-
-namespace spconv {
-// #include "voxelnet/core/cc/pybind11_helper.h"
-namespace py = pybind11;
-using namespace pybind11::literals;
-template <typename DType, typename ShapeContainer>
-inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
-  // create ROWMAJOR array.
-  py::array_t<DType> array(shape);
-  std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
-  return array;
-}
-
-template <typename DType>
-inline py::array_t<DType> zeros(std::vector<long int> shape) {
-  return constant<DType, std::vector<long int>>(shape, 0);
-}
-
-template <typename DType>
-py::array_t<DType>
-rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
-          py::array_t<DType> standup_iou, DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        bg::union_(poly, qpoly, poly_union);
-        if (!poly_union.empty()) {
-          union_area = bg::area(poly_union.front());
-          overlaps_rw(n, k) = inter_area / union_area;
-        }
-        poly_union.clear();
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-template <typename DType>
-py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
-                                      py::array_t<DType> qbox_corners,
-                                      py::array_t<DType> standup_iou,
-                                      DType standup_thresh) {
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area;
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto qbox_corners_r = qbox_corners.template unchecked<3>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  auto N = box_corners_r.shape(0);
-  auto K = qbox_corners_r.shape(0);
-  py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
-  auto overlaps_rw = overlaps.template mutable_unchecked<2>();
-  if (N == 0 || K == 0) {
-    return overlaps;
-  }
-  for (int k = 0; k < K; ++k) {
-    for (int n = 0; n < N; ++n) {
-      if (standup_iou_r(n, k) <= standup_thresh)
-        continue;
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
-      bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
-      bg::append(qpoly,
-                 point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
-
-      bg::intersection(poly, qpoly, poly_inter);
-
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        overlaps_rw(n, k) = inter_area;
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return overlaps;
-}
-
-} // namespace spconv
-#endif
\ No newline at end of file
diff --git a/include/spconv/cublas_gemm.h b/include/spconv/cublas_gemm.h
deleted file mode 100644
index 117a127..0000000
--- a/include/spconv/cublas_gemm.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-#include <cublas_v2.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-
-template <class T>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const T *alpha, const T *A, int lda, const T *B,
-                           int ldb, const T *beta, T *C, int ldc);
-
-template <class T>
-cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k,
-                              const T *alpha, const T *A, int lda, const T *B,
-                              int ldb, const T *beta, T *C, int ldc) {
-  return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
-                        beta, C, ldc);
-}
-
-template <class T> inline T constant_scalar(float data) { return T(data); }
-
-template <class T>
-cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
-                    const tv::TensorView<T> A, const tv::TensorView<T> B,
-                    tv::TensorView<T> C) {
-  TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
-  TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
-  auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
-  int m = transa ? A.dim(1) : A.dim(0);
-  int n = transb ? B.dim(0) : B.dim(1);
-  int ka = transa ? A.dim(0) : A.dim(1);
-  int kb = transb ? B.dim(1) : B.dim(0);
-  int lda = transa ? m : ka;
-  int ldb = transb ? ka : n;
-  int ldc = n;
-  TV_ASSERT_RT_ERR(ka == kb, "error");
-  T alpha = constant_scalar<T>(1);
-  T beta = constant_scalar<T>(0);
-  return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
-                           &alpha, A.data(), lda, B.data(), ldb, &beta,
-                           C.data(), ldc);
-}
-
-} // namespace spconv
diff --git a/include/spconv/fused_conv.cu.h b/include/spconv/fused_conv.cu.h
deleted file mode 100644
index 15533d1..0000000
--- a/include/spconv/fused_conv.cu.h
+++ /dev/null
@@ -1,629 +0,0 @@
-
-/*
-BSD License
-
-For SparseConvNet software
-
-Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name Facebook nor the names of its contributors may be used to
-   endorse or promote products derived from this software without specific
-   prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#define TACC double
-
-template <typename T, int32_t K, int32_t V>
-__global__ void
-dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w,
-                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                            int32_t input_nPlanes, int32_t input_stride,
-                            int32_t output_nPlanes, int32_t output_stride) {
-  // nHot must be a multiple of K!!
-
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  int32_t M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  int32_t n = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += g * input_nPlanes;
-  outFeatures += n * K + g * output_nPlanes;
-  w += n * K + g * input_nPlanes * output_nPlanes;
-
-  TACC O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int32_t m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        R0[v] = rulesIn[s + ty[v]];
-        R1[v] = rulesOut[s + ty[v]];
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-template <typename T, int32_t K, int32_t V>
-__global__ void
-dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w,
-                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                            int32_t input_nPlanes, int32_t input_stride,
-                            int32_t output_nPlanes, int32_t output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
-  // K is a multiple of V,
-
-  // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
-
-  int32_t M = input_nPlanes / K;
-  // N = gridDim.y == output_nPlanes/K
-  int32_t n = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += g * input_nPlanes;
-  outFeatures += n * K + g * output_nPlanes;
-  w += n * K + g * input_nPlanes * output_nPlanes;
-
-  TACC O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-
-  for (int32_t m = 0; m < M; m++) {
-// Read w
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rulesIn[s + ty[v]];
-          R1[v] = rulesOut[s + ty[v]];
-        }
-      }
-      __syncthreads();
-
-// Read input, reset O[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          O[v] += outFeatures[R1[v] * output_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          outFeatures[R1[v] * output_stride + tx] = O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      int32_t o = (nHot / K) * K;                                              \
-      if (o >= K)                                                              \
-        dConvolution_KMxKN_forwardA<T, K, V>                                   \
-            <<<dim3(std::min(o / K, (int32_t)512), output_nPlanes / K,         \
-                    nGroups),                                                  \
-               dim3(K, K / V), 0, s>>>(                                        \
-                inFeatures, outFeatures, w, rulesIn, rulesOut, o,              \
-                input_nPlanes, input_stride, output_nPlanes, output_stride);   \
-      if (nHot > o)                                                            \
-        dConvolution_KMxKN_forwardB<T, K, V>                                   \
-            <<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>(  \
-                inFeatures, outFeatures, w, rulesIn + o, rulesOut + o,         \
-                nHot - o, input_nPlanes, input_stride, output_nPlanes,         \
-                output_stride);                                                \
-      return;                                                                  \
-    }                                                                          \
-  }
-template <typename T>
-void dConvolution_forward(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
-                          int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                          int32_t input_nPlanes, int32_t input_stride,
-                          int32_t output_nPlanes, int32_t output_stride,
-                          int32_t nGroups) {
-  FOO(T, 64, 16)
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-template <>
-void dConvolution_forward<double>(cudaStream_t s, double *inFeatures,
-                                  double *outFeatures, double *w,
-                                  int32_t *rulesIn, int32_t *rulesOut,
-                                  int32_t nHot, int32_t input_nPlanes,
-                                  int32_t input_stride, int32_t output_nPlanes,
-                                  int32_t output_stride, int32_t nGroups) {
-  FOO(double, 32, 8)
-  FOO(double, 16, 4)
-  FOO(double, 8, 2)
-  assert(false);
-}
-#undef FOO
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template <typename T, int32_t K, int32_t V>
-__global__ void dConvolution_KMxKN_backward_dW_A(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
-    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
-    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  int32_t N = output_nPlanes / K;
-  int32_t m = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += m * K + g * input_nPlanes;
-  dInFeatures += m * K + g * input_nPlanes;
-  dOutFeatures += g * output_nPlanes;
-  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  TACC dI[V];
-  TACC dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int32_t v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        R0[v] = rulesIn[s + ty[v]];
-        R1[v] = rulesOut[s + ty[v]];
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-        dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-      }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template <typename T, int32_t K, int32_t V>
-__global__ void dConvolution_KMxKN_backward_dW_B(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
-    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
-    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  int32_t N = output_nPlanes / K;
-  int32_t m = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += m * K + g * input_nPlanes;
-  dInFeatures += m * K + g * input_nPlanes;
-  dOutFeatures += g * output_nPlanes;
-  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  TACC dI[V];
-  TACC dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  int32_t R0[V];
-  int32_t R1[V];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t n = 0; n < N; n++) {
-// Read w, reset dW
-#pragma unroll
-    for (int32_t v = 0; v < V; v++) {
-      W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (s + ty[v] < nHot) {
-          R0[v] = rulesIn[s + ty[v]];
-          R1[v] = rulesOut[s + ty[v]];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot) {
-          I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
-          dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
-        } else {
-          I[ty[v]][tx] = 0;
-          dO[ty[v]][tx] = 0;
-        }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++) {
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-        }
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dI[v] += dInFeatures[R0[v] * input_stride + tx];
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (s + ty[v] < nHot)
-          dInFeatures[R0[v] * input_stride + tx] = dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-#define FOO(T, K, V)                                                           \
-  {                                                                            \
-    if (input_nPlanes % K == 0 and output_nPlanes % K == 0) {                  \
-      int32_t o = (nHot / K) * K;                                              \
-      if (o >= K)                                                              \
-        dConvolution_KMxKN_backward_dW_A<T, K, V>                              \
-            <<<dim3(std::min(o / K, (int32_t)512), input_nPlanes / K,          \
-                    nGroups),                                                  \
-               dim3(K, K / V), 0, s>>>(inFeatures, dInFeatures, dOutFeatures,  \
-                                       w, dw, rulesIn, rulesOut, o,            \
-                                       input_nPlanes, input_stride,            \
-                                       output_nPlanes, output_stride);         \
-      if (nHot > o)                                                            \
-        dConvolution_KMxKN_backward_dW_B<T, K, V>                              \
-            <<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>(   \
-                inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o,     \
-                rulesOut + o, nHot - o, input_nPlanes, input_stride,           \
-                output_nPlanes, output_stride);                                \
-      return;                                                                  \
-    }                                                                          \
-  }
-template <typename T>
-void dConvolution_backward_dW(cudaStream_t s, T *inFeatures, T *dInFeatures,
-                              T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
-                              int32_t *rulesOut, int32_t nHot,
-                              int32_t input_nPlanes, int32_t input_stride,
-                              int32_t output_nPlanes, int32_t output_stride,
-                              int32_t nGroups) {
-  FOO(T, 32, 8)
-  FOO(T, 16, 4)
-  FOO(T, 8, 2)
-  assert(false);
-}
-#undef FOO
-template <typename T, int32_t K, int32_t V>
-__global__ void
-dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w,
-                            int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                            int32_t input_nPlanes, int32_t input_stride,
-                            int32_t output_nPlanes, int32_t output_stride) {
-  // Input x Weight -> Output
-  // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
-  // K is a multiple of V,
-  // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
-  // - parallel over N,nHot - loop over M
-  int32_t M = (input_nPlanes + K - 1) / K;
-  // N = gridDim.y ~ output_nPlanes/K
-  int32_t n = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += g * input_nPlanes;
-  outFeatures += n * K + g * output_nPlanes;
-  w += n * K + g * input_nPlanes * output_nPlanes;
-  int32_t KO = min(K, output_nPlanes - K * n);
-  TACC O[V];
-  __shared__ T W[K][K];
-  __shared__ T I[K][K];
-  __shared__ int32_t R[K * 2];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t m = 0; m < M; m++) {
-    int32_t KI = min(K, input_nPlanes - K * m);
-// Read w
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (ty[v] < 1) {
-          if (s + tx < nHot) {
-            R[2 * tx] = rulesIn[s + tx];
-            R[2 * tx + 1] = rulesOut[s + tx];
-          }
-          // R[q] = rules[2 * s + q];
-        }
-      }
-      __syncthreads();
-// Read input, reset O[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        O[v] = 0;
-      }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < KI; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          O[v] += I[ty[v]][k] * W[k][tx];
-      __syncthreads();
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (tx < KO and s + ty[v] < nHot)
-          outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
-      __syncthreads();
-    }
-    w += K * output_nPlanes;
-    inFeatures += K;
-  }
-}
-// dOutput x W^T -> dInput and
-// Input^T x dOutput -> dW
-// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
-template <typename T, int32_t K, int32_t V>
-__global__ void dConvolution_KMxKN_backward_dW2(
-    T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
-    int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
-    int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
-  // M = gridDim.y == input_nPlanes / K
-  int32_t N = (output_nPlanes + K - 1) / K;
-  int32_t m = blockIdx.y;
-  int32_t g = blockIdx.z;
-  inFeatures += m * K + g * input_nPlanes;
-  dInFeatures += m * K + g * input_nPlanes;
-  dOutFeatures += g * output_nPlanes;
-  w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
-  int32_t KI = min(K, input_nPlanes - K * m);
-  TACC dI[V];
-  TACC dW[V];
-  __shared__ T I[K][K];
-  __shared__ T dO[K][K];
-  __shared__ T W[K][K];
-  __shared__ int32_t R[K * 2];
-  const int32_t tx = threadIdx.x;
-  int32_t ty[V];
-#pragma unroll
-  for (int32_t v = 0; v < V; v++)
-    ty[v] = threadIdx.y + v * (K / V);
-  for (int32_t n = 0; n < N; n++) {
-    int32_t KO = min(K, output_nPlanes - K * n);
-// Read w, reset dW
-#pragma unroll
-    for (int32_t v = 0; v < V; v++) {
-      if (ty[v] < KI and tx < KO)
-        W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
-      dW[v] = 0;
-    }
-    for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
-// Read rules for K input/output pairs, reset dI[]
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (ty[v] < 1) {
-          if (s + tx < nHot) {
-            R[2 * tx] = rulesIn[s + tx];
-            R[2 * tx + 1] = rulesOut[s + tx];
-          }
-          // R[q] = rules[2 * s + q];
-        }
-        dI[v] = 0;
-      }
-      __syncthreads();
-// Read input and dOutput
-#pragma unroll
-      for (int32_t v = 0; v < V; v++) {
-        if (tx < KI and s + ty[v] < nHot)
-          I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
-        else
-          I[ty[v]][tx] = 0;
-        if (tx < KO and s + ty[v] < nHot)
-          dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
-        else
-          dO[ty[v]][tx] = 0;
-      }
-      __syncthreads();
-#pragma unroll
-      for (int32_t k = 0; k < KO; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          dI[v] += dO[ty[v]][k] * W[tx][k];
-#pragma unroll
-      for (int32_t k = 0; k < K; k++)
-#pragma unroll
-        for (int32_t v = 0; v < V; v++)
-          dW[v] += I[k][ty[v]] * dO[k][tx];
-      __syncthreads();
-#pragma unroll
-      for (int32_t v = 0; v < V; v++)
-        if (tx < KI and s + ty[v] < nHot)
-          dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
-      __syncthreads();
-    }
-#pragma unroll
-    for (int32_t v = 0; v < V; v++)
-      if (ty[v] < KI and tx < KO)
-        atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
-    w += K;
-    dw += K;
-    dOutFeatures += K;
-  }
-}
-template <typename T>
-void dConvolution_forward2(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
-                           int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
-                           int32_t input_nPlanes, int32_t input_stride,
-                           int32_t output_nPlanes, int32_t output_stride,
-                           int32_t nGroups) {
-  int32_t c = input_nPlanes * output_nPlanes * nGroups;
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int32_t K = 16;
-    const int32_t V = 4;
-    dConvolution_KMxKN_forward2<T, K, V>
-        <<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
-           s>>>(inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
-                input_nPlanes, input_stride, output_nPlanes, output_stride);
-
-  } else {
-    dConvolution_forward(s, inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
-                         input_nPlanes, input_stride, output_nPlanes,
-                         output_stride, nGroups);
-  }
-}
-template <typename T>
-void dConvolution_backward_dW2(cudaStream_t s, T *inFeatures, T *dInFeatures,
-                               T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
-                               int32_t *rulesOut, int32_t nHot,
-                               int32_t input_nPlanes, int32_t input_stride,
-                               int32_t output_nPlanes, int32_t output_stride,
-                               int32_t nGroups) {
-  int32_t c = input_nPlanes * output_nPlanes * nGroups;
-  if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
-    const int32_t K = 16;
-    const int32_t V = 4;
-    dConvolution_KMxKN_backward_dW2<T, K, V>
-        <<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
-           s>>>(inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn, rulesOut,
-                nHot, input_nPlanes, input_stride, output_nPlanes,
-                output_stride);
-  } else {
-    dConvolution_backward_dW(s, inFeatures, dInFeatures, dOutFeatures, w, dw,
-                             rulesIn, rulesOut, nHot, input_nPlanes,
-                             input_stride, output_nPlanes, output_stride,
-                             nGroups);
-  }
-}
-#undef TACC
\ No newline at end of file
diff --git a/include/spconv/fused_conv.h b/include/spconv/fused_conv.h
deleted file mode 100644
index a02d569..0000000
--- a/include/spconv/fused_conv.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <cuda_runtime_api.h>
-#include <tensorview/tensor.h>
-#include <torch/script.h>
-namespace spconv {
-
-enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
-using all_fused_conv_algos_t =
-    tv::mp_list_c<int, kFSparseConvNet, kFMinkowskiEngine>;
-
-void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
-                     torch::Tensor filters, torch::Tensor indicesIn,
-                     torch::Tensor indicesOut, int nHot);
-
-void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
-                              torch::Tensor dout, torch::Tensor filters,
-                              torch::Tensor dfilters, torch::Tensor indicesIn,
-                              torch::Tensor indicesOut, int nHot);
-
-void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
-                               torch::Tensor filters, torch::Tensor indicesIn,
-                               torch::Tensor indicesOut, int nHot);
-void fused_conv_backward_cuda_minkowski(torch::Tensor features,
-                                        torch::Tensor din, torch::Tensor dout,
-                                        torch::Tensor filters,
-                                        torch::Tensor dfilters,
-                                        torch::Tensor indicesIn,
-                                        torch::Tensor indicesOut, int nHot);
-
-template <int Algo> struct FusedConvDispatch;
-
-template <> struct FusedConvDispatch<kFSparseConvNet> {
-  constexpr static auto *fwd = fused_conv_cuda;
-  constexpr static auto *bwd = fused_conv_backward_cuda;
-};
-
-template <> struct FusedConvDispatch<kFMinkowskiEngine> {
-  constexpr static auto *fwd = fused_conv_cuda_minkowski;
-  constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
-};
-
-} // namespace spconv
diff --git a/include/spconv/fused_spconv_ops.h b/include/spconv/fused_spconv_ops.h
deleted file mode 100644
index cb87e67..0000000
--- a/include/spconv/fused_spconv_ops.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSED_SPARSE_CONV_OP_H_
-#define FUSED_SPARSE_CONV_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-torch::Tensor
-fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor bias, torch::Tensor indicePairs,
-                         torch::Tensor indiceNum, int64_t numActOut,
-                         int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indicePairs.size(0);
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  int indicePairMaxOffset =
-      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-
-  /*if (_subM){
-    std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
-  indicePairNumCpu.data_ptr<int>() + kernelVolume);
-    indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
-
-    auto indicePairVecMaxSizeIter = std::max_element(
-        indicePairNumVec.begin(), indicePairNumVec.end());
-    indicePairMaxSize = *indicePairVecMaxSizeIter;
-  }*/
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  // auto indicePairOptions =
-  //     torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
-
-  torch::Tensor output =
-      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
-  torch::Tensor inputBuffer =
-      torch::zeros({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  if (subM) { // the center index of subm conv don't need gather and scatter
-              // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
-                              nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // std::cout << "gather time " << totalGatherTime << std::endl;
-  // std::cout << "gemm time " << totalGEMMTime << std::endl;
-  // std::cout << "scatteradd time " << totalSAddTime << std::endl;
-  return output;
-}
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/geometry.h b/include/spconv/geometry.h
deleted file mode 100644
index d6bf3de..0000000
--- a/include/spconv/geometry.h
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPCONV_GEOMETRY_H_
-#define SPCONV_GEOMETRY_H_
-
-#include <iostream>
-#include <limits>
-#include <tensorview/tensorview.h>
-#include <tsl/robin_map.h>
-#include <unordered_map>
-namespace spconv {
-
-namespace detail {
-
-template <typename T> struct ToUnsigned;
-
-template <> struct ToUnsigned<int> { using type = uint32_t; };
-
-template <> struct ToUnsigned<long> { using type = uint64_t; };
-
-template <typename T> struct FNVInternal;
-template <> struct FNVInternal<uint32_t> {
-  constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
-  constexpr static uint32_t prime = 0x01000193;
-};
-
-template <> struct FNVInternal<uint64_t> {
-  constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
-  constexpr static uint64_t prime = 0x100000001b3;
-};
-
-} // namespace detail
-template <typename T>
-using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
-
-template <typename T> struct FNV1a : detail::FNVInternal<T> {
-  std::size_t operator()(const T *data, std::size_t size) {
-    to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
-    for (std::size_t i = 0; i < size; ++i) {
-      hash *= detail::FNVInternal<T>::prime;
-      hash ^= static_cast<to_unsigned_t<T>>(data[i]);
-    }
-    return hash;
-  }
-};
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
-                                    const Index *kernelSize,
-                                    const Index *stride, const Index *padding,
-                                    const Index *dilation,
-                                    const Index *outSpatialShape, Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
-                 stride[i] + padding[i]) /
-                stride[i];
-    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
-  }
-
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE Index getValidOutPosTranspose(
-    const Index *input_pos, const Index *kernelSize, const Index *stride,
-    const Index *padding, const Index *dilation, const Index *outSpatialShape,
-    Index *out) {
-  Index lowers[NDim];
-  Index uppers[NDim];
-  Index counter[NDim];
-  Index counterSize[NDim];
-  Index pointCounter = 0;
-  Index val;
-  Index numPoints = 1;
-  Index m, offset;
-  bool valid = false;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    lowers[i] = input_pos[i] * stride[i] - padding[i];
-    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
-  }
-#pragma unroll
-  for (unsigned i = 0; i < NDim; ++i) {
-    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
-    numPoints *= counterSize[i];
-  }
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    counter[i] = 0;
-  }
-  for (int i = 0; i < numPoints; ++i) {
-    valid = true;
-    m = 1;
-    offset = 0;
-#pragma unroll
-    for (int j = NDim - 1; j >= 0; --j) {
-      val = uppers[j] - counter[j] * dilation[j];
-      out[pointCounter * (NDim + 1) + j] = val;
-      if (val < 0 || (val > outSpatialShape[j] - 1)) {
-        valid = false;
-        // break;
-      }
-      offset += m * (val - lowers[j]) / dilation[j];
-      m *= kernelSize[j];
-    }
-    out[pointCounter * (NDim + 1) + NDim] = offset;
-    if (valid)
-      ++pointCounter;
-    counter[NDim - 1] += 1;
-#pragma unroll
-    for (int c = NDim - 1; c >= 0; --c) {
-      if (counter[c] == counterSize[c] && c > 0) {
-        counter[c - 1] += 1;
-        counter[c] = 0;
-      }
-    }
-  }
-  return pointCounter;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/indice.cu.h b/include/spconv/indice.cu.h
deleted file mode 100644
index f21cc47..0000000
--- a/include/spconv/indice.cu.h
+++ /dev/null
@@ -1,571 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INDICE_CU_H_
-#define INDICE_CU_H_
-#include <cuhash/hash_table.cuh>
-#include <spconv/geometry.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-
-template <bool UseDeconv, typename Index, unsigned NDim>
-struct ConvIndiceDispatch;
-
-template <typename Index, unsigned NDim>
-struct ConvIndiceDispatch<true, Index, NDim> {
-  constexpr static auto *func = getValidOutPosTranspose<Index, NDim>;
-};
-template <typename Index, unsigned NDim>
-struct ConvIndiceDispatch<false, Index, NDim> {
-  constexpr static auto *func = getValidOutPos<Index, NDim>;
-};
-
-template <typename Index, unsigned NDim, bool UseDeconv,
-          int KernelMaxVolume = 256, typename Index1D = int>
-__global__ void prepareIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(0, offset, oldNum) = ix;
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(1, offset, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignGridAndIndiceOutKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numAct, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
-
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    index = indicePairUnique[ix];
-    gridsOut[index] = ix;
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
-__global__ void
-assignIndiceOutKernel(tv::TensorView<Index> indicesOut, int numAct,
-                      tv::TensorView<Index> indicePairUnique,
-                      const tv::SimpleVector<Index, NDim> outSpatialShape,
-                      int batchSize) {
-
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (unsigned ix : tv::KernelLoopX<unsigned>(numAct)) {
-    index = indicePairUnique[ix];
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
-__global__ void
-assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
-                            tv::TensorView<Index> indicePairs,
-                            tv::TensorView<Index> indicePairUnique,
-                            unsigned table_size, const cuhash::Entry *table,
-                            cuhash::Functions<kNumHashFunctions> constants,
-                            uint2 stash_constants, unsigned stash_count) {
-
-  Index index;
-  int kernelVolume = indicePairs.dim(1);
-  auto indicePairsOut = indicePairs.subview(1);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairsOut(i, ix);
-      if (index > -1) {
-        auto val = cuhash::retrieve((unsigned)(index), table_size, table,
-                                    constants, stash_constants, stash_count);
-        assert(val != cuhash::kNotFound);
-        indicePairsOut(i, ix) = (unsigned)val;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
-                        tv::TensorView<IndexGrid> gridsOut, int numActIn,
-                        tv::TensorView<Index> indicePairs,
-                        tv::TensorView<Index> indicePairUnique,
-                        const tv::SimpleVector<Index, NDim> outSpatialShape) {
-
-  Index index;
-  int kernelVolume = indicePairs.dim(1);
-  auto indicePairsOut = indicePairs.subview(1);
-
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairsOut(i, ix);
-      if (index > -1) {
-        indicePairsOut(i, ix) = gridsOut[index];
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-assignIndicePairsLimitedKernel(tv::TensorView<IndexGrid> gridsOut, int numActIn,
-                               tv::TensorView<Index> indicePairs,
-                               tv::TensorView<Index> indiceNum) {
-
-  Index index, val;
-  int kernelVolume = indicePairs.dim(0);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairs(i, 1, ix);
-      if (index != -1) {
-        val = gridsOut[index];
-        if (val != -1) {
-          auto oldNum = atomicAdd(indiceNum.data() + i, Index(1));
-          indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix);
-          indicePairs(i, 1, oldNum) = val;
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void prepareSubMGridKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index =
-        tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-            indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
-        spatialVolume * indicesIn(ix, 0);
-    gridsOut[index] = ix;
-  }
-}
-
-template <typename Index, unsigned NDim>
-__global__ void
-prepareSubMHashKernel(tv::TensorView<const Index> indicesIn, unsigned *keys,
-                      unsigned *values,
-                      const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
-                                         outSpatialShape.data()) +
-            spatialVolume * indicesIn(ix, 0);
-    keys[ix] = index;
-    values[ix] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void getSubMIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      if (gridsOut[index] > -1) {
-        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(1, offset, oldNum) = gridsOut[index];
-        indicePairs(0, offset, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
-          unsigned K2>
-__global__ void getSubMIndicePairsUnrollKernel3(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-
-  Index point[3];
-  Index index = 0;
-  Index offset;
-  constexpr unsigned KV = K0 * K1 * K2;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-#pragma unroll
-        for (int k = 0; k < K2; ++k) {
-          offset = i * K1 * K2 + j * K2 + k;
-          if (offset > center) {
-            continue;
-          }
-          if (center == offset) {
-            // center of subm indice pairs dont need atomicadd
-            indicePairs(1, offset, ix) = ix;
-            indicePairs(0, offset, ix) = ix;
-          } else {
-            point[2] = indice_data[3] - k + K2 / 2;
-            point[1] = indice_data[2] - j + K1 / 2;
-            point[0] = indice_data[1] - i + K0 / 2;
-            if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
-                point[2] >= 0 && point[2] < outSpatialShape[2] &&
-                point[0] >= 0 && point[0] < outSpatialShape[0]) {
-              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
-                          point, outSpatialShape.data(), 0) +
-                      spatialVolume * indice_data[0];
-              if (gridsOut[index] != -1) {
-                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
-                // - 1]
-                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-                indicePairs(1, offset, oldNum) = gridsOut[index];
-                indicePairs(0, offset, oldNum) = ix;
-                indicePairs(1, KV - offset - 1, oldNum) = ix;
-                indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
-__global__ void getSubMIndicePairsUnrollKernel2(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
-  auto numActIn = indicesIn.dim(0);
-  Index point[2];
-  Index index = 0;
-  Index offset;
-  constexpr unsigned KV = K0 * K1;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-        offset = i * K1 + j;
-        if (offset > center) {
-          continue;
-        }
-        if (center == offset) {
-          // center of subm indice pairs dont need atomicadd
-          indicePairs(1, offset, ix) = ix;
-          indicePairs(0, offset, ix) = ix;
-        } else {
-          point[1] = indice_data[2] - j + K1 / 2;
-          point[0] = indice_data[1] - i + K0 / 2;
-          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
-              point[0] < outSpatialShape[0]) {
-            index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
-                        point, outSpatialShape.data(), 0) +
-                    spatialVolume * indice_data[0];
-            if (gridsOut[index] > -1) {
-              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-              atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-              indicePairs(1, offset, oldNum) = gridsOut[index];
-              indicePairs(0, offset, oldNum) = ix;
-              indicePairs(1, KV - offset - 1, oldNum) = ix;
-              indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
-          unsigned kNumHashFunctions = 4>
-__global__ void getSubMIndicePairsHashKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, unsigned table_size,
-    const cuhash::Entry *table, cuhash::Functions<kNumHashFunctions> constants,
-    uint2 stash_constants, unsigned stash_count) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-                  pointPtr, outSpatialShape.data(), 0) +
-              spatialVolume * indicesIn(ix, 0);
-      auto val = cuhash::retrieve((unsigned)(index), table_size, table,
-                                  constants, stash_constants, stash_count);
-      if (val != cuhash::kNotFound) {
-        Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(1, offset, oldNum) = val;
-        indicePairs(0, offset, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, unsigned K0, unsigned K1, unsigned K2,
-          unsigned kNumHashFunctions = 4>
-__global__ void getSubMIndicePairsHashUnrollKernel3(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume,
-    unsigned table_size, const cuhash::Entry *table,
-    cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
-    unsigned stash_count) {
-  auto numActIn = indicesIn.dim(0);
-  Index index = 0;
-  Index offset;
-  Index point[3];
-  constexpr unsigned KV = K0 * K1 * K2;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (3 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-#pragma unroll
-        for (int k = 0; k < K2; ++k) {
-          offset = i * K1 * K2 + j * K2 + k;
-          if (offset > center) {
-            continue;
-          }
-          if (center == offset) {
-            // center of subm indice pairs dont need atomicadd
-            indicePairs(1, offset, ix) = ix;
-            indicePairs(0, offset, ix) = ix;
-          } else {
-            point[2] = indice_data[3] - k + K2 / 2;
-            point[1] = indice_data[2] - j + K1 / 2;
-            point[0] = indice_data[1] - i + K0 / 2;
-            if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
-                point[2] >= 0 && point[2] < outSpatialShape[2] &&
-                point[0] >= 0 && point[0] < outSpatialShape[0]) {
-              index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
-                          point, outSpatialShape.data(), 0) +
-                      spatialVolume * indice_data[0];
-              auto val =
-                  cuhash::retrieve((unsigned)(index), table_size, table,
-                                   constants, stash_constants, stash_count);
-
-              if (val != cuhash::kNotFound) {
-                // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
-                // - 1]
-                Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-                atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-                indicePairs(1, offset, oldNum) = val;
-                indicePairs(0, offset, oldNum) = ix;
-                indicePairs(1, KV - offset - 1, oldNum) = ix;
-                indicePairs(0, KV - offset - 1, oldNum) = val;
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, unsigned K0, unsigned K1,
-          unsigned kNumHashFunctions = 4>
-__global__ void getSubMIndicePairsHashUnrollKernel2(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume,
-    unsigned table_size, const cuhash::Entry *table,
-    cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
-    unsigned stash_count) {
-  auto numActIn = indicesIn.dim(0);
-  Index index = 0;
-  Index offset;
-  Index point[2];
-  constexpr unsigned KV = K0 * K1;
-  constexpr unsigned center = KV / 2;
-  *(indiceNum.data() + center) = numActIn;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    const Index *indice_data = indicesIn.data() + ix * (2 + 1);
-#pragma unroll
-    for (int i = 0; i < K0; ++i) {
-#pragma unroll
-      for (int j = 0; j < K1; ++j) {
-        offset = i * K1 + j;
-        if (offset > center) {
-          continue;
-        }
-        if (center == offset) {
-          // center of subm indice pairs dont need atomicadd
-          indicePairs(1, offset, ix) = ix;
-          indicePairs(0, offset, ix) = ix;
-        } else {
-          point[1] = indice_data[2] - j + K1 / 2;
-          point[0] = indice_data[1] - i + K0 / 2;
-          if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
-              point[0] < outSpatialShape[0]) {
-            index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
-                        point, outSpatialShape.data(), 0) +
-                    spatialVolume * indice_data[0];
-            auto val =
-                cuhash::retrieve((unsigned)(index), table_size, table,
-                                 constants, stash_constants, stash_count);
-
-            if (val != cuhash::kNotFound) {
-              // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
-              // 1]
-              Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-              atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
-              indicePairs(1, offset, oldNum) = val;
-              indicePairs(0, offset, oldNum) = ix;
-              indicePairs(1, KV - offset - 1, oldNum) = ix;
-              indicePairs(0, KV - offset - 1, oldNum) = val;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridKernel(const Index *indicePairUnique,
-                                tv::TensorView<IndexGrid> gridsOut,
-                                int numAct) {
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    gridsOut[indicePairUnique[ix]] = -1;
-  }
-}
-
-template <typename T> __global__ void arangeKernel(T *data, int size) {
-  for (int ix : tv::KernelLoopX<int>(size)) {
-    data[ix] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void
-resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-                    const tv::SimpleVector<Index, NDim> outSpatialShape,
-                    int numAct, Index spatialVolume) {
-  auto indsPtr = indices;
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    indsPtr = indices + ix * (NDim + 1);
-    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-        indsPtr + 1, outSpatialShape.data(), 0);
-    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
-  }
-}
-
-} // namespace spconv
-
-#undef atomicAdd
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/indice.h b/include/spconv/indice.h
deleted file mode 100644
index 81830c0..0000000
--- a/include/spconv/indice.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
-#define SPARSE_CONV_INDICE_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-int create_conv_indice_pair_p1_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose);
-
-int create_conv_indice_pair_p2_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_submconv_indice_pair_cuda(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-int create_conv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
-    bool useHash);
-
-int create_submconv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/maxpool.h b/include/spconv/maxpool.h
deleted file mode 100644
index 76fce03..0000000
--- a/include/spconv/maxpool.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
-#define SPARSE_MAXPOOL_FUNCTOR_H_
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor dout, torch::Tensor din,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size);
-
-void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor dout, torch::Tensor din,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/minkowski.cu.h b/include/spconv/minkowski.cu.h
deleted file mode 100644
index 128e169..0000000
--- a/include/spconv/minkowski.cu.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
- * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
- * of the code.
- */
-
-template <typename Dtype, typename Itype, int BLOCK_SIZE>
-__global__ void matmul(const Dtype *A, const int wA, const int hA,
-                       const Dtype *B, const int wB, const int hB, Dtype *C,
-                       const Itype *in_map, const Itype *out_map) {
-  // Use in_feat as A and kernel as B
-
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. x is for rows, y is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
-
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
-
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
-    Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
-
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
-
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
-    }
-
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < wB)
-    atomicAdd(&C[wB * out_row + x], Csub);
-  // C[wB * out_row + x] += Csub;
-}
-
-template <typename Dtype, typename Itype, int BLOCK_SIZE>
-__global__ void matmul2(const Dtype *A, const int wA, const int hA,
-                        const Dtype *B, const int wB, const int hB,
-                        const Dtype *D, const int wD, const int hD, Dtype *C,
-                        Dtype *E, const Itype *in_map, const Itype *out_map) {
-  // Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
-
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. y is for rows, x is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
-
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
-
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-  Dtype Esub = 0;
-
-  // Declaration of the shared memory array As used to
-  // store the sub-matrix of A
-  __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
-
-  // Declaration of the shared memory array Bs used to
-  // store the sub-matrix of B
-  __shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
-
-  // Declaration of the shared memory array Ds used to
-  // store the sub-matrix of D
-  __shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
-
-  // For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
-  DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
-
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
-
-    // Transposed kernel
-    BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
-
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
-
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * BTs[k][tx];
-    }
-
-    // For Esub, reset to 0
-    Esub = 0;
-#pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Esub += DTs[k][ty] * As[k][tx];
-    }
-
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-
-    // For the E matrix which requires accmulation of multiple blocks, use
-    // atomic addition. This can be replaced with a more sophisticaed reduction
-    // algorithm.
-    if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
-      atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < hB)
-    atomicAdd(&C[hB * in_row + x], Csub);
-}
diff --git a/include/spconv/nms.h b/include/spconv/nms.h
deleted file mode 100644
index 4d0afe5..0000000
--- a/include/spconv/nms.h
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_CPU_H
-#define NMS_CPU_H
-#include <pybind11/pybind11.h>
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include "box_iou.h"
-#include "nms_gpu.h"
-#include <algorithm>
-#include <boost/geometry.hpp>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <vector>
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType>
-std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
-                                         py::array_t<int> order, DType thresh,
-                                         DType eps = 0) {
-  auto ndets = boxes.shape(0);
-  auto boxes_r = boxes.template unchecked<2>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto area = zeros<DType>({int(ndets)});
-  auto area_rw = area.template mutable_unchecked<1>();
-  // get areas
-  for (int i = 0; i < ndets; ++i) {
-    area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
-                 (boxes_r(i, 3) - boxes_r(i, 1) + eps);
-  }
-  std::vector<int> keep;
-  int i, j;
-  DType xx1, xx2, w, h, inter, ovr;
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
-      xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
-      w = xx2 - xx1 + eps;
-      if (w > 0) {
-        xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
-        xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
-        h = xx2 - xx1 + eps;
-        if (h > 0) {
-          inter = w * h;
-          ovr = inter / (area_rw(i) + area_rw(j) - inter);
-          if (ovr >= thresh)
-            suppressed_rw(j) = 1;
-        }
-      }
-    }
-  }
-  return keep;
-}
-
-template <typename DType>
-std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
-                                                py::array_t<int> order,
-                                                py::array_t<DType> standup_iou,
-                                                DType thresh) {
-  auto ndets = box_corners.shape(0);
-  auto box_corners_r = box_corners.template unchecked<3>();
-  auto order_r = order.template unchecked<1>();
-  auto suppressed = zeros<int>({int(ndets)});
-  auto suppressed_rw = suppressed.template mutable_unchecked<1>();
-  auto standup_iou_r = standup_iou.template unchecked<2>();
-  std::vector<int> keep;
-  int i, j;
-
-  namespace bg = boost::geometry;
-  typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
-  typedef bg::model::polygon<point_t> polygon_t;
-  polygon_t poly, qpoly;
-  std::vector<polygon_t> poly_inter, poly_union;
-  DType inter_area, union_area, overlap;
-
-  for (int _i = 0; _i < ndets; ++_i) {
-    i = order_r(_i);
-    if (suppressed_rw(i) == 1)
-      continue;
-    keep.push_back(i);
-    for (int _j = _i + 1; _j < ndets; ++_j) {
-      j = order_r(_j);
-      if (suppressed_rw(j) == 1)
-        continue;
-      if (standup_iou_r(i, j) <= 0.0)
-        continue;
-      // std::cout << "pre_poly" << std::endl;
-      try {
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
-        bg::append(poly,
-                   point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
-        bg::append(qpoly,
-                   point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
-        bg::intersection(poly, qpoly, poly_inter);
-      } catch (const std::exception &e) {
-        std::cout << "box i corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
-                    << std::endl;
-        }
-        std::cout << "box j corners:" << std::endl;
-        for (int k = 0; k < 4; ++k) {
-          std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
-                    << std::endl;
-        }
-        // throw e;
-        continue;
-      }
-      // std::cout << "post_poly" << std::endl;
-      // std::cout << "post_intsec" << std::endl;
-      if (!poly_inter.empty()) {
-        inter_area = bg::area(poly_inter.front());
-        // std::cout << "pre_union" << " " << inter_area << std::endl;
-        bg::union_(poly, qpoly, poly_union);
-        /*
-        if (poly_union.empty()){
-            std::cout << "intsec area:" << " " << inter_area << std::endl;
-            std::cout << "box i corners:" << std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
-        k, 1) << std::endl;
-            }
-            std::cout << "box j corners:" <<  std::endl;
-            for(int k = 0; k < 4; ++k){
-                std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
-        k, 1) << std::endl;
-            }
-        }*/
-        // std::cout << "post_union" << poly_union.empty() << std::endl;
-        if (!poly_union.empty()) { // ignore invalid box
-          union_area = bg::area(poly_union.front());
-          // std::cout << "post union area" << std::endl;
-          // std::cout << union_area << "debug" << std::endl;
-          overlap = inter_area / union_area;
-          if (overlap >= thresh)
-            suppressed_rw(j) = 1;
-          poly_union.clear();
-        }
-      }
-      poly.clear();
-      qpoly.clear();
-      poly_inter.clear();
-    }
-  }
-  return keep;
-}
-#ifdef TV_CUDA
-constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
-                        DType nms_overlap_thresh, int device_id) {
-  py::buffer_info info = boxes.request();
-  auto boxes_ptr = static_cast<DType *>(info.ptr);
-  py::buffer_info info_k = keep_out.request();
-  auto keep_out_ptr = static_cast<int *>(info_k.ptr);
-
-  return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
-                                          boxes.shape(0), boxes.shape(1),
-                                          nms_overlap_thresh, device_id);
-}
-#endif
-
-} // namespace spconv
-#endif
diff --git a/include/spconv/nms_functor.h b/include/spconv/nms_functor.h
deleted file mode 100644
index ba108e6..0000000
--- a/include/spconv/nms_functor.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_FUNCTOR_H_
-#define NMS_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct NonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxes, T threshold, T eps);
-};
-
-template <typename Device, typename T, typename Index>
-struct rotateNonMaxSupressionFunctor {
-  Index operator()(const Device &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxCorners,
-                   tv::TensorView<const T> standupIoU, T threshold);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/nms_gpu.h b/include/spconv/nms_gpu.h
deleted file mode 100644
index 15b735f..0000000
--- a/include/spconv/nms_gpu.h
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-template <typename DType, int BLOCK_THREADS>
-int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-             int boxes_dim, DType nms_overlap_thresh, int device_id);
diff --git a/include/spconv/nms_ops.h b/include/spconv/nms_ops.h
deleted file mode 100644
index 714cb33..0000000
--- a/include/spconv/nms_ops.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef NMS_TORCH_OP_H_
-#define NMS_TORCH_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/nms_functor.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-template <typename T>
-torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores,
-                                int64_t preMaxSize, int64_t postMaxSize,
-                                double thresh, double eps) {
-  // auto timer = spconv::CudaContextTimer<>();
-  tv::check_torch_dtype<T>(boxes);
-  auto resOptions =
-      torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
-  if (boxes.size(0) == 0) {
-    return torch::zeros({0}, resOptions);
-  }
-  torch::Tensor indices;
-  if (preMaxSize > 0) {
-    auto numKeepedScores = scores.size(0);
-    preMaxSize = std::min(numKeepedScores, preMaxSize);
-    auto res = torch::topk(scores, preMaxSize);
-    indices = std::get<1>(res);
-    boxes = torch::index_select(boxes, 0, indices);
-  } else {
-    indices = std::get<1>(torch::sort(scores));
-    boxes = torch::index_select(boxes, 0, indices);
-  }
-  if (boxes.size(0) == 0)
-    return torch::zeros({0}, resOptions);
-
-  auto keep = torch::zeros({boxes.size(0)}, resOptions);
-  int64_t keepNum = 0;
-  if (boxes.device().type() == torch::kCPU) {
-    auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
-    keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
-                         tv::torch2tv<const T>(boxes), T(thresh), T(eps));
-  } else {
-    TV_ASSERT_RT_ERR(false, "not implemented");
-  }
-  if (postMaxSize <= 0) {
-    postMaxSize = keepNum;
-  }
-  // std::cout << keep << std::endl;
-  keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
-  if (preMaxSize > 0) {
-    return torch::index_select(indices, 0, keep);
-  }
-  return keep;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/pillar_scatter_functor.h b/include/spconv/pillar_scatter_functor.h
deleted file mode 100644
index 518f2a3..0000000
--- a/include/spconv/pillar_scatter_functor.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
-#define POINTPILLARS_SCATTER_FUNCTOR_H_
-#include <tensorview/tensorview.h>
-
-namespace spconv {
-namespace functor {
-template <typename Device, typename T, typename Index>
-struct PointPillarScatter {
-  void operator()(const Device &d, tv::TensorView<T> canvas,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const T> coors);
-};
-
-} // namespace functor
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/pillar_scatter_ops.h b/include/spconv/pillar_scatter_ops.h
deleted file mode 100644
index 1f5e2c2..0000000
--- a/include/spconv/pillar_scatter_ops.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PILLAR_SCATTER_OP_H_
-#define PILLAR_SCATTER_OP_H_
-
-#include <spconv/pillar_scatter_functor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-// torch.jit's doc says only support int64, so we need to convert to int32.
-
-template <typename T>
-torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
-                                 torch::Tensor shape) {
-  TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
-  TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
-  TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
-  TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
-  TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
-  TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
-  TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
-  TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
-
-  tv::check_torch_dtype<int>(shape);
-  tv::check_torch_dtype<T>(coors);
-  auto shapeData = shape.data_ptr<int>();
-  torch::Tensor canvas =
-      torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
-                   features.options());
-  TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
-#ifdef TV_CUDA
-  functor::PointPillarScatter<tv::GPU, T, int> ftor;
-  ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
-       tv::torch2tv<const T>(features.squeeze()),
-       tv::torch2tv<const T>(coors.squeeze()));
-#endif
-  return canvas;
-}
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/point2voxel.cu.h b/include/spconv/point2voxel.cu.h
deleted file mode 100644
index 2a68750..0000000
--- a/include/spconv/point2voxel.cu.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-template <typename Index, unsigned NDim>
-__global__ void scatterPointToGridKernel(
-    tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
-    tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
-    tv::TensorView<Index> pointIndex,
-    const tv::SimpleVector<Index, NDim> gridShape) {
-  Index index;
-  int numPoints = points.dim(0);
-  int numFeatures = points.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numPoints)) {
-    index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
-        indexes.data() + ix * NDim, gridShape.data(), 0);
-    pointIndex(ix) = index;
-    atomicAdd(numPointsPerGrid.data() + index, Index(1));
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      atomicAdd(grids.data() + index * numFeatures + k,
-                *(points.data() + ix * numFeatures + k));
-    }
-  }
-}
-
-template <typename Index, unsigned NDim>
-__global__ void
-gatherPointFromGridKernel(tv::TensorView<const float> grids,
-                          tv::TensorView<const Index> numPointsPerGrid,
-                          tv::TensorView<const Index> pointIndexUnique,
-                          tv::TensorView<float> voxels,
-                          tv::TensorView<Index> coors,
-                          const tv::SimpleVector<Index, NDim> gridShape) {
-  Index index;
-  int numVoxels = voxels.dim(0);
-  int numFeatures = grids.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
-    index = pointIndexUnique(ix);
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
-    }
-    index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
-                                            gridShape.data());
-  }
-}
-
-template <typename Index>
-__global__ void resetGridKernel(tv::TensorView<float> grids,
-                                tv::TensorView<Index> numPointsPerGrid,
-                                tv::TensorView<Index> pointIndexUnique) {
-  Index index;
-  int numVoxels = pointIndexUnique.dim(0) - 1;
-  int numFeatures = grids.dim(1);
-
-  for (int ix : tv::KernelLoopX<int>(numVoxels)) {
-    index = pointIndexUnique(ix);
-#pragma unroll
-    for (int k = 0; k != numFeatures; ++k) {
-      grids(index, k) = 0;
-      numPointsPerGrid(index) = 0;
-    }
-  }
-}
-
-template <typename Index>
-__global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
-                                      const Index gridVolume) {
-  int num_max_points = pointIndex.dim(0) - 1;
-
-  for (int ix : tv::KernelLoopX<int>(num_max_points)) {
-    pointIndex(ix) = gridVolume;
-  }
-}
-} // namespace spconv
diff --git a/include/spconv/point2voxel.h b/include/spconv/point2voxel.h
deleted file mode 100644
index 477d1af..0000000
--- a/include/spconv/point2voxel.h
+++ /dev/null
@@ -1,276 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <pybind11/pybind11.h>
-// must include pybind11/eigen.h if using eigen matrix as arguments.
-// must include pybind11/stl.h if using containers in STL in arguments.
-#include <algorithm>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-// #include <vector>
-#include <iostream>
-#include <math.h>
-
-namespace spconv {
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
-                          py::array_t<DType> voxel_point_mask,
-                          py::array_t<int> coors,
-                          py::array_t<int> num_points_per_voxel,
-                          py::array_t<int> coor_to_voxelidx,
-                          std::vector<DType> voxel_size,
-                          std::vector<DType> coors_range, int max_points,
-                          int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_np_mean(
-    py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
-    py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto means_rw = means.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int voxelidx, num;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      num_points_per_voxel_rw(voxelidx) += 1;
-      for (int k = 0; k < num_features; ++k) {
-        means_rw(voxelidx, k) +=
-            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
-      }
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
-    num = num_points_per_voxel_rw(i);
-    for (int j = num; j < max_points; ++j) {
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(i, j, k) = means_rw(i, k);
-      }
-    }
-  }
-  return voxel_num;
-}
-
-template <typename DType, int NDim>
-int points_to_voxel_3d_with_filtering(
-    py::array_t<DType> points, py::array_t<DType> voxels,
-    py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
-    py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
-    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
-    std::vector<DType> voxel_size, std::vector<DType> coors_range,
-    int max_points, int max_voxels, int block_factor, int block_size,
-    DType height_threshold, DType height_high_threshold) {
-  auto points_rw = points.template mutable_unchecked<2>();
-  auto mins_rw = mins.template mutable_unchecked<2>();
-  auto maxs_rw = maxs.template mutable_unchecked<2>();
-  auto voxels_rw = voxels.template mutable_unchecked<3>();
-  auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
-  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
-  auto coors_rw = coors.mutable_unchecked<2>();
-  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
-  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
-  auto N = points_rw.shape(0);
-  auto num_features = points_rw.shape(1);
-  // auto ndim = points_rw.shape(1) - 1;
-  constexpr int ndim_minus_1 = NDim - 1;
-  int voxel_num = 0;
-  bool failed = false;
-  int coor[NDim];
-  int c;
-  int grid_size[NDim];
-
-  DType max_value, min_value;
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-  int block_shape_H = grid_size[1] / block_factor;
-  int block_shape_W = grid_size[0] / block_factor;
-  int voxelidx, num;
-  int block_coor[2];
-  int startx, stopx, starty, stopy;
-  for (int i = 0; i < N; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-    if (failed)
-      continue;
-    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (voxel_num >= max_voxels)
-        continue;
-      voxel_num += 1;
-      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
-      for (int k = 0; k < NDim; ++k) {
-        coors_rw(voxelidx, k) = coor[k];
-      }
-    }
-    num = num_points_per_voxel_rw(voxelidx);
-    if (num < max_points) {
-      voxel_point_mask_rw(voxelidx, num) = DType(1);
-      for (int k = 0; k < num_features; ++k) {
-        voxels_rw(voxelidx, num, k) = points_rw(i, k);
-      }
-      block_coor[0] = coor[1] / block_factor;
-      block_coor[1] = coor[2] / block_factor;
-      mins_rw(block_coor[0], block_coor[1]) =
-          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
-      maxs_rw(block_coor[0], block_coor[1]) =
-          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
-      num_points_per_voxel_rw(voxelidx) += 1;
-    }
-  }
-  for (int i = 0; i < voxel_num; ++i) {
-    coor[1] = coors_rw(i, 1);
-    coor[2] = coors_rw(i, 2);
-    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
-    block_coor[0] = coor[1] / block_factor;
-    block_coor[1] = coor[2] / block_factor;
-    min_value = mins_rw(block_coor[0], block_coor[1]);
-    max_value = maxs_rw(block_coor[0], block_coor[1]);
-    startx = std::max(0, block_coor[0] - block_size / 2);
-    stopx =
-        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
-    starty = std::max(0, block_coor[1] - block_size / 2);
-    stopy =
-        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
-
-    for (int j = startx; j < stopx; ++j) {
-      for (int k = starty; k < stopy; ++k) {
-        min_value = std::min(min_value, mins_rw(j, k));
-        max_value = std::max(max_value, maxs_rw(j, k));
-      }
-    }
-    voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
-                       ((max_value - min_value) < height_high_threshold);
-  }
-  return voxel_num;
-}
-
-} // namespace spconv
diff --git a/include/spconv/point2voxel_ops.h b/include/spconv/point2voxel_ops.h
deleted file mode 100644
index 9aba5e4..0000000
--- a/include/spconv/point2voxel_ops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2020 xmyqsh
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <spconv/points2voxels.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-
-int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
-                      torch::Tensor pointIndex, torch::Tensor grids,
-                      torch::Tensor numPointsPerGrid, torch::Tensor voxels,
-                      torch::Tensor coors, std::vector<int64_t> gridShape,
-                      const int64_t ndim);
-
-} // namespace spconv
diff --git a/include/spconv/points2voxels.h b/include/spconv/points2voxels.h
deleted file mode 100644
index b587081..0000000
--- a/include/spconv/points2voxels.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
-                                torch::Tensor grids,
-                                torch::Tensor numPointsPerGrid,
-                                torch::Tensor pointIndex,
-                                std::vector<int64_t> gridShape, const int ndim);
-
-void gather_point_from_grid_cuda(torch::Tensor grids,
-                                 torch::Tensor numPointsPerGrid,
-                                 torch::Tensor pointIndex,
-                                 torch::Tensor pointIndexUnique,
-                                 torch::Tensor voxels, torch::Tensor coors,
-                                 std::vector<int64_t> gridShape,
-                                 const int ndim);
-
-} // namespace spconv
diff --git a/include/spconv/pool_ops.h b/include/spconv/pool_ops.h
deleted file mode 100644
index 39cd59c..0000000
--- a/include/spconv/pool_ops.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_POOL_OP_H_
-#define SPARSE_POOL_OP_H_
-
-#include <spconv/maxpool.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                            torch::Tensor indiceNum, int64_t numAct);
-
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                    torch::Tensor outFeatures,
-                                    torch::Tensor outGrad,
-                                    torch::Tensor indicePairs,
-                                    torch::Tensor indiceNum);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/reordering.cu.h b/include/spconv/reordering.cu.h
deleted file mode 100644
index 61bfd55..0000000
--- a/include/spconv/reordering.cu.h
+++ /dev/null
@@ -1,432 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef REORDERING_CU_H_
-#define REORDERING_CU_H_
-#include <THC/THCAtomics.cuh>
-#include <THC/THCNumerics.cuh>
-#include <cuda_fp16.h>
-#include <tensorview/kernel_utils.h>
-
-#if PYTORCH_VERSION < 10500
-#define TH_ATOMIC_ADD atomicAdd
-#else
-#define TH_ATOMIC_ADD gpuAtomicAdd
-#endif
-
-// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-namespace spconv {
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void gatherGenericKernel(T *buffer, const T *features,
-                                    const Index *indices, int size,
-                                    int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              features[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void gatherVecKernel(T *buffer, const T *features,
-                                const Index *indices, int size, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          reinterpret_cast<VecType *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void gatherVecBlockKernel(T *buffer, const T *features,
-                                     const Index *indices, int size,
-                                     int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  features += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      reinterpret_cast<VecType *>(
-          buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-          reinterpret_cast<const VecType *>(
-              features)[indices[ix + ILPStrideX[ilp]] * numPlanes +
-                        threadIdx.y];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void batchGatherGenericKernel(T *buffer, const T *features,
-                                         const Index *indices, int size,
-                                         int numPlanes, int indice_batch_stride,
-                                         int feature_batch_stride) {
-  // size: max indice num * kernel volume
-  // inds: [volume, num_elems]
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp];
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          if (inds[ilp] != -1) {
-            buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                features[inds[ilp] * numPlanes + iy];
-
-          } else {
-            buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = T(0);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
-batchGatherVecKernel(T *buffer, const T *features, const Index *indices,
-                     int size, int feature_offset, int numPlanes,
-                     int indice_batch_stride, int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index zero[sizeof(VecType) / sizeof(T)];
-#pragma unroll
-  for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
-    zero[i] = T(0);
-  }
-
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          if (inds[ilp] != -1) {
-            reinterpret_cast<VecType *>(
-                buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                reinterpret_cast<const VecType *>(
-                    features)[inds[ilp] * numPlanes + iy];
-
-          } else {
-            reinterpret_cast<VecType *>(
-                buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-                reinterpret_cast<const VecType *>(&zero)[0];
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void
-batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
-                          int size, int numPlanes, int indice_batch_stride,
-                          int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-  Index inds;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  features += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-
-  Index inds_elem;
-  Index zero[sizeof(VecType) / sizeof(T)];
-#pragma unroll
-  for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
-    zero[i] = T(0);
-  }
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      inds_elem = ix + ILPStrideX[ilp];
-      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                     inds_elem % feature_batch_stride];
-
-      if (inds != -1) {
-        reinterpret_cast<VecType *>(
-            buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-            reinterpret_cast<const VecType *>(
-                features)[inds * numPlanes + threadIdx.y];
-      } else {
-        reinterpret_cast<VecType *>(
-            buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
-            reinterpret_cast<const VecType *>(&zero)[0];
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
-                                        const Index *indices, int size,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          outFeatures[inds[ilp] + iy] +=
-              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
-                                         const Index *indices, int size,
-                                         int numPlanes) {
-  int ILPStrideX[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2);
-
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  T buf[vecloadFactor];
-  T buf2[vecloadFactor];
-  Index idx;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
-      reinterpret_cast<VecType *>(buf)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idx];
-      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
-          buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
-      if (std::is_same<T, at::Half>::value) {
-#if __CUDA_ARCH__ >= 530
-#pragma unroll
-        for (int i = 0; i < vecloadHalf2Factor; i++) {
-          reinterpret_cast<__half2 *>(buf)[i] =
-              __hadd2(reinterpret_cast<__half2 *>(buf)[i],
-                      reinterpret_cast<__half2 *>(buf2)[i]);
-        }
-#else
-#pragma unroll
-        for (int i = 0; i < vecloadFactor; i++) {
-          buf[i] += buf2[i];
-        }
-#endif
-      } else {
-#pragma unroll
-        for (int i = 0; i < vecloadFactor; i++) {
-          buf[i] += buf2[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idx] =
-          reinterpret_cast<VecType *>(buf)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
-                                      const Index *indices, int size,
-                                      int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] +=
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
-    }
-  }
-}
-
-#if __CUDA_ARCH__ >= 530
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
-                                          const Index *indices, int size,
-                                          int numPlanes) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  Index idx;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
-      reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2(
-          reinterpret_cast<__half2 *>(outFeatures)[idx],
-          reinterpret_cast<__half2 *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
-    }
-  }
-}
-#endif
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
-                                             const Index *indices, int size,
-                                             int feature_offset, int numPlanes,
-                                             int indice_batch_stride,
-                                             int feature_batch_stride) {
-  // batch scatter add is greatly slower than native scatter when the number of
-  // points is large. this may due to atomicAdd?
-  // batch scatter add is greatly faster than native when the number of points
-  // is small.
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-  Index inds_elem;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size) {
-        inds_elem = ix + ILPStrideX[ilp] + feature_offset;
-        inds[ilp] =
-            indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                    inds_elem % feature_batch_stride];
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
-          TH_ATOMIC_ADD(outFeatures + inds[ilp] * numPlanes + iy,
-                        buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
-                           const Index *indices, int size, int numPlanes,
-                           int indice_batch_stride, int feature_batch_stride) {
-  int ILPStrideX[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  outFeatures += blockIdx.y * NumTLP;
-  buffer += blockIdx.y * NumTLP;
-  Index inds, inds_elem;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      inds_elem = ix + ILPStrideX[ilp];
-      inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
-                     inds_elem % feature_batch_stride];
-      if (inds != -1) {
-        TH_ATOMIC_ADD(outFeatures + inds * numPlanes + threadIdx.y,
-                      buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
-      }
-    }
-  }
-}
-
-} // namespace spconv
-
-#undef TH_ATOMIC_ADD
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/reordering.h b/include/spconv/reordering.h
deleted file mode 100644
index 202c5c8..0000000
--- a/include/spconv/reordering.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_REORDERING_FUNCTOR_H_
-#define SPARSE_REORDERING_FUNCTOR_H_
-#include <cuda_runtime_api.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-namespace spconv {
-
-void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                              torch::Tensor indices, int size);
-void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
-                                   torch::Tensor outFeatures,
-                                   torch::Tensor indices, int size);
-
-void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                        torch::Tensor indices, int size);
-void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
-                             torch::Tensor indices, int size);
-
-void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
-                       torch::Tensor indices, int size);
-void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
-                            torch::Tensor indices, int size);
-
-void sparse_gather_cuda(cudaStream_t s, torch::Tensor buffer,
-                        torch::Tensor features, torch::Tensor indices,
-                        int size);
-void sparse_scatter_add_cuda(cudaStream_t s, torch::Tensor buffer,
-                             torch::Tensor outFeatures, torch::Tensor indices,
-                             int size);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spconv/spconv_ops.h b/include/spconv/spconv_ops.h
deleted file mode 100644
index f2747c4..0000000
--- a/include/spconv/spconv_ops.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SPARSE_CONV_OP_H_
-#define SPARSE_CONV_OP_H_
-
-#include <spconv/indice.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-namespace spconv {
-
-enum ConvAlgo {
-  kNative = 0,
-  kBatch,
-  kBatchGemmGather,
-  kSparseConvNet,
-  kMinkowskiEngine
-};
-using all_conv_algos_t = tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather,
-                                       kSparseConvNet, kMinkowskiEngine>;
-
-// torch.jit's doc says only support int64, so we need to convert to int32.
-std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-               std::vector<int64_t> outSpatialShape,
-               std::vector<int64_t> spatialShape,
-               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-               std::vector<int64_t> outPadding, int64_t _subM,
-               int64_t _transpose, int64_t _useHash);
-
-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM,
-                         int64_t algo);
-std::vector<torch::Tensor>
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
-                   int64_t algo);
-
-} // namespace spconv
-
-#endif
\ No newline at end of file
diff --git a/include/spgemm/gemm.h b/include/spgemm/gemm.h
deleted file mode 100644
index bb60662..0000000
--- a/include/spgemm/gemm.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-#include <cutlass/gemm/device/gemm.h>
-#include <type_traits>
-namespace spconv {
-
-template <typename T>
-using determine_acc_t =
-    std::conditional_t<std::is_same<T, cutlass::half_t>::value, float, T>;
-
-template <typename T, bool TransA, bool TransB, bool TransC>
-cudaError_t cutlassGemm(cudaStream_t s, int M, int N, int K, T alpha,
-                        T const *A, int lda, T const *B, int ldb, T beta, T *C,
-                        int ldc) {
-
-  // Define type definition for single-precision CUTLASS GEMM with column-major
-  // input matrices and 128x128x8 threadblock tile size (chosen by default).
-  //
-  // To keep the interface manageable, several helpers are defined for plausible
-  // compositions including the following example for single-precision GEMM.
-  // Typical values are used as default template arguments. See
-  // `cutlass/gemm/device/default_gemm_configuration.h` for more details.
-  //
-  // To view the full gemm device API interface, see
-  // `cutlass/gemm/device/gemm.h`
-  using TAcc = determine_acc_t<T>;
-  using ColumnMajor = cutlass::layout::ColumnMajor;
-  using RowMajor = cutlass::layout::RowMajor;
-  using LayoutA = std::conditional_t<TransA, ColumnMajor, RowMajor>;
-  using LayoutB = std::conditional_t<TransB, ColumnMajor, RowMajor>;
-  using LayoutC = std::conditional_t<TransC, ColumnMajor, RowMajor>;
-
-  using CutlassGemm = cutlass::gemm::device::Gemm<T, // Data-type of A matrix
-                                                  LayoutA, // Layout of A matrix
-                                                  T, // Data-type of B matrix
-                                                  LayoutB, // Layout of B matrix
-                                                  T, // Data-type of C matrix
-                                                  LayoutC,
-                                                  TAcc>; // Layout of C matrix
-
-  // Define a CUTLASS GEMM type
-  CutlassGemm gemm_operator;
-
-  // Construct the CUTLASS GEMM arguments object.
-  //
-  // One of CUTLASS's design patterns is to define gemm argument objects that
-  // are constructible in host code and passed to kernels by value. These may
-  // include pointers, strides, scalars, and other arguments needed by Gemm and
-  // its components.
-  //
-  // The benefits of this pattern are (1.) a structured, composable strategy for
-  // passing host-constructible arguments to kernels and (2.) minimized
-  // initialization overhead on kernel entry.
-  //
-  typename CutlassGemm::Arguments args(
-      {M, N, K}, // Gemm Problem dimensions
-      {A, lda},  // Tensor-ref for source matrix A
-      {B, ldb},  // Tensor-ref for source matrix B
-      {C, ldc},  // Tensor-ref for source matrix C
-      {C, ldc},  // Tensor-ref for destination matrix D (may be different memory
-                 // than source C matrix)
-      {alpha, beta}); // Scalars used in the Epilogue
-
-  //
-  // Launch the CUTLASS GEMM kernel.
-  //
-
-  cutlass::Status status = gemm_operator(args, nullptr, s);
-
-  //
-  // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
-  //
-
-  if (status != cutlass::Status::kSuccess) {
-    return cudaErrorUnknown;
-  }
-
-  // Return success, if no errors were encountered.
-  return cudaSuccess;
-}
-
-} // namespace spconv
diff --git a/include/spgemm/gemm_th.h b/include/spgemm/gemm_th.h
deleted file mode 100644
index 87c8eca..0000000
--- a/include/spgemm/gemm_th.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-#include <cuda_runtime_api.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-
-namespace spconv {
-void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b);
-void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
-                    torch::Tensor b);
-
-} // namespace spconv
\ No newline at end of file
diff --git a/include/sphash/hashmap.h b/include/sphash/hashmap.h
deleted file mode 100644
index f3071e2..0000000
--- a/include/sphash/hashmap.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <tensorview/tensor.h>
-
-namespace spconv {
-
-enum HashTypes { kDenseMap = 0, kCUDPPHash = 1 };
-
-template <int Impl> struct HashMap;
-
-template <> struct HashMap<kDenseMap> {};
-
-} // namespace spconv
\ No newline at end of file
diff --git a/include/tensorrt/inference.h b/include/tensorrt/inference.h
deleted file mode 100644
index e222060..0000000
--- a/include/tensorrt/inference.h
+++ /dev/null
@@ -1,207 +0,0 @@
-#include "NvInfer.h"
-#include <memory>
-#include <tensorview/tensor.h>
-#include <unordered_map>
-#include <vector>
-
-namespace trt {
-
-template <typename T> tv::DType trt_dtype_to_tv(T trt_dtype) {
-  switch (trt_dtype) {
-  case nvinfer1::DataType::kFLOAT:
-    return tv::float32;
-  case nvinfer1::DataType::kHALF:
-    return tv::float16;
-  case nvinfer1::DataType::kINT32:
-    return tv::int32;
-  case nvinfer1::DataType::kINT8:
-    return tv::int8;
-  default:;
-  }
-  TV_THROW_INVALID_ARG("unknown trt dtype");
-}
-
-struct InferDeleter {
-  template <typename T> void operator()(T *obj) const {
-    if (obj) {
-      obj->destroy();
-    }
-  }
-};
-
-template <typename T> using trt_unique_ptr_t = std::unique_ptr<T, InferDeleter>;
-
-class Logger : public nvinfer1::ILogger {
-public:
-  Logger(Severity severity = Severity::kWARNING)
-      : reportableSeverity(severity) {}
-
-  void log(Severity severity, const char *msg) override {
-    // suppress messages with severity enum value greater than the reportable
-    if (severity > reportableSeverity)
-      return;
-
-    switch (severity) {
-    case Severity::kINTERNAL_ERROR:
-      std::cerr << "INTERNAL_ERROR: ";
-      break;
-    case Severity::kERROR:
-      std::cerr << "ERROR: ";
-      break;
-    case Severity::kWARNING:
-      std::cerr << "WARNING: ";
-      break;
-    case Severity::kINFO:
-      std::cerr << "INFO: ";
-      break;
-    default:
-      std::cerr << "UNKNOWN: ";
-      break;
-    }
-    std::cerr << msg << std::endl;
-  }
-
-  Severity reportableSeverity;
-};
-
-class InferenceContext {
-public:
-  explicit InferenceContext(const std::string &engine_bin, int device)
-      : logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
-    TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    if (device >= deviceCount) {
-      TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ",
-                           deviceCount, " device.");
-    }
-    cudaSetDevice(device);
-    auto runtime = trt_unique_ptr_t<nvinfer1::IRuntime>(
-        nvinfer1::createInferRuntime(logger_));
-    engine_ =
-        trt_unique_ptr_t<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(
-            engine_bin.c_str(), engine_bin.size(), nullptr));
-    ctx_ = trt_unique_ptr_t<nvinfer1::IExecutionContext>(
-        engine_->createExecutionContext());
-
-    max_batch_size_ = engine_->getMaxBatchSize();
-    for (int i = 0; i < engine_->getNbBindings(); ++i) {
-      auto dims = engine_->getBindingDimensions(i);
-      std::vector<int> shape_vec(dims.d, dims.d + dims.nbDims);
-      shape_vec.insert(shape_vec.begin(), {max_batch_size_});
-      tv::TensorShape shape(shape_vec);
-      std::string name = engine_->getBindingName(i);
-      auto trt_dtype = engine_->getBindingDataType(i);
-      auto tv_dtype = trt_dtype_to_tv(trt_dtype);
-      bool isInput = engine_->bindingIsInput(i);
-      name_to_idx_[name] = i;
-      idx_to_name_[i] = name;
-      name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)});
-      name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)});
-      if (isInput)
-        inp_idxes_.push_back(i);
-      else
-        out_idxes_.push_back(i);
-      bindings_.push_back(name_to_dev_mem_[name].raw_data());
-    }
-    checkCudaErrors(cudaStreamCreate(&stream_));
-  }
-
-  std::unordered_map<std::string, tv::Tensor>
-  operator()(std::vector<tv::Tensor> inputs) {
-    TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide",
-                          inp_idxes_.size(), "inputs, but got", inputs.size());
-    // inference batch size
-    int bs = inputs[0].dim(0);
-    for (auto &inp : inputs) {
-      TV_ASSERT_INVALID_ARG(inp.dim(0) == bs,
-                            "batch sizes of all input must same");
-    }
-    TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs,
-                          max_batch_size_);
-    for (int i = 0; i < inputs.size(); ++i) {
-      auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]];
-      auto shape_inp = inputs[i].shape().subshape(1);
-      auto shape_dev = dev_mem.shape().subshape(1);
-      TV_ASSERT_INVALID_ARG(shape_inp == shape_dev,
-                            "shape except batch must same", shape_inp,
-                            shape_dev);
-      dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs),
-                                            stream_);
-    }
-
-    ctx_->enqueue(bs, bindings_.data(), stream_, nullptr);
-
-    for (int i : out_idxes_) {
-      name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_(
-          name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_);
-    }
-    checkCudaErrors(cudaStreamSynchronize(stream_));
-    std::unordered_map<std::string, tv::Tensor> output_map;
-    for (int i = 0; i < out_idxes_.size(); ++i) {
-      auto name = idx_to_name_[out_idxes_[i]];
-      output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs);
-    }
-    return output_map;
-  }
-
-  std::unordered_map<std::string, tv::Tensor>
-  operator()(std::unordered_map<std::string, tv::Tensor> inputs) {
-    std::vector<tv::Tensor> inputs_vec(inp_idxes_.size());
-    int count = 0;
-    for (auto &p : inputs) {
-      auto iter = name_to_idx_.find(p.first);
-      TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name",
-                            p.first);
-      inputs_vec[name_to_idx_[p.first]] = p.second;
-    }
-    TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough");
-    return (*this)(inputs_vec);
-  }
-
-  tv::Tensor operator[](std::string name) {
-    auto iter = name_to_host_mem_.find(name);
-    if (iter == name_to_host_mem_.end()) {
-      TV_THROW_INVALID_ARG(name, "not found.");
-    }
-    return iter->second;
-  }
-
-  std::string repr() {
-    std::stringstream ss;
-    ss << "InferenceContext[gpu=" << device_ << "]";
-    ss << "\n  Inputs:";
-    std::string name;
-    for (auto &i : inp_idxes_) {
-      name = idx_to_name_[i];
-      auto &mem = name_to_host_mem_[name];
-      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
-         << "]: " << mem.shape();
-    }
-    ss << "\n  Outputs:";
-    for (auto &i : out_idxes_) {
-      name = idx_to_name_[i];
-      auto &mem = name_to_host_mem_[name];
-      ss << "\n    " << name << "[" << tv::detail::typeString(mem.dtype())
-         << "]: " << mem.shape();
-    }
-    return ss.str();
-  }
-
-private:
-  Logger logger_;
-  trt_unique_ptr_t<nvinfer1::ICudaEngine> engine_;
-  trt_unique_ptr_t<nvinfer1::IExecutionContext> ctx_;
-  std::unordered_map<std::string, tv::Tensor> name_to_dev_mem_;
-  std::unordered_map<std::string, tv::Tensor> name_to_host_mem_;
-  std::unordered_map<std::string, int> name_to_idx_;
-  std::unordered_map<int, std::string> idx_to_name_;
-  std::vector<int> inp_idxes_;
-  std::vector<int> out_idxes_;
-  std::vector<void *> bindings_;
-  cudaStream_t stream_;
-  int max_batch_size_;
-  int device_;
-};
-
-} // namespace trt
diff --git a/include/tensorview/cc17.h b/include/tensorview/cc17.h
deleted file mode 100644
index c008840..0000000
--- a/include/tensorview/cc17.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
-From PyTorch:
-
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
-Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
-(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
-Samy Bengio, Johnny Mariethoz)
-
-From Caffe2:
-
-Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-
-All contributions by Facebook:
-Copyright (c) 2016 Facebook Inc.
-
-All contributions by Google:
-Copyright (c) 2015 Google Inc.
-All rights reserved.
-
-All contributions by Yangqing Jia:
-Copyright (c) 2015 Yangqing Jia
-All rights reserved.
-
-All contributions from Caffe:
-Copyright(c) 2013, 2014, 2015, the respective contributors
-All rights reserved.
-
-All other contributions:
-Copyright(c) 2015, 2016 the respective contributors
-All rights reserved.
-
-Caffe2 uses a copyright model similar to Caffe: each contributor holds
-copyright over their contributions to Caffe2. The project versioning records
-all such contribution and copyright details. If a contributor wants to further
-mark their specific copyright on a particular contribution, they should
-indicate their copyright solely in the commit message of the change when it is
-committed.
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
-America and IDIAP Research Institute nor the names of its contributors may be
-   used to endorse or promote products derived from this software without
-   specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-*/
-#pragma once
-#include <type_traits>
-#include <utility>
-
-namespace tv {
-
-#ifdef __cpp_lib_void_t
-
-template <class T> using void_t = std::void_t<T>;
-
-#else
-
-// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
-// (it takes CWG1558 into account and also works for older compilers)
-template <typename... Ts> struct make_void { typedef void type; };
-template <typename... Ts> using void_t = typename make_void<Ts...>::type;
-
-#endif
-
-namespace detail {
-struct _identity final {
-  template <class T> using type_identity = T;
-
-  template <class T> decltype(auto) operator()(T &&arg) {
-    return std::forward<T>(arg);
-  }
-};
-template <class Func, class Enable = void>
-struct function_takes_identity_argument : std::false_type {};
-#if defined(_MSC_VER)
-// For some weird reason, MSVC shows a compiler error when using guts::void_t
-// instead of std::void_t. But we're only building on MSVC versions that have
-// std::void_t, so let's just use that one.
-template <class Func>
-struct function_takes_identity_argument<
-    Func, std::void_t<decltype(std::declval<Func>()(_identity()))>>
-    : std::true_type {};
-#else
-template <class Func>
-struct function_takes_identity_argument<
-    Func, void_t<decltype(std::declval<Func>()(_identity()))>>
-    : std::true_type {};
-#endif
-
-template <bool Condition> struct _if_constexpr;
-
-template <> struct _if_constexpr<true> final {
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<function_takes_identity_argument<ThenCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback &&thenCallback,
-                             ElseCallback && /* elseCallback */) {
-    // The _identity instance passed in can be used to delay evaluation of an
-    // expression, because the compiler can't know that it's just the identity
-    // we're passing in.
-    return thenCallback(_identity());
-  }
-
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback &&thenCallback,
-                             ElseCallback && /* elseCallback */) {
-    return thenCallback();
-  }
-};
-
-template <> struct _if_constexpr<false> final {
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<function_takes_identity_argument<ElseCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback && /* thenCallback */,
-                             ElseCallback &&elseCallback) {
-    // The _identity instance passed in can be used to delay evaluation of an
-    // expression, because the compiler can't know that it's just the identity
-    // we're passing in.
-    return elseCallback(_identity());
-  }
-
-  template <
-      class ThenCallback, class ElseCallback,
-      std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value,
-                       void *> = nullptr>
-  static decltype(auto) call(ThenCallback && /* thenCallback */,
-                             ElseCallback &&elseCallback) {
-    return elseCallback();
-  }
-};
-} // namespace detail
-
-/*
- * Get something like C++17 if constexpr in C++14.
- *
- * Example 1: simple constexpr if/then/else
- *   template<int arg> int increment_absolute_value() {
- *     int result = arg;
- *     if_constexpr<(arg > 0)>(
- *       [&] { ++result; }  // then-case
- *       [&] { --result; }  // else-case
- *     );
- *     return result;
- *   }
- *
- * Example 2: without else case (i.e. conditionally prune code from assembly)
- *   template<int arg> int decrement_if_positive() {
- *     int result = arg;
- *     if_constexpr<(arg > 0)>(
- *       // This decrement operation is only present in the assembly for
- *       // template instances with arg > 0.
- *       [&] { --result; }
- *     );
- *     return result;
- *   }
- *
- * Example 3: branch based on type (i.e. replacement for SFINAE)
- *   struct MyClass1 {int value;};
- *   struct MyClass2 {int val};
- *   template <class T>
- *   int func(T t) {
- *     return if_constexpr<std::is_same<T, MyClass1>::value>(
- *       [&](auto _) { return _(t).value; }, // this code is invalid for T ==
- * MyClass2, so a regular non-constexpr if statement wouldn't compile
- *       [&](auto _) { return _(t).val; }    // this code is invalid for T ==
- * MyClass1
- *     );
- *   }
- *
- * Note: The _ argument passed in Example 3 is the identity function, i.e. it
- * does nothing. It is used to force the compiler to delay type checking,
- * because the compiler doesn't know what kind of _ is passed in. Without it,
- * the compiler would fail when you try to access t.value but the member doesn't
- * exist.
- *
- * Note: In Example 3, both branches return int, so func() returns int. This is
- * not necessary. If func() had a return type of "auto", then both branches
- * could return different types, say func<MyClass1>() could return int and
- * func<MyClass2>() could return string.
- */
-template <bool Condition, class ThenCallback, class ElseCallback>
-decltype(auto) if_constexpr(ThenCallback &&thenCallback,
-                            ElseCallback &&elseCallback) {
-#if defined(__cpp_if_constexpr)
-  // If we have C++17, just use it's "if constexpr" feature instead of wrapping
-  // it. This will give us better error messages.
-  if constexpr (Condition) {
-    if constexpr (detail::function_takes_identity_argument<
-                      ThenCallback>::value) {
-      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
-    } else {
-      return std::forward<ThenCallback>(thenCallback)();
-    }
-  } else {
-    if constexpr (detail::function_takes_identity_argument<
-                      ElseCallback>::value) {
-      return std::forward<ElseCallback>(elseCallback)(detail::_identity());
-    } else {
-      return std::forward<ElseCallback>(elseCallback)();
-    }
-  }
-#else
-  // C++14 implementation of if constexpr
-  return detail::_if_constexpr<Condition>::call(
-      std::forward<ThenCallback>(thenCallback),
-      std::forward<ElseCallback>(elseCallback));
-#endif
-}
-
-template <bool Condition, class ThenCallback>
-decltype(auto) if_constexpr(ThenCallback &&thenCallback) {
-#if defined(__cpp_if_constexpr)
-  // If we have C++17, just use it's "if constexpr" feature instead of wrapping
-  // it. This will give us better error messages.
-  if constexpr (Condition) {
-    if constexpr (detail::function_takes_identity_argument<
-                      ThenCallback>::value) {
-      return std::forward<ThenCallback>(thenCallback)(detail::_identity());
-    } else {
-      return std::forward<ThenCallback>(thenCallback)();
-    }
-  }
-#else
-  // C++14 implementation of if constexpr
-  return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback),
-                                 [](auto) {});
-#endif
-}
-
-} // namespace tv
diff --git a/include/tensorview/common.h b/include/tensorview/common.h
deleted file mode 100644
index edbfbb1..0000000
--- a/include/tensorview/common.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <iostream>
-#include <sstream>
-#ifdef TV_USE_STACKTRACE
-#if defined(WIN32) || defined(_WIN32) ||                                       \
-    defined(__WIN32) && !defined(__CYGWIN__)
-#define BOOST_STACKTRACE_USE_WINDBG
-#else
-// require linking with -ldl and -lbacktrace in linux
-#define BOOST_STACKTRACE_USE_BACKTRACE
-#endif
-#include <boost/stacktrace.hpp>
-#endif
-
-namespace tv {
-
-template <class SStream, class T> void sstream_print(SStream &ss, T val) {
-  ss << val;
-}
-
-template <class SStream, class T, class... TArgs>
-void sstream_print(SStream &ss, T val, TArgs... args) {
-  ss << val << " ";
-  sstream_print(ss, args...);
-}
-
-template <class... TArgs> void ssprint(TArgs... args) {
-  std::stringstream ss;
-  sstream_print(ss, args...);
-  std::cout << ss.str() << std::endl;
-}
-
-#ifdef TV_USE_STACKTRACE
-#define TV_BACKTRACE_PRINT(ss)                                                 \
-  ss << std::endl << boost::stacktrace::stacktrace();
-#else
-#define TV_BACKTRACE_PRINT(ss)
-#endif
-
-#define TV_THROW_RT_ERR(...)                                                   \
-  {                                                                            \
-    std::stringstream __macro_s;                                               \
-    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
-    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
-    TV_BACKTRACE_PRINT(__macro_s);                                             \
-    throw std::runtime_error(__macro_s.str());                                 \
-  }
-
-#define TV_THROW_INVALID_ARG(...)                                              \
-  {                                                                            \
-    std::stringstream __macro_s;                                               \
-    __macro_s << __FILE__ << " " << __LINE__ << "\n";                          \
-    tv::sstream_print(__macro_s, __VA_ARGS__);                                 \
-    TV_BACKTRACE_PRINT(__macro_s);                                             \
-    throw std::invalid_argument(__macro_s.str());                              \
-  }
-
-#define TV_ASSERT_RT_ERR(expr, ...)                                            \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
-  }
-
-#define TV_ASSERT_INVALID_ARG(expr, ...)                                       \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << #expr << " assert faild. ";                                 \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::invalid_argument(__macro_s.str());                            \
-    }                                                                          \
-  }
-} // namespace tv
\ No newline at end of file
diff --git a/include/tensorview/cuda_utils.h b/include/tensorview/cuda_utils.h
deleted file mode 100644
index 9d0e78c..0000000
--- a/include/tensorview/cuda_utils.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-// from pytorch.aten
-#include "tensorview.h"
-#include <type_traits>
-namespace tv {
-namespace cuda {
-
-template <typename T1, typename T2> inline int DivUp(const T1 a, const T2 b) {
-  return (a + b - 1) / b;
-}
-
-// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
-// CUDA: number of blocks for threads.
-
-inline int getNumThreads(const int N) {
-  if (N > CUDA_NUM_THREADS) {
-    return CUDA_NUM_THREADS;
-  }
-  return DivUp(N, 32) * 32;
-}
-
-inline int getBlocks(const int N) {
-  TV_ASSERT_RT_ERR(N > 0,
-                   "CUDA kernel launch blocks must be positive, but got N=", N);
-  return DivUp(N, getNumThreads(N));
-}
-
-} // namespace cuda
-
-} // namespace tv
\ No newline at end of file
diff --git a/include/tensorview/eigen_utils.h b/include/tensorview/eigen_utils.h
deleted file mode 100644
index b682ac2..0000000
--- a/include/tensorview/eigen_utils.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "tensor.h"
-#include "tensorview.h"
-#include <eigen3/Eigen/Dense>
-
-namespace tv {
-
-template <typename T, int Row = Eigen::Dynamic, int Col = Eigen::Dynamic>
-Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>>
-tv2eigen(TensorView<T> view) {
-  TV_ASSERT_INVALID_ARG(view.ndim() <= 2 && view.ndim() > 0, "error");
-  if (Row != Eigen::Dynamic) {
-    TV_ASSERT_INVALID_ARG(view.dim(0) == Row, "error");
-  }
-  if (Col != Eigen::Dynamic) {
-    TV_ASSERT_INVALID_ARG(view.dim(1) == Col, "error");
-  }
-  int row = 1;
-  if (view.ndim() == 2) {
-    row = view.dim(0);
-  }
-  Eigen::Map<Eigen::Matrix<T, Row, Col, Eigen::RowMajor>> eigen_map(
-      view.data(), row, view.dim(1));
-  return eigen_map;
-}
-
-} // namespace tv
diff --git a/include/tensorview/kernel_utils.h b/include/tensorview/kernel_utils.h
deleted file mode 100644
index 00e9bc2..0000000
--- a/include/tensorview/kernel_utils.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#pragma once
-// from tensorflow
-namespace tv {
-namespace detail {
-
-template <typename T> class KernelLoop {
-  struct Iterator {
-    __forceinline__ __device__ Iterator(T index, T delta)
-        : index_(index), delta_(delta) {}
-    __forceinline__ __device__ T operator*() const { return index_; }
-    __forceinline__ __device__ Iterator &operator++() {
-      index_ += delta_;
-      return *this;
-    }
-    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
-      bool greater = index_ > other.index_;
-      bool less = index_ < other.index_;
-      // Anything past an end iterator (delta_ == 0) is equal.
-      // In range-based for loops, this optimizes to 'return less'.
-      if (!other.delta_) {
-        return less;
-      }
-      if (!delta_) {
-        return greater;
-      }
-      return less || greater;
-    }
-
-  private:
-    T index_;
-    const T delta_;
-  };
-
-public:
-  __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
-      : begin_(begin), delta_(delta), end_(end) {}
-
-  __forceinline__ __device__ Iterator begin() const {
-    return Iterator{begin_, delta_};
-  }
-  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
-
-private:
-  T begin_;
-  T delta_;
-  T end_;
-};
-
-} // namespace detail
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
-  return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
-                               gridDim.x * blockDim.x * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
-// Usage: for(int i : KernelLoopY(count)) { visit(i); }
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
-  return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
-                               gridDim.y * blockDim.y * NumILP, count);
-}
-
-// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
-// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
-template <typename T, int NumILP = 1>
-__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
-  return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
-                               gridDim.z * blockDim.z * NumILP, count);
-}
-
-} // namespace tv
\ No newline at end of file
diff --git a/include/tensorview/mp_helper.h b/include/tensorview/mp_helper.h
deleted file mode 100644
index ec56cf7..0000000
--- a/include/tensorview/mp_helper.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef MP_HELPER_H_
-#define MP_HELPER_H_
-#include <type_traits>
-#include <utility>
-
-namespace tv {
-template <class... T> struct mp_list {};
-
-template <class T, T... I>
-using mp_list_c = mp_list<std::integral_constant<T, I>...>;
-
-template <int... I>
-using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;
-
-namespace detail {
-
-template <class... Ts, class F>
-constexpr F mp_for_each_impl(mp_list<Ts...>, F &&f) {
-  return (void)(std::initializer_list<int>{(f(Ts()), 0)...}),
-         std::forward<F>(f);
-}
-
-template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) {
-  return std::forward<F>(f);
-}
-
-} // namespace detail
-
-template <class... T>
-using mp_length = std::integral_constant<std::size_t, sizeof...(T)>;
-
-namespace detail {
-
-template <class A, template <class...> class B> struct mp_rename_impl {
-  // An error "no type named 'type'" here means that the first argument to
-  // mp_rename is not a list
-};
-
-template <template <class...> class A, class... T, template <class...> class B>
-struct mp_rename_impl<A<T...>, B> {
-  using type = B<T...>;
-};
-
-} // namespace detail
-
-template <class A, template <class...> class B>
-using mp_rename = typename detail::mp_rename_impl<A, B>::type;
-
-template <class L> using mp_size = mp_rename<L, mp_length>;
-
-template <class L, class F> constexpr F mp_for_each(F &&f) {
-  return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
-}
-} // namespace tv
-
-#endif
\ No newline at end of file
diff --git a/include/tensorview/prettyprint.h b/include/tensorview/prettyprint.h
deleted file mode 100644
index e058bd6..0000000
--- a/include/tensorview/prettyprint.h
+++ /dev/null
@@ -1,475 +0,0 @@
-//          Copyright Louis Delacroix 2010 - 2014.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-//
-// A pretty printing library for C++
-//
-// Usage:
-// Include this header, and operator<< will "just work".
-
-#ifndef H_PRETTY_PRINT
-#define H_PRETTY_PRINT
-
-#include <cstddef>
-#include <iterator>
-#include <memory>
-#include <ostream>
-#include <set>
-#include <tuple>
-#include <type_traits>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-
-namespace pretty_print {
-namespace detail {
-// SFINAE type trait to detect whether T::const_iterator exists.
-
-struct sfinae_base {
-  using yes = char;
-  using no = yes[2];
-};
-
-template <typename T> struct has_const_iterator : private sfinae_base {
-private:
-  template <typename C> static yes &test(typename C::const_iterator *);
-  template <typename C> static no &test(...);
-
-public:
-  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
-  using type = T;
-};
-
-template <typename T> struct has_begin_end : private sfinae_base {
-private:
-  template <typename C>
-  static yes &
-  f(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::begin)),
-                   typename C::const_iterator (C::*)() const>::value>::type *);
-
-  template <typename C> static no &f(...);
-
-  template <typename C>
-  static yes &
-  g(typename std::enable_if<
-      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
-                                            const>(&C::end)),
-                   typename C::const_iterator (C::*)() const>::value,
-      void>::type *);
-
-  template <typename C> static no &g(...);
-
-public:
-  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
-  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
-};
-
-} // namespace detail
-
-// Holds the delimiter values for a specific character type
-
-template <typename TChar> struct delimiters_values {
-  using char_type = TChar;
-  const char_type *prefix;
-  const char_type *delimiter;
-  const char_type *postfix;
-};
-
-// Defines the delimiter values for a specific container and character type
-
-template <typename T, typename TChar> struct delimiters {
-  using type = delimiters_values<TChar>;
-  static const type values;
-};
-
-// Functor to print containers. You can use this directly if you want
-// to specificy a non-default delimiters type. The printing logic can
-// be customized by specializing the nested template.
-
-template <typename T, typename TChar = char,
-          typename TCharTraits = ::std::char_traits<TChar>,
-          typename TDelimiters = delimiters<T, TChar>>
-struct print_container_helper {
-  using delimiters_type = TDelimiters;
-  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
-
-  template <typename U> struct printer {
-    static void print_body(const U &c, ostream_type &stream) {
-      using std::begin;
-      using std::end;
-
-      auto it = begin(c);
-      const auto the_end = end(c);
-
-      if (it != the_end) {
-        for (;;) {
-          stream << *it;
-
-          if (++it == the_end)
-            break;
-
-          if (delimiters_type::values.delimiter != NULL)
-            stream << delimiters_type::values.delimiter;
-        }
-      }
-    }
-  };
-
-  print_container_helper(const T &container) : container_(container) {}
-
-  inline void operator()(ostream_type &stream) const {
-    if (delimiters_type::values.prefix != NULL)
-      stream << delimiters_type::values.prefix;
-
-    printer<T>::print_body(container_, stream);
-
-    if (delimiters_type::values.postfix != NULL)
-      stream << delimiters_type::values.postfix;
-  }
-
-private:
-  const T &container_;
-};
-
-// Specialization for pairs
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename T1, typename T2>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::pair<T1, T2>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-
-  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
-    stream << c.first;
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-    stream << c.second;
-  }
-};
-
-// Specialization for tuples
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-template <typename... Args>
-struct print_container_helper<T, TChar, TCharTraits,
-                              TDelimiters>::printer<std::tuple<Args...>> {
-  using ostream_type =
-      typename print_container_helper<T, TChar, TCharTraits,
-                                      TDelimiters>::ostream_type;
-  using element_type = std::tuple<Args...>;
-
-  template <std::size_t I> struct Int {};
-
-  static void print_body(const element_type &c, ostream_type &stream) {
-    tuple_print(c, stream, Int<0>());
-  }
-
-  static void tuple_print(const element_type &, ostream_type &,
-                          Int<sizeof...(Args)>) {}
-
-  static void
-  tuple_print(const element_type &c, ostream_type &stream,
-              typename std::conditional<sizeof...(Args) != 0, Int<0>,
-                                        std::nullptr_t>::type) {
-    stream << std::get<0>(c);
-    tuple_print(c, stream, Int<1>());
-  }
-
-  template <std::size_t N>
-  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
-    if (print_container_helper<T, TChar, TCharTraits,
-                               TDelimiters>::delimiters_type::values
-            .delimiter != NULL)
-      stream << print_container_helper<T, TChar, TCharTraits,
-                                       TDelimiters>::delimiters_type::values
-                    .delimiter;
-
-    stream << std::get<N>(c);
-
-    tuple_print(c, stream, Int<N + 1>());
-  }
-};
-
-// Prints a print_container_helper to the specified stream.
-
-template <typename T, typename TChar, typename TCharTraits,
-          typename TDelimiters>
-inline std::basic_ostream<TChar, TCharTraits> &operator<<(
-    std::basic_ostream<TChar, TCharTraits> &stream,
-    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
-  helper(stream);
-  return stream;
-}
-
-// Basic is_container template; specialize to derive from std::true_type for all
-// desired container types
-
-template <typename T>
-struct is_container
-    : public std::integral_constant<bool,
-                                    detail::has_const_iterator<T>::value &&
-                                        detail::has_begin_end<T>::beg_value &&
-                                        detail::has_begin_end<T>::end_value> {};
-
-template <typename T, std::size_t N>
-struct is_container<T[N]> : std::true_type {};
-
-template <std::size_t N> struct is_container<char[N]> : std::false_type {};
-
-template <typename T> struct is_container<std::valarray<T>> : std::true_type {};
-
-template <typename T1, typename T2>
-struct is_container<std::pair<T1, T2>> : std::true_type {};
-
-template <typename... Args>
-struct is_container<std::tuple<Args...>> : std::true_type {};
-
-// Default delimiters
-
-template <typename T> struct delimiters<T, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T>
-const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
-template <typename T> struct delimiters<T, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T>
-const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
-                                                                   L"]"};
-
-// Delimiters for (multi)set and unordered_(multi)set
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char>
-    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
-                                                                  "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<char> delimiters<::std::multiset<T, TComp, TAllocator>,
-                                         char>::values = {"{", ", ", "}"};
-
-template <typename T, typename TComp, typename TAllocator>
-struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename TComp, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
-        L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t> delimiters<
-    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
-    L"{", L", ", L"}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  char> {
-  static const delimiters_values<char> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<char> delimiters<
-    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
-    "{", ", ", "}"};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-                  wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-
-template <typename T, typename THash, typename TEqual, typename TAllocator>
-const delimiters_values<wchar_t>
-    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
-               wchar_t>::values = {L"{", L", ", L"}"};
-
-// Delimiters for pair and tuple
-
-template <typename T1, typename T2> struct delimiters<std::pair<T1, T2>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
-    "(", ", ", ")"};
-template <typename T1, typename T2>
-struct delimiters<::std::pair<T1, T2>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename T1, typename T2>
-const delimiters_values<wchar_t>
-    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
-
-template <typename... Args> struct delimiters<std::tuple<Args...>, char> {
-  static const delimiters_values<char> values;
-};
-template <typename... Args>
-const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
-    "(", ", ", ")"};
-template <typename... Args> struct delimiters<::std::tuple<Args...>, wchar_t> {
-  static const delimiters_values<wchar_t> values;
-};
-template <typename... Args>
-const delimiters_values<wchar_t>
-    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
-
-// Type-erasing helper class for easy use of custom delimiters.
-// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
-// and MyDelims needs to be defined for TChar. Usage: "cout <<
-// pretty_print::custom_delims<MyDelims>(x)".
-
-struct custom_delims_base {
-  virtual ~custom_delims_base() {}
-  virtual std::ostream &stream(::std::ostream &) = 0;
-  virtual std::wostream &stream(::std::wostream &) = 0;
-};
-
-template <typename T, typename Delims>
-struct custom_delims_wrapper : custom_delims_base {
-  custom_delims_wrapper(const T &t_) : t(t_) {}
-
-  std::ostream &stream(std::ostream &s) {
-    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
-               t);
-  }
-
-  std::wostream &stream(std::wostream &s) {
-    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
-                                       Delims>(t);
-  }
-
-private:
-  const T &t;
-};
-
-template <typename Delims> struct custom_delims {
-  template <typename Container>
-  custom_delims(const Container &c)
-      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
-
-  std::unique_ptr<custom_delims_base> base;
-};
-
-template <typename TChar, typename TCharTraits, typename Delims>
-inline std::basic_ostream<TChar, TCharTraits> &
-operator<<(std::basic_ostream<TChar, TCharTraits> &s,
-           const custom_delims<Delims> &p) {
-  return p.base->stream(s);
-}
-
-// A wrapper for a C-style array given as pointer-plus-size.
-// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
-
-template <typename T> struct array_wrapper_n {
-  typedef const T *const_iterator;
-  typedef T value_type;
-
-  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
-  inline const_iterator begin() const { return _array; }
-  inline const_iterator end() const { return _array + _n; }
-
-private:
-  const T *const _array;
-  size_t _n;
-};
-
-// A wrapper for hash-table based containers that offer local iterators to each
-// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
-// 5 of container m.)
-
-template <typename T> struct bucket_print_wrapper {
-  typedef typename T::const_local_iterator const_iterator;
-  typedef typename T::size_type size_type;
-
-  const_iterator begin() const { return m_map.cbegin(n); }
-
-  const_iterator end() const { return m_map.cend(n); }
-
-  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
-
-private:
-  const T &m_map;
-  const size_type n;
-};
-
-} // namespace pretty_print
-
-// Global accessor functions for the convenience wrappers
-
-template <typename T>
-inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
-                                                           size_t n) {
-  return pretty_print::array_wrapper_n<T>(a, n);
-}
-
-template <typename T>
-pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
-                                                   typename T::size_type n) {
-  return pretty_print::bucket_print_wrapper<T>(m, n);
-}
-
-// Main magic entry point: An overload snuck into namespace std.
-// Can we do better?
-
-namespace std {
-// Prints a container to the stream using default delimiters
-
-template <typename T, typename TChar, typename TCharTraits>
-inline typename enable_if<::pretty_print::is_container<T>::value,
-                          basic_ostream<TChar, TCharTraits> &>::type
-operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
-  return stream
-         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
-                container);
-}
-} // namespace std
-
-#endif // H_PRETTY_PRINT
diff --git a/include/tensorview/pybind_utils.h b/include/tensorview/pybind_utils.h
deleted file mode 100644
index a2b83ba..0000000
--- a/include/tensorview/pybind_utils.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "tensor.h"
-#include "tensorview.h"
-
-#include <algorithm>
-#include <array>
-#include <iostream>
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-namespace py = pybind11;
-namespace tv {
-
-template <typename Tarr> bool is_c_style(const Tarr &arr) {
-  return bool(arr.flags() & py::array::c_style);
-}
-
-template <typename T, int Rank = -1>
-TensorView<T, Rank> arrayt2tv(py::array_t<T> arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  Shape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  if (Rank >= 0) {
-    TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
-  }
-  return TensorView<T, Rank>(arr.mutable_data(), shape);
-}
-
-template <typename T, int Rank = -1>
-TensorView<const T> carrayt2tv(py::array_t<T> arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  Shape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  if (Rank >= 0) {
-    TV_ASSERT_INVALID_ARG(shape.ndim() == Rank, "error");
-  }
-  return TensorView<const T, Rank>(arr.data(), shape);
-}
-
-template <typename Tarr> tv::DType get_array_tv_dtype(const Tarr &arr) {
-  switch (arr.dtype().kind()) {
-  case 'b':
-    return tv::bool_;
-  case 'i': {
-    switch (arr.itemsize()) {
-    case 1:
-      return tv::int8;
-    case 2:
-      return tv::int16;
-    case 4:
-      return tv::int32;
-    case 8:
-      return tv::int64;
-    default:
-      break;
-    }
-  }
-  case 'u': {
-    switch (arr.itemsize()) {
-    case 1:
-      return tv::uint8;
-    case 2:
-      return tv::uint16;
-    case 4:
-      return tv::uint32;
-    case 8:
-      return tv::uint64;
-    default:
-      break;
-    }
-  }
-  case 'f': {
-    switch (arr.itemsize()) {
-    case 2:
-      return tv::float16;
-    case 4:
-      return tv::float32;
-    case 8:
-      return tv::float64;
-    default:
-      break;
-    }
-  }
-  }
-  TV_THROW_RT_ERR("unknown dtype", arr.dtype().kind(), arr.itemsize());
-}
-
-template <typename Tarr> Tensor array2tensor(Tarr &arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  TensorShape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  return tv::from_blob(arr.mutable_data(), shape, get_array_tv_dtype(arr), -1);
-}
-
-template <typename T> Tensor arrayt2tensor(py::array_t<T> &arr) {
-  TV_ASSERT_INVALID_ARG(is_c_style(arr), "array must be c-contiguous array");
-  TensorShape shape;
-  for (int i = 0; i < arr.ndim(); ++i) {
-    shape.push_back(arr.shape(i));
-  }
-  return tv::from_blob(arr.mutable_data(), shape, tv::type_v<T>, -1);
-}
-
-template <typename TDType> py::dtype tv_dtype_to_py(TDType d) {
-  switch (d) {
-  case float32:
-    return py::dtype("float32");
-  case float64:
-    return py::dtype("float64");
-  case float16:
-    return py::dtype("float16");
-  case int32:
-    return py::dtype("int32");
-  case int16:
-    return py::dtype("int16");
-  case int8:
-    return py::dtype("int8");
-  case int64:
-    return py::dtype("int64");
-  case uint32:
-    return py::dtype("uint32");
-  case uint16:
-    return py::dtype("uint16");
-  case uint8:
-    return py::dtype("uint8");
-  case uint64:
-    return py::dtype("uint64");
-  case bool_:
-    return py::dtype("bool_");
-  default:;
-  }
-  TV_THROW_INVALID_ARG("unknown dtype", d);
-}
-
-// add template to define function in header
-template <typename Ttensor> py::array tensor2array(Ttensor &tensor) {
-  // you cant call this function during GIL released.
-  TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
-  auto shape = tensor.shape();
-  std::vector<int> shape_vec(shape.begin(), shape.end());
-  auto dtype = tv_dtype_to_py(tensor.dtype());
-  // construct py::array will copy content from ptr.
-  // its expected because we can't transfer ownership from
-  // c++ tv::Tensor to numpy array when c++ object is deleted.
-  return py::array(dtype, shape_vec, {}, tensor.raw_data());
-}
-
-} // namespace tv
diff --git a/include/tensorview/tensor.h b/include/tensorview/tensor.h
deleted file mode 100644
index 5c84dad..0000000
--- a/include/tensorview/tensor.h
+++ /dev/null
@@ -1,992 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
-tv::Tensor is a lightweight header-only tensor container
-without template and annoying dependencies. no algorithm is implemented.
-it should only be used when you want a no-template simple container but
-dont want to link with libtorch.
-
-If you can use libtorch, dont use tv::Tensor.
-*/
-
-#pragma once
-#include "cc17.h"
-#include "mp_helper.h"
-#include "tensorview.h"
-#include <cstring>
-#include <iomanip>
-#include <memory>
-#include <type_traits>
-#ifdef TV_CUDA
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#endif
-
-namespace tv {
-enum DType {
-  float32,
-  int32,
-  int16,
-  int8,
-  float64,
-  bool_,
-  uint8,
-  float16,
-  int64,
-  uint16,
-  uint32,
-  uint64
-};
-
-namespace detail {
-
-using dtype_collection_t =
-    tv::mp_list_c<int, float32, int32, int16, int8, float64, bool_, uint8,
-                  float16, int64, uint16, uint32, uint64>;
-
-#ifdef TV_CUDA
-using all_tensor_types_t =
-    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
-               uint16_t, uint32_t, uint64_t, bool>;
-#else
-using all_tensor_types_t =
-    std::tuple<float, double, int8_t, int16_t, int32_t, int64_t, uint8_t,
-               uint16_t, uint32_t, uint64_t, bool>;
-#endif
-
-template <typename T> class TensorStorage {
-public:
-  TensorStorage(size_t size, int device = -1, bool managed = false,
-                bool pinned = false)
-      : mSize(size), device_(device), managed_(managed), pinned_(pinned) {
-    if (size == 0) {
-      mPtr = nullptr;
-    } else {
-      if (device == -1) {
-        if (pinned_) {
-#ifdef TV_CUDA
-          checkCudaErrors(cudaMallocHost(&mPtr, size * sizeof(T)));
-#else
-          TV_THROW_INVALID_ARG("you need to define TV_CUDA to use pinned");
-#endif
-        } else {
-          mPtr = new T[size];
-        }
-      } else {
-#ifdef TV_CUDA
-        // we should select device in external
-        /*
-        int deviceCount;
-        cudaGetDeviceCount(&deviceCount);
-        if (device >= deviceCount) {
-          TV_THROW_INVALID_ARG("you provide device ", device,
-                               " but you only have ", deviceCount, " device.");
-        }
-        cudaSetDevice(device);
-        */
-        if (managed) {
-          checkCudaErrors(cudaMallocManaged(&this->mPtr, size * sizeof(T)));
-        } else {
-          checkCudaErrors(cudaMalloc(&mPtr, size * sizeof(T)));
-        }
-#else
-        TV_THROW_INVALID_ARG("don't compiled with cuda");
-#endif
-      }
-    }
-  }
-  TensorStorage(T *ptr, size_t size, int device)
-      : mSize(size), mPtr(ptr), from_blob_(true), device_(device) {}
-
-  virtual ~TensorStorage() {
-    if (empty()) {
-      return;
-    }
-    if (from_blob_) {
-      return;
-    }
-    if (device_ == -1) {
-      if (pinned_) {
-#ifdef TV_CUDA
-        cudaFreeHost(mPtr);
-#endif
-      } else {
-        delete[] mPtr;
-      }
-    } else {
-#ifdef TV_CUDA
-      cudaFree(mPtr);
-#endif
-    }
-  };
-
-  inline size_t size() const { return mSize; }
-
-  T *data() { return mPtr; }
-  const T *data() const { return mPtr; }
-
-  bool empty() const { return mPtr == nullptr || mSize == 0; }
-  bool managed() const { return managed_; }
-  bool pinned() const { return pinned_; }
-
-  int device() const { return device_; }
-  void zero_() {
-    if (device_ == -1) {
-      std::memset(data(), 0, mSize);
-      // std::fill(data(), data() + mSize, 0);
-    } else {
-#ifdef TV_CUDA
-      checkCudaErrors(cudaMemset(data(), 0, mSize / sizeof(T)));
-#else
-      TV_THROW_INVALID_ARG("don't compiled with cuda");
-#endif
-    }
-  }
-
-private:
-  size_t mSize = 0;
-  T *mPtr = nullptr;
-  bool from_blob_ = false;
-  int device_ = -1;
-  bool managed_ = false;
-  bool pinned_ = false;
-};
-
-template <typename T> size_t sizeof_dtype(T dtype) {
-  switch (dtype) {
-  case float32:
-    return sizeof(float);
-  case int8:
-    return sizeof(int8_t);
-  case int16:
-    return sizeof(int16_t);
-  case int32:
-    return sizeof(int32_t);
-  case float64:
-    return sizeof(double);
-  case int64:
-    return sizeof(int64_t);
-  case bool_:
-    return sizeof(bool);
-  case uint8:
-    return sizeof(uint8_t);
-  case uint16:
-    return sizeof(uint16_t);
-  case uint32:
-    return sizeof(uint32_t);
-  case uint64:
-    return sizeof(uint64_t);
-  case float16:
-    return 2;
-  default:
-    TV_THROW_RT_ERR("unsupported dtype");
-  }
-  return 0;
-}
-
-template <typename T> std::string typeString(T t) {
-  switch (t) {
-  case DType::bool_:
-    return "bool";
-  case DType::float32:
-    return "float32";
-  case DType::int8:
-    return "int8";
-  case DType::int16:
-    return "int16";
-  case DType::int32:
-    return "int32";
-  case DType::float64:
-    return "float64";
-  case DType::int64:
-    return "int64";
-  case DType::uint8:
-    return "uint8";
-  case DType::uint16:
-    return "uint16";
-  case DType::uint32:
-    return "uint32";
-  case DType::uint64:
-    return "uint64";
-  case DType::float16:
-    return "half";
-  default:
-    return "";
-  }
-}
-
-template <typename T> struct TypeToDtypeTraits;
-
-template <> struct TypeToDtypeTraits<int32_t> {
-  static constexpr DType dtype = int32;
-};
-
-#ifdef TV_CUDA
-template <> struct TypeToDtypeTraits<__half> {
-  static constexpr DType dtype = float16;
-};
-#endif
-
-template <> struct TypeToDtypeTraits<float> {
-  static constexpr DType dtype = float32;
-};
-template <> struct TypeToDtypeTraits<double> {
-  static constexpr DType dtype = float64;
-};
-template <> struct TypeToDtypeTraits<int16_t> {
-  static constexpr DType dtype = int16;
-};
-template <> struct TypeToDtypeTraits<int8_t> {
-  static constexpr DType dtype = int8;
-};
-template <> struct TypeToDtypeTraits<int64_t> {
-  static constexpr DType dtype = int64;
-};
-template <> struct TypeToDtypeTraits<uint8_t> {
-  static constexpr DType dtype = uint8;
-};
-template <> struct TypeToDtypeTraits<uint16_t> {
-  static constexpr DType dtype = uint16;
-};
-template <> struct TypeToDtypeTraits<uint32_t> {
-  static constexpr DType dtype = uint32;
-};
-template <> struct TypeToDtypeTraits<uint64_t> {
-  static constexpr DType dtype = uint64;
-};
-template <> struct TypeToDtypeTraits<bool> {
-  static constexpr DType dtype = bool_;
-};
-template <> struct TypeToDtypeTraits<const int32_t> {
-  static constexpr DType dtype = int32;
-};
-
-#ifdef TV_CUDA
-template <> struct TypeToDtypeTraits<const __half> {
-  static constexpr DType dtype = float16;
-};
-#endif
-
-template <> struct TypeToDtypeTraits<const float> {
-  static constexpr DType dtype = float32;
-};
-template <> struct TypeToDtypeTraits<const double> {
-  static constexpr DType dtype = float64;
-};
-template <> struct TypeToDtypeTraits<const int16_t> {
-  static constexpr DType dtype = int16;
-};
-template <> struct TypeToDtypeTraits<const int8_t> {
-  static constexpr DType dtype = int8;
-};
-template <> struct TypeToDtypeTraits<const int64_t> {
-  static constexpr DType dtype = int64;
-};
-template <> struct TypeToDtypeTraits<const uint8_t> {
-  static constexpr DType dtype = uint8;
-};
-template <> struct TypeToDtypeTraits<const uint16_t> {
-  static constexpr DType dtype = uint16;
-};
-template <> struct TypeToDtypeTraits<const uint32_t> {
-  static constexpr DType dtype = uint32;
-};
-template <> struct TypeToDtypeTraits<const uint64_t> {
-  static constexpr DType dtype = uint64;
-};
-template <> struct TypeToDtypeTraits<const bool> {
-  static constexpr DType dtype = bool_;
-};
-
-} // namespace detail
-
-template <class T> constexpr DType type_v = detail::TypeToDtypeTraits<T>::dtype;
-
-template <class... Ts, typename F> bool dispatch_noexcept(DType t, F &&f) {
-  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
-  bool notFound = true;
-  mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (type_v<decltype(I)> == t && notFound) {
-      std::forward<F>(f)(decltype(I)());
-      notFound = false;
-    }
-  });
-  return !notFound;
-}
-
-template <class... Ts, typename F> void dispatch(DType t, F &&f) {
-  if (!dispatch_noexcept<Ts...>(t, std::forward<F>(f))) {
-    std::stringstream ss;
-    mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
-      ss << detail::TypeToString<decltype(I)>::value << " ";
-    });
-    TV_THROW_RT_ERR("unknown type", detail::typeString(t),
-                    ", available:", ss.str());
-  }
-}
-
-template <typename T, T... Is, typename F> void dispatch_scalar(T idx, F &&f) {
-  static_assert(sizeof...(Is) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list_c<T, Is...>>([=, &notFound, &f](auto I) {
-    if (T(I) == idx && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-  if (notFound) {
-    std::stringstream ss;
-    mp_for_each<mp_list_c<T, Is...>>([=, &ss](auto I) { ss << T(I) << " "; });
-    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
-  }
-}
-
-template <int... Is, typename F> bool dispatch_int_noexcept(int idx, F &&f) {
-  static_assert(sizeof...(Is) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
-    if (decltype(I)::value == idx && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-  return !notFound;
-}
-
-template <int... Is, typename F, class BinaryPredicate>
-bool dispatch_int_noexcept(int idx, BinaryPredicate p, F &&f) {
-  static_assert(sizeof...(Is) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list_c<int, Is...>>([=, &notFound, &f](auto I) {
-    if (p(idx, decltype(I)::value) && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-  return !notFound;
-}
-
-template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
-  if (!dispatch_int_noexcept<Is...>(idx, std::forward<F>(f))) {
-    std::stringstream ss;
-    mp_for_each<mp_list_c<int, Is...>>(
-        [=, &ss](auto I) { ss << decltype(I)::value << " "; });
-    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
-  }
-}
-
-template <int... Is, typename F, class BinaryPredicate>
-void dispatch_int(int idx, BinaryPredicate p, F &&f) {
-  // BinaryPredicate: BinaryPredicate(idx, candidate)
-  if (!dispatch_int_noexcept<Is...>(idx, p, std::forward<F>(f))) {
-    std::stringstream ss;
-    mp_for_each<mp_list_c<int, Is...>>(
-        [=, &ss](auto I) { ss << decltype(I)::value << " "; });
-    TV_THROW_RT_ERR("unknown value", idx, ", available:", ss.str());
-  }
-}
-
-// Ts is pack of mp_list_c
-template <class... Ts, typename Iterator, typename F>
-bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
-  static_assert(sizeof...(Ts) > 0,
-                "you need to provide at least one candidate");
-  bool notFound = true;
-  mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    using val_lst_t = decltype(I);
-    auto val_lst_size = mp_size<val_lst_t>::value;
-    bool equal = true;
-    std::size_t count = 0;
-    auto iter = begin;
-    mp_for_each<val_lst_t>([&](auto E) {
-      if (iter == end || !equal) {
-        return;
-      }
-      if (count >= val_lst_size) {
-        equal = false;
-        return;
-      }
-      constexpr auto c = decltype(E)::value;
-      if (c != *iter) {
-        equal = false;
-      }
-      ++count;
-      std::advance(iter, 1);
-    });
-    if (count != val_lst_size || iter != end) {
-      equal = false;
-    }
-    if (equal && notFound) {
-      std::forward<F>(f)(I);
-      notFound = false;
-    }
-  });
-
-  return !notFound;
-}
-
-template <class... Ts, typename Iterator, typename F>
-void dispatch_container(Iterator begin, Iterator end, F &&f) {
-  if (!dispatch_container_noexcept<Ts...>(begin, end, std::forward<F>(f))) {
-    std::stringstream ss;
-    ss << "unknown value [";
-    for (auto iter = begin; iter != end; std::advance(iter, 1)) {
-      ss << *iter << ",";
-    }
-    ss << "], available: ";
-    mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
-      ss << "[";
-      mp_for_each<decltype(I)>(
-          [=, &ss](auto E) { ss << decltype(E)::value << ","; });
-      ss << "]";
-    });
-    TV_THROW_RT_ERR(ss.str());
-  }
-}
-
-/*
-template <int... Is, typename F> void dispatch_int(int idx, F &&f) {
-  return dispatch_scalar<int, Is...>(idx, f);
-}
-*/
-
-template <class T> struct Dispatch;
-
-template <template <class...> class T, class... Args>
-struct Dispatch<T<Args...>> {
-  template <typename F> inline void operator()(DType t, F &&f) {
-    return dispatch<Args...>(t, std::forward<F>(f));
-  }
-};
-
-template <class T> struct DispatchContainer;
-
-template <template <class...> class T, class... Args>
-struct DispatchContainer<T<Args...>> {
-  template <typename Iterator, typename F>
-  inline void operator()(Iterator begin, Iterator end, F &&f) {
-    return dispatch_container<Args...>(begin, end, std::forward<F>(f));
-  }
-};
-
-template <class T> struct DispatchContainerNoexcept;
-
-template <template <class...> class T, class... Args>
-struct DispatchContainerNoexcept<T<Args...>> {
-  template <typename Iterator, typename F>
-  inline bool operator()(Iterator begin, Iterator end, F &&f) {
-    return dispatch_container_noexcept<Args...>(begin, end, std::forward<F>(f));
-  }
-};
-
-template <class T> struct DispatchInt;
-
-// Args should be std::integral_constant<int, value>
-// you need to use type_container<std::integral_constant<int, value>...>
-// as template parameter of DispatchInt.
-// tv::mp_list_c is ok.
-template <template <class...> class T, class... Args>
-struct DispatchInt<T<Args...>> {
-  template <typename F> inline void operator()(int t, F &&f) {
-    return dispatch_int<Args::value...>(t, std::forward<F>(f));
-  }
-  template <typename F, typename BinaryPredicate>
-  inline void operator()(int t, BinaryPredicate p, F &&f) {
-    return dispatch_int<Args::value...>(t, p, std::forward<F>(f));
-  }
-};
-
-template <class T> struct DispatchIntNoexcept;
-
-template <template <class...> class T, class... Args>
-struct DispatchIntNoexcept<T<Args...>> {
-  template <typename F> inline bool operator()(int t, F &&f) {
-    return dispatch_int_noexcept<Args::value...>(t, std::forward<F>(f));
-  }
-  template <typename F, typename BinaryPredicate>
-  inline bool operator()(int t, BinaryPredicate p, F &&f) {
-    return dispatch_int_noexcept<Args::value...>(t, p, std::forward<F>(f));
-  }
-};
-
-constexpr size_t kTensorMaxDim = 10;
-using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
-
-struct Tensor {
-  Tensor() {}
-  Tensor(TensorShape shape, TensorShape stride, DType dtype, int device = -1,
-         bool pinned = false, bool managed = false)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
-    shape_ = shape;
-    stride_ = stride;
-  }
-
-  Tensor(TensorShape shape, DType dtype, int device = -1, bool pinned = false,
-         bool managed = false)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        shape.size() * detail::sizeof_dtype(dtype), device, managed, pinned);
-    shape_ = shape;
-    stride_ = shape.stride_rowmajor();
-  }
-  Tensor(void *ptr, TensorShape shape, TensorShape stride, DType dtype,
-         int device = -1)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(ptr),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = stride;
-  }
-  Tensor(void *ptr, TensorShape shape, DType dtype, int device = -1)
-      : dtype_(dtype) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(ptr),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = shape.stride_rowmajor();
-  }
-
-  Tensor(const void *ptr, TensorShape shape, TensorShape stride, DType dtype,
-         int device = -1)
-      : dtype_(dtype), writeable_(false) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = stride;
-  }
-  Tensor(const void *ptr, TensorShape shape, DType dtype, int device = -1)
-      : dtype_(dtype), writeable_(false) {
-    TV_ASSERT_INVALID_ARG(!shape.empty(), "dont support empty shape");
-    storage_ = std::make_shared<detail::TensorStorage<uint8_t>>(
-        reinterpret_cast<uint8_t *>(const_cast<void *>(ptr)),
-        shape.size() * detail::sizeof_dtype(dtype), device);
-    shape_ = shape;
-    stride_ = shape.stride_rowmajor();
-  }
-
-  Tensor(std::initializer_list<int32_t> init)
-      : Tensor({int(init.size())}, tv::int32) {
-    std::copy(init.begin(), init.end(), data<int32_t>());
-  }
-  Tensor(std::initializer_list<int64_t> init)
-      : Tensor({int(init.size())}, tv::int64) {
-    std::copy(init.begin(), init.end(), data<int64_t>());
-  }
-  Tensor(std::initializer_list<float> init)
-      : Tensor({int(init.size())}, tv::float32) {
-    std::copy(init.begin(), init.end(), data<float>());
-  }
-  Tensor(std::initializer_list<double> init)
-      : Tensor({int(init.size())}, tv::float64) {
-    std::copy(init.begin(), init.end(), data<double>());
-  }
-
-  template <typename T, int Rank = -1,
-            template <class> class PtrTraits = DefaultPtrTraits,
-            typename Tindex = int,
-            typename std::enable_if<(Rank > 0), int>::type = 0>
-  TensorView<T, Rank, PtrTraits, Tindex> tview() {
-    using tv_shape_t =
-        typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
-    writable_check();
-    static_assert(Rank == -1 || Rank > 0, "error");
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    tv_shape_t shape(Rank), stride(Rank);
-    for (int i = 0; i < Rank; ++i) {
-      shape[i] = shape_[i];
-      stride[i] = stride_[i];
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        reinterpret_cast<T *>(data<T>()), shape, stride);
-  }
-
-  template <typename T, int Rank = -1,
-            template <class> class PtrTraits = DefaultPtrTraits,
-            typename Tindex = int>
-  TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>
-  tview() const {
-    static_assert(Rank == -1 || Rank > 0, "error");
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    return if_constexpr<(Rank > 0)>(
-        [&](auto _) {
-          TV_ASSERT_RT_ERR(Rank == ndim(), "error");
-          ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank),
-              stride(Rank);
-          for (int i = 0; i < Rank; ++i) {
-            shape[i] = shape_[i];
-            stride[i] = stride_[i];
-          }
-          return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
-                            Tindex>(
-              reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
-              shape, stride);
-        },
-        [&](auto _) {
-          ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
-          for (int i = 0; i < int(ndim()); ++i) {
-            shape[i] = shape_[i];
-            stride[i] = stride_[i];
-          }
-          return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
-                            Tindex>(
-              reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
-              shape, stride);
-        });
-  }
-  template <class... Inds> Tensor view(Inds... newShapes) const {
-    static_assert(sizeof...(newShapes) > 0, "dont support empty for now");
-    TensorShape shape{int(newShapes)...};
-    bool found_minus_1 = false;
-    for (size_t i = 0; i < shape.ndim(); ++i) {
-      if (!found_minus_1) {
-        if (shape[i] == -1) {
-          shape[i] = 1;
-          shape[i] = size() / shape.size();
-          found_minus_1 = true;
-        } else {
-          TV_ASSERT_INVALID_ARG(shape[i] > 0,
-                                "shape except -1 must larger than 0");
-        }
-      } else {
-        TV_ASSERT_INVALID_ARG(shape[i] > 0, "multiple -1 in your argument.");
-      }
-    }
-    TV_ASSERT_RT_ERR(shape.size() == size(), "error");
-    Tensor res(*this);
-    res.shape_ = shape;
-    res.stride_ = shape.stride_rowmajor();
-    return res;
-  }
-
-  Tensor view(TensorShape shape) const {
-    TV_ASSERT_RT_ERR(shape.size() == size(), "error");
-    Tensor res(*this);
-    res.shape_ = shape;
-    res.stride_ = shape.stride_rowmajor();
-    return res;
-  }
-
-  Tensor operator[](int64_t index) {
-    TV_ASSERT_INVALID_ARG(ndim() > 1, "error");
-    if (index < 0) {
-      index += dim(0);
-    }
-    TV_ASSERT_INVALID_ARG(index < dim(0), "error");
-    Tensor res = Tensor();
-    res.storage_ = storage_;
-    res.shape_ = shape_.subshape(1);
-    res.offset_ = offset_ + index * stride_[0];
-    res.stride_ = stride_.subshape(1);
-    res.writeable_ = writeable_;
-    return res;
-  }
-
-  Tensor squeeze() const { return view(shape_.squeeze()); }
-
-  Tensor squeeze(int axis) const {
-    if (axis < 0) {
-      axis = ndim() + axis;
-    }
-    return view(shape_.squeeze(axis));
-  }
-
-  Tensor unsqueeze(int axis) const {
-    if (axis < 0) {
-      axis = ndim() + axis;
-    }
-    return view(shape_.unsqueeze(axis));
-  }
-
-  bool pinned() const { return storage_->pinned(); }
-
-  Tensor slice_first_axis(int start, int end) const {
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    if (start < 0) {
-      start = shape_[0] + start;
-    }
-    if (end < 0) {
-      end = shape_[0] + end;
-    }
-    TV_ASSERT_INVALID_ARG(start < shape_[0], "start must small than dim 0");
-    TV_ASSERT_INVALID_ARG(start < end, "start must small than end");
-    size_t new_offset = start * shape_.prod(1) * itemsize();
-    Tensor res(*this);
-    TensorShape newshape(shape_);
-    newshape[0] = end - start;
-    res.shape_ = newshape;
-    res.stride_ = stride_;
-    res.offset_ = new_offset;
-    return res;
-  }
-
-  bool empty() const { return storage_->empty(); }
-  DType dtype() const { return dtype_; }
-  int device() const { return storage_->device(); }
-  size_t ndim() const { return shape_.ndim(); }
-
-  const TensorShape &shape() const { return shape_; }
-  const TensorShape &sizes() const { return shape_; }
-  const TensorShape &stride() const { return stride_; }
-
-  int dim(int idx) const {
-    if (idx < 0) {
-      TV_ASSERT_RT_ERR(shape_.ndim() + idx < shape_.ndim(), idx, shape_);
-      return shape_[shape_.ndim() + idx];
-    } else {
-      TV_ASSERT_RT_ERR(idx < int(shape_.ndim()), idx, shape_);
-      return shape_[idx];
-    }
-  }
-  const uint8_t *raw_data() const { return storage_->data() + offset_; }
-  size_t raw_size() const { return size() * itemsize(); }
-  size_t size() const { return shape_.size(); }
-  size_t size(int64_t idx) const { return dim(idx); }
-  size_t itemsize() const { return detail::sizeof_dtype(dtype_); }
-  Tensor &zero_() {
-    writable_check();
-    storage_->zero_();
-    return *this;
-  }
-  uint8_t *raw_data() {
-    writable_check();
-    return storage_->data() + offset_;
-  }
-  template <typename T> Tensor &fill_(T value) {
-    writable_check();
-    TV_ASSERT_RT_ERR(device() == -1, "error");
-    Dispatch<detail::all_tensor_types_t>()(dtype_, [&](auto I) {
-      using Treal = decltype(I);
-      if (std::is_convertible<T, Treal>::value) {
-        auto ptr = reinterpret_cast<Treal *>(raw_data());
-        std::fill(ptr, ptr + size(), Treal(value));
-      } else {
-        TV_THROW_INVALID_ARG("not convertable from", type_s<T>, "to",
-                             type_s<Treal>);
-      }
-    });
-    return *this;
-  }
-
-  template <typename T> T *data() {
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    writable_check();
-    return reinterpret_cast<T *>(raw_data());
-  }
-
-  template <typename T> const T *data() const {
-    TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
-    return reinterpret_cast<const T *>(raw_data());
-  }
-
-  template <typename T> T *data_ptr() { return data<T>(); }
-
-  template <typename T> const T *data_ptr() const { return data<T>(); }
-
-  void *data_ptr() { return reinterpret_cast<void *>(raw_data()); }
-
-  const void *data_ptr() const {
-    return reinterpret_cast<const void *>(raw_data());
-  }
-
-  void copy_(const Tensor &tensor) {
-    writable_check();
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
-    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
-    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
-                     detail::typeString(dtype()),
-                     detail::typeString(tensor.dtype()));
-    if (device() == -1 && tensor.device() == -1) {
-#ifdef TV_CUDA
-      host2host(storage_->data(), tensor.raw_data(),
-                size() * detail::sizeof_dtype(dtype_));
-#else
-      std::copy(tensor.raw_data(),
-                tensor.raw_data() + size() * detail::sizeof_dtype(dtype_),
-                storage_->data());
-#endif
-    }
-#ifdef TV_CUDA
-    else if (device() >= 0 && tensor.device() == -1) {
-      host2dev(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_));
-    } else if (device() == -1 && tensor.device() >= 0) {
-      dev2host(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_));
-    } else if (device() >= 0 && tensor.device() >= 0) {
-      dev2dev(storage_->data(), tensor.raw_data(),
-              size() * detail::sizeof_dtype(dtype_));
-    }
-#endif
-    else {
-      TV_THROW_RT_ERR("only support cpu tensor");
-    }
-  }
-
-#ifdef TV_CUDA
-  void copy_(const Tensor &tensor, cudaStream_t stream) {
-    writable_check();
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    TV_ASSERT_RT_ERR(!empty() && !tensor.empty(), "must not empty");
-    TV_ASSERT_RT_ERR(size() == tensor.size(), "must have same size");
-    TV_ASSERT_RT_ERR(dtype() == tensor.dtype(), "must have same dtype",
-                     detail::typeString(dtype()),
-                     detail::typeString(tensor.dtype()));
-    if (device() == -1 && tensor.device() == -1) {
-      host2host(storage_->data(), tensor.raw_data(),
-                size() * detail::sizeof_dtype(dtype_), stream);
-    } else if (device() >= 0 && tensor.device() == -1) {
-      host2dev(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_), stream);
-    } else if (device() == -1 && tensor.device() >= 0) {
-      dev2host(storage_->data(), tensor.raw_data(),
-               size() * detail::sizeof_dtype(dtype_), stream);
-    } else if (device() >= 0 && tensor.device() >= 0) {
-      dev2dev(storage_->data(), tensor.raw_data(),
-              size() * detail::sizeof_dtype(dtype_), stream);
-    } else {
-      TV_THROW_RT_ERR("only support cpu tensor");
-    }
-  }
-#endif
-
-  Tensor cpu() const {
-    if (storage_->device() == -1) {
-      // cpu() should always copy tensor.
-      return clone();
-    }
-    Tensor res(shape_, stride_, dtype_, -1, storage_->managed());
-    res.copy_(*this);
-    return res;
-  }
-
-  template <typename T> void copy_(const TensorView<T> &tensor, int device) {
-    writable_check();
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    Tensor src = from_blob(tensor, device);
-    return copy_(src);
-  }
-
-  Tensor &operator=(const Tensor &tensor) {
-    dtype_ = tensor.dtype_;
-    storage_ = tensor.storage_;
-    shape_ = tensor.shape_;
-    writeable_ = tensor.writeable_;
-    offset_ = tensor.offset_;
-    stride_ = tensor.stride_;
-    return *this;
-  }
-
-  Tensor(const Tensor &tensor) {
-    dtype_ = tensor.dtype_;
-    storage_ = tensor.storage_;
-    shape_ = tensor.shape_;
-    writeable_ = tensor.writeable_;
-    offset_ = tensor.offset_;
-    stride_ = tensor.stride_;
-  }
-
-  Tensor clone(bool pinned = false) const {
-    TV_ASSERT_RT_ERR(!empty(), "clone a empty tensor");
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    Tensor newtensor(shape_, stride_, dtype_, device(), pinned,
-                     storage_->managed());
-    newtensor.copy_(*this);
-    return newtensor;
-  }
-
-  Tensor astype(DType dtype) {
-    if (dtype == dtype_) {
-      return clone();
-    }
-    TV_ASSERT_INVALID_ARG(device() == -1, "only support cpu tensor");
-    TV_ASSERT_INVALID_ARG(!empty(), "can't be used in empty tensor");
-    TV_ASSERT_INVALID_ARG(contiguous_, "only support contiguous for now");
-    auto tensor = Tensor();
-    Dispatch<detail::all_tensor_types_t>()(dtype, [&](auto Idst) {
-      using Tdst = decltype(Idst);
-      Dispatch<detail::all_tensor_types_t>()(this->dtype_, [&](auto Icur) {
-        using Tcur = decltype(Icur);
-        if (std::is_convertible<Tcur, Tdst>::value) {
-          auto ptr = this->data<Tcur>();
-          tensor = Tensor(this->shape_, this->stride_, dtype, this->device(),
-                          this->pinned(), this->storage_->managed());
-          std::copy(ptr, ptr + this->size(), tensor.data<Tdst>());
-        } else {
-          TV_THROW_INVALID_ARG("not convertable from", type_s<Tcur>, "to",
-                               type_s<Tdst>);
-        }
-      });
-    });
-    return tensor;
-  }
-
-  template <class... Ts, typename F> inline void dispatch(F &&f) {
-    return tv::dispatch<Ts...>(dtype_, std::forward<F>(f));
-  }
-
-protected:
-  inline void writable_check() {
-    TV_ASSERT_RT_ERR(writeable_,
-                     "you cant do non-const operation when not writable");
-  }
-
-  DType dtype_;
-  std::shared_ptr<detail::TensorStorage<uint8_t>> storage_;
-  TensorShape shape_;
-  size_t offset_ = 0;
-  TensorShape stride_;
-
-private:
-  bool writeable_ = true;
-  bool contiguous_ = true;
-};
-
-template <typename Os> Os &operator<<(Os &os, const Tensor &tensor) {
-  TV_ASSERT_INVALID_ARG(tensor.device() == -1, "must be cpu tensor");
-  Dispatch<detail::all_tensor_types_t>()(tensor.dtype(), [&](auto I) {
-    using T = decltype(I);
-    std::stringstream ss;
-    if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
-      ss << std::setprecision(4);
-    }
-    os << tensor.tview<T, -1, DefaultPtrTraits, int64_t>().repr(ss);
-  });
-  return os;
-}
-
-inline Tensor from_blob(void *ptr, TensorShape shape, DType dtype, int device) {
-  return Tensor(ptr, shape, dtype, device);
-}
-
-inline Tensor from_blob(const void *ptr, TensorShape shape, DType dtype,
-                        int device) {
-  return Tensor(ptr, shape, dtype, device);
-}
-
-} // namespace tv
\ No newline at end of file
diff --git a/include/tensorview/tensorview.h b/include/tensorview/tensorview.h
deleted file mode 100644
index 54b4543..0000000
--- a/include/tensorview/tensorview.h
+++ /dev/null
@@ -1,1503 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "common.h"
-#include "prettyprint.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <sstream>
-#include <type_traits>
-#include <vector>
-#ifdef TV_CUDA
-#include <cuda_runtime_api.h>
-#endif
-namespace tv {
-
-#if (defined(__clang__) && defined(__CUDA__)) || defined(__NVCC__)
-
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#define TV_ASSERT(expr) assert(expr)
-#elif defined(__CUDACC_RTC__)
-#define TV_ASSERT(expr) assert(expr)
-#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
-#define TV_DEVICE_INLINE __forceinline__ __device__
-#define TV_HOST_DEVICE __device__ __host__
-#else
-#define TV_ASSERT(x) assert(x)
-#define TV_HOST_DEVICE_INLINE inline
-#define TV_HOST_DEVICE
-#endif
-
-#define TV_REQUIRE(expr, ...)                                                  \
-  {                                                                            \
-    if (!(expr)) {                                                             \
-      printf(__VA_ARGS__);                                                     \
-      assert(expr);                                                            \
-    }                                                                          \
-  }
-
-#define TV_CHECK_CUDA_ERR()                                                    \
-  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                     \
-    if (__macro_err != cudaSuccess) {                                          \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;         \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
-  }
-
-#define TV_CHECK_CUDA_ERR_V2(...)                                              \
-  {                                                                            \
-    auto __macro_err = cudaGetLastError();                                     \
-    if (__macro_err != cudaSuccess) {                                          \
-      std::stringstream __macro_s;                                             \
-      __macro_s << __FILE__ << " " << __LINE__ << "\n";                        \
-      __macro_s << "cuda execution failed with error " << __macro_err;         \
-      __macro_s << " " << cudaGetErrorString(__macro_err) << "\n";             \
-      tv::sstream_print(__macro_s, __VA_ARGS__);                               \
-      TV_BACKTRACE_PRINT(__macro_s);                                           \
-      throw std::runtime_error(__macro_s.str());                               \
-    }                                                                          \
-  }
-
-#ifdef TV_CUDA
-struct GPU {
-  GPU(cudaStream_t s = 0) : mStream(s) {}
-  virtual cudaStream_t getStream() const { return mStream; }
-  cudaStream_t mStream = 0;
-};
-#endif
-struct CPU {};
-
-#ifndef TV_MAX_DIM
-#define TV_MAX_DIM 6
-#endif
-
-template <typename T> struct DefaultPtrTraits { typedef T *type; };
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-template <typename T> struct RestrictPtrTraits {
-  typedef T *__restrict__ type;
-};
-#endif
-
-/*
-template <typename T>
-constexpr size_t calc_align(size_t ndim)
-{
-  if (ndim * sizeof(T) == 1)
-    return 1;
-  else if (ndim * sizeof(T) == 2)
-    return 2;
-  else if (ndim * sizeof(T) <= 4 && ndim * sizeof(T) > 2)
-    return 4;
-  else if (ndim * sizeof(T) <= 8 && ndim * sizeof(T) > 4)
-    return 8;
-  else if (ndim * sizeof(T) <= 16 && ndim * sizeof(T) > 8)
-    return 16;
-  else if (ndim * sizeof(T) <= 32 && ndim * sizeof(T) > 16)
-    return 32;
-  else
-    return 64;
-}
-*/
-
-namespace detail {
-template <typename _InIter>
-using _RequireInputIter = typename std::enable_if<std::is_convertible<
-    typename std::iterator_traits<_InIter>::iterator_category,
-    std::input_iterator_tag>::value>::type;
-
-}
-
-template <typename T, size_t MaxDim = TV_MAX_DIM>
-struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
-public:
-  TV_HOST_DEVICE_INLINE SimpleVector(){};
-  TV_HOST_DEVICE_INLINE SimpleVector(size_t count, T init = T())
-      : size_(count) {
-    for (size_t i = 0; i < count; ++i) {
-      array_[i] = init;
-    }
-  };
-  template <typename Iterator, typename = detail::_RequireInputIter<Iterator>>
-  SimpleVector(Iterator first, Iterator last) {
-    size_ = 0;
-    for (; first != last; ++first) {
-      if (size_ >= MaxDim) {
-        TV_THROW_INVALID_ARG("iterator too long");
-      }
-      array_[size_++] = *first;
-    }
-  };
-  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
-    TV_ASSERT(q.size() <= MaxDim);
-    size_ = 0;
-    for (T s : q) {
-      array_[size_++] = s;
-    }
-    size_ = q.size();
-  }
-  SimpleVector(const std::vector<T> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      array_[i] = arr[i];
-    }
-    size_ = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE SimpleVector(const SimpleVector<T, MaxDim> &arr) {
-    TV_ASSERT(arr.size() <= MaxDim);
-    for (size_t i = 0; i < arr.size(); ++i) {
-      array_[i] = arr[i];
-    }
-    size_ = arr.size();
-  }
-  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < size_);
-#endif
-    return array_[idx];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < size_);
-#endif
-    return array_[idx];
-  }
-  TV_HOST_DEVICE_INLINE void push_back(T s) {
-#ifdef TV_DEBUG
-    TV_ASSERT(size_ < MaxDim);
-#endif
-    array_[size_] = s;
-    size_++;
-  }
-  TV_HOST_DEVICE_INLINE void pop_back() {
-#ifdef TV_DEBUG
-    TV_ASSERT(size_ > 0);
-#endif
-    size_--;
-  }
-
-  TV_HOST_DEVICE_INLINE size_t size() const { return size_; }
-  TV_HOST_DEVICE_INLINE const T *data() const { return array_; }
-  TV_HOST_DEVICE_INLINE T *data() { return array_; }
-  TV_HOST_DEVICE_INLINE size_t empty() const { return size_ == 0; }
-
-  typedef size_t size_type;
-
-  class iterator {
-  public:
-    typedef iterator self_type;
-    typedef T value_type;
-    typedef T &reference;
-    typedef T *pointer;
-    typedef std::forward_iterator_tag iterator_category;
-    typedef std::ptrdiff_t difference_type;
-    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
-      return ptr_ != rhs.ptr_;
-    }
-
-  private:
-    pointer ptr_;
-  };
-
-  class const_iterator {
-  public:
-    typedef const_iterator self_type;
-    typedef T value_type;
-    typedef const T &reference;
-    typedef const T *pointer;
-    typedef std::ptrdiff_t difference_type;
-    typedef std::forward_iterator_tag iterator_category;
-    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
-    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
-      self_type i = *this;
-      ptr_++;
-      return i;
-    }
-    TV_HOST_DEVICE_INLINE self_type operator++() {
-      ptr_++;
-      return *this;
-    }
-    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
-    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
-    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) const {
-      return ptr_ == rhs.ptr_;
-    }
-    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) const {
-      return ptr_ != rhs.ptr_;
-    }
-
-  private:
-    pointer ptr_;
-  };
-
-  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(array_); }
-
-  TV_HOST_DEVICE_INLINE iterator end() { return iterator(array_ + size_); }
-
-  TV_HOST_DEVICE_INLINE const_iterator begin() const {
-    return const_iterator(array_);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator end() const {
-    return const_iterator(array_ + size_);
-  }
-  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
-    return const_iterator(array_);
-  }
-
-  TV_HOST_DEVICE_INLINE const_iterator cend() const {
-    return const_iterator(array_ + size_);
-  }
-
-protected:
-  T array_[MaxDim];
-  size_t size_ = 0;
-};
-
-template <typename T, size_t MaxDim>
-bool operator==(const SimpleVector<T, MaxDim> &lfs,
-                const SimpleVector<T, MaxDim> &rfs) {
-  if (lfs.size() != rfs.size())
-    return false;
-  for (size_t i = 0; i < lfs.size(); ++i) {
-    if (lfs[i] != rfs[i])
-      return false;
-  }
-  return true;
-}
-
-template <typename T, size_t MaxDim>
-bool operator!=(const SimpleVector<T, MaxDim> &lfs,
-                const SimpleVector<T, MaxDim> &rfs) {
-
-  return !(lfs == rfs);
-}
-
-struct Slice {
-  template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
-    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
-    SimpleVector<int, 3> slices{int(ints)...};
-    slices_[0] = -1;
-    slices_[1] = -1;
-    slices_[2] = -1;
-    for (size_t i = 0; i < slices.size(); ++i) {
-      slices_[i] = slices[i];
-    }
-  }
-
-  TV_HOST_DEVICE_INLINE Slice() {
-    slices_[0] = -1;
-    slices_[1] = -1;
-    slices_[2] = -1;
-  }
-  template <typename T>
-  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<T> slice) {
-    slices_[0] = -1;
-    slices_[1] = -1;
-    slices_[2] = -1;
-    TV_ASSERT(slice.size() <= 3);
-    int idx = 0;
-    for (T s : slice) {
-      slices_[idx] = int(s);
-      ++idx;
-    }
-  }
-  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return slices_[idx];
-  }
-  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(idx >= 0 && idx < 3);
-#endif
-    return slices_[idx];
-  }
-
-protected:
-  int slices_[3];
-};
-
-template <size_t MaxDim = TV_MAX_DIM, typename Tindex = int>
-struct ShapeBase : public SimpleVector<Tindex, MaxDim> {
-  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<Tindex, MaxDim>(){};
-  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<Tindex> shape)
-      : SimpleVector<Tindex, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(SimpleVector<Tindex, MaxDim> vec)
-      : SimpleVector<Tindex, MaxDim>(vec) {}
-  template <typename T, template <class...> class Container>
-  ShapeBase(Container<T> shape) : SimpleVector<Tindex, MaxDim>(shape) {}
-  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
-      : SimpleVector<Tindex, MaxDim>(shape) {}
-  ShapeBase(const std::vector<Tindex> &arr)
-      : SimpleVector<Tindex, MaxDim>(arr) {}
-
-  ShapeBase<MaxDim, Tindex> &
-  operator=(const ShapeBase<MaxDim, Tindex> &shape) = default;
-  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start,
-                                                    Tindex end) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && end <= this->size_ && end > start);
-#endif
-    ShapeBase<MaxDim, Tindex> shape;
-    for (Tindex i = start; i < end; ++i) {
-      shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> subshape(Tindex start) const {
-#ifdef TV_DEBUG
-    TV_ASSERT(start >= 0 && start <= this->size_);
-#endif
-    ShapeBase<MaxDim, Tindex> shape;
-    for (size_t i = start; i < this->size_; ++i) {
-      shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-
-  TV_HOST_DEVICE size_t size() const {
-    if (this->size_ == 0)
-      return 0;
-    size_t s = 1;
-    for (int i = 0; i < int(this->size_); ++i) {
-      s *= this->array_[i];
-    }
-    return s;
-  }
-  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->size_; }
-
-  TV_HOST_DEVICE ShapeBase<MaxDim, Tindex> squeeze() const {
-    ShapeBase<MaxDim, Tindex> shape;
-    for (size_t i = 0; i < this->size_; ++i) {
-      if (this->array_[i] != 1)
-        shape.push_back(this->array_[i]);
-    }
-    if (shape.empty()) {
-      // dont support empty shape for now
-      shape.push_back(1);
-    }
-    return shape;
-  }
-  template <size_t MaxDim2 = MaxDim>
-  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> squeeze(int dim) const {
-    static_assert(MaxDim2 >= MaxDim - 1, "error");
-
-    ShapeBase<MaxDim2, Tindex> shape;
-    for (size_t i = 0; i < this->size_; ++i) {
-      if (i != size_t(dim) || this->array_[i] != 1)
-        shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-  template <size_t MaxDim2 = MaxDim>
-  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> unsqueeze(int dim) const {
-    static_assert(MaxDim2 >= MaxDim - 1, "error");
-    ShapeBase<MaxDim2, Tindex> shape;
-    for (size_t i = 0; i < this->size_; ++i) {
-      if (i == size_t(dim))
-        shape.push_back(1);
-      shape.push_back(this->array_[i]);
-    }
-    return shape;
-  }
-
-  TV_HOST_DEVICE size_t prod(Tindex start = 0) const {
-    size_t res = 1;
-    for (size_t i = start; i < this->size_; ++i) {
-      res *= this->array_[i];
-    }
-    return res;
-  }
-  template <size_t MaxDim2 = MaxDim>
-  TV_HOST_DEVICE ShapeBase<MaxDim2, Tindex> stride_rowmajor() {
-    static_assert(MaxDim2 >= MaxDim, "error");
-    Tindex p = Tindex(1);
-    ShapeBase<MaxDim2, Tindex> res(this->size_);
-    for (Tindex i = this->size_ - 1; i >= 0; --i) {
-      res[i] = p;
-      p *= this->array_[i];
-    }
-    return res;
-  }
-};
-
-using Shape = ShapeBase<TV_MAX_DIM, int>;
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#ifdef TV_DEBUG
-  TV_ASSERT(sizeof...(indexes) == shape.size());
-#endif
-#if defined(__CUDA_ARCH__)
-#pragma unroll
-#endif
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
-                                           std::vector<int> &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = shape.size() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <class... Inds>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           Inds... indexes) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  int indexes_vec[sizeof...(indexes)] = {indexes...};
-#if defined(__CUDA_ARCH__)
-#pragma unroll
-#endif
-  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
-                                           const Shape &indexes_vec) {
-  unsigned offset = 0;
-  unsigned m = 1;
-  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
-    offset += m * indexes_vec[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
-                                           const Index *shape) {
-  unsigned offset = 0;
-  unsigned m = 1;
-#if defined(__CUDA_ARCH__)
-#pragma unroll
-#endif
-  for (int i = NDim - 1; i >= 0; --i) {
-    offset += m * indexes[i];
-    m *= shape[i];
-  }
-  return offset;
-}
-
-template <typename Index, unsigned NDim>
-TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
-                                           const Index *shape) {
-#pragma unroll
-  for (int i = NDim - 1; i >= 0; --i) {
-    output[i] = index % shape[i];
-    index -= output[i];
-    index /= shape[i];
-  }
-  return index;
-}
-
-template <typename Index>
-TV_HOST_DEVICE Index rowArrayIdxInv(Index index, Index *output,
-                                    const Index *shape, int ndim) {
-  for (int i = ndim - 1; i >= 0; --i) {
-    output[i] = index % shape[i];
-    index -= output[i];
-    index /= shape[i];
-  }
-  return index;
-}
-
-template <int N> struct ArrayIndexRowMajorReverse {
-  template <typename TShape, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T index,
-                                            Ts... inds) {
-    return index +
-           shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
-  }
-  template <typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T index,
-                                                 Ts... inds) {
-    return index +
-           shape[N - 1] * ArrayIndexRowMajorReverse<N - 1>::run(shape, inds...);
-  }
-};
-
-template <> struct ArrayIndexRowMajorReverse<1> {
-  template <typename TShape, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, T idx) {
-    return idx;
-  }
-  template <typename T>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape, T idx) {
-    return idx;
-  }
-};
-
-template <int N, int Ndim> struct ArrayIndexRowMajor {
-  // this array index provide almost same compiled code. compile it in
-  // https://godbolt.org/ for more details.
-  template <typename TShape, typename Tinit, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
-                                            T index, Ts... inds) {
-    return ArrayIndexRowMajor<N - 1, Ndim>::run(
-        shape, (index + start) * shape[Ndim - N + 1], inds...);
-  }
-  template <typename Tinit, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runShape(const Shape &shape, Tinit start, T index, Ts... inds) {
-    return ArrayIndexRowMajor<N - 1, Ndim>::runShape(
-        shape, (index + start) * shape[Ndim - N + 1], inds...);
-  }
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
-    return ArrayIndexRowMajor<N - 1, Ndim>::runPtrs(
-        indexes, shape, (indexes[Ndim - N] + start) * shape[Ndim - N + 1]);
-  }
-};
-
-template <int Ndim> struct ArrayIndexRowMajor<1, Ndim> {
-  template <typename TShape, typename Tinit, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start,
-                                            T idx) {
-    return start + idx;
-  }
-  template <typename Tinit, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
-                                                 Tinit start, T idx) {
-    return start + idx;
-  }
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
-    return start + indexes[Ndim - 1];
-  }
-};
-
-template <> struct ArrayIndexRowMajor<0, 0> {
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *shape, Tinit start) {
-    return 0;
-  }
-  template <typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned runShape(const Shape &shape,
-                                                 Tinit start) {
-    return 0;
-  }
-  template <typename TShape, typename Tinit>
-  TV_HOST_DEVICE_INLINE static unsigned
-  runPtrs(const TShape *indexes, const TShape *shape, Tinit start) {
-    return 0;
-  }
-};
-
-template <int N, int Ndim> struct ArrayIndexStride {
-  // this array index provide almost same compiled code. compile it in
-  // https://godbolt.org/ for more details.
-  template <typename TShape, typename Tinit, typename T, class... Ts>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
-                                            T index, Ts... inds) {
-    return ArrayIndexStride<N - 1, Ndim>::run(
-        stride, start + index * stride[Ndim - N + 1], inds...);
-  }
-};
-
-template <int Ndim> struct ArrayIndexStride<1, Ndim> {
-  template <typename TShape, typename Tinit, typename T>
-  TV_HOST_DEVICE_INLINE static unsigned run(const TShape *stride, Tinit start,
-                                            T idx) {
-    return start + idx * stride[Ndim - 1];
-  }
-};
-
-#if __cplusplus >= 201703L
-template <size_t... N, class T, class... Ts>
-TV_HOST_DEVICE_INLINE T array_index_stride(const T *stride, Ts... ids) {
-  return ((stride[N] * std::get<N>(std::forward_as_tuple(ids...))) + ...);
-}
-#endif
-
-namespace detail {
-template <typename T> struct TypeToString;
-template <> struct TypeToString<bool> {
-  static constexpr const char *value = "bool";
-};
-template <> struct TypeToString<const bool> {
-  static constexpr const char *value = "bool";
-};
-template <> struct TypeToString<int32_t> {
-  static constexpr const char *value = "int32";
-};
-template <> struct TypeToString<float> {
-  static constexpr const char *value = "float";
-};
-template <> struct TypeToString<double> {
-  static constexpr const char *value = "double";
-};
-template <> struct TypeToString<int16_t> {
-  static constexpr const char *value = "int16";
-};
-template <> struct TypeToString<int8_t> {
-  static constexpr const char *value = "int8";
-};
-template <> struct TypeToString<int64_t> {
-  static constexpr const char *value = "int64";
-};
-template <> struct TypeToString<uint8_t> {
-  static constexpr const char *value = "uint8";
-};
-template <> struct TypeToString<uint16_t> {
-  static constexpr const char *value = "uint16";
-};
-template <> struct TypeToString<uint32_t> {
-  static constexpr const char *value = "uint32";
-};
-template <> struct TypeToString<uint64_t> {
-  static constexpr const char *value = "uint64";
-};
-template <> struct TypeToString<const int32_t> {
-  static constexpr const char *value = "int32";
-};
-template <> struct TypeToString<const float> {
-  static constexpr const char *value = "float";
-};
-template <> struct TypeToString<const double> {
-  static constexpr const char *value = "double";
-};
-template <> struct TypeToString<const int16_t> {
-  static constexpr const char *value = "int16";
-};
-template <> struct TypeToString<const int8_t> {
-  static constexpr const char *value = "int8";
-};
-template <> struct TypeToString<const int64_t> {
-  static constexpr const char *value = "int64";
-};
-template <> struct TypeToString<const uint8_t> {
-  static constexpr const char *value = "uint8";
-};
-template <> struct TypeToString<const uint16_t> {
-  static constexpr const char *value = "uint16";
-};
-template <> struct TypeToString<const uint32_t> {
-  static constexpr const char *value = "uint32";
-};
-template <> struct TypeToString<const uint64_t> {
-  static constexpr const char *value = "uint64";
-};
-} // namespace detail
-
-template <typename T>
-constexpr const char *type_s = detail::TypeToString<T>::value;
-
-namespace detail {
-
-template <typename T, int Rank,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-struct TensorAccesserBase {
-  static constexpr int rank_value = Rank;
-  using ptr_t = typename PtrTraits<T>::type;
-
-  static_assert(Rank > 0, "error");
-
-  explicit TV_HOST_DEVICE_INLINE TensorAccesserBase(ptr_t ptr,
-                                                    const Tindex *stride_ptr)
-      : ptr_(ptr), stride_ptr_(stride_ptr) {}
-
-  TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
-  TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
-
-  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
-    static_assert(sizeof...(inds) == Rank, "error");
-    return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
-  }
-
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
-    static_assert(sizeof...(inds) == Rank, "error");
-    return ptr_[ArrayIndexStride<Rank, Rank>::run(stride_ptr_, 0, inds...)];
-  }
-
-protected:
-  ptr_t ptr_;
-  const Tindex *stride_ptr_;
-};
-} // namespace detail
-
-template <typename T, int Rank,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-struct TensorAccesser
-    : public detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex> {
-  using ptr_t = typename PtrTraits<T>::type;
-  static_assert(Rank > 0, "error");
-  explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
-                                                const Tindex *stride_ptr)
-      : detail::TensorAccesserBase<T, Rank, PtrTraits, Tindex>(ptr,
-                                                               stride_ptr) {}
-
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
-  operator[](int i) {
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
-  }
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
-  operator[](int i) const {
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        this->ptr_ + this->stride_ptr_[0] * i, this->stride_ptr_ + 1);
-  }
-};
-
-template <typename T, template <class> class PtrTraits, typename Tindex>
-struct TensorAccesser<T, 1, PtrTraits, Tindex>
-    : public detail::TensorAccesserBase<T, 1, PtrTraits, Tindex> {
-  using ptr_t = typename PtrTraits<T>::type;
-
-  explicit TV_HOST_DEVICE_INLINE TensorAccesser(ptr_t ptr,
-                                                const Tindex *stride_ptr)
-      : detail::TensorAccesserBase<T, 1, PtrTraits, Tindex>(ptr, stride_ptr) {}
-
-  TV_HOST_DEVICE_INLINE T &operator[](int i) {
-    return this->ptr_[this->stride_ptr_[0] * i];
-  }
-  TV_HOST_DEVICE_INLINE T &operator[](int i) const {
-    return this->ptr_[this->stride_ptr_[0] * i];
-  }
-};
-
-template <typename T, int Rank = -1,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-struct TensorView {
-  static constexpr int rank_value = Rank;
-  using ptr_t = typename PtrTraits<T>::type;
-  using tv_shape_t = ShapeBase<Rank == -1 ? TV_MAX_DIM : Rank, Tindex>;
-  using no_cv_type = typename std::remove_cv<T>::type;
-  static_assert(Rank == -1 || Rank > 0, "error");
-
-  TV_HOST_DEVICE_INLINE TensorView() {}
-  explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape)
-      : ptr_(ptr), shape_(shape), stride_(shape.stride_rowmajor()) {}
-
-  explicit TV_HOST_DEVICE_INLINE TensorView(ptr_t ptr, tv_shape_t shape,
-                                            tv_shape_t stride)
-      : ptr_(ptr), shape_(shape), stride_(stride) {}
-
-  operator TensorView<const no_cv_type, Rank, PtrTraits, Tindex>() {
-    return TensorView<const no_cv_type, Rank, PtrTraits, Tindex>(ptr_, shape_);
-  } // conversion function
-
-  template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
-    static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
-#if defined TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               shape_.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 shape_[i]);
-    }
-#endif
-    constexpr int Ndim = sizeof...(Inds);
-    return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
-  }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE const T &operator()(Inds... inds) const {
-    static_assert(Rank == -1 || sizeof...(inds) == Rank, "error");
-#if defined TV_DEBUG
-    int idxes[sizeof...(Inds)]{int(inds)...};
-    TV_REQUIRE(sizeof...(inds) == shape_.ndim(),
-               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
-               shape_.ndim());
-    for (int i = 0; i < sizeof...(inds); ++i) {
-      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < shape_[i],
-                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
-                 shape_[i]);
-    }
-#endif
-    constexpr int Ndim = sizeof...(Inds);
-    return ptr_[ArrayIndexRowMajor<Ndim, Ndim>::runShape(shape_, 0, inds...)];
-  }
-  TV_HOST_DEVICE_INLINE T &operator()() {
-    static_assert(Rank == -1 || 0 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               shape_.ndim());
-#endif
-    return ptr_[0];
-  }
-  TV_HOST_DEVICE_INLINE const T &operator()() const {
-    static_assert(Rank == -1 || 0 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(ptr_ != nullptr, "you want get value but the view is empty.%s",
-               "\n");
-    TV_REQUIRE(shape_.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
-               shape_.ndim());
-#endif
-    return ptr_[0];
-  }
-  template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
-    static_assert(Rank == -1 || 1 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, shape_[0]);
-#endif
-    return ptr_[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2) {
-    static_assert(Rank == -1 || 2 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-#endif
-    return ptr_[i1 * shape_[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3) {
-    static_assert(Rank == -1 || 3 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-#endif
-    return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
-    static_assert(Rank == -1 || 4 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
-#endif
-    return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
-  }
-
-  template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
-    static_assert(Rank == -1 || 1 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-#endif
-    return ptr_[i1];
-  }
-  template <class T1, class T2>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2) const {
-    static_assert(Rank == -1 || 2 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-#endif
-    return ptr_[i1 * shape_[1] + i2];
-  }
-  template <class T1, class T2, class T3>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3) const {
-    static_assert(Rank == -1 || 3 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-#endif
-    return ptr_[(i1 * shape_[1] + i2) * shape_[2] + i3];
-  }
-  template <class T1, class T2, class T3, class T4>
-  TV_HOST_DEVICE_INLINE const T &operator()(T1 i1, T2 i2, T3 i3, T4 i4) const {
-    static_assert(Rank == -1 || 4 == Rank, "error");
-#if defined TV_DEBUG
-    TV_REQUIRE(shape_.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
-               shape_.ndim());
-    TV_REQUIRE(i1 >= 0 && i1 < shape_[0],
-               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), shape_[0]);
-    TV_REQUIRE(i2 >= 0 && i2 < shape_[1],
-               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), shape_[1]);
-    TV_REQUIRE(i3 >= 0 && i3 < shape_[2],
-               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), shape_[2]);
-    TV_REQUIRE(i4 >= 0 && i4 < shape_[3],
-               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), shape_[3]);
-#endif
-    return ptr_[((i1 * shape_[1] + i2) * shape_[2] + i3) * shape_[3] + i4];
-  }
-
-  TV_HOST_DEVICE_INLINE T &operator[](int idx) {
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-               int(idx), size());
-#endif
-    return ptr_[idx];
-  }
-
-  TV_HOST_DEVICE_INLINE const T &operator[](int idx) const {
-#ifdef TV_DEBUG
-    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
-               int(idx), size());
-#endif
-    return ptr_[idx];
-  }
-
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank - 1, PtrTraits, Tindex>
-  accessor(Tindex idx) {
-    static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        ptr_ + stride_[0] * idx, stride_.data() + 1);
-  }
-  TV_HOST_DEVICE_INLINE TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() {
-    static_assert(Rank > 0, "rank must higher than zero");
-    return TensorAccesser<T, Rank, PtrTraits, Tindex>(ptr_, stride_.data());
-  }
-  TV_HOST_DEVICE_INLINE
-  TensorAccesser<T, Rank - 1, PtrTraits, Tindex> accessor(Tindex idx) const {
-    static_assert(Rank > 1, "for Rank == 1, use accessor() or just use []");
-    return TensorAccesser<T, Rank - 1, PtrTraits, Tindex>(
-        ptr_ + stride_[0] * idx, stride_.data() + 1);
-  }
-  TV_HOST_DEVICE_INLINE
-  TensorAccesser<T, Rank, PtrTraits, Tindex> accessor() const {
-    static_assert(Rank > 0, "error");
-    return TensorAccesser<T, Rank, PtrTraits, Tindex>(
-        ptr_, stride_.data(), "rank must higher than zero");
-  }
-
-  TV_HOST_DEVICE_INLINE bool empty() const { return ptr_ == nullptr; }
-  TV_HOST_DEVICE_INLINE ptr_t data() { return ptr_; }
-  TV_HOST_DEVICE_INLINE const ptr_t data() const { return ptr_; }
-  TV_HOST_DEVICE_INLINE const tv_shape_t &shape() const { return shape_; }
-  TV_HOST_DEVICE_INLINE const tv_shape_t &stride() const { return stride_; }
-
-  TV_HOST_DEVICE_INLINE int dim(int idx) const { return shape_[idx]; }
-  TV_HOST_DEVICE_INLINE int ndim() const { return shape_.ndim(); }
-  template <class... Inds>
-  TV_HOST_DEVICE_INLINE
-      TensorView<T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits, Tindex>
-      view(Inds... newShapes) const {
-    ShapeBase<Rank == -1 ? TV_MAX_DIM : sizeof...(Inds), Tindex> shapes{
-        int(newShapes)...};
-    for (size_t i = 0; i < sizeof...(newShapes); ++i) {
-      if (shapes[i] == -1) {
-        shapes[i] = 1;
-        shapes[i] = size() / shapes.size();
-        break;
-      }
-    }
-    TV_ASSERT(shapes.size() == size());
-    return TensorView < T, Rank == -1 ? -1 : sizeof...(Inds), PtrTraits,
-           Tindex > (ptr_, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  view(Shape shapes) const {
-    TV_ASSERT(shapes.size() == size());
-    return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shapes);
-  }
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex> squeeze() const {
-    return TensorView<T, -1, PtrTraits, Tindex>(ptr_, shape_.squeeze());
-  }
-  TV_HOST_DEVICE_INLINE
-  TensorView<T, Rank == -1 ? -1 : Rank - 1, PtrTraits, Tindex>
-  squeeze(int dim) const {
-    return TensorView < T, Rank == -1 ? -1 : Rank - 1, PtrTraits,
-           Tindex > (ptr_, shape_.squeeze < Rank == -1 ? TV_MAX_DIM
-                                                       : Rank - 1 > (dim));
-  }
-  TV_HOST_DEVICE_INLINE size_t size() const { return shape_.size(); }
-
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  subview(int id, Integers... ints) {
-    tv_shape_t start = {id, ints...};
-    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        ptr_ + rowArrayIdx(shape_, start),
-        shape_.subshape(sizeof...(ints) + 1));
-  }
-
-  template <class... Integers>
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  subview(int id, Integers... ints) const {
-    tv_shape_t start = {id, ints...};
-    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        ptr_ + rowArrayIdx(shape_, start),
-        shape_.subshape(sizeof...(ints) + 1));
-  }
-
-  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
-  subview(SimpleVector<int> ids) const {
-    Shape start = ids;
-    for (int i = ids.size(); i < ndim(); ++i) {
-      start.push_back(0);
-    }
-    return TensorView<T, Rank, PtrTraits, Tindex>(
-        ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
-  }
-  template <typename Os> std::string repr(Os &ss) const {
-    if (empty())
-      return "";
-    if (shape_.ndim() == 0) {
-      ss << "Tensor[" << type_s<T> << "]" << std::endl;
-      ss << *ptr_;
-      return ss.str();
-    }
-
-    SimpleVector<int64_t, TV_MAX_DIM> prev(ndim(), -1);
-    SimpleVector<int64_t, TV_MAX_DIM> nd_index(ndim());
-    SimpleVector<int64_t, TV_MAX_DIM> _shape;
-    for (auto s : shape()) {
-      _shape.push_back(s);
-    }
-    ss << "Tensor[" << type_s<T> << "]: shape=" << shape()
-       << ", stride=" << stride() << std::endl;
-    auto ndimValue = ndim();
-    for (int64_t i = 0; i < int64_t(size()); ++i) {
-      rowArrayIdxInv(i, nd_index.data(), _shape.data(), ndimValue);
-      bool newline = false;
-      int end_count = 0;
-      for (int j = 0; j < ndimValue; ++j) {
-        if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
-            prev[j] != -1) {
-          ss << "]";
-          ++end_count;
-          newline = true;
-        }
-      }
-      if (prev[0] == -1) {
-        end_count = ndimValue;
-      }
-      if (newline) {
-        ss << "\n";
-      }
-      int starts_count = 0;
-      for (int j = 0; j < ndimValue; ++j) {
-        if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
-          ++starts_count;
-        }
-      }
-      if (starts_count > 0) {
-        for (int j = 0; j < ndimValue - end_count; ++j) {
-          ss << " ";
-        }
-        for (int j = 0; j < starts_count; ++j) {
-          ss << "[";
-        }
-      }
-      if (std::is_same<T, uint8_t>::value ||
-          std::is_same<T, const uint8_t>::value) {
-        ss << unsigned((*this)[i]);
-      } else {
-        ss << (*this)[i];
-      }
-      if (nd_index[ndimValue - 1] != _shape[ndimValue - 1] - 1) {
-        ss << ",";
-      }
-      for (int j = 0; j < ndimValue; ++j) {
-        prev[j] = nd_index[j];
-      }
-    }
-    for (int j = 0; j < ndimValue; ++j) {
-      ss << "]";
-    }
-    return ss.str();
-  }
-  std::string repr() const {
-    std::ostringstream ss;
-    return repr(ss);
-  }
-
-protected:
-  template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
-    return Slice{int(s), -1, -1};
-  }
-
-  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
-
-  ptr_t ptr_ = nullptr;
-  tv_shape_t shape_;
-  tv_shape_t stride_;
-};
-
-template <typename T> TensorView<T> vector2tv(std::vector<T> &arr) {
-  return TensorView<T>(arr.data(), {arr.size()});
-}
-
-template <typename T>
-TensorView<T> vector2tv(std::vector<T> &arr, Shape shape) {
-  TV_ASSERT_INVALID_ARG(shape.prod() == arr.size(), "error");
-  return TensorView<T>(arr.data(), shape);
-}
-
-template <typename T> TensorView<const T> vector2tv(const std::vector<T> &arr) {
-  return TensorView<const T>(arr.data(), {arr.size()});
-}
-
-template <typename Os, typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-Os &operator<<(Os &os, const TensorView<T, Rank, PtrTraits, Tindex> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-template <typename Os, typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-Os &operator<<(Os &os, const TensorView<const T, Rank, PtrTraits, Tindex> &dt) {
-  os << dt.repr();
-  return os;
-}
-
-namespace detail {
-template <typename T> struct TypePrintfFormat;
-template <> struct TypePrintfFormat<float> {
-  static constexpr const char *value = "%.2f";
-};
-template <> struct TypePrintfFormat<double> {
-  static constexpr const char *value = "%.2f";
-};
-template <> struct TypePrintfFormat<int8_t> {
-  static constexpr const char *value = "%d";
-};
-template <> struct TypePrintfFormat<int16_t> {
-  static constexpr const char *value = "%d";
-};
-template <> struct TypePrintfFormat<int32_t> {
-  static constexpr const char *value = "%d";
-};
-template <> struct TypePrintfFormat<uint8_t> {
-  static constexpr const char *value = "%u";
-};
-template <> struct TypePrintfFormat<uint16_t> {
-  static constexpr const char *value = "%u";
-};
-template <> struct TypePrintfFormat<uint32_t> {
-  static constexpr const char *value = "%u";
-};
-template <> struct TypePrintfFormat<int64_t> {
-  static constexpr const char *value = "%ld";
-};
-template <> struct TypePrintfFormat<uint64_t> {
-  static constexpr const char *value = "%lu";
-};
-template <> struct TypePrintfFormat<bool> {
-  static constexpr const char *value = "%d";
-};
-
-template <typename T>
-constexpr const char *type_printf_format_v = TypePrintfFormat<T>::value;
-
-}; // namespace detail
-
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-TV_HOST_DEVICE void
-printTensorView(const TensorView<T, Rank, PtrTraits, Tindex> &tensor,
-                const char *format) {
-  // used to print tensor in cuda kernel.
-  if (tensor.empty())
-    return;
-  if (tensor.ndim() == 0) {
-    printf(format, tensor());
-    printf("\n");
-    return;
-  }
-  SimpleVector<int64_t, TV_MAX_DIM> prev(tensor.ndim(), -1);
-  SimpleVector<int64_t, TV_MAX_DIM> nd_index(tensor.ndim());
-  SimpleVector<int64_t, TV_MAX_DIM> shape(tensor.shape());
-
-  auto ndim = tensor.ndim();
-  for (int64_t i = 0; i < tensor.size(); ++i) {
-    rowArrayIdxInv(i, nd_index.data(), shape.data(), ndim);
-    bool newline = false;
-    int end_count = 0;
-    for (int j = 0; j < ndim; ++j) {
-      if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0 &&
-          prev[j] != -1) {
-        printf("]");
-        ++end_count;
-        newline = true;
-      }
-    }
-    if (prev[0] == -1) {
-      end_count = ndim;
-    }
-    if (newline) {
-      printf("\n");
-    }
-    int starts_count = 0;
-    for (int j = 0; j < ndim; ++j) {
-      if (nd_index[j] != prev[j] && nd_index[j] == 0 && prev[j] != 0) {
-        ++starts_count;
-      }
-    }
-    if (starts_count > 0) {
-      for (int j = 0; j < ndim - end_count; ++j) {
-        printf(" ");
-      }
-      for (int j = 0; j < starts_count; ++j) {
-        printf("]");
-      }
-    }
-    printf(format, tensor[i]);
-    if (nd_index[ndim - 1] != shape[ndim - 1] - 1) {
-      printf(",");
-    }
-    for (int j = 0; j < ndim; ++j) {
-      prev[j] = nd_index[j];
-    }
-  }
-  for (int j = 0; j < ndim; ++j) {
-    printf("]");
-  }
-  printf("\n");
-}
-
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-TV_HOST_DEVICE void
-printTensorView(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
-  using Traw = typename std::remove_const<T>::type;
-  return printTensorView(tensor, detail::type_printf_format_v<Traw>);
-}
-template <typename T>
-TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape) {
-  using Traw = typename std::remove_const<T>::type;
-  return printTensorView(TensorView<const T>(ptr, shape),
-                         detail::type_printf_format_v<Traw>);
-}
-template <typename T>
-TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
-                                    const char *format) {
-  return printTensorView(TensorView<const T>(ptr, shape), format);
-}
-
-#ifdef TV_CUDA
-
-#ifdef __DRIVER_TYPES_H__
-#ifndef DEVICE_RESET
-#define DEVICE_RESET cudaDeviceReset();
-#endif
-#else
-#ifndef DEVICE_RESET
-#define DEVICE_RESET
-#endif
-#endif
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-           int const line) {
-  if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), func);
-    DEVICE_RESET
-    // Make sure we call CUDA Device Reset before exiting
-    exit(EXIT_FAILURE);
-  }
-}
-
-#define checkCudaErrors(val) tv::check((val), #val, __FILE__, __LINE__)
-
-template <typename T>
-void host2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToDevice, s));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  host2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-
-template <typename T> void host2dev_sync(T *dst, const T *src, size_t size) {
-  checkCudaErrors(
-      cudaMemcpy(dst, src, size * sizeof(T), cudaMemcpyHostToDevice));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-                   const TensorView<const T, Rank, PtrTraits2, Tindex2> src) {
-  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2dev_sync(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-                   const TensorView<T, Rank, PtrTraits2, Tindex2> src) {
-  host2dev_sync(dst.data(), src.data(), std::min(dst.size(), src.size()));
-}
-
-template <typename T>
-void dev2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToHost, s));
-}
-
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-              const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-              cudaStream_t s = 0) {
-  dev2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-
-template <typename T>
-void dev2dev(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDeviceToDevice, s));
-}
-
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-             const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-             cudaStream_t s = 0) {
-  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void dev2dev(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-             const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-             cudaStream_t s = 0) {
-  dev2dev(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-
-template <typename T>
-void host2host(T *dst, const T *src, size_t size, cudaStream_t s = 0) {
-  checkCudaErrors(
-      cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyHostToHost, s));
-}
-
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-               const TensorView<const T, Rank, PtrTraits2, Tindex2> src,
-               cudaStream_t s = 0) {
-  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-template <typename T, int Rank, template <class> class PtrTraits1,
-          template <class> class PtrTraits2, typename Tindex1, typename Tindex2>
-void host2host(TensorView<T, Rank, PtrTraits1, Tindex1> dst,
-               const TensorView<T, Rank, PtrTraits2, Tindex2> src,
-               cudaStream_t s = 0) {
-  host2host(dst.data(), src.data(), std::min(dst.size(), src.size()), s);
-}
-
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
-  checkCudaErrors(cudaMemset(tensor.data(), 0, tensor.size() * sizeof(T)));
-}
-
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-void zero_dev(TensorView<T, Rank, PtrTraits, Tindex> tensor, cudaStream_t s) {
-  checkCudaErrors(
-      cudaMemsetAsync(tensor.data(), 0, tensor.size() * sizeof(T), s));
-}
-template <typename T, int Rank, template <class> class PtrTraits,
-          typename Tindex>
-void zero_host(TensorView<T, Rank, PtrTraits, Tindex> tensor) {
-  std::fill(tensor.data(), tensor.data() + tensor.size(), 0);
-}
-
-#endif
-
-} // namespace tv
\ No newline at end of file
diff --git a/include/tensorview/tools.h b/include/tensorview/tools.h
deleted file mode 100644
index cf010ad..0000000
--- a/include/tensorview/tools.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <chrono>
-#ifdef TV_CUDA
-#include <cuda_runtime_api.h>
-#endif
-#include <iostream>
-
-namespace tv {
-
-#ifdef TV_CUDA
-template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
-  CudaContextTimer() {
-    cudaDeviceSynchronize();
-    mCurTime = std::chrono::steady_clock::now();
-  }
-  typename TimeT::rep report() {
-    cudaDeviceSynchronize();
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-  template <int Count, typename F>
-  double benchmark(F &&f, int start = int(Count) * 0.3) {
-    // std::vector<TimeT::rep> times;
-    auto res = typename TimeT::rep();
-    int count = 0;
-    cudaDeviceSynchronize();
-    for (int i = 0; i < Count; ++i) {
-      std::forward<F>(f)();
-      auto time = report();
-      if (i >= start) {
-        // times.push_back(time)
-        res += time;
-        count += 1;
-      }
-    }
-    return res / double(count);
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-#endif
-
-template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
-  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
-  typename TimeT::rep report() {
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-
-} // namespace tv
diff --git a/include/tensorview/torch_utils.h b/include/tensorview/torch_utils.h
deleted file mode 100644
index 1851b3f..0000000
--- a/include/tensorview/torch_utils.h
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "mp_helper.h"
-#include <ATen/ATen.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <torch/script.h>
-#ifdef TV_CUDA
-#include <ATen/cuda/CUDAContext.h>
-#endif
-
-namespace tv {
-
-#ifdef TV_CUDA
-struct TorchGPU : public tv::GPU {
-  virtual cudaStream_t getStream() const override {
-    return at::cuda::getCurrentCUDAStream();
-  }
-};
-#endif
-namespace detail {
-template <typename T> struct TypeToTorchDtypeTraits;
-
-template <> struct TypeToTorchDtypeTraits<int32_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt32;
-};
-template <> struct TypeToTorchDtypeTraits<int16_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt16;
-};
-template <> struct TypeToTorchDtypeTraits<int8_t> {
-  static constexpr decltype(torch::kInt8) value = torch::kInt8;
-};
-template <> struct TypeToTorchDtypeTraits<int64_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt64;
-};
-template <> struct TypeToTorchDtypeTraits<uint8_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kUInt8;
-};
-template <> struct TypeToTorchDtypeTraits<bool> {
-  static constexpr decltype(torch::kInt32) value = torch::kBool;
-};
-template <> struct TypeToTorchDtypeTraits<float> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat32;
-};
-template <> struct TypeToTorchDtypeTraits<double> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat64;
-};
-template <> struct TypeToTorchDtypeTraits<at::Half> {
-  static constexpr decltype(torch::kInt32) value = torch::kHalf;
-};
-
-using all_torch_types_t = std::tuple<float, double, int8_t, int16_t, int32_t,
-                                     int64_t, uint8_t, bool, at::Half>;
-
-} // namespace detail
-
-template <typename T>
-constexpr decltype(torch::kInt32) torch_type_v =
-    detail::TypeToTorchDtypeTraits<T>::value;
-
-template <class... Ts, typename F>
-void dispatch_torch(at::ScalarType t, F &&f) {
-  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
-  bool notFound = true;
-  tv::mp_for_each<mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (detail::TypeToTorchDtypeTraits<decltype(I)>::value == t) {
-      std::forward<F>(f)(decltype(I)());
-      notFound = false;
-    }
-  });
-  if (notFound) {
-    std::stringstream ss;
-    tv::mp_for_each<mp_list<Ts...>>([=, &ss](auto I) {
-      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
-    });
-    TV_THROW_RT_ERR("unknown type", t, ", available:", ss.str());
-  }
-}
-
-template <class T> struct DispatchTorch;
-
-template <template <class...> class T, class... Args>
-struct DispatchTorch<T<Args...>> {
-  template <typename F> inline void operator()(at::ScalarType t, F &&f) {
-    return dispatch_torch<Args...>(t, std::forward<F>(f));
-  }
-};
-
-template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
-  DispatchTorch<detail::all_torch_types_t>()(tensor.scalar_type(), [&](auto I) {
-    using Ttensor = decltype(I);
-    constexpr bool val = std::is_same<std::remove_cv_t<T>, Ttensor>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-  });
-}
-
-template <typename T, int Rank = -1,
-          template <class> class PtrTraits = DefaultPtrTraits,
-          typename Tindex = int>
-TensorView<T, Rank, PtrTraits, Tindex> torch2tv(const torch::Tensor &tensor) {
-  using tv_shape_t =
-      typename TensorView<T, Rank, PtrTraits, Tindex>::tv_shape_t;
-  check_torch_dtype<T>(tensor);
-  // TODO stride
-  if (Rank > 0) {
-    TV_ASSERT_INVALID_ARG(tensor.dim() == Rank, "error");
-  }
-  tv_shape_t shape;
-  for (auto i : tensor.sizes()) {
-    shape.push_back(i);
-  }
-  return tv::TensorView<T, Rank, PtrTraits, Tindex>(
-      tensor.data_ptr<std::remove_const_t<T>>(), shape);
-}
-
-template <typename T>
-torch::Tensor torch_slice_first_axis(torch::Tensor tensor, T start, T end) {
-  // only torch >= 1.5 have tensor slice.
-  torch::Tensor res;
-  auto tensor_shape = tensor.sizes();
-  std::vector<int64_t> shape(tensor_shape.begin(), tensor_shape.end());
-  shape[0] = end - start;
-  uint8_t *ptr = reinterpret_cast<uint8_t *>(tensor.data_ptr());
-  res = torch::from_blob(ptr + start * tensor.stride(0) * tensor.itemsize(),
-                         torch::IntArrayRef(shape), tensor.options());
-  return res;
-}
-
-namespace detail {
-template <> struct TypeToString<at::Half> {
-  static constexpr const char *value = "half";
-};
-} // namespace detail
-} // namespace tv
\ No newline at end of file
diff --git a/include/torch_utils.h b/include/torch_utils.h
deleted file mode 100644
index e6cd11d..0000000
--- a/include/torch_utils.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-
-#include <ATen/ATen.h>
-#include <torch/script.h>
-#ifdef TV_CUDA
-#include <ATen/cuda/CUDAContext.h>
-#endif
-
-namespace tv {
-
-#ifdef TV_CUDA
-struct TorchGPU : public tv::GPU {
-  virtual cudaStream_t getStream() const override {
-    return at::cuda::getCurrentCUDAStream();
-  }
-};
-#endif
-template <typename T> void check_torch_dtype(const torch::Tensor &tensor) {
-  switch (tensor.scalar_type()) {
-  case at::ScalarType::Double: {
-    auto val = std::is_same<std::remove_const_t<T>, double>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Float: {
-    auto val = std::is_same<std::remove_const_t<T>, float>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Int: {
-    auto val = std::is_same<std::remove_const_t<T>, int>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Half: {
-    auto val = std::is_same<std::remove_const_t<T>, at::Half>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  case at::ScalarType::Long: {
-    auto val = std::is_same<std::remove_const_t<T>, long>::value;
-    TV_ASSERT_RT_ERR(val, "error");
-    break;
-  }
-  default:
-    TV_ASSERT_RT_ERR(false, "error");
-  }
-}
-namespace detail {
-template <typename T> struct TypeToTorchDtypeTraits;
-
-template <> struct TypeToTorchDtypeTraits<int32_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt32;
-};
-
-template <> struct TypeToTorchDtypeTraits<int64_t> {
-  static constexpr decltype(torch::kInt32) value = torch::kInt64;
-};
-
-template <> struct TypeToTorchDtypeTraits<float> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat32;
-};
-template <> struct TypeToTorchDtypeTraits<double> {
-  static constexpr decltype(torch::kInt32) value = torch::kFloat64;
-};
-template <> struct TypeToTorchDtypeTraits<at::Half> {
-  static constexpr decltype(torch::kInt32) value = torch::kHalf;
-};
-
-} // namespace detail
-
-template <typename T>
-constexpr decltype(torch::kInt32) torch_type_v =
-    detail::TypeToTorchDtypeTraits<T>::value;
-
-template <typename T> tv::TensorView<T> torch2tv(const torch::Tensor &tensor) {
-  check_torch_dtype<T>(tensor);
-  tv::Shape shape;
-  for (auto i : tensor.sizes()) {
-    shape.push_back(i);
-  }
-  return tv::TensorView<T>(tensor.data_ptr<std::remove_const_t<T>>(), shape);
-}
-namespace detail {
-template <> struct TypeToString<at::Half> {
-  static constexpr const char *value = "half";
-};
-} // namespace detail
-template <class... Ts, typename F>
-void dispatch_torch(at::ScalarType t, F &&f) {
-  static_assert(sizeof...(Ts) > 0, "you need to provide at least one type");
-  bool notFound = true;
-  spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &notFound, &f](auto I) {
-    if (torch_type_v<decltype(I)> == t) {
-      std::forward<F>(f)(decltype(I)());
-      notFound = false;
-    }
-  });
-  if (notFound) {
-    std::stringstream ss;
-    spconv::tv::mp_for_each<spconv::mp_list<Ts...>>([=, &ss](auto I) {
-      ss << tv::detail::TypeToString<decltype(I)>::value << " ";
-    });
-    TV_THROW_RT_ERR("unknown type", t, ", available: ", ss.str());
-  }
-}
-
-} // namespace tv
\ No newline at end of file
diff --git a/include/tsl/robin_growth_policy.h b/include/tsl/robin_growth_policy.h
deleted file mode 100644
index ff17ca1..0000000
--- a/include/tsl/robin_growth_policy.h
+++ /dev/null
@@ -1,334 +0,0 @@
-/**
- * MIT License
- *
- * Copyright (c) 2017 Tessil
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_GROWTH_POLICY_H
-#define TSL_ROBIN_GROWTH_POLICY_H
-
-#include <algorithm>
-#include <array>
-#include <climits>
-#include <cmath>
-#include <cstddef>
-#include <iterator>
-#include <limits>
-#include <ratio>
-#include <stdexcept>
-
-#ifdef TSL_DEBUG
-#define tsl_rh_assert(expr) assert(expr)
-#else
-#define tsl_rh_assert(expr) (static_cast<void>(0))
-#endif
-
-/**
- * If exceptions are enabled, throw the exception passed in parameter, otherwise
- * call std::terminate.
- */
-#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) ||                     \
-     (defined(_MSC_VER) && defined(_CPPUNWIND))) &&                            \
-    !defined(TSL_NO_EXCEPTIONS)
-#define TSL_RH_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
-#else
-#ifdef NDEBUG
-#define TSL_RH_THROW_OR_TERMINATE(ex, msg) std::terminate()
-#else
-#include <cstdio>
-#define TSL_RH_THROW_OR_TERMINATE(ex, msg)                                     \
-  do {                                                                         \
-    std::fprintf(stderr, msg);                                                 \
-    std::terminate();                                                          \
-  } while (0)
-#endif
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#define TSL_RH_LIKELY(exp) (__builtin_expect(!!(exp), true))
-#else
-#define TSL_RH_LIKELY(exp) (exp)
-#endif
-
-namespace tsl {
-namespace rh {
-
-/**
- * Grow the hash table by a factor of GrowthFactor keeping the bucket count to a
- * power of two. It allows the table to use a mask operation instead of a modulo
- * operation to map a hash to a bucket.
- *
- * GrowthFactor must be a power of two >= 2.
- */
-template <std::size_t GrowthFactor> class power_of_two_growth_policy {
-public:
-  /**
-   * Called on the hash table creation and on rehash. The number of buckets for
-   * the table is passed in parameter. This number is a minimum, the policy may
-   * update this value with a higher value if needed (but not lower).
-   *
-   * If 0 is given, min_bucket_count_in_out must still be 0 after the policy
-   * creation and bucket_for_hash must always return 0 in this case.
-   */
-  explicit power_of_two_growth_policy(std::size_t &min_bucket_count_in_out) {
-    if (min_bucket_count_in_out > max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    if (min_bucket_count_in_out > 0) {
-      min_bucket_count_in_out =
-          round_up_to_power_of_two(min_bucket_count_in_out);
-      m_mask = min_bucket_count_in_out - 1;
-    } else {
-      m_mask = 0;
-    }
-  }
-
-  /**
-   * Return the bucket [0, bucket_count()) to which the hash belongs.
-   * If bucket_count() is 0, it must always return 0.
-   */
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return hash & m_mask;
-  }
-
-  /**
-   * Return the number of buckets that should be used on next growth.
-   */
-  std::size_t next_bucket_count() const {
-    if ((m_mask + 1) > max_bucket_count() / GrowthFactor) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    return (m_mask + 1) * GrowthFactor;
-  }
-
-  /**
-   * Return the maximum number of buckets supported by the policy.
-   */
-  std::size_t max_bucket_count() const {
-    // Largest power of two.
-    return (std::numeric_limits<std::size_t>::max() / 2) + 1;
-  }
-
-  /**
-   * Reset the growth policy as if it was created with a bucket count of 0.
-   * After a clear, the policy must always return 0 when bucket_for_hash is
-   * called.
-   */
-  void clear() noexcept { m_mask = 0; }
-
-private:
-  static std::size_t round_up_to_power_of_two(std::size_t value) {
-    if (is_power_of_two(value)) {
-      return value;
-    }
-
-    if (value == 0) {
-      return 1;
-    }
-
-    --value;
-    for (std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
-      value |= value >> i;
-    }
-
-    return value + 1;
-  }
-
-  static constexpr bool is_power_of_two(std::size_t value) {
-    return value != 0 && (value & (value - 1)) == 0;
-  }
-
-protected:
-  static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2,
-                "GrowthFactor must be a power of two >= 2.");
-
-  std::size_t m_mask;
-};
-
-/**
- * Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo
- * to map a hash to a bucket. Slower but it can be useful if you want a slower
- * growth.
- */
-template <class GrowthFactor = std::ratio<3, 2>> class mod_growth_policy {
-public:
-  explicit mod_growth_policy(std::size_t &min_bucket_count_in_out) {
-    if (min_bucket_count_in_out > max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    if (min_bucket_count_in_out > 0) {
-      m_mod = min_bucket_count_in_out;
-    } else {
-      m_mod = 1;
-    }
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return hash % m_mod;
-  }
-
-  std::size_t next_bucket_count() const {
-    if (m_mod == max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    const double next_bucket_count =
-        std::ceil(double(m_mod) * REHASH_SIZE_MULTIPLICATION_FACTOR);
-    if (!std::isnormal(next_bucket_count)) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    if (next_bucket_count > double(max_bucket_count())) {
-      return max_bucket_count();
-    } else {
-      return std::size_t(next_bucket_count);
-    }
-  }
-
-  std::size_t max_bucket_count() const { return MAX_BUCKET_COUNT; }
-
-  void clear() noexcept { m_mod = 1; }
-
-private:
-  static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR =
-      1.0 * GrowthFactor::num / GrowthFactor::den;
-  static const std::size_t MAX_BUCKET_COUNT =
-      std::size_t(double(std::numeric_limits<std::size_t>::max() /
-                         REHASH_SIZE_MULTIPLICATION_FACTOR));
-
-  static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1,
-                "Growth factor should be >= 1.1.");
-
-  std::size_t m_mod;
-};
-
-namespace detail {
-
-static constexpr const std::array<std::size_t, 40> PRIMES = {
-    {1ul,         5ul,         17ul,         29ul,         37ul,
-     53ul,        67ul,        79ul,         97ul,         131ul,
-     193ul,       257ul,       389ul,        521ul,        769ul,
-     1031ul,      1543ul,      2053ul,       3079ul,       6151ul,
-     12289ul,     24593ul,     49157ul,      98317ul,      196613ul,
-     393241ul,    786433ul,    1572869ul,    3145739ul,    6291469ul,
-     12582917ul,  25165843ul,  50331653ul,   100663319ul,  201326611ul,
-     402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul}};
-
-template <unsigned int IPrime>
-static constexpr std::size_t mod(std::size_t hash) {
-  return hash % PRIMES[IPrime];
-}
-
-// MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows for
-// faster modulo as the compiler can optimize the modulo code better with a
-// constant known at the compilation.
-static constexpr const std::array<std::size_t (*)(std::size_t), 40> MOD_PRIME =
-    {{&mod<0>,  &mod<1>,  &mod<2>,  &mod<3>,  &mod<4>,  &mod<5>,  &mod<6>,
-      &mod<7>,  &mod<8>,  &mod<9>,  &mod<10>, &mod<11>, &mod<12>, &mod<13>,
-      &mod<14>, &mod<15>, &mod<16>, &mod<17>, &mod<18>, &mod<19>, &mod<20>,
-      &mod<21>, &mod<22>, &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>,
-      &mod<28>, &mod<29>, &mod<30>, &mod<31>, &mod<32>, &mod<33>, &mod<34>,
-      &mod<35>, &mod<36>, &mod<37>, &mod<38>, &mod<39>}};
-
-} // namespace detail
-
-/**
- * Grow the hash table by using prime numbers as bucket count. Slower than
- * tsl::rh::power_of_two_growth_policy in general but will probably distribute
- * the values around better in the buckets with a poor hash function.
- *
- * To allow the compiler to optimize the modulo operation, a lookup table is
- * used with constant primes numbers.
- *
- * With a switch the code would look like:
- * \code
- * switch(iprime) { // iprime is the current prime of the hash table
- *     case 0: hash % 5ul;
- *             break;
- *     case 1: hash % 17ul;
- *             break;
- *     case 2: hash % 29ul;
- *             break;
- *     ...
- * }
- * \endcode
- *
- * Due to the constant variable in the modulo the compiler is able to optimize
- * the operation by a series of multiplications, substractions and shifts.
- *
- * The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34)
- * * 5' in a 64 bits environement.
- */
-class prime_growth_policy {
-public:
-  explicit prime_growth_policy(std::size_t &min_bucket_count_in_out) {
-    auto it_prime = std::lower_bound(
-        detail::PRIMES.begin(), detail::PRIMES.end(), min_bucket_count_in_out);
-    if (it_prime == detail::PRIMES.end()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    m_iprime = static_cast<unsigned int>(
-        std::distance(detail::PRIMES.begin(), it_prime));
-    if (min_bucket_count_in_out > 0) {
-      min_bucket_count_in_out = *it_prime;
-    } else {
-      min_bucket_count_in_out = 0;
-    }
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return detail::MOD_PRIME[m_iprime](hash);
-  }
-
-  std::size_t next_bucket_count() const {
-    if (m_iprime + 1 >= detail::PRIMES.size()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The hash table exceeds its maxmimum size.");
-    }
-
-    return detail::PRIMES[m_iprime + 1];
-  }
-
-  std::size_t max_bucket_count() const { return detail::PRIMES.back(); }
-
-  void clear() noexcept { m_iprime = 0; }
-
-private:
-  unsigned int m_iprime;
-
-  static_assert(std::numeric_limits<decltype(m_iprime)>::max() >=
-                    detail::PRIMES.size(),
-                "The type of m_iprime is not big enough.");
-};
-
-} // namespace rh
-} // namespace tsl
-
-#endif
diff --git a/include/tsl/robin_hash.h b/include/tsl/robin_hash.h
deleted file mode 100644
index bad2c99..0000000
--- a/include/tsl/robin_hash.h
+++ /dev/null
@@ -1,1360 +0,0 @@
-/**
- * MIT License
- *
- * Copyright (c) 2017 Tessil
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_HASH_H
-#define TSL_ROBIN_HASH_H
-
-#include "robin_growth_policy.h"
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-namespace tsl {
-
-namespace detail_robin_hash {
-
-template <typename T> struct make_void { using type = void; };
-
-template <typename T, typename = void>
-struct has_is_transparent : std::false_type {};
-
-template <typename T>
-struct has_is_transparent<T,
-                          typename make_void<typename T::is_transparent>::type>
-    : std::true_type {};
-
-template <typename U> struct is_power_of_two_policy : std::false_type {};
-
-template <std::size_t GrowthFactor>
-struct is_power_of_two_policy<tsl::rh::power_of_two_growth_policy<GrowthFactor>>
-    : std::true_type {};
-
-// Only available in C++17, we need to be compatible with C++11
-template <class T> const T &clamp(const T &v, const T &lo, const T &hi) {
-  return std::min(hi, std::max(lo, v));
-}
-
-using truncated_hash_type = std::uint_least32_t;
-
-/**
- * Helper class that stores a truncated hash if StoreHash is true and nothing
- * otherwise.
- */
-template <bool StoreHash> class bucket_entry_hash {
-public:
-  bool bucket_hash_equal(std::size_t /*hash*/) const noexcept { return true; }
-
-  truncated_hash_type truncated_hash() const noexcept { return 0; }
-
-protected:
-  void set_hash(truncated_hash_type /*hash*/) noexcept {}
-};
-
-template <> class bucket_entry_hash<true> {
-public:
-  bool bucket_hash_equal(std::size_t hash) const noexcept {
-    return m_hash == truncated_hash_type(hash);
-  }
-
-  truncated_hash_type truncated_hash() const noexcept { return m_hash; }
-
-protected:
-  void set_hash(truncated_hash_type hash) noexcept {
-    m_hash = truncated_hash_type(hash);
-  }
-
-private:
-  truncated_hash_type m_hash;
-};
-
-/**
- * Each bucket entry has:
- * - A value of type `ValueType`.
- * - An integer to store how far the value of the bucket, if any, is from its
- * ideal bucket (ex: if the current bucket 5 has the value 'foo' and
- * `hash('foo') % nb_buckets` == 3, `dist_from_ideal_bucket()` will return 2 as
- * the current value of the bucket is two buckets away from its ideal bucket) If
- * there is no value in the bucket (i.e. `empty()` is true)
- * `dist_from_ideal_bucket()` will be < 0.
- * - A marker which tells us if the bucket is the last bucket of the bucket
- * array (useful for the iterator of the hash table).
- * - If `StoreHash` is true, 32 bits of the hash of the value, if any, are also
- * stored in the bucket. If the size of the hash is more than 32 bits, it is
- * truncated. We don't store the full hash as storing the hash is a potential
- * opportunity to use the unused space due to the alignement of the bucket_entry
- * structure. We can thus potentially store the hash without any extra space
- *   (which would not be possible with 64 bits of the hash).
- */
-template <typename ValueType, bool StoreHash>
-class bucket_entry : public bucket_entry_hash<StoreHash> {
-  using bucket_hash = bucket_entry_hash<StoreHash>;
-
-public:
-  using value_type = ValueType;
-  using distance_type = std::int_least16_t;
-
-  bucket_entry() noexcept
-      : bucket_hash(),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(false) {
-    tsl_rh_assert(empty());
-  }
-
-  bucket_entry(bool last_bucket) noexcept
-      : bucket_hash(),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(last_bucket) {
-    tsl_rh_assert(empty());
-  }
-
-  bucket_entry(const bucket_entry &other) noexcept(
-      std::is_nothrow_copy_constructible<value_type>::value)
-      : bucket_hash(other),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(other.m_last_bucket) {
-    if (!other.empty()) {
-      ::new (static_cast<void *>(std::addressof(m_value)))
-          value_type(other.value());
-      m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-    }
-  }
-
-  /**
-   * Never really used, but still necessary as we must call resize on an empty
-   * `std::vector<bucket_entry>`. and we need to support move-only types. See
-   * robin_hash constructor for details.
-   */
-  bucket_entry(bucket_entry &&other) noexcept(
-      std::is_nothrow_move_constructible<value_type>::value)
-      : bucket_hash(std::move(other)),
-        m_dist_from_ideal_bucket(EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET),
-        m_last_bucket(other.m_last_bucket) {
-    if (!other.empty()) {
-      ::new (static_cast<void *>(std::addressof(m_value)))
-          value_type(std::move(other.value()));
-      m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-    }
-  }
-
-  bucket_entry &operator=(const bucket_entry &other) noexcept(
-      std::is_nothrow_copy_constructible<value_type>::value) {
-    if (this != &other) {
-      clear();
-
-      bucket_hash::operator=(other);
-      if (!other.empty()) {
-        ::new (static_cast<void *>(std::addressof(m_value)))
-            value_type(other.value());
-      }
-
-      m_dist_from_ideal_bucket = other.m_dist_from_ideal_bucket;
-      m_last_bucket = other.m_last_bucket;
-    }
-
-    return *this;
-  }
-
-  bucket_entry &operator=(bucket_entry &&) = delete;
-
-  ~bucket_entry() noexcept { clear(); }
-
-  void clear() noexcept {
-    if (!empty()) {
-      destroy_value();
-      m_dist_from_ideal_bucket = EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET;
-    }
-  }
-
-  bool empty() const noexcept {
-    return m_dist_from_ideal_bucket == EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET;
-  }
-
-  value_type &value() noexcept {
-    tsl_rh_assert(!empty());
-    return *reinterpret_cast<value_type *>(std::addressof(m_value));
-  }
-
-  const value_type &value() const noexcept {
-    tsl_rh_assert(!empty());
-    return *reinterpret_cast<const value_type *>(std::addressof(m_value));
-  }
-
-  distance_type dist_from_ideal_bucket() const noexcept {
-    return m_dist_from_ideal_bucket;
-  }
-
-  bool last_bucket() const noexcept { return m_last_bucket; }
-
-  void set_as_last_bucket() noexcept { m_last_bucket = true; }
-
-  template <typename... Args>
-  void set_value_of_empty_bucket(distance_type dist_from_ideal_bucket,
-                                 truncated_hash_type hash,
-                                 Args &&... value_type_args) {
-    tsl_rh_assert(dist_from_ideal_bucket >= 0);
-    tsl_rh_assert(empty());
-
-    ::new (static_cast<void *>(std::addressof(m_value)))
-        value_type(std::forward<Args>(value_type_args)...);
-    this->set_hash(hash);
-    m_dist_from_ideal_bucket = dist_from_ideal_bucket;
-
-    tsl_rh_assert(!empty());
-  }
-
-  void swap_with_value_in_bucket(distance_type &dist_from_ideal_bucket,
-                                 truncated_hash_type &hash, value_type &value) {
-    tsl_rh_assert(!empty());
-
-    using std::swap;
-    swap(value, this->value());
-    swap(dist_from_ideal_bucket, m_dist_from_ideal_bucket);
-
-    // Avoid warning of unused variable if StoreHash is false
-    (void)hash;
-    if (StoreHash) {
-      const truncated_hash_type tmp_hash = this->truncated_hash();
-      this->set_hash(hash);
-      hash = tmp_hash;
-    }
-  }
-
-  static truncated_hash_type truncate_hash(std::size_t hash) noexcept {
-    return truncated_hash_type(hash);
-  }
-
-private:
-  void destroy_value() noexcept {
-    tsl_rh_assert(!empty());
-    value().~value_type();
-  }
-
-private:
-  using storage = typename std::aligned_storage<sizeof(value_type),
-                                                alignof(value_type)>::type;
-
-  static const distance_type EMPTY_MARKER_DIST_FROM_IDEAL_BUCKET = -1;
-
-  distance_type m_dist_from_ideal_bucket;
-  bool m_last_bucket;
-  storage m_value;
-};
-
-/**
- * Internal common class used by `robin_map` and `robin_set`.
- *
- * ValueType is what will be stored by `robin_hash` (usually `std::pair<Key, T>`
- * for map and `Key` for set).
- *
- * `KeySelect` should be a `FunctionObject` which takes a `ValueType` in
- * parameter and returns a reference to the key.
- *
- * `ValueSelect` should be a `FunctionObject` which takes a `ValueType` in
- * parameter and returns a reference to the value. `ValueSelect` should be void
- * if there is no value (in a set for example).
- *
- * The strong exception guarantee only holds if the expression
- * `std::is_nothrow_swappable<ValueType>::value &&
- * std::is_nothrow_move_constructible<ValueType>::value` is true.
- *
- * Behaviour is undefined if the destructor of `ValueType` throws.
- */
-template <class ValueType, class KeySelect, class ValueSelect, class Hash,
-          class KeyEqual, class Allocator, bool StoreHash, class GrowthPolicy>
-class robin_hash : private Hash, private KeyEqual, private GrowthPolicy {
-private:
-  template <typename U>
-  using has_mapped_type =
-      typename std::integral_constant<bool, !std::is_same<U, void>::value>;
-
-  static_assert(
-      noexcept(std::declval<GrowthPolicy>().bucket_for_hash(std::size_t(0))),
-      "GrowthPolicy::bucket_for_hash must be noexcept.");
-  static_assert(noexcept(std::declval<GrowthPolicy>().clear()),
-                "GrowthPolicy::clear must be noexcept.");
-
-public:
-  template <bool IsConst> class robin_iterator;
-
-  using key_type = typename KeySelect::key_type;
-  using value_type = ValueType;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using hasher = Hash;
-  using key_equal = KeyEqual;
-  using allocator_type = Allocator;
-  using reference = value_type &;
-  using const_reference = const value_type &;
-  using pointer = value_type *;
-  using const_pointer = const value_type *;
-  using iterator = robin_iterator<false>;
-  using const_iterator = robin_iterator<true>;
-
-private:
-  /**
-   * Either store the hash because we are asked by the `StoreHash` template
-   * parameter or store the hash because it doesn't cost us anything in size and
-   * can be used to speed up rehash.
-   */
-  static constexpr bool STORE_HASH =
-      StoreHash ||
-      ((sizeof(tsl::detail_robin_hash::bucket_entry<value_type, true>) ==
-        sizeof(tsl::detail_robin_hash::bucket_entry<value_type, false>)) &&
-       (sizeof(std::size_t) == sizeof(truncated_hash_type) ||
-        is_power_of_two_policy<GrowthPolicy>::value) &&
-       // Don't store the hash for primitive types with default hash.
-       (!std::is_arithmetic<key_type>::value ||
-        !std::is_same<Hash, std::hash<key_type>>::value));
-
-  /**
-   * Only use the stored hash on lookup if we are explictly asked. We are not
-   * sure how slow the KeyEqual operation is. An extra comparison may slow
-   * things down with a fast KeyEqual.
-   */
-  static constexpr bool USE_STORED_HASH_ON_LOOKUP = StoreHash;
-
-  /**
-   * We can only use the hash on rehash if the size of the hash type is the same
-   * as the stored one or if we use a power of two modulo. In the case of the
-   * power of two modulo, we just mask the least significant bytes, we just have
-   * to check that the truncated_hash_type didn't truncated more bytes.
-   */
-  static bool USE_STORED_HASH_ON_REHASH(size_type bucket_count) {
-    (void)bucket_count;
-    if (STORE_HASH && sizeof(std::size_t) == sizeof(truncated_hash_type)) {
-      return true;
-    } else if (STORE_HASH && is_power_of_two_policy<GrowthPolicy>::value) {
-      tsl_rh_assert(bucket_count > 0);
-      return (bucket_count - 1) <=
-             std::numeric_limits<truncated_hash_type>::max();
-    } else {
-      return false;
-    }
-  }
-
-  using bucket_entry =
-      tsl::detail_robin_hash::bucket_entry<value_type, STORE_HASH>;
-  using distance_type = typename bucket_entry::distance_type;
-
-  using buckets_allocator = typename std::allocator_traits<
-      allocator_type>::template rebind_alloc<bucket_entry>;
-  using buckets_container_type = std::vector<bucket_entry, buckets_allocator>;
-
-public:
-  /**
-   * The 'operator*()' and 'operator->()' methods return a const reference and
-   * const pointer respectively to the stored value type.
-   *
-   * In case of a map, to get a mutable reference to the value associated to a
-   * key (the '.second' in the stored pair), you have to call 'value()'.
-   *
-   * The main reason for this is that if we returned a `std::pair<Key, T>&`
-   * instead of a `const std::pair<Key, T>&`, the user may modify the key which
-   * will put the map in a undefined state.
-   */
-  template <bool IsConst> class robin_iterator {
-    friend class robin_hash;
-
-  private:
-    using bucket_entry_ptr =
-        typename std::conditional<IsConst, const bucket_entry *,
-                                  bucket_entry *>::type;
-
-    robin_iterator(bucket_entry_ptr bucket) noexcept : m_bucket(bucket) {}
-
-  public:
-    using iterator_category = std::forward_iterator_tag;
-    using value_type = const typename robin_hash::value_type;
-    using difference_type = std::ptrdiff_t;
-    using reference = value_type &;
-    using pointer = value_type *;
-
-    robin_iterator() noexcept {}
-
-    // Copy constructor from iterator to const_iterator.
-    template <bool TIsConst = IsConst,
-              typename std::enable_if<TIsConst>::type * = nullptr>
-    robin_iterator(const robin_iterator<!TIsConst> &other) noexcept
-        : m_bucket(other.m_bucket) {}
-
-    robin_iterator(const robin_iterator &other) = default;
-    robin_iterator(robin_iterator &&other) = default;
-    robin_iterator &operator=(const robin_iterator &other) = default;
-    robin_iterator &operator=(robin_iterator &&other) = default;
-
-    const typename robin_hash::key_type &key() const {
-      return KeySelect()(m_bucket->value());
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      IsConst>::type * = nullptr>
-    const typename U::value_type &value() const {
-      return U()(m_bucket->value());
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      !IsConst>::type * = nullptr>
-    typename U::value_type &value() {
-      return U()(m_bucket->value());
-    }
-
-    reference operator*() const { return m_bucket->value(); }
-
-    pointer operator->() const { return std::addressof(m_bucket->value()); }
-
-    robin_iterator &operator++() {
-      while (true) {
-        if (m_bucket->last_bucket()) {
-          ++m_bucket;
-          return *this;
-        }
-
-        ++m_bucket;
-        if (!m_bucket->empty()) {
-          return *this;
-        }
-      }
-    }
-
-    robin_iterator operator++(int) {
-      robin_iterator tmp(*this);
-      ++*this;
-
-      return tmp;
-    }
-
-    friend bool operator==(const robin_iterator &lhs,
-                           const robin_iterator &rhs) {
-      return lhs.m_bucket == rhs.m_bucket;
-    }
-
-    friend bool operator!=(const robin_iterator &lhs,
-                           const robin_iterator &rhs) {
-      return !(lhs == rhs);
-    }
-
-  private:
-    bucket_entry_ptr m_bucket;
-  };
-
-public:
-#if defined(__cplusplus) && __cplusplus >= 201402L
-  robin_hash(size_type bucket_count, const Hash &hash, const KeyEqual &equal,
-             const Allocator &alloc,
-             float min_load_factor = DEFAULT_MIN_LOAD_FACTOR,
-             float max_load_factor = DEFAULT_MAX_LOAD_FACTOR)
-      : Hash(hash), KeyEqual(equal), GrowthPolicy(bucket_count),
-        m_buckets_data(
-            [&]() {
-              if (bucket_count > max_bucket_count()) {
-                TSL_RH_THROW_OR_TERMINATE(
-                    std::length_error,
-                    "The map exceeds its maximum bucket count.");
-              }
-
-              return bucket_count;
-            }(),
-            alloc),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_bucket_count(bucket_count), m_nb_elements(0),
-        m_grow_on_next_insert(false), m_try_skrink_on_next_insert(false) {
-    if (m_bucket_count > 0) {
-      tsl_rh_assert(!m_buckets_data.empty());
-      m_buckets_data.back().set_as_last_bucket();
-    }
-
-    this->min_load_factor(min_load_factor);
-    this->max_load_factor(max_load_factor);
-  }
-#else
-  /**
-   * C++11 doesn't support the creation of a std::vector with a custom allocator
-   * and 'count' default-inserted elements. The needed contructor `explicit
-   * vector(size_type count, const Allocator& alloc = Allocator());` is only
-   * available in C++14 and later. We thus must resize after using the
-   * `vector(const Allocator& alloc)` constructor.
-   *
-   * We can't use `vector(size_type count, const T& value, const Allocator&
-   * alloc)` as it requires the value T to be copyable.
-   */
-  robin_hash(size_type bucket_count, const Hash &hash, const KeyEqual &equal,
-             const Allocator &alloc,
-             float min_load_factor = DEFAULT_MIN_LOAD_FACTOR,
-             float max_load_factor = DEFAULT_MAX_LOAD_FACTOR)
-      : Hash(hash), KeyEqual(equal), GrowthPolicy(bucket_count),
-        m_buckets_data(alloc), m_buckets(static_empty_bucket_ptr()),
-        m_bucket_count(bucket_count), m_nb_elements(0),
-        m_grow_on_next_insert(false), m_try_skrink_on_next_insert(false) {
-    if (bucket_count > max_bucket_count()) {
-      TSL_RH_THROW_OR_TERMINATE(std::length_error,
-                                "The map exceeds its maxmimum bucket count.");
-    }
-
-    if (m_bucket_count > 0) {
-      m_buckets_data.resize(m_bucket_count);
-      m_buckets = m_buckets_data.data();
-
-      tsl_rh_assert(!m_buckets_data.empty());
-      m_buckets_data.back().set_as_last_bucket();
-    }
-
-    this->min_load_factor(min_load_factor);
-    this->max_load_factor(max_load_factor);
-  }
-#endif
-
-  robin_hash(const robin_hash &other)
-      : Hash(other), KeyEqual(other), GrowthPolicy(other),
-        m_buckets_data(other.m_buckets_data),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_bucket_count(other.m_bucket_count),
-        m_nb_elements(other.m_nb_elements),
-        m_load_threshold(other.m_load_threshold),
-        m_max_load_factor(other.m_max_load_factor),
-        m_grow_on_next_insert(other.m_grow_on_next_insert),
-        m_min_load_factor(other.m_min_load_factor),
-        m_try_skrink_on_next_insert(other.m_try_skrink_on_next_insert) {}
-
-  robin_hash(robin_hash &&other) noexcept(
-      std::is_nothrow_move_constructible<
-          Hash>::value &&std::is_nothrow_move_constructible<KeyEqual>::value
-          &&std::is_nothrow_move_constructible<GrowthPolicy>::value &&
-              std::is_nothrow_move_constructible<buckets_container_type>::value)
-      : Hash(std::move(static_cast<Hash &>(other))),
-        KeyEqual(std::move(static_cast<KeyEqual &>(other))),
-        GrowthPolicy(std::move(static_cast<GrowthPolicy &>(other))),
-        m_buckets_data(std::move(other.m_buckets_data)),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_bucket_count(other.m_bucket_count),
-        m_nb_elements(other.m_nb_elements),
-        m_load_threshold(other.m_load_threshold),
-        m_max_load_factor(other.m_max_load_factor),
-        m_grow_on_next_insert(other.m_grow_on_next_insert),
-        m_min_load_factor(other.m_min_load_factor),
-        m_try_skrink_on_next_insert(other.m_try_skrink_on_next_insert) {
-    other.GrowthPolicy::clear();
-    other.m_buckets_data.clear();
-    other.m_buckets = static_empty_bucket_ptr();
-    other.m_bucket_count = 0;
-    other.m_nb_elements = 0;
-    other.m_load_threshold = 0;
-    other.m_grow_on_next_insert = false;
-    other.m_try_skrink_on_next_insert = false;
-  }
-
-  robin_hash &operator=(const robin_hash &other) {
-    if (&other != this) {
-      Hash::operator=(other);
-      KeyEqual::operator=(other);
-      GrowthPolicy::operator=(other);
-
-      m_buckets_data = other.m_buckets_data;
-      m_buckets = m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data();
-      m_bucket_count = other.m_bucket_count;
-      m_nb_elements = other.m_nb_elements;
-
-      m_load_threshold = other.m_load_threshold;
-      m_max_load_factor = other.m_max_load_factor;
-      m_grow_on_next_insert = other.m_grow_on_next_insert;
-
-      m_min_load_factor = other.m_min_load_factor;
-      m_try_skrink_on_next_insert = other.m_try_skrink_on_next_insert;
-    }
-
-    return *this;
-  }
-
-  robin_hash &operator=(robin_hash &&other) {
-    other.swap(*this);
-    other.clear();
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const {
-    return m_buckets_data.get_allocator();
-  }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept {
-    std::size_t i = 0;
-    while (i < m_bucket_count && m_buckets[i].empty()) {
-      i++;
-    }
-
-    return iterator(m_buckets + i);
-  }
-
-  const_iterator begin() const noexcept { return cbegin(); }
-
-  const_iterator cbegin() const noexcept {
-    std::size_t i = 0;
-    while (i < m_bucket_count && m_buckets[i].empty()) {
-      i++;
-    }
-
-    return const_iterator(m_buckets + i);
-  }
-
-  iterator end() noexcept { return iterator(m_buckets + m_bucket_count); }
-
-  const_iterator end() const noexcept { return cend(); }
-
-  const_iterator cend() const noexcept {
-    return const_iterator(m_buckets + m_bucket_count);
-  }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_nb_elements == 0; }
-
-  size_type size() const noexcept { return m_nb_elements; }
-
-  size_type max_size() const noexcept { return m_buckets_data.max_size(); }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept {
-    for (auto &bucket : m_buckets_data) {
-      bucket.clear();
-    }
-
-    m_nb_elements = 0;
-    m_grow_on_next_insert = false;
-  }
-
-  template <typename P> std::pair<iterator, bool> insert(P &&value) {
-    return insert_impl(KeySelect()(value), std::forward<P>(value));
-  }
-
-  template <typename P> iterator insert_hint(const_iterator hint, P &&value) {
-    if (hint != cend() &&
-        compare_keys(KeySelect()(*hint), KeySelect()(value))) {
-      return mutable_iterator(hint);
-    }
-
-    return insert(std::forward<P>(value)).first;
-  }
-
-  template <class InputIt> void insert(InputIt first, InputIt last) {
-    if (std::is_base_of<
-            std::forward_iterator_tag,
-            typename std::iterator_traits<InputIt>::iterator_category>::value) {
-      const auto nb_elements_insert = std::distance(first, last);
-      const size_type nb_free_buckets = m_load_threshold - size();
-      tsl_rh_assert(m_load_threshold >= size());
-
-      if (nb_elements_insert > 0 &&
-          nb_free_buckets < size_type(nb_elements_insert)) {
-        reserve(size() + size_type(nb_elements_insert));
-      }
-    }
-
-    for (; first != last; ++first) {
-      insert(*first);
-    }
-  }
-
-  template <class K, class M>
-  std::pair<iterator, bool> insert_or_assign(K &&key, M &&obj) {
-    auto it = try_emplace(std::forward<K>(key), std::forward<M>(obj));
-    if (!it.second) {
-      it.first.value() = std::forward<M>(obj);
-    }
-
-    return it;
-  }
-
-  template <class K, class M>
-  iterator insert_or_assign(const_iterator hint, K &&key, M &&obj) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      auto it = mutable_iterator(hint);
-      it.value() = std::forward<M>(obj);
-
-      return it;
-    }
-
-    return insert_or_assign(std::forward<K>(key), std::forward<M>(obj)).first;
-  }
-
-  template <class... Args> std::pair<iterator, bool> emplace(Args &&... args) {
-    return insert(value_type(std::forward<Args>(args)...));
-  }
-
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args &&... args) {
-    return insert_hint(hint, value_type(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> try_emplace(K &&key, Args &&... args) {
-    return insert_impl(key, std::piecewise_construct,
-                       std::forward_as_tuple(std::forward<K>(key)),
-                       std::forward_as_tuple(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  iterator try_emplace_hint(const_iterator hint, K &&key, Args &&... args) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      return mutable_iterator(hint);
-    }
-
-    return try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
-  }
-
-  /**
-   * Here to avoid `template<class K> size_type erase(const K& key)` being used
-   * when we use an `iterator` instead of a `const_iterator`.
-   */
-  iterator erase(iterator pos) {
-    erase_from_bucket(pos);
-
-    /**
-     * Erase bucket used a backward shift after clearing the bucket.
-     * Check if there is a new value in the bucket, if not get the next
-     * non-empty.
-     */
-    if (pos.m_bucket->empty()) {
-      ++pos;
-    }
-
-    m_try_skrink_on_next_insert = true;
-
-    return pos;
-  }
-
-  iterator erase(const_iterator pos) { return erase(mutable_iterator(pos)); }
-
-  iterator erase(const_iterator first, const_iterator last) {
-    if (first == last) {
-      return mutable_iterator(first);
-    }
-
-    auto first_mutable = mutable_iterator(first);
-    auto last_mutable = mutable_iterator(last);
-    for (auto it = first_mutable.m_bucket; it != last_mutable.m_bucket; ++it) {
-      if (!it->empty()) {
-        it->clear();
-        m_nb_elements--;
-      }
-    }
-
-    if (last_mutable == end()) {
-      return end();
-    }
-
-    /*
-     * Backward shift on the values which come after the deleted values.
-     * We try to move the values closer to their ideal bucket.
-     */
-    std::size_t icloser_bucket =
-        static_cast<std::size_t>(first_mutable.m_bucket - m_buckets);
-    std::size_t ito_move_closer_value =
-        static_cast<std::size_t>(last_mutable.m_bucket - m_buckets);
-    tsl_rh_assert(ito_move_closer_value > icloser_bucket);
-
-    const std::size_t ireturn_bucket =
-        ito_move_closer_value -
-        std::min(
-            ito_move_closer_value - icloser_bucket,
-            std::size_t(
-                m_buckets[ito_move_closer_value].dist_from_ideal_bucket()));
-
-    while (ito_move_closer_value < m_bucket_count &&
-           m_buckets[ito_move_closer_value].dist_from_ideal_bucket() > 0) {
-      icloser_bucket =
-          ito_move_closer_value -
-          std::min(
-              ito_move_closer_value - icloser_bucket,
-              std::size_t(
-                  m_buckets[ito_move_closer_value].dist_from_ideal_bucket()));
-
-      tsl_rh_assert(m_buckets[icloser_bucket].empty());
-      const distance_type new_distance = distance_type(
-          m_buckets[ito_move_closer_value].dist_from_ideal_bucket() -
-          (ito_move_closer_value - icloser_bucket));
-      m_buckets[icloser_bucket].set_value_of_empty_bucket(
-          new_distance, m_buckets[ito_move_closer_value].truncated_hash(),
-          std::move(m_buckets[ito_move_closer_value].value()));
-      m_buckets[ito_move_closer_value].clear();
-
-      ++icloser_bucket;
-      ++ito_move_closer_value;
-    }
-
-    m_try_skrink_on_next_insert = true;
-
-    return iterator(m_buckets + ireturn_bucket);
-  }
-
-  template <class K> size_type erase(const K &key) {
-    return erase(key, hash_key(key));
-  }
-
-  template <class K> size_type erase(const K &key, std::size_t hash) {
-    auto it = find(key, hash);
-    if (it != end()) {
-      erase_from_bucket(it);
-      m_try_skrink_on_next_insert = true;
-
-      return 1;
-    } else {
-      return 0;
-    }
-  }
-
-  void swap(robin_hash &other) {
-    using std::swap;
-
-    swap(static_cast<Hash &>(*this), static_cast<Hash &>(other));
-    swap(static_cast<KeyEqual &>(*this), static_cast<KeyEqual &>(other));
-    swap(static_cast<GrowthPolicy &>(*this),
-         static_cast<GrowthPolicy &>(other));
-    swap(m_buckets_data, other.m_buckets_data);
-    swap(m_buckets, other.m_buckets);
-    swap(m_bucket_count, other.m_bucket_count);
-    swap(m_nb_elements, other.m_nb_elements);
-    swap(m_load_threshold, other.m_load_threshold);
-    swap(m_max_load_factor, other.m_max_load_factor);
-    swap(m_grow_on_next_insert, other.m_grow_on_next_insert);
-    swap(m_min_load_factor, other.m_min_load_factor);
-    swap(m_try_skrink_on_next_insert, other.m_try_skrink_on_next_insert);
-  }
-
-  /*
-   * Lookup
-   */
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &at(const K &key) {
-    return at(key, hash_key(key));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &at(const K &key, std::size_t hash) {
-    return const_cast<typename U::value_type &>(
-        static_cast<const robin_hash *>(this)->at(key, hash));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  const typename U::value_type &at(const K &key) const {
-    return at(key, hash_key(key));
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  const typename U::value_type &at(const K &key, std::size_t hash) const {
-    auto it = find(key, hash);
-    if (it != cend()) {
-      return it.value();
-    } else {
-      TSL_RH_THROW_OR_TERMINATE(std::out_of_range, "Couldn't find key.");
-    }
-  }
-
-  template <
-      class K, class U = ValueSelect,
-      typename std::enable_if<has_mapped_type<U>::value>::type * = nullptr>
-  typename U::value_type &operator[](K &&key) {
-    return try_emplace(std::forward<K>(key)).first.value();
-  }
-
-  template <class K> size_type count(const K &key) const {
-    return count(key, hash_key(key));
-  }
-
-  template <class K> size_type count(const K &key, std::size_t hash) const {
-    if (find(key, hash) != cend()) {
-      return 1;
-    } else {
-      return 0;
-    }
-  }
-
-  template <class K> iterator find(const K &key) {
-    return find_impl(key, hash_key(key));
-  }
-
-  template <class K> iterator find(const K &key, std::size_t hash) {
-    return find_impl(key, hash);
-  }
-
-  template <class K> const_iterator find(const K &key) const {
-    return find_impl(key, hash_key(key));
-  }
-
-  template <class K> const_iterator find(const K &key, std::size_t hash) const {
-    return find_impl(key, hash);
-  }
-
-  template <class K> std::pair<iterator, iterator> equal_range(const K &key) {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<iterator, iterator> equal_range(const K &key, std::size_t hash) {
-    iterator it = find(key, hash);
-    return std::make_pair(it, (it == end()) ? it : std::next(it));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator>
-  equal_range(const K &key, std::size_t hash) const {
-    const_iterator it = find(key, hash);
-    return std::make_pair(it, (it == cend()) ? it : std::next(it));
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_bucket_count; }
-
-  size_type max_bucket_count() const {
-    return std::min(GrowthPolicy::max_bucket_count(),
-                    m_buckets_data.max_size());
-  }
-
-  /*
-   * Hash policy
-   */
-  float load_factor() const {
-    if (bucket_count() == 0) {
-      return 0;
-    }
-
-    return float(m_nb_elements) / float(bucket_count());
-  }
-
-  float min_load_factor() const { return m_min_load_factor; }
-
-  float max_load_factor() const { return m_max_load_factor; }
-
-  void min_load_factor(float ml) {
-    m_min_load_factor = clamp(ml, float(MINIMUM_MIN_LOAD_FACTOR),
-                              float(MAXIMUM_MIN_LOAD_FACTOR));
-  }
-
-  void max_load_factor(float ml) {
-    m_max_load_factor = clamp(ml, float(MINIMUM_MAX_LOAD_FACTOR),
-                              float(MAXIMUM_MAX_LOAD_FACTOR));
-    m_load_threshold = size_type(float(bucket_count()) * m_max_load_factor);
-  }
-
-  void rehash(size_type count) {
-    count = std::max(count,
-                     size_type(std::ceil(float(size()) / max_load_factor())));
-    rehash_impl(count);
-  }
-
-  void reserve(size_type count) {
-    rehash(size_type(std::ceil(float(count) / max_load_factor())));
-  }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return static_cast<const Hash &>(*this); }
-
-  key_equal key_eq() const { return static_cast<const KeyEqual &>(*this); }
-
-  /*
-   * Other
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return iterator(const_cast<bucket_entry *>(pos.m_bucket));
-  }
-
-private:
-  template <class K> std::size_t hash_key(const K &key) const {
-    return Hash::operator()(key);
-  }
-
-  template <class K1, class K2>
-  bool compare_keys(const K1 &key1, const K2 &key2) const {
-    return KeyEqual::operator()(key1, key2);
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const {
-    const std::size_t bucket = GrowthPolicy::bucket_for_hash(hash);
-    tsl_rh_assert(bucket < m_bucket_count ||
-                  (bucket == 0 && m_bucket_count == 0));
-
-    return bucket;
-  }
-
-  template <class U = GrowthPolicy,
-            typename std::enable_if<is_power_of_two_policy<U>::value>::type * =
-                nullptr>
-  std::size_t next_bucket(std::size_t index) const noexcept {
-    tsl_rh_assert(index < bucket_count());
-
-    return (index + 1) & this->m_mask;
-  }
-
-  template <class U = GrowthPolicy,
-            typename std::enable_if<!is_power_of_two_policy<U>::value>::type * =
-                nullptr>
-  std::size_t next_bucket(std::size_t index) const noexcept {
-    tsl_rh_assert(index < bucket_count());
-
-    index++;
-    return (index != bucket_count()) ? index : 0;
-  }
-
-  template <class K> iterator find_impl(const K &key, std::size_t hash) {
-    return mutable_iterator(
-        static_cast<const robin_hash *>(this)->find(key, hash));
-  }
-
-  template <class K>
-  const_iterator find_impl(const K &key, std::size_t hash) const {
-    std::size_t ibucket = bucket_for_hash(hash);
-    distance_type dist_from_ideal_bucket = 0;
-
-    while (dist_from_ideal_bucket <=
-           m_buckets[ibucket].dist_from_ideal_bucket()) {
-      if (TSL_RH_LIKELY(
-              (!USE_STORED_HASH_ON_LOOKUP ||
-               m_buckets[ibucket].bucket_hash_equal(hash)) &&
-              compare_keys(KeySelect()(m_buckets[ibucket].value()), key))) {
-        return const_iterator(m_buckets + ibucket);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    return cend();
-  }
-
-  void erase_from_bucket(iterator pos) {
-    pos.m_bucket->clear();
-    m_nb_elements--;
-
-    /**
-     * Backward shift, swap the empty bucket, previous_ibucket, with the values
-     * on its right, ibucket, until we cross another empty bucket or if the
-     * other bucket has a distance_from_ideal_bucket == 0.
-     *
-     * We try to move the values closer to their ideal bucket.
-     */
-    std::size_t previous_ibucket =
-        static_cast<std::size_t>(pos.m_bucket - m_buckets);
-    std::size_t ibucket = next_bucket(previous_ibucket);
-
-    while (m_buckets[ibucket].dist_from_ideal_bucket() > 0) {
-      tsl_rh_assert(m_buckets[previous_ibucket].empty());
-
-      const distance_type new_distance =
-          distance_type(m_buckets[ibucket].dist_from_ideal_bucket() - 1);
-      m_buckets[previous_ibucket].set_value_of_empty_bucket(
-          new_distance, m_buckets[ibucket].truncated_hash(),
-          std::move(m_buckets[ibucket].value()));
-      m_buckets[ibucket].clear();
-
-      previous_ibucket = ibucket;
-      ibucket = next_bucket(ibucket);
-    }
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> insert_impl(const K &key,
-                                        Args &&... value_type_args) {
-    const std::size_t hash = hash_key(key);
-
-    std::size_t ibucket = bucket_for_hash(hash);
-    distance_type dist_from_ideal_bucket = 0;
-
-    while (dist_from_ideal_bucket <=
-           m_buckets[ibucket].dist_from_ideal_bucket()) {
-      if ((!USE_STORED_HASH_ON_LOOKUP ||
-           m_buckets[ibucket].bucket_hash_equal(hash)) &&
-          compare_keys(KeySelect()(m_buckets[ibucket].value()), key)) {
-        return std::make_pair(iterator(m_buckets + ibucket), false);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    if (rehash_on_extreme_load()) {
-      ibucket = bucket_for_hash(hash);
-      dist_from_ideal_bucket = 0;
-
-      while (dist_from_ideal_bucket <=
-             m_buckets[ibucket].dist_from_ideal_bucket()) {
-        ibucket = next_bucket(ibucket);
-        dist_from_ideal_bucket++;
-      }
-    }
-
-    if (m_buckets[ibucket].empty()) {
-      m_buckets[ibucket].set_value_of_empty_bucket(
-          dist_from_ideal_bucket, bucket_entry::truncate_hash(hash),
-          std::forward<Args>(value_type_args)...);
-    } else {
-      insert_value(ibucket, dist_from_ideal_bucket,
-                   bucket_entry::truncate_hash(hash),
-                   std::forward<Args>(value_type_args)...);
-    }
-
-    m_nb_elements++;
-    /*
-     * The value will be inserted in ibucket in any case, either because it was
-     * empty or by stealing the bucket (robin hood).
-     */
-    return std::make_pair(iterator(m_buckets + ibucket), true);
-  }
-
-  template <class... Args>
-  void insert_value(std::size_t ibucket, distance_type dist_from_ideal_bucket,
-                    truncated_hash_type hash, Args &&... value_type_args) {
-    value_type value(std::forward<Args>(value_type_args)...);
-    insert_value_impl(ibucket, dist_from_ideal_bucket, hash, value);
-  }
-
-  void insert_value(std::size_t ibucket, distance_type dist_from_ideal_bucket,
-                    truncated_hash_type hash, value_type &&value) {
-    insert_value_impl(ibucket, dist_from_ideal_bucket, hash, value);
-  }
-
-  /*
-   * We don't use `value_type&& value` as last argument due to a bug in MSVC
-   * when `value_type` is a pointer, The compiler is not able to see the
-   * difference between `std::string*` and `std::string*&&` resulting in compile
-   * error.
-   *
-   * The `value` will be in a moved state at the end of the function.
-   */
-  void insert_value_impl(std::size_t ibucket,
-                         distance_type dist_from_ideal_bucket,
-                         truncated_hash_type hash, value_type &value) {
-    m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket, hash,
-                                                 value);
-    ibucket = next_bucket(ibucket);
-    dist_from_ideal_bucket++;
-
-    while (!m_buckets[ibucket].empty()) {
-      if (dist_from_ideal_bucket >
-          m_buckets[ibucket].dist_from_ideal_bucket()) {
-        if (dist_from_ideal_bucket >= REHASH_ON_HIGH_NB_PROBES__NPROBES &&
-            load_factor() >= REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR) {
-          /**
-           * The number of probes is really high, rehash the map on the next
-           * insert. Difficult to do now as rehash may throw an exception.
-           */
-          m_grow_on_next_insert = true;
-        }
-
-        m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket,
-                                                     hash, value);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    m_buckets[ibucket].set_value_of_empty_bucket(dist_from_ideal_bucket, hash,
-                                                 std::move(value));
-  }
-
-  void rehash_impl(size_type count) {
-    robin_hash new_table(count, static_cast<Hash &>(*this),
-                         static_cast<KeyEqual &>(*this), get_allocator(),
-                         m_min_load_factor, m_max_load_factor);
-
-    const bool use_stored_hash =
-        USE_STORED_HASH_ON_REHASH(new_table.bucket_count());
-    for (auto &bucket : m_buckets_data) {
-      if (bucket.empty()) {
-        continue;
-      }
-
-      const std::size_t hash =
-          use_stored_hash ? bucket.truncated_hash()
-                          : new_table.hash_key(KeySelect()(bucket.value()));
-
-      new_table.insert_value_on_rehash(new_table.bucket_for_hash(hash), 0,
-                                       bucket_entry::truncate_hash(hash),
-                                       std::move(bucket.value()));
-    }
-
-    new_table.m_nb_elements = m_nb_elements;
-    new_table.swap(*this);
-  }
-
-  void insert_value_on_rehash(std::size_t ibucket,
-                              distance_type dist_from_ideal_bucket,
-                              truncated_hash_type hash, value_type &&value) {
-    while (true) {
-      if (dist_from_ideal_bucket >
-          m_buckets[ibucket].dist_from_ideal_bucket()) {
-        if (m_buckets[ibucket].empty()) {
-          m_buckets[ibucket].set_value_of_empty_bucket(dist_from_ideal_bucket,
-                                                       hash, std::move(value));
-          return;
-        } else {
-          m_buckets[ibucket].swap_with_value_in_bucket(dist_from_ideal_bucket,
-                                                       hash, value);
-        }
-      }
-
-      dist_from_ideal_bucket++;
-      ibucket = next_bucket(ibucket);
-    }
-  }
-
-  /**
-   * Grow the table if m_grow_on_next_insert is true or we reached the
-   * max_load_factor. Shrink the table if m_try_skrink_on_next_insert is true
-   * (an erase occured) and we're below the min_load_factor.
-   *
-   * Return true if the table has been rehashed.
-   */
-  bool rehash_on_extreme_load() {
-    if (m_grow_on_next_insert || size() >= m_load_threshold) {
-      rehash_impl(GrowthPolicy::next_bucket_count());
-      m_grow_on_next_insert = false;
-
-      return true;
-    }
-
-    if (m_try_skrink_on_next_insert) {
-      m_try_skrink_on_next_insert = false;
-      if (m_min_load_factor != 0.0f && load_factor() < m_min_load_factor) {
-        reserve(size() + 1);
-
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-public:
-  static const size_type DEFAULT_INIT_BUCKETS_SIZE = 0;
-
-  static constexpr float DEFAULT_MAX_LOAD_FACTOR = 0.5f;
-  static constexpr float MINIMUM_MAX_LOAD_FACTOR = 0.2f;
-  static constexpr float MAXIMUM_MAX_LOAD_FACTOR = 0.95f;
-
-  static constexpr float DEFAULT_MIN_LOAD_FACTOR = 0.0f;
-  static constexpr float MINIMUM_MIN_LOAD_FACTOR = 0.0f;
-  static constexpr float MAXIMUM_MIN_LOAD_FACTOR = 0.15f;
-
-  static_assert(MINIMUM_MAX_LOAD_FACTOR < MAXIMUM_MAX_LOAD_FACTOR,
-                "MINIMUM_MAX_LOAD_FACTOR should be < MAXIMUM_MAX_LOAD_FACTOR");
-  static_assert(MINIMUM_MIN_LOAD_FACTOR < MAXIMUM_MIN_LOAD_FACTOR,
-                "MINIMUM_MIN_LOAD_FACTOR should be < MAXIMUM_MIN_LOAD_FACTOR");
-  static_assert(MAXIMUM_MIN_LOAD_FACTOR < MINIMUM_MAX_LOAD_FACTOR,
-                "MAXIMUM_MIN_LOAD_FACTOR should be < MINIMUM_MAX_LOAD_FACTOR");
-
-private:
-  static const distance_type REHASH_ON_HIGH_NB_PROBES__NPROBES = 128;
-  static constexpr float REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR = 0.15f;
-
-  /**
-   * Return an always valid pointer to an static empty bucket_entry with
-   * last_bucket() == true.
-   */
-  bucket_entry *static_empty_bucket_ptr() {
-    static bucket_entry empty_bucket(true);
-    return &empty_bucket;
-  }
-
-private:
-  buckets_container_type m_buckets_data;
-
-  /**
-   * Points to m_buckets_data.data() if !m_buckets_data.empty() otherwise points
-   * to static_empty_bucket_ptr. This variable is useful to avoid the cost of
-   * checking if m_buckets_data is empty when trying to find an element.
-   *
-   * TODO Remove m_buckets_data and only use a pointer instead of a
-   * pointer+vector to save some space in the robin_hash object. Manage the
-   * Allocator manually.
-   */
-  bucket_entry *m_buckets;
-
-  /**
-   * Used a lot in find, avoid the call to m_buckets_data.size() which is a bit
-   * slower.
-   */
-  size_type m_bucket_count;
-
-  size_type m_nb_elements;
-
-  size_type m_load_threshold;
-  float m_max_load_factor;
-
-  bool m_grow_on_next_insert;
-
-  float m_min_load_factor;
-
-  /**
-   * We can't shrink down the map on erase operations as the erase methods need
-   * to return the next iterator. Shrinking the map would invalidate all the
-   * iterators and we could not return the next iterator in a meaningful way, On
-   * erase, we thus just indicate on erase that we should try to shrink the hash
-   * table on the next insert if we go below the min_load_factor.
-   */
-  bool m_try_skrink_on_next_insert;
-};
-
-} // namespace detail_robin_hash
-
-} // namespace tsl
-
-#endif
diff --git a/include/tsl/robin_map.h b/include/tsl/robin_map.h
deleted file mode 100644
index a7290a6..0000000
--- a/include/tsl/robin_map.h
+++ /dev/null
@@ -1,715 +0,0 @@
-/**
- * MIT License
- *
- * Copyright (c) 2017 Tessil
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef TSL_ROBIN_MAP_H
-#define TSL_ROBIN_MAP_H
-
-#include "robin_hash.h"
-#include <cstddef>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <type_traits>
-#include <utility>
-
-namespace tsl {
-
-/**
- * Implementation of a hash map using open-adressing and the robin hood hashing
- * algorithm with backward shift deletion.
- *
- * For operations modifying the hash map (insert, erase, rehash, ...), the
- * strong exception guarantee is only guaranteed when the expression
- * `std::is_nothrow_swappable<std::pair<Key, T>>::value &&
- * std::is_nothrow_move_constructible<std::pair<Key, T>>::value` is true,
- * otherwise if an exception is thrown during the swap or the move, the hash map
- * may end up in a undefined state. Per the standard a `Key` or `T` with a
- * noexcept copy constructor and no move constructor also satisfies the
- * `std::is_nothrow_move_constructible<std::pair<Key, T>>::value` criterion (and
- * will thus guarantee the strong exception for the map).
- *
- * When `StoreHash` is true, 32 bits of the hash are stored alongside the
- * values. It can improve the performance during lookups if the `KeyEqual`
- * function takes time (if it engenders a cache-miss for example) as we then
- * compare the stored hashes before comparing the keys. When
- * `tsl::rh::power_of_two_growth_policy` is used as `GrowthPolicy`, it may also
- * speed-up the rehash process as we can avoid to recalculate the hash. When it
- * is detected that storing the hash will not incur any memory penality due to
- * alignement (i.e. `sizeof(tsl::detail_robin_hash::bucket_entry<ValueType,
- * true>) == sizeof(tsl::detail_robin_hash::bucket_entry<ValueType, false>)`)
- * and `tsl::rh::power_of_two_growth_policy` is used, the hash will be stored
- * even if `StoreHash` is false so that we can speed-up the rehash (but it will
- * not be used on lookups unless `StoreHash` is true).
- *
- * `GrowthPolicy` defines how the map grows and consequently how a hash value is
- * mapped to a bucket. By default the map uses
- * `tsl::rh::power_of_two_growth_policy`. This policy keeps the number of
- * buckets to a power of two and uses a mask to map the hash to a bucket instead
- * of the slow modulo. Other growth policies are available and you may define
- * your own growth policy, check `tsl::rh::power_of_two_growth_policy` for the
- * interface.
- *
- * `std::pair<Key, T>` must be swappable.
- *
- * `Key` and `T` must be copy and/or move constructible.
- *
- * If the destructor of `Key` or `T` throws an exception, the behaviour of the
- * class is undefined.
- *
- * Iterators invalidation:
- *  - clear, operator=, reserve, rehash: always invalidate the iterators.
- *  - insert, emplace, emplace_hint, operator[]: if there is an effective
- * insert, invalidate the iterators.
- *  - erase: always invalidate the iterators.
- */
-template <class Key, class T, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<std::pair<Key, T>>,
-          bool StoreHash = false,
-          class GrowthPolicy = tsl::rh::power_of_two_growth_policy<2>>
-class robin_map {
-private:
-  template <typename U>
-  using has_is_transparent = tsl::detail_robin_hash::has_is_transparent<U>;
-
-  class KeySelect {
-  public:
-    using key_type = Key;
-
-    const key_type &
-    operator()(const std::pair<Key, T> &key_value) const noexcept {
-      return key_value.first;
-    }
-
-    key_type &operator()(std::pair<Key, T> &key_value) noexcept {
-      return key_value.first;
-    }
-  };
-
-  class ValueSelect {
-  public:
-    using value_type = T;
-
-    const value_type &
-    operator()(const std::pair<Key, T> &key_value) const noexcept {
-      return key_value.second;
-    }
-
-    value_type &operator()(std::pair<Key, T> &key_value) noexcept {
-      return key_value.second;
-    }
-  };
-
-  using ht = detail_robin_hash::robin_hash<std::pair<Key, T>, KeySelect,
-                                           ValueSelect, Hash, KeyEqual,
-                                           Allocator, StoreHash, GrowthPolicy>;
-
-public:
-  using key_type = typename ht::key_type;
-  using mapped_type = T;
-  using value_type = typename ht::value_type;
-  using size_type = typename ht::size_type;
-  using difference_type = typename ht::difference_type;
-  using hasher = typename ht::hasher;
-  using key_equal = typename ht::key_equal;
-  using allocator_type = typename ht::allocator_type;
-  using reference = typename ht::reference;
-  using const_reference = typename ht::const_reference;
-  using pointer = typename ht::pointer;
-  using const_pointer = typename ht::const_pointer;
-  using iterator = typename ht::iterator;
-  using const_iterator = typename ht::const_iterator;
-
-public:
-  /*
-   * Constructors
-   */
-  robin_map() : robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE) {}
-
-  explicit robin_map(size_type bucket_count, const Hash &hash = Hash(),
-                     const KeyEqual &equal = KeyEqual(),
-                     const Allocator &alloc = Allocator())
-      : m_ht(bucket_count, hash, equal, alloc) {}
-
-  robin_map(size_type bucket_count, const Allocator &alloc)
-      : robin_map(bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  robin_map(size_type bucket_count, const Hash &hash, const Allocator &alloc)
-      : robin_map(bucket_count, hash, KeyEqual(), alloc) {}
-
-  explicit robin_map(const Allocator &alloc)
-      : robin_map(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {}
-
-  template <class InputIt>
-  robin_map(InputIt first, InputIt last,
-            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-            const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-            const Allocator &alloc = Allocator())
-      : robin_map(bucket_count, hash, equal, alloc) {
-    insert(first, last);
-  }
-
-  template <class InputIt>
-  robin_map(InputIt first, InputIt last, size_type bucket_count,
-            const Allocator &alloc)
-      : robin_map(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  template <class InputIt>
-  robin_map(InputIt first, InputIt last, size_type bucket_count,
-            const Hash &hash, const Allocator &alloc)
-      : robin_map(first, last, bucket_count, hash, KeyEqual(), alloc) {}
-
-  robin_map(std::initializer_list<value_type> init,
-            size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-            const Hash &hash = Hash(), const KeyEqual &equal = KeyEqual(),
-            const Allocator &alloc = Allocator())
-      : robin_map(init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
-
-  robin_map(std::initializer_list<value_type> init, size_type bucket_count,
-            const Allocator &alloc)
-      : robin_map(init.begin(), init.end(), bucket_count, Hash(), KeyEqual(),
-                  alloc) {}
-
-  robin_map(std::initializer_list<value_type> init, size_type bucket_count,
-            const Hash &hash, const Allocator &alloc)
-      : robin_map(init.begin(), init.end(), bucket_count, hash, KeyEqual(),
-                  alloc) {}
-
-  robin_map &operator=(std::initializer_list<value_type> ilist) {
-    m_ht.clear();
-
-    m_ht.reserve(ilist.size());
-    m_ht.insert(ilist.begin(), ilist.end());
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const { return m_ht.get_allocator(); }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept { return m_ht.begin(); }
-  const_iterator begin() const noexcept { return m_ht.begin(); }
-  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
-
-  iterator end() noexcept { return m_ht.end(); }
-  const_iterator end() const noexcept { return m_ht.end(); }
-  const_iterator cend() const noexcept { return m_ht.cend(); }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_ht.empty(); }
-  size_type size() const noexcept { return m_ht.size(); }
-  size_type max_size() const noexcept { return m_ht.max_size(); }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept { m_ht.clear(); }
-
-  std::pair<iterator, bool> insert(const value_type &value) {
-    return m_ht.insert(value);
-  }
-
-  template <class P, typename std::enable_if<std::is_constructible<
-                         value_type, P &&>::value>::type * = nullptr>
-  std::pair<iterator, bool> insert(P &&value) {
-    return m_ht.emplace(std::forward<P>(value));
-  }
-
-  std::pair<iterator, bool> insert(value_type &&value) {
-    return m_ht.insert(std::move(value));
-  }
-
-  iterator insert(const_iterator hint, const value_type &value) {
-    return m_ht.insert_hint(hint, value);
-  }
-
-  template <class P, typename std::enable_if<std::is_constructible<
-                         value_type, P &&>::value>::type * = nullptr>
-  iterator insert(const_iterator hint, P &&value) {
-    return m_ht.emplace_hint(hint, std::forward<P>(value));
-  }
-
-  iterator insert(const_iterator hint, value_type &&value) {
-    return m_ht.insert_hint(hint, std::move(value));
-  }
-
-  template <class InputIt> void insert(InputIt first, InputIt last) {
-    m_ht.insert(first, last);
-  }
-
-  void insert(std::initializer_list<value_type> ilist) {
-    m_ht.insert(ilist.begin(), ilist.end());
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(const key_type &k, M &&obj) {
-    return m_ht.insert_or_assign(k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(key_type &&k, M &&obj) {
-    return m_ht.insert_or_assign(std::move(k), std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, const key_type &k, M &&obj) {
-    return m_ht.insert_or_assign(hint, k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, key_type &&k, M &&obj) {
-    return m_ht.insert_or_assign(hint, std::move(k), std::forward<M>(obj));
-  }
-
-  /**
-   * Due to the way elements are stored, emplace will need to move or copy the
-   * key-value once. The method is equivalent to
-   * insert(value_type(std::forward<Args>(args)...));
-   *
-   * Mainly here for compatibility with the std::unordered_map interface.
-   */
-  template <class... Args> std::pair<iterator, bool> emplace(Args &&... args) {
-    return m_ht.emplace(std::forward<Args>(args)...);
-  }
-
-  /**
-   * Due to the way elements are stored, emplace_hint will need to move or copy
-   * the key-value once. The method is equivalent to insert(hint,
-   * value_type(std::forward<Args>(args)...));
-   *
-   * Mainly here for compatibility with the std::unordered_map interface.
-   */
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args &&... args) {
-    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(const key_type &k, Args &&... args) {
-    return m_ht.try_emplace(k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(key_type &&k, Args &&... args) {
-    return m_ht.try_emplace(std::move(k), std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, const key_type &k,
-                       Args &&... args) {
-    return m_ht.try_emplace_hint(hint, k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, key_type &&k, Args &&... args) {
-    return m_ht.try_emplace_hint(hint, std::move(k),
-                                 std::forward<Args>(args)...);
-  }
-
-  iterator erase(iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
-  iterator erase(const_iterator first, const_iterator last) {
-    return m_ht.erase(first, last);
-  }
-  size_type erase(const key_type &key) { return m_ht.erase(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup to the value if you already have the hash.
-   */
-  size_type erase(const key_type &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key) {
-    return m_ht.erase(key);
-  }
-
-  /**
-   * @copydoc erase(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup to the value if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type erase(const K &key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  void swap(robin_map &other) { other.m_ht.swap(m_ht); }
-
-  /*
-   * Lookup
-   */
-  T &at(const Key &key) { return m_ht.at(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  T &at(const Key &key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  const T &at(const Key &key) const { return m_ht.at(key); }
-
-  /**
-   * @copydoc at(const Key& key, std::size_t precalculated_hash)
-   */
-  const T &at(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  T &at(const K &key) {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  T &at(const K &key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const T &at(const K &key) const {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const T &at(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  T &operator[](const Key &key) { return m_ht[key]; }
-  T &operator[](Key &&key) { return m_ht[std::move(key)]; }
-
-  size_type count(const Key &key) const { return m_ht.count(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  size_type count(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key) const {
-    return m_ht.count(key);
-  }
-
-  /**
-   * @copydoc count(const K& key) const
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  size_type count(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  iterator find(const Key &key) { return m_ht.find(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  iterator find(const Key &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  const_iterator find(const Key &key) const { return m_ht.find(key); }
-
-  /**
-   * @copydoc find(const Key& key, std::size_t precalculated_hash)
-   */
-  const_iterator find(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key) {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  iterator find(const K &key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key) const {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  const_iterator find(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  std::pair<iterator, iterator> equal_range(const Key &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  std::pair<iterator, iterator> equal_range(const Key &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  std::pair<const_iterator, const_iterator> equal_range(const Key &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
-   */
-  std::pair<const_iterator, const_iterator>
-  equal_range(const Key &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Usefull to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<iterator, iterator> equal_range(const K &key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(const K &key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K, class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type * = nullptr>
-  std::pair<const_iterator, const_iterator>
-  equal_range(const K &key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_ht.bucket_count(); }
-  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
-
-  /*
-   *  Hash policy
-   */
-  float load_factor() const { return m_ht.load_factor(); }
-
-  float min_load_factor() const { return m_ht.min_load_factor(); }
-  float max_load_factor() const { return m_ht.max_load_factor(); }
-
-  /**
-   * Set the `min_load_factor` to `ml`. When the `load_factor` of the map goes
-   * below `min_load_factor` after some erase operations, the map will be
-   * shrunk when an insertion occurs. The erase method itself never shrinks
-   * the map.
-   *
-   * The default value of `min_load_factor` is 0.0f, the map never shrinks by
-   * default.
-   */
-  void min_load_factor(float ml) { m_ht.min_load_factor(ml); }
-  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
-
-  void rehash(size_type count) { m_ht.rehash(count); }
-  void reserve(size_type count) { m_ht.reserve(count); }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return m_ht.hash_function(); }
-  key_equal key_eq() const { return m_ht.key_eq(); }
-
-  /*
-   * Other
-   */
-
-  /**
-   * Convert a const_iterator to an iterator.
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return m_ht.mutable_iterator(pos);
-  }
-
-  friend bool operator==(const robin_map &lhs, const robin_map &rhs) {
-    if (lhs.size() != rhs.size()) {
-      return false;
-    }
-
-    for (const auto &element_lhs : lhs) {
-      const auto it_element_rhs = rhs.find(element_lhs.first);
-      if (it_element_rhs == rhs.cend() ||
-          element_lhs.second != it_element_rhs->second) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  friend bool operator!=(const robin_map &lhs, const robin_map &rhs) {
-    return !operator==(lhs, rhs);
-  }
-
-  friend void swap(robin_map &lhs, robin_map &rhs) { lhs.swap(rhs); }
-
-private:
-  ht m_ht;
-};
-
-/**
- * Same as `tsl::robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash,
- * tsl::rh::prime_growth_policy>`.
- */
-template <class Key, class T, class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<std::pair<Key, T>>,
-          bool StoreHash = false>
-using robin_pg_map = robin_map<Key, T, Hash, KeyEqual, Allocator, StoreHash,
-                               tsl::rh::prime_growth_policy>;
-
-} // end namespace tsl
-
-#endif
diff --git a/include/utility/timer.h b/include/utility/timer.h
deleted file mode 100644
index ca5f3ae..0000000
--- a/include/utility/timer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <chrono>
-#ifdef TV_CUDA
-#include <cuda_runtime_api.h>
-#endif
-#include <iostream>
-
-namespace spconv {
-
-#ifdef TV_CUDA
-template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
-  CudaContextTimer() {
-    cudaDeviceSynchronize();
-    mCurTime = std::chrono::steady_clock::now();
-  }
-  typename TimeT::rep report() {
-    cudaDeviceSynchronize();
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-#endif
-
-template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
-  CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
-  typename TimeT::rep report() {
-    auto duration = std::chrono::duration_cast<TimeT>(
-        std::chrono::steady_clock::now() - mCurTime);
-    auto res = duration.count();
-    mCurTime = std::chrono::steady_clock::now();
-    return res;
-  }
-
-private:
-  std::chrono::time_point<std::chrono::steady_clock> mCurTime;
-};
-
-} // namespace spconv
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..67569c1
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=41.0", "wheel", "pccm>=0.2.5", "cumm>=0.1.3"]
+build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index 0cf871d..b050457 100644
--- a/setup.py
+++ b/setup.py
@@ -1,110 +1,185 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Note: To use the 'upload' functionality of this file, you must:
+#   $ pip install twine
+
+import io
 import os
-import platform
-import re
-import subprocess
+import shutil
 import sys
-from distutils.version import LooseVersion
 from pathlib import Path
+from shutil import rmtree
+from typing import List
+
+import pccm
+from pccm.extension import ExtCallback, PCCMBuild, PCCMExtension
+from setuptools import Command, find_packages, setup
+from setuptools.extension import Extension
+
+# Package meta-data.
+NAME = 'spconv'
+RELEASE_NAME = NAME
+deps = ["cumm"]
+cuda_ver = os.environ.get("CUMM_CUDA_VERSON", "")
+if cuda_ver:
+    cuda_ver = cuda_ver.replace(".", "") # 10.2 to 102
+    RELEASE_NAME += "-cu{}".format(cuda_ver)
+    deps = ["cumm-cu{}".format(cuda_ver)]
+DESCRIPTION = 'spatial sparse convolution'
+URL = 'https://github.com/traveller59/spconv'
+EMAIL = 'yanyan.sub@outlook.com'
+AUTHOR = 'Yan Yan'
+REQUIRES_PYTHON = '>=3.7'
+VERSION = None
+
+# What packages are required for this module to be executed?
+REQUIRED = ["pccm>=0.2.5", "pybind11>=2.6.0", "fire", "numpy", *deps]
+
+# What packages are optional?
+EXTRAS = {
+    # 'fancy feature': ['django'],
+}
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change the Trove Classifier for that!
+
+here = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(str(Path(__file__).parent))
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+        long_description = '\n' + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+    with open('version.txt', 'r') as f:
+        version = f.read().strip()
+else:
+    version = VERSION
+cwd = os.path.dirname(os.path.abspath(__file__))
+
 
-import torch
-from setuptools import Extension, find_packages, setup
-from setuptools.command.build_ext import build_ext
+def _convert_build_number(build_number):
+    parts = build_number.split(".")
+    if len(parts) == 2:
+        return "{}{:03d}".format(int(parts[0]), int(parts[1]))
+    elif len(parts) == 1:
+        return build_number
+    else:
+        raise NotImplementedError
 
-# if 'LIBTORCH_ROOT' not in os.environ:
-#     raise ValueError("You must set LIBTORCH_ROOT to your torch c++ library.")
 
-LIBTORCH_ROOT = str(Path(torch.__file__).parent)
+env_suffix = os.environ.get("SPCONV_VERSION_SUFFIX", "")
+if env_suffix != "":
+    version += ".dev{}".format(_convert_build_number(env_suffix))
+version_path = os.path.join(cwd, NAME, '__version__.py')
+about['__version__'] = version
 
-SPCONV_FORCE_BUILD_CUDA = os.getenv("SPCONV_FORCE_BUILD_CUDA")
+with open(version_path, 'w') as f:
+    f.write("__version__ = '{}'\n".format(version))
 
-PYTHON_VERSION = "{}.{}".format(sys.version_info.major, sys.version_info.minor)
+class UploadCommand(Command):
+    """Support setup.py upload."""
 
-remove_plus = torch.__version__.find("+")
-PYTORCH_VERSION = torch.__version__
-if remove_plus != -1:
-    PYTORCH_VERSION = torch.__version__[:remove_plus]
-PYTORCH_VERSION = list(map(int, PYTORCH_VERSION.split(".")))
-PYTORCH_VERSION_NUMBER = PYTORCH_VERSION[0] * 10000 + PYTORCH_VERSION[1] * 100 + PYTORCH_VERSION[2]
+    description = 'Build and publish the package.'
+    user_options = []
 
-class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir='', library_dirs=[]):
-        Extension.__init__(self, name, sources=[], library_dirs=library_dirs)
-        self.sourcedir = os.path.abspath(sourcedir)
+    @staticmethod
+    def status(s):
+        """Prints things in bold."""
+        print('\033[1m{0}\033[0m'.format(s))
 
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
 
-class CMakeBuild(build_ext):
     def run(self):
         try:
-            out = subprocess.check_output(['cmake', '--version'])
+            self.status('Removing previous builds...')
+            rmtree(os.path.join(here, 'dist'))
         except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
-
-        if platform.system() == "Windows":
-            cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
-            if cmake_version < '3.13.0':
-                raise RuntimeError("CMake >= 3.13.0 is required on Windows")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-        cmake_args = [# '-G "Visual Studio 15 2017 Win64"',
-                      '-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
-                      '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
-                      '-DSPCONV_BuildTests=OFF',
-                      '-DPYTORCH_VERSION={}'.format(PYTORCH_VERSION_NUMBER),
-                      ] #  -arch=sm_61
-        if not torch.cuda.is_available() and SPCONV_FORCE_BUILD_CUDA is None:
-            cmake_args += ['-DSPCONV_BuildCUDA=OFF']
-        else:
-            cuda_flags = ["\"--expt-relaxed-constexpr\""]
-            # must add following flags to use at::Half
-            # but will remove raw half operators.
-            cuda_flags += ["-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__"]
-            # cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"] 
-            cmake_args += ['-DCMAKE_CUDA_FLAGS=' + " ".join(cuda_flags)]
-        cfg = 'Debug' if self.debug else 'Release'
-        assert cfg == "Release", "pytorch ops don't support debug build."
-        build_args = ['--config', cfg]
-        print(cfg)
-        if platform.system() == "Windows":
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), str(Path(extdir) / "spconv"))]
-            # cmake_args += ['-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), str(Path(extdir) / "spconv"))]
-            cmake_args += ['-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), str(Path(extdir) / "spconv"))]
-            cmake_args += ["-DCMAKE_WINDOWS_EXPORT_ALL_SYMBOLS=TRUE"]
-            if sys.maxsize > 2**32:
-                cmake_args += ['-A', 'x64']
-            build_args += ['--', '/m']
-        else:
-            cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(str(Path(extdir) / "spconv"))]
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            build_args += ['--', '-j4']
-
-        env = os.environ.copy()
-        env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
-                                                              self.distribution.get_version())
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-        print("|||||CMAKE ARGS|||||", cmake_args)
-        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
-
-
-packages = find_packages(exclude=('tools', 'tools.*'))
+            pass
+
+        self.status('Building Source and Wheel (universal) distribution...')
+        os.system('{0} setup.py sdist bdist_wheel --universal'.format(
+            sys.executable))
+
+        self.status('Uploading the package to PyPI via Twine...')
+        os.system('twine upload dist/*')
+
+        self.status('Pushing git tags...')
+        os.system('git tag v{0}'.format(about['__version__']))
+        os.system('git push --tags')
+
+        sys.exit()
+
+
+
+disable_jit = os.getenv("SPCONV_DISABLE_JIT", None)
+
+if disable_jit is not None and disable_jit == "1":
+    cmdclass = {
+        'upload': UploadCommand,
+        'build_ext': PCCMBuild,
+    }
+    from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
+    from spconv.csrc.sparse.all import SpconvOps
+    from cumm.gemm.gather import GatherAll
+    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
+
+    cu.namespace = "cumm.gemm.main"
+    ext_modules: List[Extension] = [
+        PCCMExtension([cu, SpconvOps(), GatherAll()],
+                      "spconv/core_cc",
+                      Path(__file__).resolve().parent / "spconv")
+    ]
+else:
+    cmdclass = {
+        'upload': UploadCommand,
+    }
+    ext_modules = []
+
+# Where the magic happens:
 setup(
-    name='spconv',
-    version='1.2.1',
-    author='Yan Yan',
-    author_email='scrin@foxmail.com',
-    description='spatial sparse convolution for pytorch',
-    long_description='',
-    setup_requires = ['torch>=1.3.0'],
-    packages=packages,
-    package_dir = {'spconv': 'spconv'},
-    ext_modules=[CMakeExtension('spconv', library_dirs=[])],
-    cmdclass=dict(build_ext=CMakeBuild),
-    zip_safe=False,
+    name=RELEASE_NAME,
+    version=about['__version__'],
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author=AUTHOR,
+    author_email=EMAIL,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(exclude=('tests', )),
+    # If your package is a single module, use this instead of 'packages':
+    # py_modules=['mypackage'],
+    entry_points={
+        'console_scripts': [],
+    },
+    install_requires=REQUIRED,
+    extras_require=EXTRAS,
+    include_package_data=True,
+    license='MIT',
+    classifiers=[
+        # Trove classifiers
+        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: PyPy'
+    ],
+    # $ setup.py publish support.
+    cmdclass=cmdclass,
+    ext_modules=ext_modules,
 )
diff --git a/spconv/__init__.py b/spconv/__init__.py
index 150fe26..3e2e8ff 100644
--- a/spconv/__init__.py
+++ b/spconv/__init__.py
@@ -1,51 +1,18 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import platform
-from pathlib import Path
+from . import build as _build
 
-import numpy as np
-import torch
-
-from spconv import ops, utils
-from spconv.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
-                         SparseConvTranspose3d, SparseInverseConv2d,
-                         SparseInverseConv3d, SubMConv2d, SubMConv3d)
-from spconv.core import SparseConvTensor
-from spconv.identity import Identity
-from spconv.modules import SparseModule, SparseSequential
-from spconv.ops import ConvAlgo
-from spconv.pool import SparseMaxPool2d, SparseMaxPool3d
-from spconv.tables import AddTable, ConcatTable, JoinTable
-
-_LIB_FILE_NAME = "libspconv.so"
-if platform.system() == "Windows":
-    _LIB_FILE_NAME = "spconv.dll"
-_LIB_PATH = str(Path(__file__).parent / _LIB_FILE_NAME)
-torch.ops.load_library(_LIB_PATH)
-
-
-class ToDense(SparseModule):
-    """convert SparseConvTensor to NCHW dense tensor.
-    """
-    def forward(self, x: SparseConvTensor):
-        return x.dense()
-
-
-class RemoveGrid(SparseModule):
-    """remove pre-allocated grid buffer.
-    """
-    def forward(self, x: SparseConvTensor):
-        x.grid = None
-        return x
+from .algo import ConvAlgo
+from . import utils, constants
\ No newline at end of file
diff --git a/spconv/algo.py b/spconv/algo.py
new file mode 100644
index 0000000..baf2e63
--- /dev/null
+++ b/spconv/algo.py
@@ -0,0 +1,578 @@
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from cumm import tensorview as tv
+from typing import Dict, List, Set, Tuple
+from spconv.core_cc.cumm.gemm.main import GemmAlgoDesp, GemmMainUnitTest, GemmParams
+# from spconv.core_cc.cumm.gemm.gather import GatherAll, ScatterAll
+from cumm.gemm.algospec.core import ShuffleStrideType, get_min_arch_of_algo_str, get_available_algo_str_from_arch
+from cumm.gemm.codeops import group_by, div_up
+from typing import Optional
+import time
+
+import numpy as np
+
+
+class ConvAlgo(Enum):
+    Native = "Native"
+    MaskImplicitGemm = "MaskImplicitGemm"
+    MaskSplitImplicitGemm = "MaskSplitImplicitGemm"
+
+
+class AlgoHint(Enum):
+    NoHint = 0b000
+    Fowrard = 0b001
+    BackwardInput = 0b010
+    BackwardWeight = 0b100
+
+
+ALL_ALGO_DESPS = GemmMainUnitTest.get_all_algo_desp()
+
+_GEMM_STATIC_KEY = Tuple[bool, bool, bool, int, int, int, str, str]
+
+# GATHER = GatherAll()
+# SCATTER = ScatterAll()
+
+
+class SimpleGemmAlgoMeta:
+    def __init__(self, tile_ms: List[int], tile_ns: List[int],
+                 tile_ks: List[int],
+                 tile_shape_to_algos: Dict[int, List[int]]) -> None:
+        self.tile_shape_to_algos = tile_shape_to_algos
+        self.tile_ms = tile_ms
+        self.tile_ns = tile_ns
+        self.tile_ks = tile_ks
+
+
+class BestAlgoByProfile:
+    def __init__(self,
+                 algo_desp: GemmAlgoDesp,
+                 external_gather: bool,
+                 external_scatter: bool,
+                 gather_params: Optional[Tuple[int, int, int, int]] = None,
+                 scatter_params: Optional[Tuple[int, int, int, int]] = None,
+                 splitk: int = 1) -> None:
+        self.algo_desp = algo_desp
+        self.external_gather = external_gather
+        self.external_scatter = external_scatter
+        self.gather_params = gather_params
+        self.scatter_params = scatter_params
+        self.splitk = splitk
+
+
+class SimpleGemm:
+    def __init__(self, desps: List[GemmAlgoDesp]) -> None:
+        self.desps = desps
+
+        self.static_key_to_desps = group_by(self.get_static_key, desps)
+        self.static_key_to_meta: Dict[_GEMM_STATIC_KEY,
+                                      SimpleGemmAlgoMeta] = {}
+        for k, static_desps in self.static_key_to_desps.items():
+            tile_shape_to_algos: Dict[int, List[int]] = {}
+            tile_ms: Set[int] = set()
+            tile_ns: Set[int] = set()
+            tile_ks: Set[int] = set()
+            for i, desp in enumerate(static_desps):
+                ts = desp.tile_shape
+                tile_ms.add(ts[0])
+                tile_ns.add(ts[1])
+                tile_ks.add(ts[2])
+                tile_key = ts[0] | (ts[1] << 20) | (ts[2] << 40)
+                if tile_key not in tile_shape_to_algos:
+                    tile_shape_to_algos[tile_key] = []
+                tile_shape_to_algos[tile_key].append(i)
+                tile_ms_list = list(tile_ms)
+                tile_ns_list = list(tile_ns)
+                tile_ks_list = list(tile_ks)
+                tile_ms_list.sort()
+                tile_ns_list.sort()
+                tile_ks_list.sort()
+            self.static_key_to_meta[k] = SimpleGemmAlgoMeta(
+                tile_ms_list, tile_ns_list, tile_ks_list, tile_shape_to_algos)
+
+        self.nk_forward_cache: Dict[Tuple[int, int],
+                            BestAlgoByProfile] = {}  # for forward
+        self.nk_dgrad_cache: Dict[Tuple[int, int],
+                            BestAlgoByProfile] = {}  # for backward weight
+
+        self.mn_cache: Dict[Tuple[int, int],
+                            BestAlgoByProfile] = {}  # for backward weight
+
+    @staticmethod
+    def get_static_key(d: GemmAlgoDesp) -> _GEMM_STATIC_KEY:
+        return (d.trans_a, d.trans_b, d.trans_c, d.dtype_a, d.dtype_b,
+                d.dtype_c, d.shuffle_type, d.algo)
+
+    def device_synchronize(self):
+        return GemmMainUnitTest.device_synchronize()
+
+    def get_all_available(
+            self,
+            a: tv.Tensor,
+            b: tv.Tensor,
+            c: tv.Tensor,
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle):
+        if trans_c:
+            trans_a = not trans_a
+            trans_b = not trans_b
+            trans_a, trans_b = trans_b, trans_a
+            a, b = b, a
+            trans_c = False
+        avail_algos = get_available_algo_str_from_arch(arch)
+        finally_algos: List[GemmAlgoDesp] = []
+        for algo in avail_algos:
+            static_key = (trans_a, trans_b, trans_c, a.dtype, b.dtype, c.dtype,
+                          shuffle_type.value, algo)
+            desps = self.static_key_to_desps.get(static_key, None)
+            if desps is None or len(desps) == 0:
+                continue
+            for desp in desps:
+                lda = a.dim(1)
+                ldb = b.dim(1)
+                ldc = c.dim(1)
+                if desp.supported_ldx(lda, ldb, ldc):
+                    finally_algos.append(desp)
+        return finally_algos
+
+    def select(self,
+               a: tv.Tensor,
+               b: tv.Tensor,
+               c: tv.Tensor,
+               trans_a: bool,
+               trans_b: bool,
+               trans_c: bool,
+               arch: Tuple[int, int],
+               shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+               a_inds: tv.Tensor = tv.Tensor(),
+               b_inds: tv.Tensor = tv.Tensor(),
+               c_inds: tv.Tensor = tv.Tensor(),
+               hint: int = AlgoHint.NoHint.value):
+        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds.shape, b_inds.shape,
+                                               c_inds.shape)
+        if trans_c:
+            trans_a = not trans_a
+            trans_b = not trans_b
+            trans_a, trans_b = trans_b, trans_a
+            a, b = b, a
+            trans_c = False
+        avail_algos = get_available_algo_str_from_arch(arch)
+        finally_algos: List[GemmAlgoDesp] = []
+        for algo in avail_algos:
+            static_key = (trans_a, trans_b, trans_c, a.dtype, b.dtype, c.dtype,
+                          shuffle_type.value, algo)
+            desps = self.static_key_to_desps.get(static_key, None)
+            if desps is None or len(desps) == 0:
+                continue
+            meta = self.static_key_to_meta[static_key]
+            # for shuffle stride algos, we need to make channel tile size as large as possible.
+            # so if ShuffleAC, we need to make k largest.
+            selected_algo_desps = GemmMainUnitTest.simple_select_tile_shape(
+                m,
+                n,
+                k,
+                meta.tile_ms,
+                meta.tile_ns,
+                meta.tile_ks,
+                meta.tile_shape_to_algos,
+                large_k_first=shuffle_type == shuffle_type.ShuffleAC)
+            if not selected_algo_desps:
+                candidate = desps
+            else:
+                candidate = [desps[i] for i in selected_algo_desps]
+            # select by hint
+            if hint == 0:
+                return candidate[0]
+            if hint & (AlgoHint.Fowrard.value | AlgoHint.BackwardInput.value):
+                # m may be huge, n and k are small
+                # don't need mixed precision
+                # don't need splitk
+                finally_algos = []
+                if a.dtype == tv.float16:
+                    dacc = tv.float16
+                    dcomp = tv.float16
+                    for can in candidate:
+                        if can.dacc == dacc and can.dcomp == dcomp:
+                            finally_algos.append(can)
+                else:
+                    finally_algos = candidate
+            elif hint & AlgoHint.BackwardWeight.value:
+                # k is huge
+                # don't support i8
+                # if f16, acc and comp must be f32
+                finally_algos = []
+                candidate_filtered: List[GemmAlgoDesp] = list(
+                    filter(lambda x: x.split_k_serial, candidate))
+                if not candidate_filtered:
+                    candidate_filtered = candidate
+                if a.dtype == tv.int8:
+                    continue
+                elif a.dtype == tv.float16:
+                    dacc = tv.float32
+                    dcomp = tv.float32
+                    for can in candidate_filtered:
+                        if can.dacc == dacc and can.dcomp == dcomp:
+                            finally_algos.append(can)
+                else:
+                    finally_algos = candidate_filtered
+            else:
+                return candidate[0]
+        # print(finally_algos)
+        if finally_algos:
+            return finally_algos[0]
+        return None
+
+    def get_profiled_algo(
+            self,
+            a_shape: List[int],
+            b_shape: List[int],
+            c_shape: List[int],
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+            a_inds_shape: Optional[List[int]] = None,
+            b_inds_shape: Optional[List[int]] = None,
+            c_inds_shape: Optional[List[int]] = None,
+            hint: int = AlgoHint.NoHint.value):
+        if a_inds_shape is None:
+            a_inds_shape = []
+        if b_inds_shape is None:
+            b_inds_shape = []
+        if c_inds_shape is None:
+            c_inds_shape = []
+        m, n, k = GemmMainUnitTest.extract_mnk(a_shape, b_shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds_shape, b_inds_shape,
+                                               c_inds_shape)
+        if hint & AlgoHint.BackwardWeight.value:
+            key = (m, n)
+            return self.mn_cache.get(key, None)
+        elif hint & AlgoHint.BackwardInput.value:
+            key = (n, k)
+            return self.nk_dgrad_cache.get(key, None)
+        elif hint & AlgoHint.Fowrard.value:
+            key = (n, k)
+            return self.nk_forward_cache.get(key, None)
+        raise NotImplementedError
+
+    def extract_mnk(
+            self,
+            a_shape: List[int],
+            b_shape: List[int],
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+            a_inds_shape: Optional[List[int]] = None,
+            b_inds_shape: Optional[List[int]] = None,
+            c_inds_shape: Optional[List[int]] = None,
+            hint: int = AlgoHint.NoHint.value):
+        if a_inds_shape is None:
+            a_inds_shape = []
+        if b_inds_shape is None:
+            b_inds_shape = []
+        if c_inds_shape is None:
+            c_inds_shape = []
+        m, n, k = GemmMainUnitTest.extract_mnk(a_shape, b_shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds_shape, b_inds_shape,
+                                               c_inds_shape)
+        return m, n, k
+
+    def profile_and_cache(
+            self,
+            a: tv.Tensor,
+            b: tv.Tensor,
+            c: tv.Tensor,
+            trans_a: bool,
+            trans_b: bool,
+            trans_c: bool,
+            arch: Tuple[int, int],
+            shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+            a_inds: tv.Tensor = tv.Tensor(),
+            b_inds: tv.Tensor = tv.Tensor(),
+            c_inds: tv.Tensor = tv.Tensor(),
+            hint: int = AlgoHint.NoHint.value,
+            alpha: float = 1.0,
+            beta: float = 0.0,
+            gather_data: tv.Tensor = tv.Tensor(),
+            scatter_data: tv.Tensor = tv.Tensor(),
+            # mm_func
+            stream: int = 0):
+        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds.shape, b_inds.shape,
+                                               c_inds.shape)
+        if hint & AlgoHint.BackwardWeight.value:
+            key = (m, n)
+        else:
+            key = (n, k)
+
+        avail = self.get_all_available(a, b, c, trans_a, trans_b, trans_c,
+                                       arch, shuffle_type)
+        c_ = c.clone()
+        times: List[float] = []
+        # gather_algos: List[GemmAlgoDesp] = []
+        # find fastest gather algo for this input
+        best_gather_params = (-1, -1, -1, -1)
+        best_scatter_params = (-1, -1, -1, -1)
+        # gather_data_ = tv.Tensor()
+        # if not gather_data.empty(
+        # ) and not hint & AlgoHint.BackwardWeight.value:
+        #     # run gather here
+        #     all_gather_params = GATHER.get_all_gather_params()
+        #     gather_data_ = gather_data.clone()
+        #     gather_times: List[float] = []
+
+        #     for gather_params in all_gather_params:
+        #         if GATHER.supported(gather_params[2], a.dim(1), a.dtype):
+        #             this_times = []
+        #             for j in range(10):
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 t = time.time()
+        #                 GATHER.gather(gather_data_, a, a_inds, *gather_params)
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 this_times.append(time.time() - t)
+        #             gather_times.append(np.mean(this_times[5:]))
+
+        #     min_time = 1000
+        #     min_idx = -1
+        #     for i, t in enumerate(gather_times):
+        #         if t < min_time:
+        #             min_time = t
+        #             min_idx = i
+        #     best_gather_params = all_gather_params[min_idx]
+
+        # if not scatter_data.empty(
+        # ) and not hint & AlgoHint.BackwardWeight.value:
+        #     # run gather here
+        #     all_scatter_params = SCATTER.get_all_scatter_params()
+        #     scatter_data_ = scatter_data.clone()
+        #     scatter_times: List[float] = []
+
+        #     for params in all_scatter_params:
+        #         if SCATTER.supported_scatter(*params, a.dim(1), a.dtype):
+        #             this_times = []
+        #             for j in range(10):
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 t = time.time()
+        #                 SCATTER.scatter(c_, scatter_data_, c_inds, *params)
+        #                 GemmMainUnitTest.stream_synchronize(stream)
+        #                 this_times.append(time.time() - t)
+        #             scatter_times.append(np.mean(this_times[5:]))
+
+        #     min_time = 1000
+        #     min_idx = -1
+        #     for i, t in enumerate(scatter_times):
+        #         if t < min_time:
+        #             min_time = t
+        #             min_idx = i
+        #     best_scatter_params = all_scatter_params[min_idx]
+
+
+        all_profile_res: List[BestAlgoByProfile] = []
+        for desp in avail:
+            c_.zero_()
+            split_k_slices = 1
+            # TODO better splitk selection
+            if desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:
+                split_k_slices = max(min(32, k // 128), 1)
+            params = GemmParams()
+            params.a = a
+            params.b = b
+            params.c = c_
+            params.a_inds = a_inds
+            params.b_inds = b_inds
+            params.c_inds = c_inds
+            params.algo_desp = desp
+            params.alpha = alpha
+            params.beta = beta
+            params.stream = stream
+            if desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:
+                splitk_tests = [1, 2, 4, 8, 16, 32, 64]
+            else:
+                splitk_tests = [1]
+            spk_speeds = []
+            for spk in splitk_tests:
+                this_times = []
+                for j in range(3):
+                    GemmMainUnitTest.stream_synchronize(stream)
+                    t = time.time()
+                    params.split_k_slices = spk
+                    GemmMainUnitTest.matmul2(params)
+                    GemmMainUnitTest.stream_synchronize(stream)
+                    this_times.append(time.time() - t)
+                times.append(np.mean(this_times[1:]))
+                spk_speeds.append(times[-1])
+
+                all_profile_res.append(
+                    BestAlgoByProfile(desp, False, False, best_gather_params, best_scatter_params, splitk=spk))
+            # if desp.split_k_serial:
+            #     print(a.shape, b.shape, spk_speeds)
+            # if not gather_data.empty(
+            # ) and not hint & AlgoHint.BackwardWeight.value:
+            #     # run gather here
+            #     for spk in splitk_tests:
+            #         this_times = []
+            #         for j in range(3):
+
+            #             GemmMainUnitTest.stream_synchronize(stream)
+            #             t = time.time()
+            #             params.a_inds = tv.Tensor()
+            #             params.a = gather_data_
+            #             params.split_k_slices = spk
+            #             GATHER.gather(gather_data_,
+            #                         a,
+            #                         a_inds,
+            #                         *best_gather_params,
+            #                         stream=stream)
+            #             GemmMainUnitTest.matmul2(params)
+            #             GemmMainUnitTest.stream_synchronize(stream)
+            #             this_times.append(time.time() - t)
+
+            #         times.append(np.mean(this_times[1:]))
+            #         # print("G", times[-1], times[-2])
+            #         all_profile_res.append(
+            #             BestAlgoByProfile(desp,
+            #                             True,
+            #                             False,
+            #                             best_gather_params, best_scatter_params,
+            #                             splitk=spk))
+
+        min_time = 1000
+        min_idx = -1
+        for i, t in enumerate(times):
+            if t < min_time:
+                min_time = t
+                min_idx = i
+        res = all_profile_res[min_idx]
+        if hint & AlgoHint.BackwardWeight.value:
+            key = (m, n)
+            self.mn_cache[key] = res
+        elif hint & AlgoHint.BackwardInput.value:
+            key = (n, k)
+            self.nk_dgrad_cache[key] = res
+        elif hint & AlgoHint.Fowrard.value:
+            key = (n, k)
+            self.nk_forward_cache[key] = res
+        else:
+            raise NotImplementedError
+
+        return res, min_time
+
+    def run_profile(
+        self,
+        profile_res: BestAlgoByProfile,
+        a: tv.Tensor,
+        b: tv.Tensor,
+        c: tv.Tensor,
+        trans_a: bool,
+        trans_b: bool,
+        trans_c: bool,
+        arch: Tuple[int, int],
+        stream: int,
+        shuffle_type: ShuffleStrideType = ShuffleStrideType.NoShuffle,
+        a_inds: tv.Tensor = tv.Tensor(),
+        b_inds: tv.Tensor = tv.Tensor(),
+        c_inds: tv.Tensor = tv.Tensor(),
+        hint: int = AlgoHint.NoHint.value,
+        alpha: float = 1.0,
+        beta: float = 0.0,
+        gather_data: tv.Tensor = tv.Tensor(),
+        workspace: tv.Tensor = tv.Tensor()):
+        m, n, k = GemmMainUnitTest.extract_mnk(a.shape, b.shape,
+                                               trans_a, trans_b, trans_c,
+                                               shuffle_type.value,
+                                               a_inds.shape, b_inds.shape,
+                                               c_inds.shape)
+        # GemmMainUnitTest.stream_synchronize(stream)
+        algo_desp = profile_res.algo_desp
+        assert algo_desp is not None
+        split_k_slices = 1
+        # TODO better splitk selection
+        # if algo_desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:
+        #     split_k_slices = max(min(32, k // 128), 1)
+        if profile_res.splitk > 1:
+            split_k_slices = profile_res.splitk
+        params = GemmParams()
+        params.a = a
+        params.b = b
+        params.c = c
+        params.a_inds = a_inds
+        params.b_inds = b_inds
+        params.c_inds = c_inds
+        params.algo_desp = algo_desp
+        params.split_k_slices = split_k_slices
+        params.stream = stream
+        params.alpha = alpha
+        params.beta = beta
+        params.workspace = workspace
+        # gather = 0
+        # if profile_res.external_gather and not gather_data.empty():
+        #     GemmMainUnitTest.stream_synchronize(stream)
+        #     tt = time.time()
+        #     assert not gather_data.empty()
+        #     params.a_inds = tv.Tensor()
+        #     params.a = gather_data
+        #     # print(profile_res.gather_params, gather_data.shape, a.shape, a_inds.shape)
+        #     GATHER.gather(gather_data,
+        #                    a,
+        #                    a_inds,
+        #                    *profile_res.gather_params,
+        #                    stream=stream)
+        #     GemmMainUnitTest.stream_synchronize(stream)
+        #     gather = time.time() - tt
+
+        GemmMainUnitTest.matmul2(params)
+        # GemmMainUnitTest.stream_synchronize(stream)
+        return algo_desp
+
+
+GEMM = SimpleGemm(ALL_ALGO_DESPS)
+
+if __name__ == "__main__":
+    print(len(ALL_ALGO_DESPS))
+    print(ALL_ALGO_DESPS[0])
+
+    a = tv.zeros([64000, 32], dtype=tv.float16)
+    b = tv.zeros([32, 64], dtype=tv.float16)
+    c = tv.zeros([64000, 64], dtype=tv.float16)
+    a_inds = tv.zeros([64000], dtype=tv.int32)
+    c_inds = tv.zeros([64000], dtype=tv.int32)
+    t = time.time()
+    for i in range(100):
+        algo = GEMM.select(a,
+                           c,
+                           b,
+                           True,
+                           False,
+                           False, (7, 5),
+                           ShuffleStrideType.ShuffleAB,
+                           a_inds=a_inds,
+                           b_inds=c_inds)
+    print((time.time() - t) / 100)
+    print(algo)
diff --git a/spconv/build.py b/spconv/build.py
new file mode 100644
index 0000000..120ef53
--- /dev/null
+++ b/spconv/build.py
@@ -0,0 +1,31 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import pccm
+from pccm.utils import project_is_editable, project_is_installed
+
+from .constants import PACKAGE_NAME, PACKAGE_ROOT
+
+if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME):
+    from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
+    from spconv.csrc.sparse.all import SpconvOps
+    # from cumm.gemm.gather import GatherAll, ScatterAll
+    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
+    cu.namespace = "cumm.gemm.main"
+    pccm.builder.build_pybind([cu, SpconvOps()],
+                              PACKAGE_ROOT / "core_cc",
+                              namespace_root=PACKAGE_ROOT,
+                              load_library=False)
diff --git a/spconv/constants.py b/spconv/constants.py
new file mode 100644
index 0000000..eb882b1
--- /dev/null
+++ b/spconv/constants.py
@@ -0,0 +1,27 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from typing import List
+from pccm.utils import project_is_editable, project_is_installed
+
+PACKAGE_NAME = "spconv"
+PACKAGE_ROOT = Path(__file__).parent.resolve()
+
+EDITABLE_INSTALLED = project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME)
+
+
+_filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", "0")
+FILTER_HWIO = _filter_hwio_env == "1"
\ No newline at end of file
diff --git a/spconv/core_cc/__init__.pyi b/spconv/core_cc/__init__.pyi
new file mode 100644
index 0000000..b8bf5f6
--- /dev/null
+++ b/spconv/core_cc/__init__.pyi
@@ -0,0 +1,14 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/core_cc/csrc/__init__.pyi b/spconv/core_cc/csrc/__init__.pyi
new file mode 100644
index 0000000..b8bf5f6
--- /dev/null
+++ b/spconv/core_cc/csrc/__init__.pyi
@@ -0,0 +1,14 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/core_cc/csrc/sparse/__init__.pyi b/spconv/core_cc/csrc/sparse/__init__.pyi
new file mode 100644
index 0000000..b8bf5f6
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/__init__.pyi
@@ -0,0 +1,14 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/core_cc/csrc/sparse/all/__init__.pyi b/spconv/core_cc/csrc/sparse/all/__init__.pyi
new file mode 100644
index 0000000..b1d71b9
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/__init__.pyi
@@ -0,0 +1,110 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class SpconvOps:
+    @staticmethod
+    def generate_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int]) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            out_inds: 
+            indice_num_per_loc: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+        """
+        ...
+    @staticmethod
+    def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: 
+        """
+        Args:
+            indices: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            indice_num_per_loc: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def generate_conv_inds_stage2(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, out_inds: Tensor, num_out_act: int, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], stream_int: int = 0) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            indice_pairs_uniq: 
+            out_inds: 
+            num_out_act: 
+            batch_size: 
+            output_dims: 
+            input_dims: 
+            ksize: 
+            stride: 
+            padding: 
+            dilation: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def generate_subm_conv_inds(indices: Tensor, hashdata: Tensor, indice_pairs: Tensor, out_inds: Tensor, indice_num_per_loc: Tensor, batch_size: int, input_dims: List[int], ksize: List[int], dilation: List[int], indice_pair_mask: Tensor =  Tensor(), backward: bool = False, stream_int: int =  0) -> int: 
+        """
+        Args:
+            indices: 
+            hashdata: 
+            indice_pairs: 
+            out_inds: 
+            indice_num_per_loc: 
+            batch_size: 
+            input_dims: 
+            ksize: 
+            dilation: 
+            indice_pair_mask: 
+            backward: 
+            stream_int: 
+        """
+        ...
+    @staticmethod
+    def maxpool_forward(out: Tensor, inp: Tensor, out_inds: Tensor, in_inds: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            inp: 
+            out_inds: 
+            in_inds: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def maxpool_backward(out: Tensor, inp: Tensor, dout: Tensor, dinp: Tensor, out_inds: Tensor, in_inds: Tensor, stream: int = 0) -> None: 
+        """
+        Args:
+            out: 
+            inp: 
+            dout: 
+            dinp: 
+            out_inds: 
+            in_inds: 
+            stream: 
+        """
+        ...
+    @staticmethod
+    def sort_1d_by_key(data: Tensor) -> Tensor: 
+        """
+        Args:
+            data: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops1d.pyi b/spconv/core_cc/csrc/sparse/all/ops1d.pyi
new file mode 100644
index 0000000..b28aa24
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops1d.pyi
@@ -0,0 +1,28 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops2d.pyi b/spconv/core_cc/csrc/sparse/all/ops2d.pyi
new file mode 100644
index 0000000..b28aa24
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops2d.pyi
@@ -0,0 +1,28 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops3d.pyi b/spconv/core_cc/csrc/sparse/all/ops3d.pyi
new file mode 100644
index 0000000..b28aa24
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops3d.pyi
@@ -0,0 +1,28 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops4d.pyi b/spconv/core_cc/csrc/sparse/all/ops4d.pyi
new file mode 100644
index 0000000..b28aa24
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops4d.pyi
@@ -0,0 +1,28 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2Voxel:
+    hashdata: Tensor
+    point_indice_data: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi b/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
new file mode 100644
index 0000000..7db3c2f
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi
@@ -0,0 +1,34 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi b/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
new file mode 100644
index 0000000..7db3c2f
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi
@@ -0,0 +1,34 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi b/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
new file mode 100644
index 0000000..7db3c2f
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi
@@ -0,0 +1,34 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
new file mode 100644
index 0000000..7db3c2f
--- /dev/null
+++ b/spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi
@@ -0,0 +1,34 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class Point2VoxelCPU:
+    densehashdata: Tensor
+    voxels: Tensor
+    indices: Tensor
+    num_per_voxel: Tensor
+    @property
+    def grid_size(self) -> List[int]: ...
+    def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None: 
+        """
+        Args:
+            vsize_xyz: 
+            coors_range_xyz: 
+            num_point_features: 
+            max_num_voxels: 
+            max_num_points_per_voxel: 
+        """
+        ...
+    def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
+    def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]: 
+        """
+        Args:
+            points: 
+            clear_voxels: 
+        """
+        ...
diff --git a/spconv/core_cc/cumm/__init__.pyi b/spconv/core_cc/cumm/__init__.pyi
new file mode 100644
index 0000000..b8bf5f6
--- /dev/null
+++ b/spconv/core_cc/cumm/__init__.pyi
@@ -0,0 +1,14 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/core_cc/cumm/gemm/__init__.pyi b/spconv/core_cc/cumm/gemm/__init__.pyi
new file mode 100644
index 0000000..b8bf5f6
--- /dev/null
+++ b/spconv/core_cc/cumm/gemm/__init__.pyi
@@ -0,0 +1,14 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/core_cc/cumm/gemm/gather.pyi b/spconv/core_cc/cumm/gemm/gather.pyi
new file mode 100644
index 0000000..0047268
--- /dev/null
+++ b/spconv/core_cc/cumm/gemm/gather.pyi
@@ -0,0 +1,91 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class ScatterAll:
+    def __init__(self) -> None: ...
+    @staticmethod
+    def get_all_scatter_params() -> List[Tuple[int, int, int, int]]: ...
+    def supported_scatter(self, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, channel_size: int, dtype: int) -> bool: 
+        """
+        Args:
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            channel_size: 
+            dtype: 
+        """
+        ...
+    @staticmethod
+    def stream_synchronize(stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def scatter(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            stream: 
+        """
+        ...
+    def scatter2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            size: 
+            stream: 
+        """
+        ...
+class GatherAll:
+    def __init__(self) -> None: ...
+    @staticmethod
+    def get_all_gather_params() -> List[Tuple[int, int, int, int]]: ...
+    @staticmethod
+    def supported(bytes_per_access: int, channel_size: int, dtype: int) -> bool: 
+        """
+        Args:
+            bytes_per_access: 
+            channel_size: 
+            dtype: 
+        """
+        ...
+    @staticmethod
+    def stream_synchronize(stream: int = 0) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    def gather(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            tile_m: 
+            tile_k_bytes: 
+            bytes_per_access: 
+            num_threads: 
+            stream: 
+        """
+        ...
+    def gather2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None: 
+        """
+        Args:
+            output: 
+            input: 
+            indices: 
+            size: 
+            stream: 
+        """
+        ...
diff --git a/spconv/core_cc/cumm/gemm/main.pyi b/spconv/core_cc/cumm/gemm/main.pyi
new file mode 100644
index 0000000..bcdad7f
--- /dev/null
+++ b/spconv/core_cc/cumm/gemm/main.pyi
@@ -0,0 +1,187 @@
+from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from pccm.stubs import EnumValue, EnumClassValue
+from cumm.tensorview import Tensor
+class GemmAlgoDesp:
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    tile_shape: Tuple[int, int, int]
+    warp_tile_shape: Tuple[int, int, int]
+    num_stage: int
+    dacc: int
+    dcomp: int
+    algo: str
+    tensorop: List[int]
+    split_k_serial_: int
+    split_k_parallel_: int
+    shuffle_type: str
+    element_per_access_a: int
+    element_per_access_b: int
+    element_per_access_c: int
+    def __init__(self) -> None: ...
+    def __repr__(self) -> str: ...
+    @property
+    def split_k_serial(self) -> bool: ...
+    @split_k_serial.setter
+    def split_k_serial(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def split_k_parallel(self) -> bool: ...
+    @split_k_parallel.setter
+    def split_k_parallel(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    def check_valid(self) -> None: ...
+    @property
+    def trans_a(self) -> bool: ...
+    @trans_a.setter
+    def trans_a(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def trans_b(self) -> bool: ...
+    @trans_b.setter
+    def trans_b(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def trans_c(self) -> bool: ...
+    @trans_c.setter
+    def trans_c(self, val: bool) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    def query_workspace_size(self, m: int, n: int, k: int, split_k_slices: int) -> int: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+            split_k_slices: 
+        """
+        ...
+    def supported(self, m: int, n: int, k: int) -> bool: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+        """
+        ...
+    def supported_ldx(self, lda: int, ldb: int, ldc: int) -> bool: 
+        """
+        Args:
+            lda: 
+            ldb: 
+            ldc: 
+        """
+        ...
+class GemmParams:
+    algo_desp: GemmAlgoDesp
+    split_k_slices: int
+    workspace: Tensor =  Tensor()
+    a_inds: Tensor =  Tensor()
+    b_inds: Tensor =  Tensor()
+    c_inds: Tensor =  Tensor()
+    alpha: float
+    beta: float
+    stream: int
+    def __init__(self) -> None: ...
+    def check_valid(self) -> None: ...
+    @property
+    def a(self) -> Tensor: ...
+    @a.setter
+    def a(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def b(self) -> Tensor: ...
+    @b.setter
+    def b(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @property
+    def c(self) -> Tensor: ...
+    @c.setter
+    def c(self, val: Tensor) -> None: 
+        """
+        Args:
+            val: 
+        """
+        ...
+class GemmMainUnitTest:
+    @staticmethod
+    def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
+    @staticmethod
+    def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "NS", a_inds_shape: List[int] =  [], b_inds_shape: List[int] =  [], c_inds_shape: List[int] =  []) -> Tuple[int, int, int]: 
+        """
+        Args:
+            a_shape: 
+            b_shape: 
+            trans_a: 
+            trans_b: 
+            trans_c: 
+            shuffle_type: 
+            a_inds_shape: 
+            b_inds_shape: 
+            c_inds_shape: 
+        """
+        ...
+    @staticmethod
+    def align_to_power2(val: int) -> int: 
+        """
+        Args:
+            val: 
+        """
+        ...
+    @staticmethod
+    def device_synchronize() -> None: ...
+    @staticmethod
+    def stream_synchronize(stream: int) -> None: 
+        """
+        Args:
+            stream: 
+        """
+        ...
+    @staticmethod
+    def simple_select_tile_shape(m: int, n: int, k: int, tile_ms: List[int], tile_ns: List[int], tile_ks: List[int], tile_shape_to_algos: Dict[int, List[int]], large_k_first: bool) -> List[int]: 
+        """
+        Args:
+            m: 
+            n: 
+            k: 
+            tile_ms: 
+            tile_ns: 
+            tile_ks: 
+            tile_shape_to_algos: 
+            large_k_first: 
+        """
+        ...
+    @staticmethod
+    def matmul2(params: GemmParams) -> None: 
+        """
+        Args:
+            params: 
+        """
+        ...
diff --git a/spconv/csrc/__init__.py b/spconv/csrc/__init__.py
new file mode 100644
index 0000000..b8bf5f6
--- /dev/null
+++ b/spconv/csrc/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/csrc/sparse/__init__.py b/spconv/csrc/sparse/__init__.py
new file mode 100644
index 0000000..b8bf5f6
--- /dev/null
+++ b/spconv/csrc/sparse/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/spconv/csrc/sparse/all.py b/spconv/csrc/sparse/all.py
new file mode 100644
index 0000000..7c1c33c
--- /dev/null
+++ b/spconv/csrc/sparse/all.py
@@ -0,0 +1,250 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cumm.common import TensorViewKernel, ThrustLib
+from cumm.conv.bases import ConvOpType, NHWC
+from cumm.conv.params import ConvProblem
+from cumm import dtypes
+import pccm 
+
+from .pointops import Point2Voxel, Point2VoxelCPU
+from .indices import SparseConvIndicesKernel, CudaCommonKernel
+from .maxpool import IndiceMaxPool
+
+class SpconvOps(pccm.Class):
+    def __init__(self):
+        super().__init__()
+        self.ndims = [1, 2, 3, 4]
+        for ndim in self.ndims:
+            p2v = Point2Voxel(dtypes.float32,  ndim)
+            p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
+            self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
+            self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
+
+            problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
+            indices = SparseConvIndicesKernel(problem, dtypes.int32)
+            # self.add_param_class("ops", indices, "SpconvIndices")
+            cuda_funcs = [self.generate_conv_inds, self.generate_subm_conv_inds, 
+                self.generate_conv_inds_stage1, self.generate_conv_inds_stage2, self.sort_1d_by_key]
+            self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
+
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds(indices, hashdata,
+                    indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("indices", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
+                    indice_pairs, indice_pairs_uniq, indice_num_per_loc,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage2(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg("num_out_act", "int")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"std::vector<int>")
+        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
+            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
+            padding.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> output_dims_, input_dims_;
+                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    output_dims_[i] = output_dims[i];
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    stride_[i] = stride[i];
+                    padding_[i] = padding[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
+                    indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
+                    batch_size, output_dims_, input_dims_, 
+                    ksize_, stride_, padding_, dilation_);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def generate_subm_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("input_dims", f"std::vector<int>")
+        code.arg("ksize, dilation", f"std::vector<int>")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("backward", "bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
+        code.raw(f"""
+        int ndim = indices.dim(1) - 1;
+        TV_ASSERT_RT_ERR(input_dims.size() == ndim &&
+            ksize.size() == ndim && dilation.size() == ndim, "your params size not equal to ndim", ndim);
+        """)
+        for ndim in self.ndims:
+            code.raw(f"""
+            if (ndim == {ndim}){{
+                tv::array<int, {ndim}> input_dims_;
+                tv::array<int, {ndim}> ksize_, dilation_;
+                for (int i = 0; i < {ndim}; ++i){{
+                    input_dims_[i] = input_dims[i];
+                    ksize_[i] = ksize[i];
+                    dilation_[i] = dilation[i];
+                }}
+                return SpconvIndices{ndim}D::generate_subm_conv_inds(indices, hashdata,
+                    indice_pairs, out_inds, indice_num_per_loc,
+                    batch_size, input_dims_, 
+                    ksize_, dilation_, indice_pair_mask, backward,
+                    stream_int);
+            }}
+            """)
+        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
+        return code.ret("int")
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def maxpool_forward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("inp", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
+        code.add_dependency(IndiceMaxPool)
+        code.raw(f"""
+        return IndiceMaxPool::forward(out, inp, out_inds, in_inds, stream);
+        """)
+        return code
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def maxpool_backward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("inp", "tv::Tensor")
+        code.arg("dout", "tv::Tensor")
+        code.arg("dinp", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
+        code.add_dependency(IndiceMaxPool)
+        code.raw(f"""
+        return IndiceMaxPool::backward(out, inp, dout, dinp, out_inds, in_inds, stream);
+        """)
+        return code
+
+    @pccm.pybind.mark
+    @pccm.cuda.static_function
+    def sort_1d_by_key(self):
+        code = pccm.FunctionCode()
+        code.add_dependency(ThrustLib, TensorViewKernel)
+        code.add_param_class("cudakers", CudaCommonKernel())
+        code.arg("data", "tv::Tensor")
+        code.raw(f"""
+        tv::Tensor indices({{data.dim(0)}}, tv::int32, 0);
+        tv::cuda::Launch launcher(data.dim(0));
+        launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0));
+        tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            thrust::device_ptr<T> ptr_tr(data.data_ptr<T>());
+            thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>());
+            auto thrust_ctx = thrust::cuda::par.on(0);
+            thrust::sort_by_key(thrust_ctx, ptr_tr, ptr_tr + data.dim(0), ptr_k);
+        }});
+        return indices;
+        """)
+        return code.ret("tv::Tensor")
diff --git a/spconv/csrc/sparse/devleop/sort_bench.py b/spconv/csrc/sparse/devleop/sort_bench.py
new file mode 100644
index 0000000..68f3e6f
--- /dev/null
+++ b/spconv/csrc/sparse/devleop/sort_bench.py
@@ -0,0 +1,37 @@
+import torch 
+import time 
+
+def main():
+
+    arr = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
+    arr2 = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
+
+    torch.cuda.synchronize()
+    ar = torch.arange(arr.shape[0]).cuda()
+
+    t = time.time()
+    for i in range(10):
+
+        xx, indices = arr.sort()
+        # thh = torch.empty_like(indices)
+        xx2, indices2 = arr2.sort()
+
+        # thh[indices] = ar
+        torch.cuda.synchronize()
+        print(time.time() - t)
+        t = time.time()
+    # print(indices[:10], thh[:10])
+    a = torch.rand(130000, 27 * 32).cuda().float()
+    b = torch.rand(27 * 32, 32).cuda().float()
+    c = torch.rand(130000, 32).cuda().float()
+    for i in range(10):
+        torch.cuda.synchronize()
+        t = time.time()
+        torch.mm(a, b, out=c)
+        # thh[indices] = ar
+        torch.cuda.synchronize()
+        print(time.time() - t)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/spconv/csrc/sparse/devleop/wtf.py b/spconv/csrc/sparse/devleop/wtf.py
new file mode 100644
index 0000000..9c26094
--- /dev/null
+++ b/spconv/csrc/sparse/devleop/wtf.py
@@ -0,0 +1,85 @@
+#!/home/yy/library/anaconda3/bin/python
+import sys
+from pathlib import Path 
+import ctypes
+# _cudart = ctypes.CDLL('libcudart.so')
+
+print(str(Path(__file__).parent.parent.parent.parent))
+sys.path.append(str(Path(__file__).parent.parent.parent.parent))
+
+
+
+from spconv import tensorview as tv 
+
+from spconv.sparse import build
+import numpy as np 
+from pathlib import Path 
+from spconv.spconv_ops_cc.sparse.all.ops import Point2Voxel
+from spconv.spconv_ops_cc.sparse.all import SpconvOps
+
+import time 
+
+def main():
+    data = np.load("/home/yy/OneDrive/dev/spconv/test/data/benchmark-pc.npz")["pc"].astype(np.float32)
+    print(data.shape, data.dtype)
+    p2v = Point2Voxel([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3, 150000, 1)
+    gs = p2v.grid_size # zyx
+    print(gs)
+    # return
+    data_tv = tv.from_numpy(data).cuda()
+    for i in range(6):
+        t = time.time()
+
+        voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)   
+        
+        print(time.time() - t)
+    voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)   
+    print(voxels.shape, gs)
+    gs_xyz = gs
+    indices_np = indices.cpu().numpy()
+    # indices_offset = indices_np[:, 0] * gs_xyz[1] * gs_xyz[2] + indices_np[:, 1] * gs_xyz[2] + indices_np[:, 2]
+    # uq = np.unique(indices_offset)
+    # print(uq.shape, indices_offset.shape, gs_xyz)
+    # return 
+    ksize = [3] * 3 
+    kv = int(np.prod(ksize))
+    indices_with_bs = np.zeros((indices_np.shape[0], 4), dtype=np.int32)
+    indices_with_bs[:, 1:] = indices_np
+    print(indices_with_bs.mean(), indices_with_bs.max(), indices_with_bs.min())
+
+    indices = tv.from_numpy(indices_with_bs).cuda()
+    out_indices = tv.zeros([indices.dim(0) * kv, 4], tv.int32, 0)
+    indice_num_per_loc = tv.zeros([kv], tv.int32, 0)
+
+
+    points = voxels.view([-1, 3])
+    hashdata = tv.zeros([points.dim(0) * kv * 2], tv.custom64, 0)
+    hashdata_subm = tv.zeros([points.dim(0) * 2], tv.custom64, 0)
+
+    indice_pairs = tv.full([2, kv, indices.dim(0)], -1, tv.int32, 0)
+    indice_pairs_uniq = tv.zeros([indice_pairs.size // 2 + 1], tv.int32, 0)
+
+    # for i in range(10):
+    #     indice_pairs.fill_int_(-1)
+    #     np.random.shuffle(indices_with_bs)
+    #     indices = tv.from_numpy(indices_with_bs).cuda()
+
+    #     indice_num_per_loc.zero_()
+    #     out_act = SpconvOps.generate_conv_inds(indices, hashdata, indice_pairs,
+    #         indice_pairs_uniq, out_indices, indice_num_per_loc, 
+    #         1, gs, gs, [3, 3, 3], [1, 1, 1], [1, 1, 1], [1, 1, 1])
+    #     indice_num_per_loc.zero_()
+    #     out_act = SpconvOps.generate_subm_conv_inds(indices, hashdata_subm, indice_pairs,
+    #         out_indices, indice_num_per_loc, 
+    #         1, gs, ksize, [1, 1, 1])
+    #     indice_num_per_loc_cpu = indice_num_per_loc.cpu().numpy()
+    #     indice_pairs_cpu = indice_pairs.cpu().numpy()
+    #     indice_pairs_cpu_flat = indice_pairs_cpu.reshape(-1)
+    #     uq, count = np.unique(indice_pairs_cpu_flat, return_counts=True)
+    #     print(out_act, indice_pairs_cpu.shape, indice_pairs_cpu.mean(), indice_num_per_loc_cpu.tolist())
+    #     print(indice_pairs_cpu[:, 13, :2])
+    #     print(uq, count)
+
+if __name__ == "__main__":
+
+    main()
\ No newline at end of file
diff --git a/spconv/csrc/sparse/indices.py b/spconv/csrc/sparse/indices.py
new file mode 100644
index 0000000..d8a5798
--- /dev/null
+++ b/spconv/csrc/sparse/indices.py
@@ -0,0 +1,770 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.conv.bases import ConvEnum
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+class CudaCommonKernel(pccm.ParameterizedClass):
+    # we need to use PClass instead of Class
+    # because cuda global function can't be put in class body.
+    @pccm.cuda.cuda_global_function
+    def arange_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("data", f"T*") 
+        code.arg("size", f"int") 
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(size)) {{
+            data[i] = T(i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def fill_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("data", f"T*") 
+        code.arg("val", f"T")
+        code.arg("size", f"int") 
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(size)) {{
+            data[i] = T(val);
+        }}
+        """)
+        return code
+
+
+class ConvOutLocIter(pccm.ParameterizedClass):
+    # TODO add conv transpose
+    def __init__(self, problem: ConvProblem):
+        super().__init__()
+        self.add_dependency(TensorView)
+        self.add_param_class("lociter", problem, "ConvProblem")
+        layout_npq = TensorGeneric(problem.ndim + 1, False)
+        layout_rs = TensorGeneric(problem.ndim, False)
+
+        self.add_param_class("lociter", layout_npq, "LayoutNPQ")
+        self.add_param_class("lociter_rs", layout_rs, "LayoutRS")
+
+        self.ndim = problem.ndim 
+        self.add_member("problem_", f"ConvProblem")
+        self.add_member("count_", f"tv::array<int, {self.ndim}>")
+        self.add_member("layout_npq", f"LayoutNPQ")
+        self.add_member("layout_rs", f"LayoutRS")
+
+    @pccm.cuda.constructor(host=True, device=True, forceinline=True)
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("problem", f"ConvProblem const&")
+        code.ctor_init("problem_", f"problem")
+        zeros = ", ".join(["0"] * self.ndim)
+        code.ctor_init("count_", f"{{{zeros}}}")
+        pqs = codeops.unpack("problem.output_dims", range(self.ndim))
+        rss = codeops.unpack("problem.ksize", range(self.ndim))
+
+        code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
+        code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
+
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               name="operator++")
+    def increment(self):
+        code = pccm.FunctionCode()
+        for i in range(self.ndim - 1, -1, -1):
+            code.raw(f"""
+            if (++count_[{i}] < problem_.ksize[{i}]){{
+                return *this;
+            }}
+            count_[{i}] = 0;
+            """)
+        code.raw("return *this;")
+        return code.ret(f"{self.class_name}&")
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True)
+    def set_filter_offset(self):
+        code = pccm.FunctionCode()
+        code.arg("filter_offset", "int")
+        code.raw(f"""
+        layout_rs.inverse(filter_offset, count_);
+        """)
+        return code
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def nhw_to_npq(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.nontype_targ("NoStride", "bool")
+        for i in range(self.ndim):
+            code.raw(f"""
+            int r_{i} = count_[{i}];
+            int h_{i} = (nhw_offset[{i + 1}] + problem_.padding[{i}] - 
+                r_{i} * problem_.dilation[{i}]) / (NoStride ? 1 : problem_.stride[{i}]);
+            """)
+        h0h1h2 = codeops.unpack_str("h", range(self.ndim))
+        code.raw(f"""
+        return {{nhw_offset[0], {h0h1h2}}};
+        """)
+        return code.ret(f"tv::array<int, {self.ndim + 1}>")
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def npq_to_nhw(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        for i in range(self.ndim):
+            code.raw(f"""
+            int r_{i} = count_[{i}];
+            int h_{i} = npq_offset[{i + 1}] * problem_.stride[{i}] - problem_.padding[{i}] + r_{i} * problem_.dilation[{i}];
+            """)
+        h0h1h2 = codeops.unpack_str("h", range(self.ndim))
+        code.raw(f"""
+        return {{npq_offset[0], {h0h1h2}}};
+        """)
+        return code.ret(f"tv::array<int, {self.ndim + 1}>")
+
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_npq(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
+        npq_offset[0] = npq_no_stride[0];
+        """)
+        hw_valid = [] # type: List[str]
+        stride_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
+            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+            stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
+        code.raw(f"""
+        return npq_no_stride[0] < problem_.N && 
+            {' && '.join(hw_valid)} &&
+            {' && '.join(stride_valid)};
+        """)
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_npq_no_stride(self):
+        code = pccm.FunctionCode()
+        code.arg("nhw_offset", "const int*")
+        code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        npq_offset = nhw_to_npq<true>(nhw_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
+                            f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
+        code.raw(f"""
+        return npq_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
+    @pccm.cuda.member_function(host=True,
+                               device=True,
+                               forceinline=True,
+                               const=True)
+    def query_nhw(self):
+        code = pccm.FunctionCode()
+        code.arg("npq_offset", "const int*")
+        code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
+        code.ret("bool")
+        code.raw(f"""
+        nhw_offset = npq_to_nhw(npq_offset);
+        """)
+        hw_valid = [] # type: List[str]
+        for i in range(self.ndim):
+            hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
+                            f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
+        code.raw(f"""
+        return nhw_offset[0] < problem_.N && 
+            {' && '.join(hw_valid)};
+        """)
+        return code 
+
+class SparseConvIndicesKernel(pccm.ParameterizedClass):
+    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
+        super().__init__()
+        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
+        self.loc_iter = ConvOutLocIter(problem)
+        self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
+        self.add_param_class("spinds", problem, "ConvProblem")        
+        self.add_param_class("cudakers", CudaCommonKernel())        
+
+        self.ndim = problem.ndim 
+        self.dtype_indices = dtype_indices
+        self.dtype_indices_uniq = dtype_indices
+
+        assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
+
+
+    @pccm.cuda.cuda_global_function
+    def calc_conv_indices_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        # code.arg("bool", "transposed")
+
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+            tv::array<int, {self.ndim + 1}> npq_offset;
+            if (loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset)){{
+                int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
+                {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
+                if (old_num < indices_pair_size){{
+                    indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
+                    indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = offset;
+                    indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + old_num] = offset;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def build_conv_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indices_out", f"int*") # [N, ndim + 1]
+        code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
+
+        code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
+
+        code.arg("num_indices", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_indices)) {{
+            {self.dtype_indices} index = indice_pairs_for_uniq[i];
+            layout_npq.inverse(index, indices_out + {self.ndim + 1} * i);
+            table.insert(index, i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_conv_indices_stage2(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize]
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+        # TODO use block instead of filter_offset?
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
+        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+            {self.dtype_indices} index = indice_pairs_out_part_filter[i];
+            if (index > -1){{
+                auto ptr = table.lookup_ptr(index);
+                if (ptr){{
+                    indice_pairs_out_part_filter[i] = ptr->second;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def build_subm_conv_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+
+        code.arg("table", f"TTable") # [N, ndim + 1]
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+
+        code.arg("layout_npq", f"spinds::LayoutNPQ") 
+
+        code.arg("num_indices", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_indices)) {{
+            {self.dtype_indices} index = layout_npq(indices_in + i * {self.ndim + 1});
+            table.insert(index, i);
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def clean_indices_uniq(self):
+        code = pccm.FunctionCode()
+        code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") 
+        code.arg("size", f"{self.dtype_indices}") 
+        code.raw(f"""
+        for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{
+            indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max();
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*") # [kernelProd]
+
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+                tv::array<int, {self.ndim + 1}> npq_offset;
+                if (loc_iter.query_npq_no_stride(indices_in + i * {self.ndim + 1}, npq_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
+                    auto item = table.lookup(offset); // performance bound
+                    if (!item.empty()){{
+                        int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
+                        indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = item.second;
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + old_num] = item.second;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + old_num] = i;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices_mask(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("mask", f"uint32_t*") # [kernelProd]
+
+        code.arg("num_indices", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        uint32_t filter_mask_out = (1u << (filter_offset));
+        uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
+        uint32_t filter_mask_center = (1u << (RS / 2));
+
+        loc_iter.set_filter_offset(filter_offset);
+        int indices_pair_size_mul_RS = indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices)) {{
+                // atomicOr(mask + i, filter_mask_center);
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
+                // find input offset from output offset
+                tv::array<int, {self.ndim + 1}> nhw_offset;
+                // table: input indice coord to output index (or output indice coord to input index)
+                if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
+                    auto item = table.lookup(offset);
+                    if (!item.empty()) {{
+                        auto input_index = item.second; // we find a input indice idx.
+                        atomicOr(mask + output_index, filter_mask_out);
+                        atomicOr(mask + input_index, filter_mask_in);
+                        // for this output, we set correct input idx.
+                        indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
+                        // the output in "input location" connect this output idx in another location.
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + input_index] = output_index;
+                        indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def calc_subm_conv_indices_split_mask(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
+        code.arg("table", f"TTable") # [N, ndim + 1]
+
+        code.arg("indices_in", f"const int*") # [N, ndim + 1]
+        code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
+        code.arg("mask1", f"uint32_t*") # [kernelProd]
+        code.arg("mask2", f"uint32_t*") # [kernelProd]
+
+        code.arg("num_indices", "int")
+        code.arg("indices_pair_size", "int")
+
+        code.arg("RS", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        uint32_t filter_mask_out = (1u << (filter_offset));
+        uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
+        uint32_t filter_mask_center = (1u << (RS / 2));
+
+        loc_iter.set_filter_offset(filter_offset);
+        auto indice_ptr_inv = indice_pairs + indices_pair_size * RS;
+        int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
+        int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
+        if (filter_offset == (RS / 2)){{
+            for (int i : tv::KernelLoopX<int>(num_indices)) {{
+                indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
+                indice_ptr_inv[filter_offset_mul_indices_pair_size + i] = i;
+            }}
+        }} else {{
+            for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
+                // find input offset from output offset
+                tv::array<int, {self.ndim + 1}> nhw_offset;
+                // table: input indice coord to output index (or output indice coord to input index)
+                if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
+                    {self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
+                    auto item = table.lookup(offset);
+                    if (!item.empty()) {{
+                        auto input_index = item.second; // we find a input indice idx.
+                        atomicOr(mask1 + output_index, filter_mask_out);
+                        atomicOr(mask2 + input_index, filter_mask_in);
+                        // for this output, we set correct input idx.
+                        indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
+                        // the output in "input location" connect this output idx in another location.
+                        indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
+                        indice_ptr_inv[filter_offset_mul_indices_pair_size + input_index] = output_index;
+                        indice_ptr_inv[filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def generate_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.raw(f"""
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        auto timer = tv::CudaContextTimer<>();
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        
+        int64_t expected_out_size = indices.dim(0) * kv;
+        TV_ASSERT_RT_ERR(out_inds.dim(0) == expected_out_size && out_inds.dim(1) == {self.ndim + 1}, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0));
+        // tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
+        launcher_num_act_in.blocks.y = kv;
+
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        tv::cuda::Launch launcher_clean_uniq(uniq_size);
+        launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
+        tv::ssprint("clean time", timer.report() / 1000.0);
+
+        launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), 
+            indice_pairs.data_ptr<{self.dtype_indices}>(), 
+            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
+            indice_pairs.dim(2), kv);
+        tv::ssprint("calc_conv_indices_stage1 time", timer.report() / 1000.0, uniq_size);
+
+        thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
+        auto thrust_ctx = thrust::cuda::par.on(0);
+        thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto num_out_act = new_end - ptr_tr - 1;
+        tv::ssprint("unique time", num_out_act, timer.report() / 1000.0);
+
+        // return num_out_act;
+        // TODO handle invalid num_out_act
+        indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
+        tv::cuda::Launch lanucher_build_hash(num_out_act);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear();
+        tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
+
+        lanucher_build_hash(build_conv_hash_table<table_t>, hash, out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(), 
+            loc_iter.layout_npq, num_out_act);
+        tv::ssprint("build_hash time", num_out_act, timer.report() / 1000.0);
+
+        launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, indice_pairs[1].data_ptr<int>(), indices.dim(0), 
+            indice_pairs.dim(2));
+        tv::ssprint("gem conv inds time", timer.report() / 1000.0);
+        return num_out_act;
+        """)
+
+        return code.ret("int")
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage1(self):
+        code = pccm.FunctionCode()
+        code.arg("indices", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+
+        code.raw(f"""
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= uniq_size, "error");
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        int64_t expected_out_size = indices.dim(0) * kv;
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), reinterpret_cast<cudaStream_t>(stream_int));
+        // tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
+        launcher_num_act_in.blocks.y = kv;
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
+        launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
+        launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(), 
+            indice_pairs.data_ptr<{self.dtype_indices}>(), 
+            indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
+            indice_pairs.dim(2), kv);
+        thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
+        auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
+        thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
+        auto num_out_act = new_end - ptr_tr - 1;
+        return num_out_act;
+        """)
+        return code.ret("int")
+
+    @pccm.cuda.static_function
+    def generate_conv_inds_stage2(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
+        code.arg("num_out_act", "int")
+        code.arg("batch_size", "int")
+        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+        code.raw(f"""
+        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
+        // TODO stream
+        // TODO handle num input == 0
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        auto timer = tv::CudaContextTimer<>();
+        int64_t uniq_size = indice_pairs.size() / 2 + 1;
+        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
+        TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
+        launcher_num_act_in.blocks.y = kv;
+        ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+        
+        // TODO handle invalid num_out_act
+        indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
+        tv::cuda::Launch lanucher_build_hash(num_out_act, custream);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear(custream);
+        lanucher_build_hash(build_conv_hash_table<table_t>, hash, 
+            out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(), 
+            loc_iter.layout_npq, num_out_act);
+        launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, 
+            indice_pairs[1].data_ptr<int>(), indices.dim(0), 
+            indice_pairs.dim(2));
+        return num_out_act;
+        """)
+        return code.ret("int")
+
+
+    @pccm.cuda.static_function
+    def generate_subm_conv_inds(self):
+        code = pccm.FunctionCode()
+        code.arg("indices, hashdata", "tv::Tensor")
+        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
+        code.arg("batch_size", "int")
+        code.arg("input_dims", f"tv::array<int, {self.ndim}>")
+        code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
+        code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
+        code.arg("backward", "bool", "false")
+        code.arg("stream_int", f"std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
+        auto ctx = tv::Context();
+        ctx.set_cuda_stream(custream);
+        if (!indice_pair_mask.empty()){{
+            TV_ASSERT_INVALID_ARG(tv::arrayops::prod(ksize) < 32, "for now only support 32bit mask");
+        }}
+        // TODO stream
+        // TODO handle num input == 0
+        tv::array<int, {self.ndim}> stride, padding;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            TV_ASSERT_RT_ERR(ksize[i] % 2 == 1, "subm only support odd ksize");
+            stride[i] = 1;
+            padding[i] = (ksize[i] / 2) * dilation[i];
+        }}
+        int kv = tv::arrayops::prod(ksize);
+        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
+        // indice_pairs: [2, kv, indices.dim(0)]
+        // out_inds: [MaxSize, {self.ndim + 1}]
+        // auto timer = tv::CudaContextTimer<>();
+        TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
+        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
+        launcher_num_act_in.blocks.y = (kv / 2) + 1;
+        // launcher_num_act_in.blocks.y = kv;
+
+        ConvProblem problem(batch_size, 1, 1, input_dims, input_dims, ksize, padding, stride, dilation);
+        ConvLocIter loc_iter(problem);
+
+        tv::cuda::Launch lanucher_build_hash(indices.dim(0), custream);
+        using V = {self.dtype_indices};
+        using KeyType = {self.dtype_indices};
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        TV_ASSERT_RT_ERR(hashdata.dim(0) >= indices.dim(0), "hash size not enough");
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
+        hash.clear(custream);
+        // tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
+
+        lanucher_build_hash(build_subm_conv_hash_table<table_t>, hash, indices.data_ptr<const int>(),
+            loc_iter.layout_npq, indices.dim(0));
+        // tv::ssprint("build_hash time", timer.report() / 1000.0);
+        if (!indice_pair_mask.empty()){{
+            if (indice_pair_mask.ndim() == 2 && indice_pair_mask.dim(0) == 2){{
+                auto mask_0 = indice_pair_mask[0];
+                tv::cuda::Launch lanucher_fill(mask_0.size(), custream);
+                lanucher_fill(cudakers::fill_kernel<int>, mask_0.data_ptr<int>(), (1 << (kv / 2)), mask_0.size());
+                indice_pair_mask[1].zero_(ctx);
+                auto kernel = &calc_subm_conv_indices_split_mask<table_t>;
+                launcher_num_act_in(kernel, loc_iter, hash,  
+                    indices.data_ptr<int>(), indice_pairs.data_ptr<int>(), 
+                    indice_pair_mask[0].data_ptr<uint32_t>(), indice_pair_mask[1].data_ptr<uint32_t>(), 
+                    indices.dim(0), indice_pairs.dim(2), kv);
+            }}else{{
+                tv::cuda::Launch lanucher_fill(indice_pair_mask.size(), custream);
+                lanucher_fill(cudakers::fill_kernel<int>, indice_pair_mask.data_ptr<int>(), (1 << (kv / 2)), indice_pair_mask.size());
+                TV_ASSERT_RT_ERR(indice_pair_mask.ndim() == 1, "error");
+                launcher_num_act_in(calc_subm_conv_indices_mask<table_t>, loc_iter, hash, 
+                    indices.data_ptr<int>(), indice_pairs.data_ptr<int>(), 
+                    indice_pair_mask.data_ptr<uint32_t>(), indices.dim(0), indice_pairs.dim(2), kv);
+            }}
+        }}else{{
+            launcher_num_act_in(calc_subm_conv_indices<table_t>, loc_iter, hash, indices.data_ptr<int>(), 
+                indice_pairs.data_ptr<int>(), 
+                indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs.dim(2), kv);
+        }}
+        // tv::ssprint("gem subm conv inds time", timer.report() / 1000.0);
+        return indices.dim(0);
+        """)
+
+        return code.ret("int")
diff --git a/spconv/csrc/sparse/maxpool.py b/spconv/csrc/sparse/maxpool.py
new file mode 100644
index 0000000..cb18966
--- /dev/null
+++ b/spconv/csrc/sparse/maxpool.py
@@ -0,0 +1,174 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.conv.bases import ConvEnum
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+class IndiceMaxPool(pccm.Class):
+    # TODO optimize this function
+    def __init__(self):
+        super().__init__()
+        self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
+    
+    @pccm.cuda.cuda_global_function
+    def forward_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+
+        code.arg("out_features", f"T*") 
+        code.arg("in_features", f"const T*")
+        code.arg("out_indices", "const int*")
+        code.arg("in_indices", "const int*")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            int in_idx = in_indices[i];
+            int out_idx = out_indices[i];
+            auto in_ptr = in_features + in_idx * num_features;
+            auto out_ptr = out_features + out_idx * num_features;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                auto in = in_ptr[j];
+                auto out = out_ptr[j];
+                if (in > out){{
+                    out_ptr[j] = in;
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.cuda_global_function
+    def backward_kernel(self):
+        code = pccm.FunctionCode()
+        code.targ("T")
+        code.arg("out_features", f"const T*") 
+        code.arg("in_features", f"const T*")
+        code.arg("dout_features", f"const T*") 
+        code.arg("din_features", f"T*")
+        code.arg("out_indices", "const int*")
+        code.arg("in_indices", "const int*")
+        code.arg("size", "int")
+        code.arg("num_features", "int")
+
+        code.raw(f"""
+        for (int i : tv::KernelLoopY<int>(size)) {{
+            int in_idx_offset = in_indices[i] * num_features;
+            int out_idx_offset = out_indices[i] * num_features;
+            auto in_ptr = in_features + in_idx_offset;
+            auto out_ptr = out_features + out_idx_offset;
+            auto din_ptr = din_features + in_idx_offset;
+            auto dout_ptr = dout_features + out_idx_offset;
+            for (int j : tv::KernelLoopX<int>(num_features)) {{
+                auto in = in_ptr[j];
+                auto out = out_ptr[j];
+                if (in == out){{
+                    din_ptr[j] = din_ptr[j] + dout_ptr[j];
+                }}
+            }}
+        }}
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def forward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("in", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto nhot = out_inds.dim(0);
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                constexpr int NumFeatures = TV_DECLTYPE(V)::value;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                constexpr int NumFeatures = 16;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(forward_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
+                out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
+
+        }});
+        """)
+        return code
+
+    @pccm.cuda.static_function
+    def backward(self):
+        code = pccm.FunctionCode()
+        code.arg("out", "tv::Tensor")
+        code.arg("in", "tv::Tensor")
+        code.arg("dout", "tv::Tensor")
+        code.arg("din", "tv::Tensor")
+        code.arg("out_inds", "tv::Tensor")
+        code.arg("in_inds", "tv::Tensor")
+        code.arg("stream", "std::uintptr_t", "0")
+
+        code.raw(f"""
+        auto nhot = out_inds.dim(0);
+
+        auto cudastream = reinterpret_cast<cudaStream_t>(stream);
+        tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
+            using T = TV_DECLTYPE(I);
+            constexpr int MaxThreads = 512;
+            tv::cuda::Launch launcher(1);
+            bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
+                // if out.dim(1) > value in list above, run this function.
+                // if a value is found, other value won't be executed.
+                constexpr int NumFeatures = TV_DECLTYPE(V)::value;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }});
+            if (!found){{
+                constexpr int NumFeatures = 16;
+                constexpr int Num0 = MaxThreads / NumFeatures;
+                dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
+                dim3 threads(NumFeatures, Num0);
+                launcher = tv::cuda::Launch(blocks, threads, cudastream);
+            }}
+            launcher(backward_kernel<T>, out.data_ptr<const T>(), in.data_ptr<const T>(),
+                dout.data_ptr<const T>(), din.data_ptr<T>(),
+                out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
+        }});
+        """)
+        return code
+
diff --git a/spconv/csrc/sparse/pointops.py b/spconv/csrc/sparse/pointops.py
new file mode 100644
index 0000000..49137df
--- /dev/null
+++ b/spconv/csrc/sparse/pointops.py
@@ -0,0 +1,482 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from cumm.gemm.core.metaarray import MetaArray, seq
+from cumm import dtypes
+import pccm 
+from cumm.gemm.layout import TensorGeneric, to_stride
+from cumm.common import TensorView, TensorViewHashKernel
+from cumm.gemm import codeops
+from typing import List 
+from cumm.conv.params import ConvProblem
+import numpy as np 
+
+
+class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    """this class don't support multi-thread. 
+    create p2v for every thread.
+    """
+    def __init__(self, dtype: dtypes.DType, ndim: int, layout: TensorGeneric, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView, TensorViewHashKernel)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+
+    @pccm.cuda.cuda_global_function
+    def build_hash_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("points", f"{self.dtype} const*")
+        code.arg("points_indice_data", f"int64_t *")
+
+        code.arg("point_stride", f"int")
+        code.arg("vsize", f"tv::array<float, {self.ndim}>")
+        code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
+        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
+
+        code.arg("num_points", f"int")
+        point_xyz = f"{self.ndim - 1} - j"
+        if not self.zyx:
+            point_xyz = f"j"
+        # if zyx, the coors_range and grid_bound is zyx too, 
+        # generated indices is zyx.
+        code.raw(f"""
+        for (int i : tv::KernelLoopX<int>(num_points)){{
+            bool failed = false;
+            int c;
+            int64_t prod = 0;
+        #pragma unroll
+            for (int j = 0; j < {self.ndim}; ++j) {{
+                c = floor((points[i * point_stride + {point_xyz}] - coors_range[j]) /
+                            vsize[j]);
+                if ((c < 0 || c >= grid_bound[j])) {{
+                    failed = true;
+                }}
+                prod += grid_stride[j] * c;
+            }}
+            if (!failed){{
+                points_indice_data[i] = prod;
+                table.insert(prod, i);
+            }}else{{
+                points_indice_data[i] = -1;
+            }}
+        }}
+        """)
+        return code 
+
+    @pccm.cuda.cuda_global_function
+    def assign_table(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("indices", f"int*")
+        code.arg("count", f"int*")
+        code.arg("layout", f"Layout")
+        code.arg("max_voxels", f"int")
+
+        code.raw(f"""
+        auto data = table.data();
+        for (int i : tv::KernelLoopX<int>(table.size())){{
+            auto &item = data[i];
+            if (!item.empty()) {{
+                item.second = tv::cuda::atomicAggInc(count);
+                if (item.second < max_voxels){{
+                    layout.inverse(item.first, indices + item.second * {self.ndim});
+                }}
+            }}
+        }}
+        """)
+        return code 
+
+    @pccm.cuda.cuda_global_function
+    def generate_voxel(self):
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+        code.arg("table", "TTable")
+        code.arg("points", f"{self.dtype} const*")
+
+        code.arg("points_indice_data", f"const int64_t*")
+        code.arg("voxels", f"{self.dtype} *")
+        code.arg("num_per_voxel", f"int *")
+
+        code.arg("point_stride", f"int")
+        code.arg("max_points_per_voxel", f"int")
+        code.arg("max_voxels", f"int")
+
+        code.arg("vsize", f"tv::array<float, {self.ndim}>")
+        code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
+        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
+
+        code.arg("num_points", f"int")
+        code.raw(f"""
+        int voxel_stride0 = point_stride * max_points_per_voxel;
+        for (int i : tv::KernelLoopX<int>(num_points)){{
+            int64_t prod = points_indice_data[i];
+            if (prod != -1){{
+                auto voxel_index_pair = table.lookup(prod);
+                if (!voxel_index_pair.empty() &&
+                    voxel_index_pair.second < max_voxels) {{
+                    int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
+                    if (old < max_points_per_voxel) {{
+                        for (int j = 0; j < point_stride; ++j) {{
+                            voxels[voxel_index_pair.second * voxel_stride0 + old * point_stride + j] = points[i * point_stride + j];
+                        }}
+                    }}
+                }}
+            }}
+        }}
+        """)
+        return code 
+
+class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView)
+        layout = TensorGeneric(ndim, True)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+        cuda_funcs = [self.point_to_voxel_hash]
+        self.add_impl_only_param_class(cuda_funcs, "kernel", Point2VoxelKernel(dtype, ndim, layout, zyx))
+
+        self.add_pybind_member("hashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+        self.add_pybind_member("point_indice_data", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+
+        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
+        self.add_member("vsize", f"tv::array<float, {self.ndim}>")
+        self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
+        self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark_prop_getter(prop_name="grid_size")
+    @pccm.member_function
+    def get_grid_size(self):
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        std::array<int, {self.ndim}> res;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            res[i] = grid_size[i];
+        }}
+        return res;
+        """)
+        return code.ret(f"std::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
+        code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
+        code.arg("num_point_features", f"int")
+        code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
+        if self.zyx:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[{self.ndim - 1} - i] = vsize_xyz[i];
+                coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
+                coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        else:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[i] = vsize_xyz[i];
+                coors_range[i] = coors_range_xyz[i];
+                coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        # if zyx, grid_size is zyx.
+        code.raw(f"""
+        int64_t prod = 1;
+        for (size_t i = 0; i < {self.ndim}; ++i) {{
+            grid_size[i] =
+                std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
+        }}
+        for (int i = {self.ndim} - 1; i >= 0; --i) {{
+            grid_stride[i] = prod;
+            prod *= grid_size[i];
+        }}
+        voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, 0);
+        indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, 0);
+        num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, 0);
+        hashdata = tv::zeros({{1}}, tv::custom128, 0);
+        point_indice_data = tv::zeros({{1}}, tv::int64, 0);
+        """)
+        return code 
+
+    @pccm.pybind.mark
+    @pccm.cuda.member_function
+    def point_to_voxel_hash(self):
+        code = pccm.FunctionCode()
+        code.arg("points", "tv::Tensor")
+        code.arg("clear_voxels", "bool", "true")
+
+        code.raw(f"""
+        TV_ASSERT_INVALID_ARG(points.ndim() == 2 && points.dim(1) >= {self.ndim}, "error");
+        using V = int64_t;
+        using KeyType = int64_t;
+        constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
+        if (clear_voxels){{
+            voxels.zero_();
+        }}
+        using table_t =
+            tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
+                                        kEmptyKey, false>;
+        using pair_t = typename table_t::value_type;
+        // int64_t expected_hash_data_num = int64_t(tv::hash::align_to_power2(points.dim(0) * 2));
+        int64_t expected_hash_data_num = points.dim(0) * 2;
+
+        if (hashdata.dim(0) < expected_hash_data_num){{
+            hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
+        }}
+        if (point_indice_data.dim(0) < points.dim(0)){{
+            point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
+        }}
+        // auto timer = tv::CudaContextTimer<>();
+        num_per_voxel.zero_();
+        table_t hash = table_t(hashdata.data_ptr<pair_t>(), expected_hash_data_num);
+        hash.clear();
+        // tv::ssprint("clear time", timer.report());
+        auto launcher = tv::cuda::Launch(points.dim(0));
+        launcher(kernel::build_hash_table<table_t>, hash, points.data_ptr<const {self.dtype}>(),
+                point_indice_data.data_ptr<int64_t>(),
+                points.dim(1), vsize, coors_range, grid_size, grid_stride, points.dim(0));
+        // tv::ssprint("build_hash_table", timer.report());
+
+        auto table_launcher = tv::cuda::Launch(hash.size());
+        tv::Tensor count = tv::zeros({{1}}, tv::int32, 0);
+        Layout layout = Layout::from_shape(grid_size);
+        table_launcher(kernel::assign_table<table_t>, hash, indices.data_ptr<int>(),
+                        count.data_ptr<int>(),
+                        layout, voxels.dim(0));
+        auto count_cpu = count.cpu();
+        int count_val = count_cpu.item<int32_t>();
+        // tv::ssprint("assign_table", timer.report());
+
+        launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
+                point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
+                num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1), 
+                voxels.dim(0), vsize, coors_range,
+                grid_size, grid_stride, points.dim(0));
+        // tv::ssprint("generate_voxel", timer.report());
+
+        return std::make_tuple(voxels.slice_first_axis(0, count_val), 
+            indices.slice_first_axis(0, count_val), 
+            num_per_voxel.slice_first_axis(0, count_val));
+
+        """)
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
+
+
+
+class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
+    def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
+        super().__init__()
+        self.add_dependency(TensorView)
+        layout = TensorGeneric(ndim, True)
+        self.add_param_class("layout_ns", layout, "Layout")
+        self.dtype = dtype 
+        self.ndim = ndim 
+        self.zyx = zyx
+
+        self.add_pybind_member("densehashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
+
+        self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
+        self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
+        self.add_member("mean_per_voxel", "tv::Tensor")
+
+        self.add_member("vsize", f"tv::array<float, {self.ndim}>")
+        self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
+        self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
+        self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark_prop_getter(prop_name="grid_size")
+    @pccm.member_function
+    def get_grid_size(self):
+        code = pccm.FunctionCode()
+        code.raw(f"""
+        std::array<int, {self.ndim}> res;
+        for (int i = 0; i < {self.ndim}; ++i){{
+            res[i] = grid_size[i];
+        }}
+        return res;
+        """)
+        return code.ret(f"std::array<int, {self.ndim}>")
+
+    @pccm.pybind.mark
+    @pccm.constructor
+    def ctor(self):
+        code = pccm.FunctionCode()
+        code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
+        code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
+        code.arg("num_point_features", f"int")
+        code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
+        if self.zyx:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[{self.ndim - 1} - i] = vsize_xyz[i];
+                coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
+                coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        else:
+            code.raw(f"""
+            for (int i = 0; i < {self.ndim}; ++i){{
+                vsize[i] = vsize_xyz[i];
+                coors_range[i] = coors_range_xyz[i];
+                coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
+            }}
+            """)
+        code.raw(f"""
+        int64_t prod = 1;
+        for (size_t i = 0; i < {self.ndim}; ++i) {{
+            grid_size[i] =
+                std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
+        }}
+        for (int i = {self.ndim} - 1; i >= 0; --i) {{
+            grid_stride[i] = prod;
+            prod *= grid_size[i];
+        }}
+        voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, -1);
+        indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, -1);
+        num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, -1);
+        mean_per_voxel = tv::zeros({{max_num_voxels, num_point_features}}, tv::DType({self.dtype.tv_dtype}), -1);
+        tv::TensorShape grid_shape(grid_size.data(), grid_size.data() + {self.ndim});
+        densehashdata = tv::zeros(grid_shape, tv::int32, -1);
+        auto densehashdata_ptr = densehashdata.data_ptr<int>();
+        for (int i= 0; i < densehashdata.size(); ++i){{
+            densehashdata_ptr[i] = -1;
+        }}
+        """)
+        return code 
+
+    def point_to_voxel_template(self, mean: bool = False):
+        code = pccm.FunctionCode()
+        code.arg("points", "tv::Tensor")
+        code.arg("clear_voxels", "bool", "true")
+
+        point_xyz = f"{self.ndim - 1} - j"
+        if not self.zyx:
+            point_xyz = f"j"
+        code.raw(f"""
+        auto max_num_voxels = voxels.dim(0);
+        auto max_num_points_per_voxel = voxels.dim(1);
+        num_per_voxel.zero_();
+        if (clear_voxels){{
+            voxels.zero_();
+        }}
+        """)
+        if mean:
+            code.raw(f"mean_per_voxel.zero_();")
+            code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
+        else:
+            code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
+        
+        code.raw(f"""
+        int res_voxel_num = 0;
+        int num_features = points.dim(1);
+        auto N = points.dim(0);
+        int c;
+        TV_ASSERT_RT_ERR(num_features == voxels.dim(2), "your points num features doesn't equal to voxel.");
+        constexpr bool kUseMean = {pccm.boolean(mean)};
+        tv::dispatch<float, double>(points.dtype(), [&](auto I){{
+            using T = decltype(I);
+            auto points_rw = points.tview<T, 2>();
+            auto coors_rw = indices.tview<int, 2>();
+            auto voxels_rw = voxels.tview<{self.dtype}, 3>();
+            auto num_points_per_voxel_rw = num_per_voxel.tview<int, 1>();
+            
+            int coor[{self.ndim}];
+            auto coor_to_voxelidx_rw = densehashdata.tview<int, {self.ndim}>();
+            int voxelidx, num;
+            bool failed;
+            int voxel_num = 0;
+            for (int i = 0; i < N; ++i) {{
+                failed = false;
+                for (int j = 0; j < {self.ndim}; ++j) {{
+                    c = floor((points_rw(i, {point_xyz}) - coors_range[j]) / vsize[j]);
+                    if ((c < 0 || c >= grid_size[j])) {{
+                        failed = true;
+                        break;
+                    }}
+                    coor[j] = c;
+                }}
+                if (failed)
+                    continue;
+                voxelidx = coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))});
+
+                if (voxelidx == -1) {{
+                    voxelidx = voxel_num;
+                    if (voxel_num >= max_num_voxels)
+                        continue;
+                    voxel_num += 1;
+                    coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))}) = voxelidx;
+                    for (int k = 0; k < {self.ndim}; ++k) {{
+                        coors_rw(voxelidx, k) = coor[k];
+                    }}
+                }}
+                num = num_points_per_voxel_rw(voxelidx);
+                if (num < max_num_points_per_voxel) {{
+                    // voxel_point_mask_rw(voxelidx, num) = {self.dtype}(1);
+                    for (int k = 0; k < num_features; ++k) {{
+                        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+                    }}
+                    num_points_per_voxel_rw(voxelidx) += 1;
+                    if TV_IF_CONSTEXPR (kUseMean){{
+                        for (int k = 0; k < num_features; ++k) {{
+                            means_rw(voxelidx, k) +=
+                                (points_rw(i, k) - means_rw(voxelidx, k)) / {self.dtype}(num + 1);
+                        }}
+                    }}
+                }}
+            }}
+            for (int i = 0; i < voxel_num; ++i) {{
+                coor_to_voxelidx_rw({codeops.unpack("coors_rw", range(self.ndim), left="(i, ", right=")")}) = -1;
+                if TV_IF_CONSTEXPR (kUseMean){{
+                    num = num_points_per_voxel_rw(i);
+                    for (int j = num; j < max_num_points_per_voxel; ++j) {{
+                        for (int k = 0; k < num_features; ++k) {{
+                            voxels_rw(i, j, k) = means_rw(i, k);
+                        }}
+                    }}
+                }}
+            }}
+            res_voxel_num = voxel_num;
+        }});
+        return std::make_tuple(voxels.slice_first_axis(0, res_voxel_num), 
+            indices.slice_first_axis(0, res_voxel_num), 
+            num_per_voxel.slice_first_axis(0, res_voxel_num));
+        """)
+        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
+
+
+    @pccm.pybind.mark
+    @pccm.member_function
+    def point_to_voxel(self):
+        return self.point_to_voxel_template(False)
+
+    @pccm.pybind.mark
+    @pccm.member_function
+    def point_to_voxel_empty_mean(self):
+        return self.point_to_voxel_template(True)
diff --git a/spconv/ops.py b/spconv/ops.py
deleted file mode 100644
index 4c2db64..0000000
--- a/spconv/ops.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2019-2020 Yan Yan
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from enum import Enum
-
-import torch
-
-import spconv
-
-
-class ConvAlgo(Enum):
-    Native = 0  # small memory cost, faster when number of points is large.
-    Batch = 1  # high memory cost, faster when number of points is small (< 50000)
-    BatchGemmGather = 2  # high memory cost, faster when number of points medium
-    SparseConvNet = 3
-    Minkowski = 4  # https://github.com/StanfordVL/MinkowskiEngine/blob/master/src/convolution.cu
-
-
-def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        size = (input_size[i] + 2 * padding[i] - dilation[i] *
-                (kernel_size[i] - 1) - 1) // stride[i] + 1
-        if kernel_size[i] == -1:
-            output_size.append(1)
-        else:
-            output_size.append(size)
-    return output_size
-
-
-def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
-                           output_padding):
-    ndim = len(input_size)
-    output_size = []
-    for i in range(ndim):
-        if kernel_size[i] == -1:
-            raise ValueError("deconv don't support kernel_size < 0")
-        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
-            i] + output_padding[i]
-        output_size.append(size)
-    return output_size
-
-
-def get_indice_pairs(indices,
-                     batch_size,
-                     spatial_shape,
-                     ksize=3,
-                     stride=1,
-                     padding=0,
-                     dilation=1,
-                     out_padding=0,
-                     subm=False,
-                     transpose=False,
-                     grid=None,
-                     use_hash=False):
-    ndim = indices.shape[1] - 1
-    if not isinstance(ksize, (list, tuple)):
-        ksize = [ksize] * ndim
-    if not isinstance(stride, (list, tuple)):
-        stride = [stride] * ndim
-    if not isinstance(padding, (list, tuple)):
-        padding = [padding] * ndim
-    if not isinstance(dilation, (list, tuple)):
-        dilation = [dilation] * ndim
-    if not isinstance(out_padding, (list, tuple)):
-        out_padding = [out_padding] * ndim
-
-    for d, s in zip(dilation, stride):
-        assert any([s == 1, d == 1]), "don't support this."
-
-    if not subm:
-        if transpose:
-            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
-                                               padding, dilation, out_padding)
-        else:
-            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
-                                             padding, dilation)
-    else:
-        out_shape = spatial_shape
-    if grid is None:
-        grid = torch.Tensor()
-    res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size,
-                                            out_shape, spatial_shape, ksize,
-                                            stride, padding, dilation,
-                                            out_padding, int(subm),
-                                            int(transpose), int(use_hash))
-    return res
-
-
-def indice_conv(features,
-                filters,
-                indice_pairs,
-                indice_pair_num,
-                num_activate_out,
-                inverse=False,
-                subm=False,
-                algo=ConvAlgo.Native.value):
-    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
-                                        indice_pair_num, num_activate_out,
-                                        int(inverse), int(subm), algo)
-
-
-def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
-                      num_activate_out, inverse, subm):
-    return torch.ops.spconv.fused_indice_conv_bn(features, filters, bias,
-                                                 indice_pairs, indice_pair_num,
-                                                 num_activate_out,
-                                                 int(inverse), int(subm))
-
-
-def indice_conv_backward(features,
-                         filters,
-                         out_bp,
-                         indice_pairs,
-                         indice_pair_num,
-                         inverse=False,
-                         subm=False,
-                         algo=ConvAlgo.Native.value):
-    return torch.ops.spconv.indice_conv_backward(features, filters, out_bp,
-                                                 indice_pairs, indice_pair_num,
-                                                 int(inverse), int(subm), algo)
-
-
-def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
-    return torch.ops.spconv.indice_maxpool(features, indice_pairs,
-                                           indice_pair_num, num_activate_out)
-
-
-def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
-                            indice_pair_num):
-    return torch.ops.spconv.indice_maxpool_backward(features, out_features,
-                                                    out_bp, indice_pairs,
-                                                    indice_pair_num)
-
-
-def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
-    res = torch.ops.spconv.nms(boxes, scores, pre_max_size, post_max_size,
-                               thresh, eps)
-    return res
-
-
-def pillar_scatter(features, coors, shape):
-    if features.dtype == torch.float32:
-        return torch.ops.spconv.pillar_scatter_float(features, coors, shape)
-    elif features.dtype == torch.half:
-        return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
-    else:
-        raise NotImplementedError
diff --git a/spconv/pytorch/__init__.py b/spconv/pytorch/__init__.py
new file mode 100644
index 0000000..d31dae9
--- /dev/null
+++ b/spconv/pytorch/__init__.py
@@ -0,0 +1,31 @@
+import platform
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from spconv.pytorch import ops
+from spconv.pytorch.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                         SparseConvTranspose3d, SparseInverseConv2d,
+                         SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from spconv.pytorch.core import SparseConvTensor
+from spconv.pytorch.identity import Identity
+from spconv.pytorch.modules import SparseModule, SparseSequential
+from spconv.pytorch.ops import ConvAlgo
+from spconv.pytorch.pool import SparseMaxPool2d, SparseMaxPool3d
+from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor.
+    """
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer.
+    """
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
diff --git a/spconv/conv.py b/spconv/pytorch/conv.py
similarity index 79%
rename from spconv/conv.py
rename to spconv/pytorch/conv.py
index 6ca632d..6c49761 100644
--- a/spconv/conv.py
+++ b/spconv/pytorch/conv.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,12 +21,13 @@
 from torch.nn import init
 from torch.nn.parameter import Parameter
 
-import spconv
-import spconv.functional as Fsp
-from spconv import ops
-from spconv.core import IndiceData, SparseConvTensor
-from spconv.modules import SparseModule
-
+from spconv import pytorch as spconv
+from spconv.algo import ConvAlgo
+import spconv.pytorch.functional as Fsp
+from spconv.pytorch import ops
+from spconv.pytorch.core import IndiceData, SparseConvTensor
+from spconv.pytorch.modules import SparseModule
+from spconv.constants import FILTER_HWIO
 
 def _calculate_fan_in_and_fan_out_hwio(tensor):
     dimensions = tensor.ndimension()
@@ -39,8 +40,8 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
         fan_in = tensor.size(-2)
         fan_out = tensor.size(-1)
     else:
-        num_input_fmaps = tensor.size(-2)
-        num_output_fmaps = tensor.size(-1)
+        num_input_fmaps = tensor.size(-1)
+        num_output_fmaps = tensor.size(-2)
         receptive_field_size = 1
         if tensor.dim() > 2:
             receptive_field_size = tensor[..., 0, 0].numel()
@@ -72,7 +73,6 @@ def __init__(self,
                  inverse=False,
                  indice_key=None,
                  fused_bn=False,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SparseConvolution, self).__init__(name=name)
@@ -106,20 +106,25 @@ def __init__(self,
         self.subm = subm
         self.indice_key = indice_key
         self.fused_bn = fused_bn
-        self.use_hash = use_hash
-        self.algo = algo.value
-
-        self.weight = Parameter(
-            torch.Tensor(*kernel_size, in_channels, out_channels))
+        self.algo = algo
+        if FILTER_HWIO:
+            self.weight = Parameter(
+                torch.Tensor(*kernel_size, in_channels, out_channels))
+        else:
+            self.weight = Parameter(
+                torch.Tensor(*kernel_size, out_channels, in_channels))
         if bias:
             self.bias = Parameter(torch.Tensor(out_channels))
         else:
             self.register_parameter('bias', None)
+        # self.workspace_for_splitk = torch.zeros((GLOBAL_MAXIMUM_SPLITK,), dtype=torch.int8)
+        # self.register_buffer("workspace_for_splitk", self.workspace_for_splitk)
         self.reset_parameters()
 
     def reset_parameters(self):
         n = self.in_channels
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        # init.uniform_(self.weight, 0, 0.001)
+        init.kaiming_uniform_(self.weight, a=math.sqrt(0.005))
         if self.bias is not None:
             fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
             bound = 1 / math.sqrt(fan_in)
@@ -171,9 +176,15 @@ def forward(self, input: SparseConvTensor):
                     }
                 }
         if self.conv1x1:
-            features = torch.mm(
-                input.features,
-                self.weight.view(self.in_channels, self.out_channels))
+            if FILTER_HWIO:
+                features = torch.mm(
+                    input.features,
+                    self.weight.view(self.out_channels, self.in_channels).T)
+            else:
+                features = torch.mm(
+                    input.features,
+                    self.weight.view(self.in_channels, self.out_channels).T)
+
             if self.bias is not None:
                 features += self.bias
             out_tensor.features = features
@@ -201,15 +212,14 @@ def forward(self, input: SparseConvTensor):
                     indices,
                     batch_size,
                     spatial_shape,
+                    self.algo,
                     self.kernel_size,
                     self.stride,
                     self.padding,
                     self.dilation,
                     self.output_padding,
                     self.subm,
-                    self.transposed,
-                    grid=input.grid,
-                    use_hash=self.use_hash)
+                    self.transposed)
                 if input.benchmark:
                     torch.cuda.synchronize()
                     interval = time.time() - t
@@ -264,6 +274,32 @@ def forward(self, input: SparseConvTensor):
         out_tensor.spatial_shape = out_spatial_shape
         return out_tensor
 
+class SparseConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseConv1d, self).__init__(1,
+                                           in_channels,
+                                           out_channels,
+                                           kernel_size,
+                                           stride,
+                                           padding,
+                                           dilation,
+                                           groups,
+                                           bias,
+                                           indice_key=indice_key,
+                                           algo=algo,
+                                           name=name)
+
 
 class SparseConv2d(SparseConvolution):
     def __init__(self,
@@ -276,7 +312,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SparseConv2d, self).__init__(2,
@@ -289,7 +324,6 @@ def __init__(self,
                                            groups,
                                            bias,
                                            indice_key=indice_key,
-                                           use_hash=use_hash,
                                            algo=algo,
                                            name=name)
 
@@ -305,7 +339,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SparseConv3d, self).__init__(3,
@@ -318,7 +351,6 @@ def __init__(self,
                                            groups,
                                            bias,
                                            indice_key=indice_key,
-                                           use_hash=use_hash,
                                            algo=algo,
                                            name=name)
 
@@ -334,7 +366,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SparseConv4d, self).__init__(4,
@@ -347,7 +378,6 @@ def __init__(self,
                                            groups,
                                            bias,
                                            indice_key=indice_key,
-                                           use_hash=use_hash,
                                            algo=algo,
                                            name=name)
 
@@ -363,7 +393,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SparseConvTranspose2d, self).__init__(2,
@@ -377,7 +406,6 @@ def __init__(self,
                                                     bias,
                                                     transposed=True,
                                                     indice_key=indice_key,
-                                                    use_hash=use_hash,
                                                     algo=algo,
                                                     name=name)
 
@@ -393,7 +421,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SparseConvTranspose3d, self).__init__(3,
@@ -407,10 +434,28 @@ def __init__(self,
                                                     bias,
                                                     transposed=True,
                                                     indice_key=indice_key,
-                                                    use_hash=use_hash,
                                                     algo=algo,
                                                     name=name)
 
+class SparseInverseConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseInverseConv1d, self).__init__(1,
+                                                  in_channels,
+                                                  out_channels,
+                                                  kernel_size,
+                                                  bias=bias,
+                                                  inverse=True,
+                                                  indice_key=indice_key,
+                                                  algo=algo,
+                                                  name=name)
+
 
 class SparseInverseConv2d(SparseConvolution):
     def __init__(self,
@@ -451,6 +496,52 @@ def __init__(self,
                                                   algo=algo,
                                                   name=name)
 
+class SparseInverseConv4d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key,
+                 bias=True,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SparseInverseConv4d, self).__init__(4,
+                                                  in_channels,
+                                                  out_channels,
+                                                  kernel_size,
+                                                  bias=bias,
+                                                  inverse=True,
+                                                  indice_key=indice_key,
+                                                  algo=algo,
+                                                  name=name)
+
+class SubMConv1d(SparseConvolution):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None,
+                 algo=ops.ConvAlgo.Native,
+                 name=None):
+        super(SubMConv1d, self).__init__(1,
+                                         in_channels,
+                                         out_channels,
+                                         kernel_size,
+                                         stride,
+                                         padding,
+                                         dilation,
+                                         groups,
+                                         bias,
+                                         True,
+                                         indice_key=indice_key,
+                                         algo=algo,
+                                         name=name)
+
 
 class SubMConv2d(SparseConvolution):
     def __init__(self,
@@ -463,7 +554,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SubMConv2d, self).__init__(2,
@@ -477,7 +567,6 @@ def __init__(self,
                                          bias,
                                          True,
                                          indice_key=indice_key,
-                                         use_hash=use_hash,
                                          algo=algo,
                                          name=name)
 
@@ -493,7 +582,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SubMConv3d, self).__init__(3,
@@ -507,7 +595,6 @@ def __init__(self,
                                          bias,
                                          True,
                                          indice_key=indice_key,
-                                         use_hash=use_hash,
                                          algo=algo,
                                          name=name)
 
@@ -523,7 +610,6 @@ def __init__(self,
                  groups=1,
                  bias=True,
                  indice_key=None,
-                 use_hash=False,
                  algo=ops.ConvAlgo.Native,
                  name=None):
         super(SubMConv4d, self).__init__(4,
@@ -537,6 +623,5 @@ def __init__(self,
                                          bias,
                                          True,
                                          indice_key=indice_key,
-                                         use_hash=use_hash,
                                          algo=algo,
                                          name=name)
diff --git a/spconv/core.py b/spconv/pytorch/core.py
similarity index 87%
rename from spconv/core.py
rename to spconv/pytorch/core.py
index e228873..237cd50 100644
--- a/spconv/core.py
+++ b/spconv/pytorch/core.py
@@ -1,3 +1,17 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional
 
 import numpy as np
diff --git a/spconv/pytorch/cppcore.py b/spconv/pytorch/cppcore.py
new file mode 100644
index 0000000..5555645
--- /dev/null
+++ b/spconv/pytorch/cppcore.py
@@ -0,0 +1,51 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cumm import tensorview as tv 
+import torch 
+from typing import Optional, List
+_TORCH_DTYPE_TO_TV = {
+    torch.float32: tv.float32,
+    torch.float64: tv.float64,
+    torch.float16: tv.float16,
+    torch.int32: tv.int32,
+    torch.int64: tv.int64,
+    torch.int8: tv.int8,
+    torch.int16: tv.int16,
+    torch.uint8: tv.uint8,
+}
+
+def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Optional[List[int]] = None):
+    assert ten.is_contiguous(), "must be contiguous tensor"
+    ptr = ten.data_ptr()
+    device = ten.device 
+    if device.type == "cpu":
+        tv_device = -1
+    elif device.type == "cuda":
+        tv_device = 0
+    else:
+        raise NotImplementedError
+    if shape is None:
+        shape = list(ten.shape)
+    if dtype is None:
+        dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
+    return tv.from_blob(ptr, shape, dtype, tv_device)
+
+def get_current_stream():
+    return torch.cuda.current_stream().cuda_stream
+
+if __name__ == "__main__":
+    a = torch.rand(2, 2)
+    atv = torch_tensor_to_tv(a)
+    print(atv.numpy_view())
\ No newline at end of file
diff --git a/spconv/functional.py b/spconv/pytorch/functional.py
similarity index 98%
rename from spconv/functional.py
rename to spconv/pytorch/functional.py
index d438a13..f124cb1 100644
--- a/spconv/functional.py
+++ b/spconv/pytorch/functional.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,7 +16,7 @@
 from torch import nn
 from torch.autograd import Function
 
-import spconv.ops as ops
+import spconv.pytorch.ops as ops
 
 
 class SparseConvFunction(Function):
diff --git a/spconv/identity.py b/spconv/pytorch/identity.py
similarity index 100%
rename from spconv/identity.py
rename to spconv/pytorch/identity.py
diff --git a/spconv/modules.py b/spconv/pytorch/modules.py
similarity index 97%
rename from spconv/modules.py
rename to spconv/pytorch/modules.py
index f4bc5fe..1ee79e9 100644
--- a/spconv/modules.py
+++ b/spconv/pytorch/modules.py
@@ -1,17 +1,18 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import sys
 import time
 from collections import OrderedDict
@@ -19,7 +20,7 @@
 import torch
 from torch import nn
 
-import spconv
+from spconv import pytorch as spconv
 
 
 def is_spconv_module(module):
@@ -28,7 +29,7 @@ def is_spconv_module(module):
 
 
 def is_sparse_conv(module):
-    from spconv.conv import SparseConvolution
+    from spconv.pytorch.conv import SparseConvolution
     return isinstance(module, SparseConvolution)
 
 
@@ -145,7 +146,7 @@ def forward(self, input):
     def fused(self):
         """don't use this. no effect.
         """
-        from spconv.conv import SparseConvolution
+        from spconv.pytorch.conv import SparseConvolution
         mods = [v for k, v in self._modules.items()]
         fused_mods = []
         idx = 0
diff --git a/spconv/pytorch/ops.py b/spconv/pytorch/ops.py
new file mode 100644
index 0000000..9fbe213
--- /dev/null
+++ b/spconv/pytorch/ops.py
@@ -0,0 +1,590 @@
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from cumm import tensorview as tv
+from cumm.gemm.algospec.core import ShuffleStrideType
+
+import torch
+import numpy as np
+import spconv
+from spconv.algo import AlgoHint, ConvAlgo
+from typing import List, Union
+from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
+from spconv.core_cc.csrc.sparse.all import SpconvOps
+from spconv.algo import GEMM# , GATHER, SCATTER
+import time
+from spconv.constants import FILTER_HWIO
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices: torch.Tensor,
+                     batch_size: int,
+                     spatial_shape: List[int],
+                     algo: ConvAlgo,
+                     ksize: Union[int, List[int]],
+                     stride: Union[int, List[int]],
+                     padding: Union[int, List[int]],
+                     dilation: Union[int, List[int]],
+                     out_padding: Union[int, List[int]],
+                     subm: bool = False,
+                     transpose: bool = False):
+
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+    kv: int = int(np.prod(ksize))
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+    else:
+        out_shape = spatial_shape
+    assert algo == ConvAlgo.Native and not transpose, "TODO"
+    stream = get_current_stream()
+    pair = torch.full((2, kv, indices.shape[0]),
+                      -1,
+                      dtype=indices.dtype,
+                      device=indices.device)
+    indice_num_per_loc = torch.zeros((kv, ),
+                                     dtype=indices.dtype,
+                                     device=indices.device)
+    inds_tv = torch_tensor_to_tv(indices)
+    pair_tv = torch_tensor_to_tv(pair)
+    indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
+    # torch.cuda.synchronize()
+    # t = time.time()
+
+    if subm:
+        out_inds = indices
+        hashdata = torch.empty((out_inds.shape[0] * 2, ),
+                               dtype=torch.int64,
+                               device=indices.device)
+        out_inds_tv = torch_tensor_to_tv(out_inds)
+        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+        SpconvOps.generate_subm_conv_inds(inds_tv,
+                                          hashdata_tv,
+                                          pair_tv,
+                                          out_inds_tv,
+                                          indice_num_per_loc_tv,
+                                          batch_size=batch_size,
+                                          input_dims=spatial_shape,
+                                          ksize=ksize,
+                                          dilation=dilation,
+                                          stream_int=stream)
+        # torch.cuda.synchronize()
+
+        # print("SUBM INDICE GEN", time.time() - t)
+
+    else:
+        indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
+                                        dtype=indices.dtype,
+                                        device=indices.device)
+        indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
+        num_act_out = SpconvOps.generate_conv_inds_stage1(
+            inds_tv,
+            pair_tv,
+            indice_pairs_uniq_tv,
+            indice_num_per_loc_tv,
+            batch_size=batch_size,
+            output_dims=out_shape,
+            input_dims=spatial_shape,
+            ksize=ksize,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            stream_int=stream)
+        out_inds = torch.empty((num_act_out, indices.shape[1]),
+                               dtype=indices.dtype,
+                               device=indices.device)
+        hashdata = torch.empty((out_inds.shape[0] * 2, ),
+                               dtype=torch.int64,
+                               device=indices.device)
+        out_inds_tv = torch_tensor_to_tv(out_inds)
+        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+        SpconvOps.generate_conv_inds_stage2(inds_tv,
+                                            hashdata_tv,
+                                            pair_tv,
+                                            indice_pairs_uniq_tv,
+                                            out_inds_tv,
+                                            num_out_act=num_act_out,
+                                            batch_size=batch_size,
+                                            output_dims=out_shape,
+                                            input_dims=spatial_shape,
+                                            ksize=ksize,
+                                            stride=stride,
+                                            padding=padding,
+                                            dilation=dilation,
+                                            stream_int=stream)
+        # torch.cuda.synchronize()
+
+        # print("INDICE GEN", time.time() - t)
+
+    return out_inds, pair, indice_num_per_loc
+
+
+def indice_conv(features: torch.Tensor,
+                filters: torch.Tensor,
+                indice_pairs: torch.Tensor,
+                indice_pair_num: torch.Tensor,
+                num_activate_out: int,
+                inverse: bool = False,
+                subm: bool = False,
+                algo: ConvAlgo = ConvAlgo.Native):
+    # filters: RSKC
+    # torch.cuda.synchronize()
+    # t = time.time()
+    if features.dtype == torch.int8 or features.dtype == torch.qint8:
+        raise NotImplementedError("work in progress")
+    if FILTER_HWIO:
+        out_channel = filters.shape[-1]
+    else:
+        out_channel = filters.shape[-2]
+    filters = filters.reshape(-1, *filters.shape[-2:])
+    kv = filters.shape[0]
+    kv_center = kv // 2
+    if subm:
+        if FILTER_HWIO:
+            out_features = torch.mm(features, filters[kv_center])
+        else:
+            out_features = torch.mm(features, filters[kv_center].T)
+    else:
+        out_features = torch.zeros((num_activate_out, out_channel),
+                                   dtype=features.dtype,
+                                   device=features.device)
+    if kv == 1 and subm:
+        return out_features
+
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    arch = torch.cuda.get_device_capability()
+    inited: bool = subm
+    a = torch_tensor_to_tv(features)
+    c = torch_tensor_to_tv(out_features)
+    profile_idx = kv_center
+    if subm:
+        profile_idx = kv_center - 1
+    # profile_idx = first_n
+    nhot_profile = indice_pair_num_cpu[profile_idx]
+
+    # print(nhot_profile, indice_pair_num_cpu)
+    profile_res = GEMM.get_profiled_algo(
+        a.shape,
+        filters.shape[-2:],
+        c.shape,
+        False,
+        False if FILTER_HWIO else True,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAC,
+        a_inds_shape=[nhot_profile],
+        c_inds_shape=[nhot_profile],
+        hint=AlgoHint.Fowrard.value)
+
+    gather_data_tv = tv.Tensor()
+    scatter_data_tv = tv.Tensor()
+
+    maxnhot = max(indice_pair_num_cpu)
+    if profile_res is None:
+        # run profile on center
+        inp_indices_th = indice_pairs[int(inverse)][profile_idx, :nhot_profile]
+        out_indices_th = indice_pairs[int(not inverse)][
+            profile_idx, :nhot_profile]
+        inp_indices = torch_tensor_to_tv(inp_indices_th)
+        out_indices = torch_tensor_to_tv(out_indices_th)
+        filter_tv = torch_tensor_to_tv(filters)[profile_idx]
+
+        profile_res, min_time = GEMM.profile_and_cache(
+            a,
+            filter_tv,
+            c,
+            False,
+            False if FILTER_HWIO else True,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=inp_indices,
+            c_inds=out_indices,
+            alpha=1.0,
+            beta=0.0,
+            hint=AlgoHint.Fowrard.value,
+            stream=stream)
+
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
+    pair_in = indice_pairs_tv[int(inverse)]
+    pair_out = indice_pairs_tv[int(not inverse)]
+    filters_tv = torch_tensor_to_tv(filters)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if subm and i == kv_center:
+            continue
+        if subm and i > kv_center:
+            nhot = indice_pair_num_cpu[kv - i - 1]
+        if nhot <= 0:
+            continue
+        inp_indices = pair_in[i].slice_first_axis(0, nhot)
+        out_indices = pair_out[i].slice_first_axis(0, nhot)
+        # inp_indices = torch_tensor_to_tv(inp_indices_th)
+        # out_indices = torch_tensor_to_tv(out_indices_th)
+        b = filters_tv[i]
+        # inp @ filter.T, NC @ KC
+        beta = 1.0 if inited else 0.0
+        algo_desp = GEMM.run_profile(
+            profile_res,
+            a,
+            b,
+            c,
+            False,
+            False if FILTER_HWIO else True,
+            False,
+            arch=arch,
+            stream=stream,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=inp_indices,
+            c_inds=out_indices,
+            hint=AlgoHint.Fowrard.value,
+            alpha=1.0,
+            beta=beta)
+
+        # gather_times += gather_time
+        inited = True
+    # torch.cuda.synchronize()
+    # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
+    # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times)
+
+    # print(indice_pair_num_cpu)
+    # print(time.time() - t)
+    return out_features
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    raise NotImplementedError
+
+
+def indice_conv_backward(features: torch.Tensor,
+                         filters: torch.Tensor,
+                         out_bp: torch.Tensor,
+                         indice_pairs: torch.Tensor,
+                         indice_pair_num: torch.Tensor,
+                         inverse: bool = False,
+                         subm: bool = False,
+                         algo: ConvAlgo = ConvAlgo.Native):
+    # workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
+    # workspace_tv = torch_tensor_to_tv(workspace)
+    # torch.cuda.synchronize()
+    # t = time.time()
+
+    num_activate_out = out_bp.shape[0]
+    out_channel = out_bp.shape[-1]
+    filters_shape = filters.shape
+    filters = filters.reshape(-1, *filters.shape[-2:])
+    kv = filters.shape[0]
+    kv_center = kv // 2
+    assert out_bp.is_contiguous()
+    assert filters.is_contiguous()
+    assert features.is_contiguous()
+
+    if subm:
+        dfilters = torch.zeros_like(filters)
+        if FILTER_HWIO:
+            torch.mm(features.T, out_bp, out=dfilters[kv_center])
+            # TODO can we use torch mm for f16 backward weight?
+            din = torch.mm(out_bp, filters[kv_center].T)
+        else:
+            torch.mm(out_bp.T, features, out=dfilters[kv_center])
+            # TODO can we use torch mm for f16 backward weight?
+            din = torch.mm(out_bp, filters[kv_center])
+    else:
+        dfilters = torch.zeros_like(filters)
+        din = torch.zeros_like(features)
+    if kv == 1 and subm:
+        return (din, dfilters.reshape(filters_shape))
+
+    inited: bool = subm
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
+    # torch slice (a_th[x]) is very slow, so we need to use tv.Tensor earlier.
+    pair_in = indice_pairs_tv[int(inverse)]
+    pair_out = indice_pairs_tv[int(not inverse)]
+
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    arch = torch.cuda.get_device_capability()
+    filters_tv = torch_tensor_to_tv(filters)
+
+    dfilters_tv = torch_tensor_to_tv(dfilters)
+    out_bp_tv = torch_tensor_to_tv(out_bp)
+    features_tv = torch_tensor_to_tv(features)
+
+    din_tv = torch_tensor_to_tv(din)
+
+    profile_idx = kv_center
+    if subm:
+        profile_idx = kv_center - 1
+    # profile_idx = first_n
+    nhot_profile = indice_pair_num_cpu[profile_idx]
+
+    # print(nhot_profile, indice_pair_num_cpu)
+    profile_res_dgrad = GEMM.get_profiled_algo(
+        out_bp_tv.shape,
+        filters.shape[-2:],
+        din_tv.shape,
+        False,
+        True if FILTER_HWIO else False,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAC,
+        a_inds_shape=[nhot_profile],
+        c_inds_shape=[nhot_profile],
+        hint=AlgoHint.BackwardInput.value)
+    if profile_res_dgrad is None:
+        inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
+        out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
+        filter_tv = filters_tv[profile_idx]
+        profile_res_dgrad, min_time = GEMM.profile_and_cache(
+            out_bp_tv,
+            filter_tv,
+            din_tv,
+            False,
+            True if FILTER_HWIO else False,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=inp_indices,
+            c_inds=out_indices,
+            alpha=1.0,
+            beta=0.0,
+            # scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
+            hint=AlgoHint.BackwardInput.value,
+            stream=stream)
+    if not FILTER_HWIO:
+        a_wgrad = out_bp_tv
+        b_wgrad = features_tv
+    else:
+        a_wgrad = features_tv
+        b_wgrad = out_bp_tv
+    profile_res_wgrad = GEMM.get_profiled_algo(
+        a_wgrad.shape,
+        b_wgrad.shape,
+        filters.shape[-2:],
+        True,
+        False,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAB,
+        a_inds_shape=[nhot_profile],
+        b_inds_shape=[nhot_profile],
+        hint=AlgoHint.BackwardWeight.value)
+
+    if profile_res_wgrad is None:
+        inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
+        out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
+        dfilter_tv = dfilters_tv[profile_idx]
+        if not FILTER_HWIO:
+            a_inds_wgrad = out_indices
+            b_inds_wgrad = inp_indices
+        else:
+            a_inds_wgrad = inp_indices
+            b_inds_wgrad = out_indices
+        profile_res_wgrad, min_time = GEMM.profile_and_cache(
+            a_wgrad,
+            b_wgrad,
+            dfilter_tv,
+            True,
+            False,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAB,
+            a_inds=a_inds_wgrad,
+            b_inds=b_inds_wgrad,
+            alpha=1.0,
+            beta=0.0,
+            # scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
+            hint=AlgoHint.BackwardWeight.value,
+            stream=stream)
+        # print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
+    maxnhot = max(indice_pair_num_cpu)
+    # get workspace size for wgrad
+    if not FILTER_HWIO:
+        a_shape = [maxnhot, out_bp_tv.dim(1)]
+        b_shape = [maxnhot, features_tv.dim(1)]
+    else:
+        b_shape = [maxnhot, out_bp_tv.dim(1)]
+        a_shape = [maxnhot, features_tv.dim(1)]
+    m, n, k = GEMM.extract_mnk(
+        a_shape, b_shape, profile_res_wgrad.algo_desp.trans_a,
+        profile_res_wgrad.algo_desp.trans_b,
+        profile_res_wgrad.algo_desp.trans_c,
+        arch=arch, 
+        shuffle_type=ShuffleStrideType.ShuffleAB,
+        a_inds_shape=[maxnhot],
+        b_inds_shape=[maxnhot],
+        hint=AlgoHint.BackwardWeight.value)
+    workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(m, n, k, profile_res_wgrad.splitk)
+    workspace = torch.Tensor()
+
+    workspace_tv = tv.Tensor()
+    if workspace_size > 0:
+        workspace = torch.empty((workspace_size,), dtype=torch.int8, device=features.device)
+        workspace_tv = torch_tensor_to_tv(workspace)
+    # print(workspace_size, m, n, k, profile_res_wgrad.splitk)
+    # torch.cuda.synchronize()
+    # di_time = time.time() - t
+    # t = time.time()
+    inited = subm
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if subm and i == kv_center:
+            continue
+        if subm and i > kv_center:
+            nhot = indice_pair_num_cpu[kv - i - 1]
+        if nhot <= 0:
+            continue
+        beta = 1.0 if inited else 0.0
+        inp_indices = pair_in[i].slice_first_axis(0, nhot)
+        out_indices = pair_out[i].slice_first_axis(0, nhot)
+        # out.T @ inp, NK @ NC
+        # print(features_tv.shape, out_bp_tv.shape)
+        GEMM.run_profile(profile_res_dgrad,
+                         out_bp_tv,
+                         filters_tv[i],
+                         din_tv,
+                         False,
+                         True if FILTER_HWIO else False,
+                         False,
+                         arch=arch,
+                         stream=stream,
+                         shuffle_type=ShuffleStrideType.ShuffleAC,
+                         a_inds=out_indices,
+                         c_inds=inp_indices,
+                         hint=AlgoHint.BackwardInput.value,
+                         alpha=1.0,
+                         beta=beta)
+
+        if not FILTER_HWIO:
+            a = out_bp_tv
+            b = features_tv
+            a_inds = out_indices
+            b_inds = inp_indices
+        else:
+            a = features_tv
+            b = out_bp_tv
+            a_inds = inp_indices
+            b_inds = out_indices
+        GEMM.run_profile(profile_res_wgrad,
+                         a,
+                         b,
+                         dfilters_tv[i],
+                         True,
+                         False,
+                         False,
+                         arch=arch,
+                         stream=stream,
+                         shuffle_type=ShuffleStrideType.ShuffleAB,
+                         a_inds=a_inds,
+                         b_inds=b_inds,
+                         hint=AlgoHint.BackwardWeight.value,
+                         alpha=1.0,
+                         beta=beta,
+                         workspace=workspace_tv)
+        inited = True
+
+    # torch.cuda.synchronize()
+    # dw_time = time.time() - t
+    # # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
+    # # print(dw_time + di_time)
+    # print(time.time() - t)
+    return (din, dfilters.reshape(filters_shape))
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    out_channel = features.shape[-1]
+    out_features = torch.zeros((num_activate_out, out_channel),
+                               dtype=features.dtype,
+                               device=features.device)
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    out_features_tv = torch_tensor_to_tv(out_features)
+    features_tv = torch_tensor_to_tv(features)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if nhot <= 0:
+            continue
+        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
+        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
+                                  inp_indices, stream)
+    return out_features
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    out_channel = features.shape[-1]
+    din = torch.zeros_like(features)
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    out_features_tv = torch_tensor_to_tv(out_features)
+    features_tv = torch_tensor_to_tv(features)
+    out_bp_tv = torch_tensor_to_tv(out_bp)
+    din_tv = torch_tensor_to_tv(din)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if nhot <= 0:
+            continue
+        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
+        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        SpconvOps.maxpool_backward(out_features_tv, features_tv, out_bp_tv,
+                                   din_tv, out_indices, inp_indices, stream)
+
+    return din
+
+
+def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
+    raise NotImplementedError
+
+
+def pillar_scatter(features, coors, shape):
+    raise NotImplementedError
diff --git a/spconv/pool.py b/spconv/pytorch/pool.py
similarity index 95%
rename from spconv/pool.py
rename to spconv/pytorch/pool.py
index 333f4ce..57fddaf 100644
--- a/spconv/pool.py
+++ b/spconv/pytorch/pool.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,11 +21,12 @@
 from torch.nn import init
 from torch.nn.parameter import Parameter
 
-import spconv
-import spconv.functional as Fsp
-from spconv import ops
-from spconv.core import IndiceData
-from spconv.modules import SparseModule
+from spconv import pytorch as spconv
+from spconv.algo import ConvAlgo
+import spconv.pytorch.functional as Fsp
+from spconv.pytorch import ops
+from spconv.pytorch.core import IndiceData
+from spconv.pytorch.modules import SparseModule
 
 
 class SparseMaxPool(SparseModule):
@@ -100,13 +101,13 @@ def forward(self, input):
             indices,
             batch_size,
             spatial_shape,
+            ConvAlgo.Native,
             self.kernel_size,
             self.stride,
             self.padding,
             self.dilation,
             0,
-            self.subm,
-            grid=input.grid)
+            False)
         if input.benchmark:
             torch.cuda.synchronize()
             interval = time.time() - t
diff --git a/spconv/spatial.py b/spconv/pytorch/spatial.py
similarity index 92%
rename from spconv/spatial.py
rename to spconv/pytorch/spatial.py
index 79a7cdf..ffacbbd 100644
--- a/spconv/spatial.py
+++ b/spconv/pytorch/spatial.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,8 +21,8 @@
 from torch.nn import init
 from torch.nn.parameter import Parameter
 
-import spconv
-from spconv.modules import SparseModule
+from spconv import pytorch as spconv
+from spconv.pytorch.modules import SparseModule
 
 
 class RemoveDuplicate(SparseModule):
diff --git a/spconv/tables.py b/spconv/pytorch/tables.py
similarity index 67%
rename from spconv/tables.py
rename to spconv/pytorch/tables.py
index fc9aa81..53b537b 100644
--- a/spconv/tables.py
+++ b/spconv/pytorch/tables.py
@@ -1,9 +1,23 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 from torch.autograd import Function
 
-import spconv
+import spconv.pytorch as spconv
 #from torch.nn import Module
-from spconv.modules import SparseModule
+from spconv.pytorch.modules import SparseModule
 
 
 class JoinTable(SparseModule):  # Module):
diff --git a/spconv/test_utils.py b/spconv/test_utils.py
index fb35db9..12105fb 100644
--- a/spconv/test_utils.py
+++ b/spconv/test_utils.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/spconv/utils/__init__.py b/spconv/utils/__init__.py
index 3e63227..51fe3fa 100644
--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,372 +13,13 @@
 # limitations under the License.
 
 import numpy as np
-import torch
-
-from spconv import spconv_utils
-from spconv.spconv_utils import (non_max_suppression_cpu,
-                                 points_to_voxel_3d_np,
-                                 points_to_voxel_3d_np_mean,
-                                 points_to_voxel_3d_with_filtering,
-                                 rbbox_intersection, rbbox_iou,
-                                 rotate_non_max_suppression_cpu)
-
-try:
-    from spconv.spconv_utils import non_max_suppression
-except ImportError:
-    pass
-
-
-def points_to_voxel(points,
-                    voxel_size,
-                    coors_range,
-                    coor_to_voxelidx,
-                    max_points=35,
-                    max_voxels=20000,
-                    full_mean=False,
-                    block_filtering=True,
-                    block_factor=1,
-                    block_size=8,
-                    height_threshold=0.2,
-                    height_high_threshold=3.0,
-                    pad_output=False):
-    """convert 3d points(N, >=3) to voxels. This version calculate
-    everything in one loop. now it takes only 0.8ms(~6k voxels) 
-    with c++ and 3.2ghz cpu.
-
-    Args:
-        points: [N, ndim] float tensor. points[:, :3] contain xyz points and
-            points[:, 3:] contain other information such as reflectivity.
-        voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
-        coors_range: [6] list/tuple or array, float. indicate voxel range.
-            format: xyzxyz, minmax
-        coor_to_voxelidx: int array. used as a dense map.
-        max_points: int. indicate maximum points contained in a voxel.
-        max_voxels: int. indicate maximum voxels this function create.
-            for voxelnet, 20000 is a good choice. you should shuffle points
-            before call this function because max_voxels may drop some points.
-        full_mean: bool. if true, all empty points in voxel will be filled with mean
-            of exist points.
-        block_filtering: filter voxels by height. used for lidar point cloud.
-            use some visualization tool to see filtered result.
-    Returns:
-        voxels: [M, max_points, ndim] float tensor. only contain points.
-        coordinates: [M, 3] int32 tensor. zyx format.
-        num_points_per_voxel: [M] int32 tensor.
-    """
-    if full_mean:
-        assert block_filtering is False
-    if not isinstance(voxel_size, np.ndarray):
-        voxel_size = np.array(voxel_size, dtype=points.dtype)
-    if not isinstance(coors_range, np.ndarray):
-        coors_range = np.array(coors_range, dtype=points.dtype)
-    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
-    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
-    voxelmap_shape = voxelmap_shape[::-1]
-    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
-    voxels = np.zeros(shape=(max_voxels, max_points, points.shape[-1]),
-                      dtype=points.dtype)
-    voxel_point_mask = np.zeros(shape=(max_voxels, max_points),
-                                dtype=points.dtype)
-    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
-    res = {
-        "voxels": voxels,
-        "coordinates": coors,
-        "num_points_per_voxel": num_points_per_voxel,
-        "voxel_point_mask": voxel_point_mask,
-    }
-    if full_mean:
-        means = np.zeros(shape=(max_voxels, points.shape[-1]),
-                         dtype=points.dtype)
-        voxel_num = points_to_voxel_3d_np_mean(points, voxels,
-                                               voxel_point_mask, means, coors,
-                                               num_points_per_voxel,
-                                               coor_to_voxelidx,
-                                               voxel_size.tolist(),
-                                               coors_range.tolist(),
-                                               max_points, max_voxels)
-    else:
-        if block_filtering:
-            block_shape = [*voxelmap_shape[1:]]
-            block_shape = [b // block_factor for b in block_shape]
-            mins = np.full(block_shape, 99999999, dtype=points.dtype)
-            maxs = np.full(block_shape, -99999999, dtype=points.dtype)
-            voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
-            voxel_num = points_to_voxel_3d_with_filtering(
-                points, voxels, voxel_point_mask, voxel_mask, mins, maxs,
-                coors, num_points_per_voxel, coor_to_voxelidx,
-                voxel_size.tolist(), coors_range.tolist(), max_points,
-                max_voxels, block_factor, block_size, height_threshold,
-                height_high_threshold)
-            voxel_mask = voxel_mask.astype(np.bool_)
-            coors_ = coors[voxel_mask]
-            if pad_output:
-                res["coordinates"][:voxel_num] = coors_
-                res["voxels"][:voxel_num] = voxels[voxel_mask]
-                res["voxel_point_mask"][:voxel_num] = voxel_point_mask[
-                    voxel_mask]
-
-                res["num_points_per_voxel"][:voxel_num] = num_points_per_voxel[
-                    voxel_mask]
-                res["coordinates"][voxel_num:] = 0
-                res["voxels"][voxel_num:] = 0
-                res["num_points_per_voxel"][voxel_num:] = 0
-                res["voxel_point_mask"][voxel_num:] = 0
-            else:
-                res["coordinates"] = coors_
-                res["voxels"] = voxels[voxel_mask]
-                res["num_points_per_voxel"] = num_points_per_voxel[voxel_mask]
-                res["voxel_point_mask"] = voxel_point_mask[voxel_mask]
-            voxel_num = coors_.shape[0]
-        else:
-            voxel_num = points_to_voxel_3d_np(points, voxels, voxel_point_mask,
-                                              coors, num_points_per_voxel,
-                                              coor_to_voxelidx,
-                                              voxel_size.tolist(),
-                                              coors_range.tolist(), max_points,
-                                              max_voxels)
-    res["voxel_num"] = voxel_num
-    res["voxel_point_mask"] = res["voxel_point_mask"].reshape(
-        -1, max_points, 1)
-    return res
-
-
-class VoxelGenerator:
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000,
-                 full_mean=True):
-        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = np.array(voxel_size, dtype=np.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = np.round(grid_size).astype(np.int64)
-        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
-        voxelmap_shape = voxelmap_shape[::-1]
-
-        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
-        self._voxel_size = voxel_size
-        self._point_cloud_range = point_cloud_range
-        self._max_num_points = max_num_points
-        self._max_voxels = max_voxels
-        self._grid_size = grid_size
-        self._full_mean = full_mean
-
-    def generate(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean)
-        voxels = res["voxels"]
-        coors = res["coordinates"]
-        num_points_per_voxel = res["num_points_per_voxel"]
-        voxel_num = res["voxel_num"]
-        coors = coors[:voxel_num]
-        voxels = voxels[:voxel_num]
-        num_points_per_voxel = num_points_per_voxel[:voxel_num]
-
-        return (voxels, coors, num_points_per_voxel)
-
-    def generate_multi_gpu(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean)
-        voxels = res["voxels"]
-        coors = res["coordinates"]
-        num_points_per_voxel = res["num_points_per_voxel"]
-        voxel_num = res["voxel_num"]
-        return (voxels, coors, num_points_per_voxel)
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def max_num_points_per_voxel(self):
-        return self._max_num_points
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
-
-
-class VoxelGeneratorV2:
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000,
-                 full_mean=False,
-                 block_filtering=False,
-                 block_factor=8,
-                 block_size=3,
-                 height_threshold=0.1,
-                 height_high_threshold=2.0):
-        assert full_mean is False, "don't use this."
-        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = np.array(voxel_size, dtype=np.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = np.round(grid_size).astype(np.int64)
-        if block_filtering:
-            assert block_size > 0
-            assert grid_size[0] % block_factor == 0
-            assert grid_size[1] % block_factor == 0
-
-        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
-        voxelmap_shape = voxelmap_shape[::-1]
-        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
-        self._voxel_size = voxel_size
-        self._point_cloud_range = point_cloud_range
-        self._max_num_points = max_num_points
-        self._max_voxels = max_voxels
-        self._grid_size = grid_size
-        self._full_mean = full_mean
-        self._block_filtering = block_filtering
-        self._block_factor = block_factor
-        self._height_threshold = height_threshold
-        self._block_size = block_size
-        self._height_high_threshold = height_high_threshold
-
-    def generate(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean,
-                              self._block_filtering, self._block_factor,
-                              self._block_size, self._height_threshold,
-                              self._height_high_threshold)
-        for k, v in res.items():
-            if k != "voxel_num":
-                res[k] = v[:res["voxel_num"]]
-        return res
-
-    def generate_multi_gpu(self, points, max_voxels=None):
-        res = points_to_voxel(points,
-                              self._voxel_size,
-                              self._point_cloud_range,
-                              self._coor_to_voxelidx,
-                              self._max_num_points,
-                              max_voxels or self._max_voxels,
-                              self._full_mean,
-                              self._block_filtering,
-                              self._block_factor,
-                              self._block_size,
-                              self._height_threshold,
-                              self._height_high_threshold,
-                              pad_output=True)
-        return res
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def max_num_points_per_voxel(self):
-        return self._max_num_points
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
-
-
-class VoxelGeneratorV3:
-    def __init__(self, voxel_size, point_cloud_range, max_points, num_features,
-                 dtype, device):
-
-        self._max_points = max_points
-
-        self._point_cloud_range = point_cloud_range
-        self._voxel_size = voxel_size
-        self._grid_size = torch.round(
-            (self._point_cloud_range[3:] - self._point_cloud_range[:3]) /
-            self._voxel_size).to(torch.int32)
-        grid_volume = self._grid_size.prod()
-        self._grid_size = self._grid_size.cpu().numpy().tolist()
-        self._ndim = len(self._grid_size)
-
-        self._dtype = dtype
-        self._device = device
-
-        self._point_index = torch.full([max_points + 1],
-                                       grid_volume,
-                                       dtype=torch.int32,
-                                       device=self._device)
-        self._grids = torch.zeros([grid_volume, num_features],
-                                  dtype=self._dtype,
-                                  device=self._device)
-        self._num_points_per_grid = torch.zeros([grid_volume],
-                                                dtype=torch.int32,
-                                                device=self._device)
-        self._voxels = torch.zeros([max_points, num_features],
-                                   dtype=self._dtype,
-                                   device=self._device)
-        self._coors = torch.zeros([max_points, self._ndim],
-                                  dtype=torch.int32,
-                                  device=self._device)
-
-    def generate(self, points):
-        assert points.shape[
-            0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
-                points.shape[0])
-        points.to(self._dtype).to(self._device)
-        return self.points_to_voxel(points)
-
-    def generate_multi_gpu(self, points):
-        assert points.shape[
-            0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
-                points.shape[0])
-        points.to(self._dtype).to(self._device)
-        return self.points_to_voxel(points)
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
-
-    def points_to_voxel(self, points):
-        """
-            points: [N, ndim] float tensor. points[:, :3] contain xyz points and
-                points[:, 3:] contain other information such as reflectivity.
-            voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
-            coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
-                format: xyzxyz, minmax
-        """
-        indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) /
-                              self._voxel_size).to(torch.int32)
-        num_voxel = torch.ops.spconv.points_to_voxel(
-            points, indexes, self._point_index, self._grids,
-            self._num_points_per_grid, self._voxels, self._coors,
-            self._grid_size, self._ndim)
-        voxels = self._voxels[:num_voxel, :]
-        coors = self._coors[:num_voxel, :]
-
-        # xyz --> zyx
-        #coors = coors[::-1]
-        x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape(
-            [-1, 1]), coors[:, 2].reshape([-1, 1])
-        coors = torch.cat([z, y, x], dim=1)
-        # can be skipped
-        #        x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
-        #        voxels = torch.cat([z, y, x, f], dim=1)
-        return voxels, coors
+from cumm import tensorview as tv 
+from spconv.core_cc.csrc.sparse.all.ops1d import Point2Voxel as Point2VoxelGPU1d
+from spconv.core_cc.csrc.sparse.all.ops2d import Point2Voxel as Point2VoxelGPU2d
+from spconv.core_cc.csrc.sparse.all.ops3d import Point2Voxel as Point2VoxelGPU3d
+from spconv.core_cc.csrc.sparse.all.ops4d import Point2Voxel as Point2VoxelGPU4d
+
+from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
+from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
+from spconv.core_cc.csrc.sparse.all.ops_cpu3d import Point2VoxelCPU as Point2VoxelCPU3d
+from spconv.core_cc.csrc.sparse.all.ops_cpu4d import Point2VoxelCPU as Point2VoxelCPU4d
\ No newline at end of file
diff --git a/src/cuhash/CMakeLists.txt b/src/cuhash/CMakeLists.txt
deleted file mode 100644
index 76bbd59..0000000
--- a/src/cuhash/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-if(WIN32)
-    add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
-else()
-    add_library(cuhash STATIC hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
-endif()
-target_include_directories(cuhash PRIVATE ${ALL_INCLUDE} )
-set_property(TARGET cuhash PROPERTY CUDA_STANDARD 14)
-set_property(TARGET cuhash PROPERTY CXX_STANDARD 14)
-set_target_properties(cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-set_target_properties(cuhash PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-if(NOT WIN32)
-    set_property(TARGET cuhash PROPERTY POSITION_INDEPENDENT_CODE ON)
-endif()
-target_link_libraries(cuhash PRIVATE ${ALL_LIBS})
-install (TARGETS cuhash DESTINATION lib)
-
-if (SPCONV_BuildTests)
-    add_executable(cuhash_test main.cc)
-    target_include_directories(cuhash_test PRIVATE ${ALL_INCLUDE} )
-    set_property(TARGET cuhash_test PROPERTY CUDA_STANDARD 14)
-    set_property(TARGET cuhash_test PROPERTY CXX_STANDARD 14)
-    set_target_properties(cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_link_libraries(cuhash_test PRIVATE ${ALL_LIBS} cuhash)
-    install (TARGETS cuhash_test DESTINATION bin)
-endif()
\ No newline at end of file
diff --git a/src/cuhash/debugging.cpp b/src/cuhash/debugging.cpp
deleted file mode 100644
index 802cfe8..0000000
--- a/src/cuhash/debugging.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.cpp
- *
- * @brief Debugging/statistics/performance utilities for hash tables.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-
-#include <algorithm>
-#include <cstring>
-#include <cuhash/cuda_util.h>
-
-namespace cuhash {
-
-void OutputRetrievalStatistics(const unsigned n_queries,
-                               const unsigned *d_retrieval_probes,
-                               const unsigned n_functions) {
-  unsigned *retrieval_probes = new unsigned[n_queries];
-  CUDA_SAFE_CALL(cudaMemcpy(retrieval_probes, d_retrieval_probes,
-                            sizeof(unsigned) * n_queries,
-                            cudaMemcpyDeviceToHost));
-
-  // Create a histogram showing how many items needed how many probes to be
-  // found.
-  unsigned possible_probes = n_functions + 2;
-  unsigned *histogram = new unsigned[possible_probes];
-  memset(histogram, 0, sizeof(unsigned) * (possible_probes));
-  for (unsigned i = 0; i < n_queries; ++i) {
-    histogram[retrieval_probes[i]]++;
-  }
-
-  // Dump it.
-  char buffer[10000];
-  sprintf(buffer, "Probes for retrieval: ");
-  PrintMessage(buffer);
-  for (unsigned i = 0; i < possible_probes; ++i) {
-    sprintf(buffer, "\t(%u, %u)", i, histogram[i]);
-    PrintMessage(buffer);
-  }
-  delete[] retrieval_probes;
-  delete[] histogram;
-}
-
-void OutputBuildStatistics(const unsigned n,
-                           const unsigned *d_iterations_taken) {
-  // Output how many iterations each thread took until it found an empty slot.
-  unsigned *iterations_taken = new unsigned[n];
-  CUDA_SAFE_CALL(cudaMemcpy(iterations_taken, d_iterations_taken,
-                            sizeof(unsigned) * n, cudaMemcpyDeviceToHost));
-  std::sort(iterations_taken, iterations_taken + n);
-  unsigned total_iterations = 0;
-  unsigned max_iterations_taken = 0;
-  for (unsigned i = 0; i < n; ++i) {
-    total_iterations += iterations_taken[i];
-    max_iterations_taken = std::max(max_iterations_taken, iterations_taken[i]);
-  }
-
-  unsigned current_value = iterations_taken[0];
-  unsigned count = 1;
-  char buffer[10000];
-  sprintf(buffer, "Iterations taken:\n");
-  for (unsigned i = 1; i < n; ++i) {
-    if (iterations_taken[i] != current_value) {
-      sprintf(buffer, "%s\t(%u, %u)\n", buffer, current_value, count);
-      current_value = iterations_taken[i];
-      count = 1;
-    } else {
-      count++;
-    }
-  }
-  sprintf(buffer, "%s\t(%u, %u)", buffer, current_value, count);
-  PrintMessage(buffer);
-  sprintf(buffer, "Total iterations: %u", total_iterations);
-  PrintMessage(buffer);
-  sprintf(buffer, "Avg/Med/Max iterations: (%f %u %u)",
-          (float)total_iterations / n, iterations_taken[n / 2],
-          iterations_taken[n - 1]);
-  PrintMessage(buffer);
-  delete[] iterations_taken;
-
-  // Print the length of the longest eviction chain.
-  sprintf(buffer, "Max iterations: %u", max_iterations_taken);
-  PrintMessage(buffer);
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/src/cuhash/debugging.cu b/src/cuhash/debugging.cu
deleted file mode 100644
index 4b6147e..0000000
--- a/src/cuhash/debugging.cu
+++ /dev/null
@@ -1,236 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.cu
- *
- * @brief Debugging/statistics/performance utilities for hash tables.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-#include <cuhash/hash_table.cuh>
-
-#include <algorithm>
-#include <cuhash/cuda_util.h>
-
-namespace cuhash {
-
-//! Debugging function: Takes statistics on the hash functions' distribution.
-/*! Determines:
- *    - How many unique slots each key has.
- *    - How many keys hash into each slot.
- *    - Whether any keys failed to get a full set of slots.
- */
-__global__ void take_hash_function_statistics_kernel(
-    const unsigned *keys, const unsigned n_entries, const unsigned table_size,
-    const uint2 *constants, const unsigned num_functions,
-    unsigned *num_slots_available, unsigned *num_hashing_in, unsigned *failed) {
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-
-  if (thread_index >= n_entries)
-    return;
-  unsigned key = keys[thread_index];
-
-  // Determine all of the locations the key hashes into.
-  // Also count how many keys hash into each location.
-  unsigned locations[kMaxHashFunctions];
-  for (unsigned i = 0; i < num_functions; ++i) {
-    locations[i] = hash_function_inner(constants[i], key) % table_size;
-
-    if (num_hashing_in != NULL) {
-      atomicAdd(num_hashing_in + locations[i], 1);
-    }
-  }
-
-  // Determine whether all of the locations were different.
-  unsigned num_slots = 1;
-  for (unsigned i = 1; i < num_functions; ++i) {
-    bool matched = false;
-    for (unsigned j = 0; j < i; ++j) {
-      if (locations[i] == locations[j]) {
-        matched = true;
-        break;
-      }
-    }
-    if (!matched) {
-      num_slots++;
-    }
-  }
-
-  if (num_slots_available != NULL) {
-    num_slots_available[thread_index] = num_slots;
-  }
-
-  if (failed != NULL && num_slots != num_functions) {
-    *failed = 1;
-  }
-}
-
-void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
-                                const unsigned table_size,
-                                const uint2 *constants,
-                                const unsigned kNumHashFunctions) {
-  char buffer[16000];
-  PrintMessage("Hash function constants: ");
-
-  for (unsigned i = 0; i < kNumHashFunctions; ++i) {
-    sprintf(buffer, "\t%10u, %10u", constants[i].x, constants[i].y);
-    PrintMessage(buffer);
-  }
-
-  unsigned *d_num_hashing_in = NULL;
-#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_num_hashing_in, sizeof(unsigned) * table_size));
-  CUDA_SAFE_CALL(
-      cudaMemset(d_num_hashing_in, 0, sizeof(unsigned) * table_size));
-#endif
-
-  unsigned *d_num_slots_available = NULL;
-#ifdef COUNT_HOW_MANY_HAVE_CYCLES
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_num_slots_available, sizeof(unsigned) * num_keys));
-#endif
-  uint2 *d_constants = NULL;
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_constants, sizeof(uint2) * kNumHashFunctions));
-  CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants,
-                            sizeof(uint2) * kNumHashFunctions,
-                            cudaMemcpyHostToDevice));
-
-  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
-                                         kBlockSize>>>(
-      d_keys, num_keys, table_size, d_constants, kNumHashFunctions,
-      d_num_slots_available, d_num_hashing_in, NULL);
-  CUDA_SAFE_CALL(cudaFree(d_constants));
-
-#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-  unsigned *num_hashing_in = new unsigned[table_size];
-  CUDA_SAFE_CALL(cudaMemcpy(num_hashing_in, d_num_hashing_in,
-                            sizeof(unsigned) * table_size,
-                            cudaMemcpyDeviceToHost));
-
-  /*
-  // Print how many items hash into each slot.
-  // Used to make sure items are spread evenly throughout the table.
-  buffer[0] = '\0';
-  PrintMessage("Num hashing into each: ", true);
-  for (unsigned i = 0; i < table_size; ++i) {
-    sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
-    if (i % 25 == 24) {
-      PrintMessage(buffer, true);
-      buffer[0] = '\0';
-    }
-  }
-  PrintMessage(buffer,true);
-  */
-
-  // Print a histogram of how many items are hashed into each slot.  Shows
-  // if average number of items hashing into each slot is low.
-  std::sort(num_hashing_in, num_hashing_in + table_size);
-  int count = 1;
-  unsigned previous = num_hashing_in[0];
-  sprintf(buffer, "Num items hashing into a slot:\t");
-  PrintMessage(buffer);
-  for (unsigned i = 1; i < table_size; ++i) {
-    if (num_hashing_in[i] != previous) {
-      sprintf(buffer, "\t(%u, %u)", previous, count);
-      PrintMessage(buffer);
-      previous = num_hashing_in[i];
-      count = 1;
-    } else {
-      count++;
-    }
-  }
-  sprintf(buffer, "\t(%u, %u)", previous, count);
-  PrintMessage(buffer);
-
-  delete[] num_hashing_in;
-  CUDA_SAFE_CALL(cudaFree(d_num_hashing_in));
-#endif
-
-#ifdef COUNT_HOW_MANY_HAVE_CYCLES
-  unsigned *num_slots_available = new unsigned[num_keys];
-  CUDA_SAFE_CALL(cudaMemcpy(num_slots_available, d_num_slots_available,
-                            sizeof(unsigned) * num_keys,
-                            cudaMemcpyDeviceToHost));
-
-  static const unsigned kHistogramSize = kNumHashFunctions + 1;
-  unsigned *histogram = new unsigned[kHistogramSize];
-  memset(histogram, 0, sizeof(unsigned) * kHistogramSize);
-  for (unsigned i = 0; i < num_keys; ++i) {
-    histogram[num_slots_available[i]]++;
-  }
-
-  sprintf(buffer, "Slots assigned to each key: ");
-  for (unsigned i = 1; i < kHistogramSize; ++i) {
-    sprintf(buffer, "%s(%u, %u) ", buffer, i, histogram[i]);
-  }
-  PrintMessage(buffer);
-
-  delete[] histogram;
-  delete[] num_slots_available;
-  CUDA_SAFE_CALL(cudaFree(d_num_slots_available));
-#endif
-}
-
-bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
-                           const unsigned *d_keys, const unsigned table_size,
-                           uint2 *constants) {
-  unsigned *d_cycle_exists = NULL;
-  uint2 *d_constants = NULL;
-
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cycle_exists, sizeof(unsigned)));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_constants, sizeof(uint2) * N));
-
-  CUDA_SAFE_CALL(cudaMemset(d_cycle_exists, 0, sizeof(unsigned)));
-  CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants, sizeof(uint2) * N,
-                            cudaMemcpyHostToDevice));
-
-  // Check if all keys were given a full set of N slots by the functions.
-  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
-                                         kBlockSize>>>(
-      d_keys, num_keys, table_size, d_constants, N, NULL, NULL, d_cycle_exists);
-
-  unsigned cycle_exists;
-  CUDA_SAFE_CALL(cudaMemcpy(&cycle_exists, d_cycle_exists, sizeof(unsigned),
-                            cudaMemcpyDeviceToHost));
-
-  CUDA_SAFE_CALL(cudaFree(d_cycle_exists));
-  CUDA_SAFE_CALL(cudaFree(d_constants));
-
-  return (cycle_exists != 0);
-}
-
-void PrintStashContents(const Entry *d_stash) {
-  Entry *stash = new Entry[cuhash::kStashSize];
-  CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cuhash::kStashSize,
-                            cudaMemcpyDeviceToHost));
-  for (unsigned i = 0; i < cuhash::kStashSize; ++i) {
-    if (get_key(stash[i]) != kKeyEmpty) {
-      char buffer[256];
-      sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]),
-              get_value(stash[i]));
-      PrintMessage(buffer, true);
-    }
-  }
-  delete[] stash;
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/src/cuhash/hash_functions.cpp b/src/cuhash/hash_functions.cpp
deleted file mode 100644
index a46c521..0000000
--- a/src/cuhash/hash_functions.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// nvcc (cuda) 9.0 with gcc 5.5 don't support random, so compile it in host
-
-#include <random>
-
-namespace cuhash {
-
-std::random_device random_dev;
-
-std::mt19937 random_engine(random_dev());
-std::uniform_int_distribution<unsigned> uint_distribution;
-
-unsigned generate_random_uint32() { return uint_distribution(random_engine); }
-
-} // namespace cuhash
\ No newline at end of file
diff --git a/src/cuhash/hash_functions.cu b/src/cuhash/hash_functions.cu
deleted file mode 100644
index 702ca99..0000000
--- a/src/cuhash/hash_functions.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <cassert>
-#include <cuhash/debugging.h>
-#include <cuhash/hash_functions.h>
-#include <cuhash/hash_table.h>
-
-namespace cuhash {
-
-void GenerateFunctions(const unsigned N, const unsigned num_keys,
-                       const unsigned *d_keys, const unsigned table_size,
-                       uint2 *constants) {
-  bool regenerate = true;
-
-  while (regenerate) {
-    regenerate = false;
-
-    // Generate a set of hash function constants for this build attempt.
-    for (unsigned i = 0; i < N; ++i) {
-      // uint_distribution(random_engine) % kPrimeDivisor;
-      // genrand_int32() % kPrimeDivisor;
-      unsigned new_a = generate_random_uint32() % kPrimeDivisor;
-      constants[i].x = (1 > new_a ? 1 : new_a);
-      constants[i].y = generate_random_uint32() % kPrimeDivisor;
-    }
-
-#ifdef FORCEFULLY_GENERATE_NO_CYCLES
-    // Ensure that every key gets N different slots.
-    regenerate =
-        CheckAssignedSameSlot(N, num_keys, d_keys, table_size, constants);
-#endif
-  }
-
-#ifdef TAKE_HASH_FUNCTION_STATISTICS
-  // Examine how well distributed the items are.
-  TakeHashFunctionStatistics(num_keys, d_keys, table_size, constants, N);
-#endif
-}
-
-}; // namespace cuhash
diff --git a/src/cuhash/hash_table.cpp b/src/cuhash/hash_table.cpp
deleted file mode 100644
index 9bf9744..0000000
--- a/src/cuhash/hash_table.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cpp
- *
- * @brief Implements a basic hash table that stores one value per key.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/hash_table.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cuda_runtime_api.h>
-#include <cuhash/cuda_util.h>
-#include <limits>
-
-namespace cuhash {
-
-char buffer[256];
-
-//! @name Internal
-/// @{
-dim3 ComputeGridDim(unsigned n) {
-  // Round up in order to make sure all items are hashed in.
-  dim3 grid((n + kBlockSize - 1) / kBlockSize);
-  if (grid.x > kGridSize) {
-    grid.y = (grid.x + kGridSize - 1) / kGridSize;
-    grid.x = kGridSize;
-  }
-  return grid;
-}
-
-unsigned ComputeMaxIterations(const unsigned n, const unsigned table_size,
-                              const unsigned num_functions) {
-  float lg_input_size = (float)(log((double)n) / log(2.0));
-
-// #define CONSTANT_ITERATIONS
-#ifdef CONSTANT_ITERATIONS
-  // Set the maximum number of iterations to 7lg(N).
-  const unsigned MAX_ITERATION_CONSTANT = 7;
-  unsigned max_iterations = MAX_ITERATION_CONSTANT * lg_input_size;
-#else
-  // Use an empirical formula for determining what the maximum number of
-  // iterations should be.  Works OK in most situations.
-  float load_factor = float(n) / table_size;
-  float ln_load_factor = (float)(log(load_factor) / log(2.71828183));
-
-  unsigned max_iterations =
-      (unsigned)(4.0 * ceil(-1.0 / (0.028255 + 1.1594772 * ln_load_factor) *
-                            lg_input_size));
-#endif
-  return max_iterations;
-}
-/// @}
-
-HashTable::HashTable()
-    : table_size_(0), d_contents_(NULL), stash_count_(0), d_failures_(NULL) {
-  CUDA_CHECK_ERROR("Failed in constructor.\n");
-}
-
-bool HashTable::Initialize(const unsigned max_table_entries,
-                           const float space_usage,
-                           const unsigned num_functions) {
-  Release();
-
-  // Determine the minimum amount of slots the table requires,
-  // and whether the space_usage is within range.
-  float minimum_space_usage;
-  if (num_functions < 2 || num_functions > 5) {
-    char message[256] = "Number of hash functions must be from 2 to 5; "
-                        "others are unimplemented.";
-    PrintMessage(message, true);
-    return false;
-  } else {
-    minimum_space_usage = kMinimumSpaceUsages[num_functions];
-  }
-
-  if (space_usage < minimum_space_usage) {
-    sprintf(buffer, "Minimum possible space usage for %u functions is %f.",
-            num_functions, minimum_space_usage);
-    PrintMessage(buffer);
-    return false;
-  }
-
-  num_hash_functions_ = num_functions;
-  table_size_ = unsigned(ceil(max_table_entries * space_usage));
-
-  // Allocate memory.
-  const unsigned slots_to_allocate = table_size_ + kStashSize;
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_contents_, sizeof(Entry) * slots_to_allocate));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_failures_, sizeof(unsigned)));
-  if (!d_contents_ || !d_failures_) {
-    fprintf(stderr, "Failed to allocate %u slots.\n", slots_to_allocate);
-    return false;
-  }
-  CUDA_CHECK_ERROR("Failed to initialize.\n");
-
-  return true;
-}
-
-void HashTable::Release() {
-  table_size_ = 0;
-
-  CUDA_SAFE_CALL(cudaFree(d_contents_));
-  CUDA_SAFE_CALL(cudaFree(d_failures_));
-
-  d_contents_ = NULL;
-  d_failures_ = NULL;
-
-  CUDA_CHECK_ERROR("Failed during release.\n");
-}
-
-bool HashTable::Build(const unsigned n, const unsigned *d_keys,
-                      const unsigned *d_values) {
-  unsigned max_iterations =
-      ComputeMaxIterations(n, table_size_, num_hash_functions_);
-  unsigned num_failures = 1;
-  unsigned num_attempts = 0;
-
-  // Storage for statistics collection.
-  unsigned *d_iterations_taken = NULL;
-#ifdef TRACK_ITERATIONS
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_iterations_taken, sizeof(unsigned) * n));
-#endif
-
-  // Track how many items ended up in the stash.
-  unsigned *d_stash_count = NULL;
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_stash_count, sizeof(unsigned)));
-  CUDA_CHECK_ERROR("Failed before main build loop.\n");
-
-  // Main build loop.
-  while (num_failures && ++num_attempts < kMaxRestartAttempts) {
-    CUDA_SAFE_CALL(cudaMemset(d_stash_count, 0, sizeof(unsigned)));
-
-    // Generate new hash functions.
-    if (num_hash_functions_ == 2)
-      constants_2_.Generate(n, d_keys, table_size_);
-    else if (num_hash_functions_ == 3)
-      constants_3_.Generate(n, d_keys, table_size_);
-    else if (num_hash_functions_ == 4)
-      constants_4_.Generate(n, d_keys, table_size_);
-    else
-      constants_5_.Generate(n, d_keys, table_size_);
-
-    stash_constants_.x = std::max(1u, generate_random_uint32()) % kPrimeDivisor;
-    stash_constants_.y = generate_random_uint32() % kPrimeDivisor;
-    stash_count_ = 0;
-
-    // Initialize memory.
-    unsigned slots_in_table = table_size_ + kStashSize;
-    CUDAWrapper::ClearTable(slots_in_table, kEntryEmpty, d_contents_);
-
-    num_failures = 0;
-
-    CUDAWrapper::CallCuckooHash(
-        n, num_hash_functions_, d_keys, d_values, table_size_, constants_2_,
-        constants_3_, constants_4_, constants_5_, max_iterations, d_contents_,
-        stash_constants_, d_stash_count, d_failures_, d_iterations_taken);
-
-    // Check if successful.
-    CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned),
-                              cudaMemcpyDeviceToHost));
-
-#ifdef COUNT_UNINSERTED
-    if (num_failures) {
-      printf("Failed to insert %u items.\n", num_failures);
-    }
-#endif
-  }
-
-  // Copy out the stash size.
-  CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned),
-                            cudaMemcpyDeviceToHost));
-  if (stash_count_ && num_failures == 0) {
-    // sprintf(buffer, "Stash size: %u", stash_count_);
-    // PrintMessage(buffer, true);
-
-#ifdef _DEBUG
-    PrintStashContents(d_contents_ + table_size_);
-#endif
-  }
-  CUDA_SAFE_CALL(cudaFree(d_stash_count));
-
-#ifdef TRACK_ITERATIONS
-  if (num_failures == 0) {
-    OutputBuildStatistics(n, d_iterations_taken);
-  }
-  CUDA_SAFE_CALL(cudaFree(d_iterations_taken));
-#endif
-
-  // Dump some info if a restart was required.
-  if (num_attempts >= kMaxRestartAttempts) {
-    sprintf(buffer, "Completely failed to build");
-    PrintMessage(buffer, true);
-  } else if (num_attempts > 1) {
-    sprintf(buffer, "Needed %u attempts to build, you can ignore this message.",
-            num_attempts);
-    PrintMessage(buffer, true);
-  }
-
-  CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
-  return num_failures == 0;
-}
-
-void HashTable::Retrieve(const unsigned n_queries, const unsigned *d_keys,
-                         unsigned *d_values) {
-  CUDAWrapper::CallHashRetrieve(n_queries, num_hash_functions_, d_keys,
-                                table_size_, d_contents_, constants_2_,
-                                constants_3_, constants_4_, constants_5_,
-                                stash_constants_, stash_count_, d_values);
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
diff --git a/src/cuhash/hash_table.cu b/src/cuhash/hash_table.cu
deleted file mode 100644
index c395eeb..0000000
--- a/src/cuhash/hash_table.cu
+++ /dev/null
@@ -1,112 +0,0 @@
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cu
- *
- * @brief Hides all of the CUDA calls from the actual CPP file.
- */
-
-#include <cuhash/cuda_util.h>
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-#include <cuhash/hash_table.cuh>
-
-#include <cuda.h>
-
-namespace cuhash {
-
-namespace CUDAWrapper {
-void ClearTable(const unsigned slots_in_table, const Entry fill_value,
-                Entry *d_contents) {
-  clear_table<Entry><<<ComputeGridDim(slots_in_table), kBlockSize>>>(
-      slots_in_table, fill_value, d_contents);
-  TV_CHECK_CUDA_ERR_V2("Error occurred during hash table clear.\n");
-}
-
-void CallCuckooHash(const unsigned n, const unsigned num_hash_functions,
-                    const unsigned *d_keys, const unsigned *d_values,
-                    const unsigned table_size, const Functions<2> constants_2,
-                    const Functions<3> constants_3,
-                    const Functions<4> constants_4,
-                    const Functions<5> constants_5,
-                    const unsigned max_iterations, Entry *d_contents,
-                    uint2 stash_constants, unsigned *d_stash_count,
-                    unsigned *d_failures, unsigned *d_iterations_taken) {
-  // Build the table.
-  cudaMemset(d_failures, 0, sizeof(unsigned));
-  if (num_hash_functions == 2) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_2, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else if (num_hash_functions == 3) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_3, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else if (num_hash_functions == 4) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_4, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_5, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  }
-
-  CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
-}
-
-void CallHashRetrieve(const unsigned n_queries,
-                      const unsigned num_hash_functions, const unsigned *d_keys,
-                      const unsigned table_size, const Entry *d_contents,
-                      const Functions<2> constants_2,
-                      const Functions<3> constants_3,
-                      const Functions<4> constants_4,
-                      const Functions<5> constants_5,
-                      const uint2 stash_constants, const unsigned stash_count,
-                      unsigned *d_values) {
-  unsigned *d_retrieval_probes = NULL;
-#ifdef TRACK_ITERATIONS
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_retrieval_probes, sizeof(unsigned) * n_queries));
-#endif
-
-  if (num_hash_functions == 2) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_2, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else if (num_hash_functions == 3) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_3, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else if (num_hash_functions == 4) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_4, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_5, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  }
-
-  CUDA_CHECK_ERROR("Retrieval failed.\n");
-
-#ifdef TRACK_ITERATIONS
-  OutputRetrievalStatistics(n_queries, d_retrieval_probes, num_hash_functions);
-  CUDA_SAFE_CALL(cudaFree(d_retrieval_probes));
-#endif
-}
-}; // namespace CUDAWrapper
-
-}; // namespace cuhash
diff --git a/src/cuhash/main.cc b/src/cuhash/main.cc
deleted file mode 100644
index efdede4..0000000
--- a/src/cuhash/main.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-#include <cuda.h>
-#include <cuhash/hash_table.h>
-
-int main() {
-  auto table = cuhash::HashTable();
-  table.Initialize(10, 2.0);
-  const int N = 10;
-
-  // ハッシュテーブルに格納するデータ
-  int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
-  int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  // デバイスメモリにコピー
-  int *d_keys, *d_vals;
-  cudaMalloc((void **)&d_keys, sizeof(int) * N);
-  cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_vals, sizeof(int) * N);
-  cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // ハッシュテーブルにクエリするデータ
-  int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int output[N];
-
-  // デバイスメモリにコピー
-  int *d_input, *d_output;
-  cudaMalloc((void **)&d_input, sizeof(int) * N);
-  cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_output, sizeof(int) * N);
-  cudaMemset(d_output, 0, sizeof(int) * N);
-  bool s = table.Build(N, (const unsigned int *)d_keys,
-                       (const unsigned int *)d_vals);
-
-  std::cout << s << std::endl;
-  table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
-
-  std::cout << s << std::endl;
-  cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  for (int i = 0; i < N; ++i) {
-    printf("%d\n", output[i]);
-  }
-
-  return 0;
-}
\ No newline at end of file
diff --git a/src/spconv/CMakeLists.txt b/src/spconv/CMakeLists.txt
deleted file mode 100644
index e5fcbc3..0000000
--- a/src/spconv/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc point2voxel_ops.cc)
-if (SPCONV_BuildCUDA)
-    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc point2voxel.cu fused_conv.cu)
-endif()
-add_library(spconv SHARED ${ALL_FILES})
-
-find_package(OpenMP)
-if(OpenMP_CXX_FOUND)
-    target_link_libraries(spconv PUBLIC OpenMP::OpenMP_CXX)
-endif()
-
-
-target_include_directories(spconv PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} )
-set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
-set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
-set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-if (SPCONV_BuildCUDA)
-    target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash spgemm)
-else()
-    target_link_libraries(spconv PRIVATE ${ALL_LIBS})
-endif()
-install (TARGETS spconv DESTINATION lib)
diff --git a/src/spconv/all.cc b/src/spconv/all.cc
deleted file mode 100644
index 6c177cf..0000000
--- a/src/spconv/all.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/fused_spconv_ops.h>
-#include <spconv/nms_ops.h>
-#include <spconv/pillar_scatter_ops.h>
-#include <spconv/point2voxel_ops.h>
-#include <spconv/pool_ops.h>
-#include <spconv/spconv_ops.h>
-#include <torch/script.h>
-
-static auto registry =
-    torch::RegisterOperators()
-        .op("spconv::points_to_voxel", &spconv::pointsToVoxel)
-        .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
-        .op("spconv::indice_conv", &spconv::indiceConv)
-        .op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
-        .op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
-        .op("spconv::indice_maxpool", &spconv::indiceMaxPool)
-        .op("spconv::indice_maxpool_backward", &spconv::indiceMaxPoolBackward)
-        .op("spconv::nms", &spconv::nonMaxSuppression<float>)
-        .op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
-        .op("spconv::pillar_scatter_half",
-            &spconv::pointPillarScatter<at::Half>);
diff --git a/src/spconv/cublas_gemm.cc b/src/spconv/cublas_gemm.cc
deleted file mode 100644
index d4b8a55..0000000
--- a/src/spconv/cublas_gemm.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <ATen/ATen.h>
-#include <spconv/cublas_gemm.h>
-
-namespace spconv {
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const float *alpha, const float *A, int lda,
-                           const float *B, int ldb, const float *beta, float *C,
-                           int ldc) {
-  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const __half *alpha, const __half *A, int lda,
-                           const __half *B, int ldb, const __half *beta,
-                           __half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const at::Half *alpha, const at::Half *A, int lda,
-                           const at::Half *B, int ldb, const at::Half *beta,
-                           at::Half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k,
-                     reinterpret_cast<const __half *>(alpha),
-                     reinterpret_cast<const __half *>(A), lda,
-                     reinterpret_cast<const __half *>(B), ldb,
-                     reinterpret_cast<const __half *>(beta),
-                     reinterpret_cast<__half *>(C), ldc);
-}
-
-template <>
-cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
-                           cublasOperation_t transb, int m, int n, int k,
-                           const double *alpha, const double *A, int lda,
-                           const double *B, int ldb, const double *beta,
-                           double *C, int ldc) {
-  return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-template <> inline __half constant_scalar(float data) {
-  return __float2half(data);
-}
-
-} // namespace spconv
\ No newline at end of file
diff --git a/src/spconv/fused_conv.cu b/src/spconv/fused_conv.cu
deleted file mode 100644
index b147cce..0000000
--- a/src/spconv/fused_conv.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <spconv/fused_conv.cu.h>
-#include <spconv/fused_conv.h>
-#include <spconv/minkowski.cu.h>
-#include <tensorview/torch_utils.h>
-
-namespace spconv {
-void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
-                     torch::Tensor filters, torch::Tensor indicesIn,
-                     torch::Tensor indicesOut, int nHot) {
-  auto dtype = output.scalar_type();
-  auto input_nPlanes = features.size(1);
-  auto output_nPlanes = output.size(1);
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::dispatch_torch<float, at::Half>(dtype, [&](auto I) {
-    using T = decltype(I);
-    dConvolution_forward2(stream, features.data_ptr<T>(), output.data_ptr<T>(),
-                          filters.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
-                          indicesOut.data_ptr<int32_t>(), nHot, input_nPlanes,
-                          input_nPlanes, output_nPlanes, output_nPlanes, 1);
-  });
-}
-
-void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
-                              torch::Tensor dout, torch::Tensor filters,
-                              torch::Tensor dfilters, torch::Tensor indicesIn,
-                              torch::Tensor indicesOut, int nHot) {
-  auto dtype = features.scalar_type();
-  auto input_nPlanes = features.size(1);
-  auto output_nPlanes = dout.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  tv::dispatch_torch<float>(dtype, [&](auto I) {
-    using T = decltype(I);
-    dConvolution_backward_dW2(
-        stream, features.data_ptr<T>(), din.data_ptr<T>(), dout.data_ptr<T>(),
-        filters.data_ptr<T>(), dfilters.data_ptr<T>(),
-        indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>(), nHot,
-        input_nPlanes, input_nPlanes, output_nPlanes, output_nPlanes, 1);
-  });
-}
-
-void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
-                               torch::Tensor filters, torch::Tensor indicesIn,
-                               torch::Tensor indicesOut, int nHot) {
-  auto dtype = output.scalar_type();
-  auto in_nchannel = features.size(1);
-  auto out_nchannel = output.size(1);
-  int shared_mem_size = -1;
-  if ((in_nchannel > 16 && out_nchannel > 16 &&
-       in_nchannel * out_nchannel >= 512) ||
-      (in_nchannel > 24 && out_nchannel > 24))
-    shared_mem_size = 32;
-  else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
-    shared_mem_size = 24;
-  else if ((in_nchannel > 8 && out_nchannel > 8) ||
-           (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
-    shared_mem_size = 16;
-  else
-    shared_mem_size = 8;
-  constexpr int MAX_GRID = 65535;
-  auto stream = at::cuda::getCurrentCUDAStream();
-  using shmem_sizes_t = tv::mp_list_c<int, 32, 24, 16, 8>;
-  int num_grid = (nHot + shared_mem_size - 1) / shared_mem_size;
-  int num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
-  int step = (nHot + num_div - 1) / num_div;
-  dim3 threads(shared_mem_size, shared_mem_size);
-
-  tv::dispatch_torch<float>(dtype, [&](auto I) {
-    using T = decltype(I);
-    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
-      constexpr int ShmemSize = decltype(ShSizeValue)::value;
-      for (int s = 0; s < num_div; s++) {
-        int remainder = nHot - step * s;
-        int curr_num_active = remainder < step ? remainder : step;
-        dim3 grid((out_nchannel + threads.x - 1) / threads.x,
-                  (curr_num_active + threads.y - 1) / threads.y);
-        matmul<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
-            features.data_ptr<T>(), in_nchannel, curr_num_active,
-            filters.data_ptr<T>(), out_nchannel, in_nchannel,
-            output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
-            indicesOut.data_ptr<int32_t>());
-      }
-    });
-  });
-}
-void fused_conv_backward_cuda_minkowski(torch::Tensor features,
-                                        torch::Tensor din, torch::Tensor dout,
-                                        torch::Tensor filters,
-                                        torch::Tensor dfilters,
-                                        torch::Tensor indicesIn,
-                                        torch::Tensor indicesOut, int nHot) {
-  auto dtype = features.scalar_type();
-  auto in_nchannel = features.size(1);
-  auto out_nchannel = dout.size(1);
-  int shared_mem_size = -1;
-  if ((in_nchannel > 16 && out_nchannel > 16 &&
-       in_nchannel * out_nchannel >= 512) ||
-      (in_nchannel % 32 == 0 && out_nchannel % 32 == 0))
-    shared_mem_size = 32;
-  else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
-    shared_mem_size = 24;
-  else if ((in_nchannel > 8 && out_nchannel > 8) ||
-           (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
-    shared_mem_size = 16;
-  else
-    shared_mem_size = 8;
-  dim3 threads(shared_mem_size, shared_mem_size);
-
-  constexpr int MAX_GRID = 65535;
-  auto stream = at::cuda::getCurrentCUDAStream();
-  using shmem_sizes_t = tv::mp_list_c<int, 32, 24, 16, 8>;
-
-  int num_grid = (nHot + shared_mem_size - 1) / shared_mem_size;
-  int num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
-  int step = (nHot + num_div - 1) / num_div;
-
-  tv::dispatch_torch<float>(dtype, [&](auto I) {
-    using T = decltype(I);
-    tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
-      constexpr int ShmemSize = decltype(ShSizeValue)::value;
-      for (int s = 0; s < num_div; s++) {
-        int remainder = nHot - step * s;
-        int curr_num_active = remainder < step ? remainder : step;
-        dim3 grid((in_nchannel + threads.x - 1) / threads.x,
-                  (curr_num_active + threads.y - 1) / threads.y);
-        matmul2<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
-            dout.data_ptr<T>(), out_nchannel, curr_num_active, // A
-            filters.data_ptr<T>(), out_nchannel,
-            in_nchannel,                                          // B
-            features.data_ptr<T>(), in_nchannel, curr_num_active, // D
-            din.data_ptr<T>(),                                    // C
-            dfilters.data_ptr<T>(),                               // E
-            indicesIn.data_ptr<int32_t>(), indicesOut.data_ptr<int32_t>());
-      }
-    });
-  });
-}
-
-} // namespace spconv
\ No newline at end of file
diff --git a/src/spconv/indice.cc b/src/spconv/indice.cc
deleted file mode 100644
index a9bcdeb..0000000
--- a/src/spconv/indice.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/Parallel.h>
-#include <spconv/geometry.h>
-#include <spconv/indice.h>
-#include <spconv/spconv_ops.h>
-#include <tensorview/tensor.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<Index> indicesOut,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *kernelSize, const Index *stride,
-                         const Index *padding, const Index *dilation,
-                         const Index *outSpatialShape) {
-  // indicesOut: num_active * kernelVolume * (NDim + 1)
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index hashval;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-      auto iter = hash.find(index);
-      if (iter == hash.end()) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        hashval = numAct++;
-        hash[index] = hashval;
-      } else {
-        hashval = iter->second;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(0, offset, indiceNum[offset]) = j;
-      indicePairs(1, offset, indiceNum[offset]++) = hashval;
-    }
-  }
-  return numAct;
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
-                           tv::TensorView<Index> indicesOut,
-                           tv::TensorView<IndexGrid> gridsOut,
-                           tv::TensorView<Index> indicePairs,
-                           tv::TensorView<Index> indiceNum,
-                           const Index *kernelSize, const Index *stride,
-                           const Index *padding, const Index *dilation,
-                           const Index *outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  Index hashval;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    batchIdx = indicesIn(j, 0);
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                   spatialVolume * batchIdx;
-
-      auto iter = hash.find(index);
-      if (iter == hash.end()) {
-        for (unsigned k = 1; k < NDim + 1; ++k) {
-          indicesOut(numAct, k) = pointPtr[k - 1];
-        }
-        indicesOut(numAct, 0) = batchIdx;
-        hashval = numAct++;
-        hash[index] = hashval;
-      } else {
-        hashval = iter->second;
-      }
-      // indicePairs: [K, 2, L]
-      indicePairs(0, offset, indiceNum[offset]) = j;
-      indicePairs(1, offset, indiceNum[offset]++) = hashval;
-    }
-  }
-  return numAct;
-}
-
-#ifndef TV_WINDOWS
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    Index index = 0;
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    hash[index] = j;
-  }
-
-  at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
-    Index index = 0;
-    Index numValidPoints = 0;
-    std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-    Index *validPoints = validPoints_.data();
-    Index *pointPtr = nullptr;
-    Index oldOffset = 0;
-    for (int j = begin; j < end; ++j) {
-      numValidPoints = getValidOutPos<Index, NDim>(
-          indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-          dilation, outSpatialShape, validPoints);
-      for (Index i = 0; i < numValidPoints; ++i) {
-        pointPtr = validPoints + i * (NDim + 1);
-        auto offset = pointPtr[NDim];
-        index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-                spatialVolume * indicesIn(j, 0);
-        auto iter = hash.find(index);
-        if (iter != hash.end()) {
-#pragma omp atomic capture
-          oldOffset = indiceNum[offset]++;
-          indicePairs(0, offset, oldOffset) = j;
-          indicePairs(1, offset, oldOffset) = iter->second;
-        }
-      }
-    }
-  });
-  return numActIn;
-}
-#else
-template <typename Index, typename IndexGrid, unsigned NDim>
-Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
-                         tv::TensorView<IndexGrid> gridsOut,
-                         tv::TensorView<Index> indicePairs,
-                         tv::TensorView<Index> indiceNum,
-                         const Index *const kernelSize,
-                         const Index *const stride, const Index *const padding,
-                         const Index *dilation,
-                         const Index *const outSpatialShape) {
-  Index numAct = 0;
-  auto numActIn = indicesIn.dim(0);
-  Index batchIdx = 0;
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  // Index validPoints[kernelVolume * (NDim + 1)];
-  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
-  Index *validPoints = validPoints_.data();
-  Index *pointPtr = nullptr;
-  tsl::robin_map<Index, Index> hash;
-  for (int j = 0; j < numActIn; ++j) {
-    Index index = 0;
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
-                                         outSpatialShape) +
-            spatialVolume * indicesIn(j, 0);
-    hash[index] = j;
-  }
-  Index index = 0;
-  for (int j = 0; j < numActIn; ++j) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
-        dilation, outSpatialShape, validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
-              spatialVolume * indicesIn(j, 0);
-      auto iter = hash.find(index);
-      if (iter != hash.end()) {
-        indicePairs(0, offset, indiceNum[offset]) = j;
-        indicePairs(1, offset, indiceNum[offset]++) = iter->second;
-      }
-    }
-  }
-  return numActIn;
-}
-#endif
-
-int create_conv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-    std::vector<int64_t> padding, std::vector<int64_t> dilation,
-    std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
-    bool useHash) {
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = decltype(V);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      if (transpose)
-        numActIn = getIndicePairsDeConv<Index, IndexGrid, NDim>(
-            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
-            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
-            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
-            di.data(), ou.data());
-      else
-        numActIn = getIndicePairsConv<Index, IndexGrid, NDim>(
-            tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
-            tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
-            tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
-            di.data(), ou.data());
-    });
-  });
-  return numActIn;
-}
-
-int create_submconv_indice_pair_cpu(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
-    using Index = decltype(V);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      numActIn = getIndicePairsSubM<Index, IndexGrid, NDim>(
-          tv::torch2tv<Index>(indicesIn), tv::torch2tv<IndexGrid>(gridsOut),
-          tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indiceNum),
-          ks.data(), st.data(), pa.data(), di.data(), ou.data());
-    });
-  });
-  return numActIn;
-}
-
-} // namespace spconv
diff --git a/src/spconv/indice.cu b/src/spconv/indice.cu
deleted file mode 100644
index fb8a701..0000000
--- a/src/spconv/indice.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <boost/mp11.hpp>
-#include <chrono>
-#include <cuhash/hash_table.h>
-#include <limits>
-#include <spconv/indice.cu.h>
-#include <spconv/indice.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
-#include <type_traits>
-#include <utility/timer.h>
-namespace spconv {
-
-using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
-
-int create_conv_indice_pair_p1_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = kernelSize.size();
-  auto numActIn = indicesIn.size(0);
-  auto kernelVolume = indiceNum.size(0);
-  // auto timer = spconv::CudaContextTimer<>();
-
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      tv::DispatchInt<max_kernel_vol_t>()(
-          kernelVolume, std::less_equal<int>(), [&](auto I2) {
-            constexpr int MaxKernelVolume = decltype(I2)::value;
-            tv::dispatch_int<0, 1>(int(transpose), [&](auto I) {
-              constexpr bool UseDeconv = decltype(I)::value;
-              prepareIndicePairsKernel<Index, NDim, UseDeconv, MaxKernelVolume>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum),
-                                  tv::torch2tv<Index>(indicePairUnique), ks, st,
-                                  pa, di, ou);
-              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
-            });
-#ifdef TV_LOG_KERNEL_INFO
-            cudaFuncAttributes attr;
-            checkCudaErrors(cudaFuncGetAttributes(
-                &attr,
-                prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
-            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
-                        MaxKernelVolume, ">", attr.numRegs);
-#endif
-          });
-    });
-  });
-  return 1;
-}
-
-int create_conv_indice_pair_p2_cuda(
-    torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
-    torch::Tensor indicePairs, torch::Tensor indiceNum,
-    torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-  int numAct = indicePairUnique.size(0) - 1;
-
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      using IndexGrid = int32_t;
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      if (useHash) {
-        auto table = cuhash::HashTable();
-        // std::cout << "create " << numAct << " size table..." << std::endl;
-        table.Initialize(numAct, 2.0, 4);
-        unsigned *d_values = nullptr;
-        cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
-        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-        arangeKernel<unsigned>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(d_values, numAct);
-        TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
-        bool res = table.Build(
-            numAct,
-            reinterpret_cast<unsigned *>(indicePairUnique.data_ptr<Index>()),
-            d_values);
-        cudaFree(d_values);
-        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
-        if (!res) {
-          return -1; // use -1 to tell outside use CPU implementation
-        }
-        assignIndiceOutKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
-                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
-        TV_CHECK_CUDA_ERR_V2("assignIndiceOutKernel failed");
-
-        auto tableSize = table.get_table_size();
-        auto tableData = table.data();
-        auto constants = table.get_constants_4();
-        auto stash_constants = table.get_stash_constants();
-        auto stash_count = table.get_stash_count();
-        assignIndicePairsHashKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), tableSize,
-                         tableData, constants, stash_constants, stash_count);
-        TV_CHECK_CUDA_ERR_V2("assignIndicePairsHashKernel failed");
-
-      } else {
-        assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut),
-                         tv::torch2tv<IndexGrid>(gridsOut), numAct,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
-        TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
-        assignIndicePairsKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesOut),
-                         tv::torch2tv<IndexGrid>(gridsOut), numActIn,
-                         tv::torch2tv<Index>(indicePairs),
-                         tv::torch2tv<Index>(indicePairUnique), ou);
-        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
-        tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim,
-                    ">", attr.numRegs);
-        cudaFuncAttributes attr2;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
-        tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">",
-                    attr2.numRegs);
-#endif
-      }
-
-      if (resetGrid && (!useHash)) {
-        resetGridKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(indicePairUnique.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), numAct);
-        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-      }
-    });
-  });
-  return numAct;
-}
-
-int create_submconv_indice_pair_cuda(
-    torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
-    torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
-    std::vector<int64_t> stride, std::vector<int64_t> padding,
-    std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
-    bool transpose, bool resetGrid, bool useHash) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto ndim = outSpatialShape.size();
-  auto numActIn = indicesIn.size(0);
-  int batchSize = gridsOut.size(0);
-
-  auto kernelVolume = indiceNum.size(0);
-  if (numActIn == 0)
-    return 0;
-  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    using IndexGrid = int32_t;
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
-      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
-      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
-      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
-      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
-                                       outSpatialShape.end());
-      Index spatialVolume = 1;
-      for (int i = 0; i < NDim; ++i) {
-        spatialVolume *= outSpatialShape[i];
-      }
-      auto dispatcher = tv::DispatchIntNoexcept<tv::mp_list_c<int, 1, 3, 5>>();
-      namespace mp11 = boost::mp11;
-
-      using kernel2_candidates_t =
-          mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
-                           tv::mp_list_c<int, 1, 3, 5>>;
-      using kernel3_candidates_t =
-          mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
-                           tv::mp_list_c<int, 1, 3, 5>,
-                           tv::mp_list_c<int, 1, 3, 5>>;
-      using kernel3_candidates_final_t =
-          mp11::mp_push_back<kernel3_candidates_t>;
-      auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
-      auto dispatcher3 =
-          tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
-
-      if (useHash) {
-        auto table = cuhash::HashTable();
-        // std::cout << "create " << numAct << " size table..." << std::endl;
-        table.Initialize(numActIn, 2.0, 4);
-        unsigned *d_keyvalues = nullptr;
-        cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
-        unsigned *d_values = d_keyvalues + numActIn;
-        TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
-        prepareSubMHashKernel<Index, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
-                         ou);
-        TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
-        bool res =
-            table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
-                        reinterpret_cast<unsigned *>(d_values));
-        cudaFree(d_keyvalues);
-        TV_CHECK_CUDA_ERR_V2("cudaFree failed");
-        if (!res) {
-          return -1; // use -1 to tell outside use CPU implementation
-        }
-        auto tableSize = table.get_table_size();
-        auto tableData = table.data();
-        auto constants = table.get_constants_4();
-        auto stash_constants = table.get_stash_constants();
-        auto stash_count = table.get_stash_count();
-        bool dilation_one = true;
-        for (int i = 0; i < NDim; ++i) {
-          dilation_one &= di[i] == 1;
-        }
-        auto found = false;
-        if (dilation_one && (NDim == 2 || NDim == 3)) {
-          auto indiceNumCpu = indiceNum.cpu();
-          if (NDim == 2) {
-            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              found = true;
-              getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume, tableSize, tableData,
-                                  constants, stash_constants, stash_count);
-            });
-          } else if (NDim == 3) {
-            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
-              found = true;
-              getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume, tableSize, tableData,
-                                  constants, stash_constants, stash_count);
-            });
-          }
-        }
-        if (!found) {
-          tv::DispatchInt<max_kernel_vol_t>()(
-              kernelVolume, std::less_equal<int>(), [&](auto I2) {
-                constexpr int MaxKernelVolume = decltype(I2)::value;
-                getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
-                    <<<tv::cuda::getBlocks(numActIn),
-                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                        tv::torch2tv<Index>(indicesIn),
-                        tv::torch2tv<Index>(indicePairs),
-                        tv::torch2tv<Index>(indiceNum), ks, st, pa, di, ou,
-                        tableSize, tableData, constants, stash_constants,
-                        stash_count);
-                TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
-              });
-        }
-      } else {
-        // auto timer = spconv::CudaContextTimer<>();
-        prepareSubMGridKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(tv::torch2tv<Index>(indicesIn),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
-        // tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
-        TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
-        // when dilation all one, we use a simple kernel to calc result
-        bool dilation_one = true;
-        for (int i = 0; i < NDim; ++i) {
-          dilation_one &= di[i] == 1;
-        }
-        auto found = false;
-        if (dilation_one && (NDim == 2 || NDim == 3)) {
-          auto indiceNumCpu = indiceNum.cpu();
-          if (NDim == 2) {
-            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              found = true;
-              getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<IndexGrid>(gridsOut),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume);
-            });
-          } else if (NDim == 3) {
-            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
-                                           outSpatialShape.end());
-            dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-              constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
-              found = true;
-              getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
-                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                                  tv::torch2tv<IndexGrid>(gridsOut),
-                                  tv::torch2tv<Index>(indicePairs),
-                                  tv::torch2tv<Index>(indiceNum), ou_,
-                                  spatialVolume);
-            });
-            /*
-            dispatcher(kernelSize[0], [&](auto K0C) {
-              dispatcher(kernelSize[1], [&](auto K1C) {
-                dispatcher(kernelSize[2], [&](auto K2C) {
-                  constexpr int K0 = decltype(K0C)::value;
-                  constexpr int K1 = decltype(K1C)::value;
-                  constexpr int K2 = decltype(K2C)::value;
-                  found = true;
-                  getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
-                      <<<tv::cuda::getBlocks(numActIn),
-                         tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                          tv::torch2tv<Index>(indicesIn),
-                          tv::torch2tv<IndexGrid>(gridsOut),
-                          tv::torch2tv<Index>(indicePairs),
-                          tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
-                });
-              });
-            });*/
-          }
-        }
-        if (!found) {
-          tv::DispatchInt<
-              max_kernel_vol_t>()(ndim, std::less_equal<int>(), [&](auto I2) {
-            constexpr int MaxKernelVolume = decltype(I2)::value;
-            getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
-                <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-                   stream>>>(tv::torch2tv<Index>(indicesIn),
-                             tv::torch2tv<IndexGrid>(gridsOut),
-                             tv::torch2tv<Index>(indicePairs),
-                             tv::torch2tv<Index>(indiceNum), ks, st, pa, di,
-                             ou);
-            TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsKernel failed");
-          });
-        }
-        // tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
-      }
-
-      if (resetGrid && (!useHash)) {
-        resetGridSubMKernel<Index, IndexGrid, NDim>
-            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
-               stream>>>(indicesIn.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn,
-                         spatialVolume);
-        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-      }
-    });
-  });
-  return numActIn;
-}
-
-} // namespace spconv
\ No newline at end of file
diff --git a/src/spconv/maxpool.cc b/src/spconv/maxpool.cc
deleted file mode 100644
index c1301fd..0000000
--- a/src/spconv/maxpool.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/maxpool.h>
-#include <torch/script.h>
-
-namespace spconv {
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-
-void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size) {
-  if (size <= 0)
-    return;
-  int stride = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      auto outFeaturesData = outFeatures.data_ptr<T>();
-      auto inFeaturesData = inFeatures.data_ptr<T>();
-      auto indicesInData = indicesIn.data_ptr<Index>();
-      auto indicesOutData = indicesOut.data_ptr<Index>();
-      Index idxi, idxo;
-      for (int row = 0; row < size; row++) {
-        idxi = indicesInData[row] * stride;
-        idxo = indicesOutData[row] * stride;
-        for (int plane = 0; plane < stride; ++plane)
-          if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
-            outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
-      }
-    });
-  });
-}
-
-void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                     torch::Tensor dout, torch::Tensor din,
-                     torch::Tensor indicesIn, torch::Tensor indicesOut,
-                     int size) {
-  if (size <= 0)
-    return;
-  int stride = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      auto outFeaturesData = outFeatures.data_ptr<T>();
-      auto inFeaturesData = inFeatures.data_ptr<T>();
-      auto doutData = dout.data_ptr<T>();
-      auto dinData = din.data_ptr<T>();
-      auto indicesInData = indicesIn.data_ptr<Index>();
-      auto indicesOutData = indicesOut.data_ptr<Index>();
-      Index idxi, idxo;
-      for (int row = 0; row < size; row++) {
-        idxi = indicesInData[row] * stride;
-        idxo = indicesOutData[row] * stride;
-        for (int plane = 0; plane < stride; ++plane)
-          if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
-            dinData[idxi + plane] += doutData[idxo + plane];
-      }
-    });
-  });
-}
-
-} // namespace spconv
diff --git a/src/spconv/maxpool.cu b/src/spconv/maxpool.cu
deleted file mode 100644
index 2156bf5..0000000
--- a/src/spconv/maxpool.cu
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/maxpool.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-
-namespace spconv {
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
-                                      const Index *indicesIn,
-                                      const Index *indicesOut, int numHot,
-                                      int numPlanes) {
-  T in, out;
-  int ILPStrideY[NumILP];
-  Index idxo, idxi;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in > out) {
-          outFeatures[idxo] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
-                             const Index *indicesIn, const Index *indicesOut,
-                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in > out) {
-          outFeatures[RO[ilp] + iy] = in;
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
-                                         const Index *indicesIn,
-                                         const Index *indicesOut, int numHot,
-                                         int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] > bufo[i]) {
-          bufo[i] = bufi[i];
-        }
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idxo] =
-          reinterpret_cast<VecType *>(bufo)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
-                                        const Index *indicesIn,
-                                        const Index *indicesOut, int numHot,
-                                        int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in > out) {
-            outFeatures[RO[ilp] + iy] = in;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
-                      T *din, const Index *indicesIn, const Index *indicesOut,
-                      int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  T in, out;
-  Index idxo, idxi;
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  dout += blockIdx.y * NumTLP;
-  din += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
-       ix += blockDim.x * gridDim.x) {
-    {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-        in = inFeatures[idxi];
-        out = outFeatures[idxo];
-        if (in == out) {
-          din[idxi] += dout[idxo];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
-                                             const T *inFeatures, const T *dout,
-                                             T *din, const Index *indicesIn,
-                                             const Index *indicesOut,
-                                             int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        in = inFeatures[RI[ilp] + iy];
-        out = outFeatures[RO[ilp] + iy];
-        if (in == out) {
-          din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
-__global__ void
-maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
-                         const T *dout, T *din, const Index *indicesIn,
-                         const Index *indicesOut, int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
-  T bufi[vecloadFactor];
-  T bufo[vecloadFactor];
-  T bufdi[vecloadFactor];
-  T bufdo[vecloadFactor];
-  Index idxi, idxo;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
-  outFeatures += blockIdx.y * NumTLP;
-  inFeatures += blockIdx.y * NumTLP;
-  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
-       ix += blockDim.x * gridDim.x * vecloadFactor) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(bufo)[0] =
-          reinterpret_cast<const VecType *>(outFeatures)[idxo];
-      reinterpret_cast<VecType *>(bufi)[0] =
-          reinterpret_cast<const VecType *>(inFeatures)[idxi];
-      reinterpret_cast<VecType *>(bufdo)[0] =
-          reinterpret_cast<const VecType *>(dout)[idxo];
-      reinterpret_cast<VecType *>(bufdi)[0] =
-          reinterpret_cast<VecType *>(din)[idxi];
-
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        if (bufi[i] == bufo[i]) {
-          bufdi[i] += bufdo[i];
-        }
-      }
-      reinterpret_cast<VecType *>(din)[idxi] =
-          reinterpret_cast<VecType *>(bufdi)[0];
-    }
-  }
-}
-
-template <typename T, typename Index, int NumTLP, int NumILP>
-__global__ void
-maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
-                        const T *dout, T *din, const Index *indicesIn,
-                        const Index *indicesOut, int numHot, int numPlanes) {
-  // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
-  int ILPStrideX[NumILP];
-  Index RI[NumILP];
-  Index RO[NumILP];
-  T in, out;
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < numHot) {
-        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
-        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
-      }
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < numHot) {
-          in = inFeatures[RI[ilp] + iy];
-          out = outFeatures[RO[ilp] + iy];
-          if (in == out) {
-            din[RI[ilp] + iy] += dout[RO[ilp] + iy];
-          }
-        }
-      }
-    }
-  }
-}
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-
-void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t =
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indicesIn,
-                                       &indicesOut, &notFound](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (numHotBlock >= NumTLP) {
-              maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                     dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                     stream>>>(
-                      outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                      indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                      numHotBlock, numPlanes / vecloadFactor);
-              TV_CHECK_CUDA_ERR();
-            }
-
-            if (size > numHotBlock) {
-              maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  inFeatures.data_ptr<T>(),
-                                  indicesIn.data_ptr<Index>() + numHotBlock,
-                                  indicesOut.data_ptr<Index>() + numHotBlock,
-                                  size - numHotBlock, numPlanes);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (numHotBlock >= NumTLP) {
-          maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                  numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-
-        if (size > numHotBlock) {
-          maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>() + numHotBlock,
-                  indicesOut.data_ptr<Index>() + numHotBlock,
-                  size - numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-      }
-    });
-  });
-}
-
-void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
-                      torch::Tensor dout, torch::Tensor din,
-                      torch::Tensor indicesIn, torch::Tensor indicesOut,
-                      int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = inFeatures.size(1);
-  auto dtype = inFeatures.scalar_type();
-  auto int_dtype = indicesIn.scalar_type();
-  auto stream = at::cuda::getCurrentCUDAStream();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t =
-        std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
-    using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout,
-                                       &din, &indicesIn, &indicesOut,
-                                       &notFound](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (numHotBlock >= NumTLP) {
-              maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
-                     dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
-                     stream>>>(outFeatures.data_ptr<T>(),
-                               inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
-                               din.data_ptr<T>(), indicesIn.data_ptr<Index>(),
-                               indicesOut.data_ptr<Index>(), numHotBlock,
-                               numPlanes / vecloadFactor);
-              TV_CHECK_CUDA_ERR();
-            }
-
-            if (size > numHotBlock) {
-              maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
-                                  din.data_ptr<T>(),
-                                  indicesIn.data_ptr<Index>() + numHotBlock,
-                                  indicesOut.data_ptr<Index>() + numHotBlock,
-                                  size - numHotBlock, numPlanes);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        int numHotBlock = (size / NumTLP) * NumTLP;
-        if (numHotBlock >= NumTLP) {
-          maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  dout.data_ptr<T>(), din.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
-                  numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-
-        if (size > numHotBlock) {
-          maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
-              <<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
-                 dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                  outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
-                  dout.data_ptr<T>(), din.data_ptr<T>(),
-                  indicesIn.data_ptr<Index>() + numHotBlock,
-                  indicesOut.data_ptr<Index>() + numHotBlock,
-                  size - numHotBlock, numPlanes);
-          TV_CHECK_CUDA_ERR();
-        }
-      }
-    });
-  });
-}
-
-} // namespace spconv
\ No newline at end of file
diff --git a/src/spconv/nms.cc b/src/spconv/nms.cc
deleted file mode 100644
index 1f9be04..0000000
--- a/src/spconv/nms.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <boost/geometry.hpp>
-#include <spconv/nms_functor.h>
-#include <torch/script.h>
-#include <vector>
-
-namespace spconv {
-
-namespace functor {
-template <typename T, typename Index>
-struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
-  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxes, T threshold, T eps) {
-    auto ndets = boxes.dim(0);
-    auto suppressed = std::vector<Index>(ndets);
-    auto area = std::vector<T>(ndets);
-    for (int i = 0; i < ndets; ++i) {
-      area[i] =
-          (boxes(i, 2) - boxes(i, 0) + eps) * (boxes(i, 3) - boxes(i, 1) + eps);
-    }
-    int i, j;
-    T xx1, xx2, w, h, inter, ovr;
-    int keepNum = 0;
-    for (int _i = 0; _i < ndets; ++_i) {
-      i = _i;
-      if (suppressed[i] == 1)
-        continue;
-      keep[keepNum] = i;
-      keepNum += 1;
-      for (int _j = _i + 1; _j < ndets; ++_j) {
-        j = _j;
-        if (suppressed[j] == 1)
-          continue;
-        xx2 = std::min(boxes(i, 2), boxes(j, 2));
-        xx1 = std::max(boxes(i, 0), boxes(j, 0));
-        w = xx2 - xx1 + eps;
-        if (w > 0) {
-          xx2 = std::min(boxes(i, 3), boxes(j, 3));
-          xx1 = std::max(boxes(i, 1), boxes(j, 1));
-          h = xx2 - xx1 + eps;
-          if (h > 0) {
-            inter = w * h;
-            ovr = inter / (area[i] + area[j] - inter);
-            if (ovr >= threshold)
-              suppressed[j] = 1;
-          }
-        }
-      }
-    }
-    return keepNum;
-  }
-};
-
-template <typename T, typename Index>
-struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
-  Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
-                   tv::TensorView<const T> boxCorners,
-                   tv::TensorView<const T> standupIoU, T threshold) {
-    auto ndets = boxCorners.dim(0);
-    auto suppressed = std::vector<Index>(ndets);
-    int i, j;
-    namespace bg = boost::geometry;
-    typedef bg::model::point<T, 2, bg::cs::cartesian> point_t;
-    typedef bg::model::polygon<point_t> polygon_t;
-    polygon_t poly, qpoly;
-    std::vector<polygon_t> poly_inter, poly_union;
-    T inter_area, union_area, overlap;
-    int keepNum = 0;
-    for (int _i = 0; _i < ndets; ++_i) {
-      i = _i;
-      if (suppressed[i] == 1)
-        continue;
-      keep[keepNum] = i;
-      keepNum += 1;
-      for (int _j = _i + 1; _j < ndets; ++_j) {
-        j = _j;
-        if (suppressed[j] == 1)
-          continue;
-        if (standupIoU(i, j) <= 0.0)
-          continue;
-        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
-        bg::append(poly, point_t(boxCorners(i, 1, 0), boxCorners(i, 1, 1)));
-        bg::append(poly, point_t(boxCorners(i, 2, 0), boxCorners(i, 2, 1)));
-        bg::append(poly, point_t(boxCorners(i, 3, 0), boxCorners(i, 3, 1)));
-        bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 1, 0), boxCorners(j, 1, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 2, 0), boxCorners(j, 2, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 3, 0), boxCorners(j, 3, 1)));
-        bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
-        bg::intersection(poly, qpoly, poly_inter);
-
-        if (!poly_inter.empty()) {
-          inter_area = bg::area(poly_inter.front());
-          bg::union_(poly, qpoly, poly_union);
-          if (!poly_union.empty()) { // ignore invalid box
-            union_area = bg::area(poly_union.front());
-            overlap = inter_area / union_area;
-            if (overlap >= threshold)
-              suppressed[j] = 1;
-            poly_union.clear();
-          }
-        }
-        poly.clear();
-        qpoly.clear();
-        poly_inter.clear();
-      }
-    }
-    return keepNum;
-  }
-};
-
-} // namespace functor
-
-#define DECLARE_CPU_T_INDEX(T, Index)                                          \
-  template struct functor::NonMaxSupressionFunctor<tv::CPU, T, Index>;         \
-  template struct functor::rotateNonMaxSupressionFunctor<tv::CPU, T, Index>;
-
-#define DECLARE_CPU_INDEX(Index)                                               \
-  DECLARE_CPU_T_INDEX(float, Index);                                           \
-  DECLARE_CPU_T_INDEX(double, Index);
-
-DECLARE_CPU_INDEX(int);
-DECLARE_CPU_INDEX(long);
-
-#undef DECLARE_CPU_INDEX
-#undef DECLARE_CPU_T_INDEX
-
-} // namespace spconv
diff --git a/src/spconv/nms.cu b/src/spconv/nms.cu
deleted file mode 100644
index 9ac3bd7..0000000
--- a/src/spconv/nms.cu
+++ /dev/null
@@ -1,73 +0,0 @@
-// ------------------------------------------------------------------
-// Deformable Convolutional Networks
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN
-// (https://github.com/shaoqingren/faster_rcnn)
-// ------------------------------------------------------------------
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-#include <utility/timer.h>
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b) {
-  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
-  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  DType interS = width * height;
-  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-
-template <typename DType, int BLOCK_THREADS>
-__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes,
-                           unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
-
-  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size) {
-#pragma unroll
-    for (int i = 0; i < 5; ++i) {
-      block_boxes[threadIdx.x * 5 + i] =
-          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
-    const DType *cur_box = dev_boxes + cur_box_idx * 5;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (int i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
diff --git a/src/spconv/pillar_scatter.cu b/src/spconv/pillar_scatter.cu
deleted file mode 100644
index 58cc318..0000000
--- a/src/spconv/pillar_scatter.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/pillar_scatter_functor.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensorview.h>
-#include <type_traits>
-#include <utility/timer.h>
-
-namespace spconv {
-template <typename T, typename Index>
-__global__ void pointPillarsScatterKernel(tv::TensorView<T> canvas,
-                                          tv::TensorView<const T> features,
-                                          tv::TensorView<const T> coors) {
-  auto numFeatures = features.dim(0);
-  auto numPoints = features.dim(1);
-  for (int i : tv::KernelLoopX<int>(numPoints)) {
-    for (int ifeature : tv::KernelLoopY<int>(numFeatures)) {
-      canvas(int(coors(0, i)), ifeature, int(coors(2, i)), int(coors(3, i))) =
-          features(ifeature, i);
-    }
-  }
-}
-namespace functor {
-template <typename T, typename Index>
-struct PointPillarScatter<tv::GPU, T, Index> {
-  void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
-                  tv::TensorView<const T> features,
-                  tv::TensorView<const T> coors) {
-    auto grid = dim3(tv::cuda::DivUp(features.dim(1), 32),
-                     tv::cuda::DivUp(features.dim(0), 32));
-    pointPillarsScatterKernel<T, Index>
-        <<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
-    TV_CHECK_CUDA_ERR();
-  }
-};
-} // namespace functor
-
-#define DECLARE_GPU_SPECS_T_INDEX(T, Index)                                    \
-  template struct functor::PointPillarScatter<tv::GPU, T, Index>;
-
-#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
-
-DECLARE_GPU_SPECS(float);
-DECLARE_GPU_SPECS(double);
-DECLARE_GPU_SPECS(at::Half);
-
-#undef DECLARE_GPU_SPECS
-#undef DECLARE_GPU_SPECS_T_INDEX
-} // namespace spconv
\ No newline at end of file
diff --git a/src/spconv/point2voxel.cu b/src/spconv/point2voxel.cu
deleted file mode 100644
index 3dba511..0000000
--- a/src/spconv/point2voxel.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-
-#include <ATen/ATen.h>
-#include <spconv/point2voxel.cu.h>
-//#include <spconv/point2voxel.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-
-namespace spconv {
-
-void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
-                                torch::Tensor grids,
-                                torch::Tensor numPointsPerGrid,
-                                torch::Tensor pointIndex,
-                                std::vector<int64_t> gridShape,
-                                const int ndim) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto num_points = points.size(0);
-  auto num_features = points.size(1);
-  tv::dispatch_torch<int32_t>(pointIndex.scalar_type(), [&](auto IndexValue) {
-    using Index = decltype(IndexValue);
-    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-      constexpr int NDim = decltype(I)::value;
-      tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
-      scatterPointToGridKernel<Index, NDim>
-          <<<tv::cuda::getBlocks(num_points), tv::cuda::CUDA_NUM_THREADS, 0,
-             stream>>>(tv::torch2tv<float>(points),
-                       tv::torch2tv<Index>(indexes), tv::torch2tv<float>(grids),
-                       tv::torch2tv<Index>(numPointsPerGrid),
-                       tv::torch2tv<Index>(pointIndex), gs);
-      TV_CHECK_CUDA_ERR_V2("scatterPointToGridKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-      cudaFuncAttributes attr;
-      checkCudaErrors(
-          cudaFuncGetAttributes(&attr, scatterPointToGridKernel<Index, NDim>));
-      tv::ssprint("scatterPointToGridKernel<", tv::type_s<Index>, NDim, ">",
-                  attr.numRegs);
-#endif
-    });
-  });
-}
-
-void gather_point_from_grid_cuda(torch::Tensor grids,
-                                 torch::Tensor numPointsPerGrid,
-                                 torch::Tensor pointIndex,
-                                 torch::Tensor pointIndexUnique,
-                                 torch::Tensor voxels, torch::Tensor coors,
-                                 std::vector<int64_t> gridShape,
-                                 const int ndim) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto num_voxel = voxels.size(0);
-  auto num_max_points = pointIndex.size(0) - 1;
-  auto grid_volume = grids.size(0);
-  tv::dispatch_torch<int32_t>(
-      pointIndexUnique.scalar_type(), [&](auto IndexValue) {
-        using Index = decltype(IndexValue);
-        tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
-          constexpr int NDim = decltype(I)::value;
-          tv::SimpleVector<Index, NDim> gs(gridShape.begin(), gridShape.end());
-
-          resetPointIndexKernel<Index>
-              <<<tv::cuda::getBlocks(num_max_points),
-                 tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-                  tv::torch2tv<Index>(pointIndex), grid_volume);
-          TV_CHECK_CUDA_ERR_V2("resetPointIndexKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-          cudaFuncAttributes attr0;
-          checkCudaErrors(cudaFuncGetAttributes(
-              &attr0, resetPointIndexKernel<Index, NDim>));
-          tv::ssprint("resetPointIndexKernel<", tv::type_s<Index>, NDim, ">",
-                      attr0.numRegs);
-#endif
-
-          gatherPointFromGridKernel<Index, NDim>
-              <<<tv::cuda::getBlocks(num_voxel), tv::cuda::CUDA_NUM_THREADS, 0,
-                 stream>>>(tv::torch2tv<float>(grids),
-                           tv::torch2tv<Index>(numPointsPerGrid),
-                           tv::torch2tv<Index>(pointIndexUnique),
-                           tv::torch2tv<float>(voxels),
-                           tv::torch2tv<Index>(coors), gs);
-          TV_CHECK_CUDA_ERR_V2("gatherPointFromGridKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-          cudaFuncAttributes attr1;
-          checkCudaErrors(cudaFuncGetAttributes(
-              &attr1, gatherPointFromGridKernel<Index, NDim>));
-          tv::ssprint("gatherPointFromGridKernel<", tv::type_s<Index>, NDim,
-                      ">", attr1.numRegs);
-#endif
-
-          resetGridKernel<Index><<<tv::cuda::getBlocks(num_voxel),
-                                   tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
-              tv::torch2tv<float>(grids), tv::torch2tv<Index>(numPointsPerGrid),
-              tv::torch2tv<Index>(pointIndexUnique));
-          TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
-#ifdef TV_LOG_KERNEL_INFO
-          cudaFuncAttributes attr2;
-          checkCudaErrors(
-              cudaFuncGetAttributes(&attr2, resetGridKernel<Index, NDim>));
-          tv::ssprint("resetGridKernel<", tv::type_s<Index>, NDim, ">",
-                      attr2.numRegs);
-#endif
-        });
-      });
-}
-
-} // namespace spconv
diff --git a/src/spconv/point2voxel_ops.cc b/src/spconv/point2voxel_ops.cc
deleted file mode 100644
index 51dc694..0000000
--- a/src/spconv/point2voxel_ops.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <spconv/point2voxel_ops.h>
-//#include <spconv/point2voxel.cu.h>
-
-namespace spconv {
-
-int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
-                      torch::Tensor pointIndex, torch::Tensor grids,
-                      torch::Tensor numPointsPerGrid, torch::Tensor voxels,
-                      torch::Tensor coors, std::vector<int64_t> gridShape,
-                      const int64_t ndim) {
-  if (points.device().type() == torch::kCPU) {
-    TV_THROW_INVALID_ARG("not support cpu currently");
-  }
-#ifdef TV_CUDA
-  else if (points.device().type() == torch::kCUDA) {
-    scatter_point_to_grid_cuda(points, indexes, grids, numPointsPerGrid,
-                               pointIndex, gridShape, ndim);
-  }
-#endif
-  else {
-    TV_THROW_INVALID_ARG("unknown device type");
-  }
-  auto res = torch::_unique(pointIndex);
-  auto pointIndexUnique = std::get<0>(res);
-  auto num_voxel = pointIndexUnique.size(0) - 1;
-  if (points.device().type() == torch::kCPU) {
-    TV_THROW_INVALID_ARG("not support cpu currently");
-  }
-#ifdef TV_CUDA
-  else if (points.device().type() == torch::kCUDA) {
-    gather_point_from_grid_cuda(grids, numPointsPerGrid, pointIndex,
-                                pointIndexUnique, voxels, coors, gridShape,
-                                ndim);
-  }
-#endif
-  else {
-    TV_THROW_INVALID_ARG("unknown device type");
-  }
-  return num_voxel;
-}
-
-} // namespace spconv
diff --git a/src/spconv/pool_ops.cc b/src/spconv/pool_ops.cc
deleted file mode 100644
index f28cc68..0000000
--- a/src/spconv/pool_ops.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <spconv/pool_ops.h>
-
-namespace spconv {
-torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
-                            torch::Tensor indiceNum, int64_t numAct) {
-  auto device = features.device().type();
-  auto kernelVolume = indiceNum.size(0);
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
-  double totalTime = 0;
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    // auto timer = spconv::CudaContextTimer<>();
-    if (device == torch::kCPU) {
-      maxpool_fwd_cpu(output, features, indicePairs[0][i], indicePairs[1][i],
-                      nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      maxpool_fwd_cuda(output, features, indicePairs[0][i], indicePairs[1][i],
-                       nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-    // totalTime += timer.report() / 1000.0;
-  }
-  // std::cout << "maxpool forward time " << totalTime << std::endl;
-  return output;
-}
-
-torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
-                                    torch::Tensor outFeatures,
-                                    torch::Tensor outGrad,
-                                    torch::Tensor indicePairs,
-                                    torch::Tensor indiceNum) {
-  auto device = features.device().type();
-  auto numInPlanes = features.size(1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  auto kernelVolume = indiceNum.size(0);
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      maxpool_bwd_cpu(outFeatures, features, outGrad, inputGrad,
-                      indicePairs[0][i], indicePairs[1][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      maxpool_bwd_cuda(outFeatures, features, outGrad, inputGrad,
-                       indicePairs[0][i], indicePairs[1][i], nHot);
-    }
-#endif
-    else {
-      TV_ASSERT_INVALID_ARG(false, "unknown device type");
-    }
-  }
-  return inputGrad;
-}
-
-} // namespace spconv
\ No newline at end of file
diff --git a/src/spconv/reordering.cc b/src/spconv/reordering.cc
deleted file mode 100644
index e08e85e..0000000
--- a/src/spconv/reordering.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/Parallel.h>
-#include <spconv/reordering.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-
-namespace spconv {
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
-                       torch::Tensor indices, int size) {
-  int numPlanes = features.size(1);
-  auto dtype = features.scalar_type();
-  auto int_dtype = indices.scalar_type();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      Index *indices_data = indices.data_ptr<Index>();
-      T *buffer_data = buffer.data_ptr<T>();
-      const T *features_data = features.data_ptr<T>();
-      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
-        for (int i = begin; i < end; ++i) {
-          std::memcpy(buffer_data + i * numPlanes,
-                      features_data + indices_data[i] * numPlanes,
-                      sizeof(T) * numPlanes);
-        }
-      });
-    });
-  });
-}
-
-void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
-                            torch::Tensor indices, int size) {
-  int numPlanes = outFeatures.size(1);
-  auto dtype = outFeatures.scalar_type();
-  auto int_dtype = indices.scalar_type();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      Index *indices_data = indices.data_ptr<Index>();
-      const T *buffer_data = buffer.data_ptr<T>();
-      T *features_data = outFeatures.data_ptr<T>();
-      at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
-        const T *buf = buffer.data_ptr<T>();
-        T *out = outFeatures.data_ptr<T>();
-        for (int i = begin; i < end; ++i) {
-          buf = buffer_data + i * numPlanes;
-          out = features_data + indices_data[i] * numPlanes;
-          for (int j = 0; j < numPlanes; ++j) {
-            out[j] += buf[j];
-          }
-        }
-      });
-    });
-  });
-}
-
-} // namespace spconv
diff --git a/src/spconv/reordering.cu b/src/spconv/reordering.cu
deleted file mode 100644
index bd8d7b1..0000000
--- a/src/spconv/reordering.cu
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/ATen.h>
-#include <chrono>
-#include <limits>
-#include <spconv/reordering.cu.h>
-#include <spconv/reordering.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/mp_helper.h>
-#include <tensorview/tensor.h>
-#include <tensorview/tensorview.h>
-#include <tensorview/torch_utils.h>
-#include <type_traits>
-#include <utility/timer.h>
-namespace spconv {
-
-using float_types_t = tv::mp_list<float, double, at::Half>;
-using int_types_t = tv::mp_list<int32_t, int64_t>;
-template <typename T>
-using half_vec_t =
-    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
-template <typename T>
-using half_vec_sadd_t =
-    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
-using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;
-
-void sparse_gather_cuda(cudaStream_t stream, torch::Tensor buffer,
-                        torch::Tensor features, torch::Tensor indices,
-                        int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = features.size(1);
-  auto dtype = features.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  // auto timer = spconv::CudaContextTimer<>();
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-
-      tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
-        constexpr int NumILP = NumTLP / 4;
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                   vecload_type_t>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                              vecload_type_t>));
-              tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
-                  <<<dim3(1, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                               features.data_ptr<T>(),
-                               indices.data_ptr<Index>() + nHotBlock,
-                               size - nHotBlock, numPlanes / vecloadFactor);
-
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, gatherVecKernel<T, Index, int(NumTLP), NumILP,
-                                         vecload_type_t>));
-              tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>,
-                          int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        gatherGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                buffer.data_ptr<T>(), features.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
-        tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>,
-                    int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                        torch::Tensor indices, int size) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  return sparse_gather_cuda(stream, buffer, features, indices, size);
-}
-
-void sparse_scatter_add_cuda(cudaStream_t stream, torch::Tensor buffer,
-                             torch::Tensor outFeatures, torch::Tensor indices,
-                             int size) {
-  if (size <= 0)
-    return;
-  int numPlanes = outFeatures.size(1);
-  auto dtype = outFeatures.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_sadd_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor =
-          sizeof(vecload_type_t) / sizeof(T); // important for half.
-
-      tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
-        // vecloadFactor));
-        constexpr int NumILP = NumTLP / 4;
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                                  vecload_type_t>));
-              tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                  indices.data_ptr<Index>() + nHotBlock,
-                                  size - nHotBlock, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-              cudaFuncAttributes attr;
-              checkCudaErrors(cudaFuncGetAttributes(
-                  &attr,
-                  scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
-              tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>,
-                          tv::type_s<Index>, int(NumTLP), NumILP, ">",
-                          attr.numRegs);
-#endif
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        scatterAddGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes);
-#ifdef TV_LOG_KERNEL_INFO
-        cudaFuncAttributes attr;
-        checkCudaErrors(cudaFuncGetAttributes(
-            &attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
-        tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>,
-                    tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
-#endif
-
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
-                             torch::Tensor indices, int size) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  return sparse_scatter_add_cuda(stream, buffer, outFeatures, indices, size);
-}
-
-void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
-                              torch::Tensor indices, int size) {
-  // indices: [volume, inds_stride]
-  // buffer: [volume, num_points, num_features]
-  // size == volume * num_points
-  if (size <= 0)
-    return;
-  int numPlanes = features.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = features.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  int inds_stride = indices.size(1);
-  int feature_stride = buffer.size(1);
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>(
-          [=, &buffer, &features, &indices, &notFound](auto NumTLP) {
-            constexpr int NumILP = NumTLP / 4;
-            // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
-            int nHotBlock = (size / NumTLP) * NumTLP;
-            if (notFound) {
-              if (numPlanes % NumTLP == 0) {
-                if (nHotBlock >= NumTLP) {
-                  batchGatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
-                                            vecload_type_t>
-                      <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
-                                   indices.data_ptr<Index>(), nHotBlock,
-                                   numPlanes / vecloadFactor, inds_stride,
-                                   feature_stride);
-                  TV_CHECK_CUDA_ERR_V2("batchGatherVecBlockKernel");
-                }
-                if (size - nHotBlock > 0) {
-                  batchGatherVecKernel<T, Index, int(NumTLP), NumILP,
-                                       vecload_type_t>
-                      <<<dim3(1, numPlanes / NumTLP),
-                         dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                         stream>>>(buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                   features.data_ptr<T>(),
-                                   indices.data_ptr<Index>(), size - nHotBlock,
-                                   nHotBlock, numPlanes / vecloadFactor,
-                                   inds_stride, feature_stride);
-                  TV_CHECK_CUDA_ERR_V2("batchGatherVecKernel");
-                }
-                notFound = false;
-              }
-            }
-          });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        batchGatherGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                buffer.data_ptr<T>(), features.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, numPlanes, inds_stride,
-                feature_stride);
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
-                                   torch::Tensor outFeatures,
-                                   torch::Tensor indices, int size) {
-  // indices: [volume, inds_stride]
-  // buffer: [volume, num_points, num_features]
-  // size == volume * num_points
-  if (size <= 0)
-    return;
-  int numPlanes = outFeatures.size(1);
-  auto stream = at::cuda::getCurrentCUDAStream();
-  auto dtype = outFeatures.scalar_type();
-  auto inds_dtype = indices.scalar_type();
-  int inds_stride = indices.size(1);
-  int feature_stride = buffer.size(1);
-
-  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
-    using T = decltype(TValue);
-    using vecload_type_t = half_vec_sadd_t<T>;
-    tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
-      using Index = decltype(IndexValue);
-      bool notFound = true;
-      constexpr int vecloadFactor = 1; // important for half.
-
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
-                                       &notFound](auto NumTLP) {
-        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
-        // vecloadFactor));
-        constexpr int NumILP = NumTLP / 4;
-        int nHotBlock = (size / NumTLP) * NumTLP;
-        if (notFound) {
-          if (numPlanes % NumTLP == 0) {
-            if (nHotBlock >= NumTLP) {
-              batchScatterAddBlockKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(size / NumTLP, numPlanes / NumTLP),
-                     dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
-                     stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                               indices.data_ptr<Index>(), nHotBlock,
-                               numPlanes / vecloadFactor, inds_stride,
-                               feature_stride);
-              TV_CHECK_CUDA_ERR();
-            }
-            if (size - nHotBlock > 0) {
-              batchScatterAddGenericKernel<T, Index, int(NumTLP), NumILP>
-                  <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
-                     0, stream>>>(outFeatures.data_ptr<T>(),
-                                  buffer.data_ptr<T>() + nHotBlock * numPlanes,
-                                  indices.data_ptr<Index>(), size - nHotBlock,
-                                  nHotBlock, numPlanes, inds_stride,
-                                  feature_stride);
-              TV_CHECK_CUDA_ERR();
-            }
-            notFound = false;
-          }
-        }
-      });
-
-      if (notFound) {
-        constexpr int NumTLP = 64;
-        constexpr int NumILP = NumTLP / 4;
-        batchScatterAddGenericKernel<T, Index, NumTLP, NumILP>
-            <<<dim3(tv::cuda::DivUp(size, NumTLP),
-                    tv::cuda::DivUp(numPlanes, NumTLP)),
-               dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
-                outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
-                indices.data_ptr<Index>(), size, 0, numPlanes, inds_stride,
-                feature_stride);
-        TV_CHECK_CUDA_ERR();
-      }
-    });
-  });
-}
-
-} // namespace spconv
diff --git a/src/spconv/spconv_ops.cc b/src/spconv/spconv_ops.cc
deleted file mode 100644
index a46f09d..0000000
--- a/src/spconv/spconv_ops.cc
+++ /dev/null
@@ -1,752 +0,0 @@
-#include <spconv/fused_conv.h>
-#include <spconv/spconv_ops.h>
-#include <spgemm/gemm_th.h>
-#include <tensorview/tensor.h>
-
-namespace spconv {
-
-std::vector<torch::Tensor>
-getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
-               std::vector<int64_t> outSpatialShape,
-               std::vector<int64_t> spatialShape,
-               std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
-               std::vector<int64_t> padding, std::vector<int64_t> dilation,
-               std::vector<int64_t> outPadding, int64_t _subM,
-               int64_t _transpose, int64_t _useHash) {
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool transpose = _transpose != 0;
-  auto NDim = kernelSize.size();
-  // CPU always use hash (tsl::robin_map).
-  bool useHash = _useHash != 0 || indices.device().type() == torch::kCPU;
-  auto numAct = indices.size(0);
-  auto coorDim = indices.size(1) - 1; // batchIdx + xyz
-  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
-  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
-  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
-  auto kernelVolume = kernelSize[0];
-  for (int i = 1; i < kernelSize.size(); ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
-  auto outputVolume = outSpatialShape[0];
-  for (int i = 1; i < outSpatialShape.size(); ++i) {
-    outputVolume *= outSpatialShape[i];
-  }
-  std::string msg = "due to limits of cuda hash, the volume of dense space "
-                    "include batch size ";
-  msg += "must less than std::numeric_limits<int>::max() = 2e9";
-  TV_ASSERT_RT_ERR(batchSize * outputVolume < std::numeric_limits<int>::max(),
-                   msg);
-  torch::Tensor indicePairs =
-      torch::full({2, kernelVolume, numAct}, -1,
-                  torch::dtype(torch::kInt32).device(indices.device()));
-  torch::Tensor indiceNum = torch::zeros(
-      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
-  auto gridSize = batchSize * outputVolume;
-  if (useHash) {
-    gridSize = batchSize;
-  }
-  bool resetGrid = gridOut.numel() != 0;
-  if (!resetGrid) {
-    gridOut = torch::full({gridSize}, -1,
-                          torch::dtype(torch::kInt32).device(indices.device()));
-  }
-  gridOut = gridOut.view({batchSize, -1});
-  int64_t numActOut = -1;
-  for (int i = 0; i < NDim; ++i) {
-    if (subM) {
-      padding[i] = kernelSize[i] / 2;
-      stride[i] = 1;
-    }
-  }
-  // tv::ssprint("prepare", timer.report() / 1000.0);
-  if (subM) {
-    if (indices.device().type() == torch::kCPU) {
-      numActOut = create_submconv_indice_pair_cpu(
-          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
-          dilation, outSpatialShape, transpose, false, useHash);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      numActOut = create_submconv_indice_pair_cuda(
-          indices, gridOut, indicePairs, indiceNum, kernelSize, stride, padding,
-          dilation, outSpatialShape, transpose, resetGrid, useHash);
-      if (numActOut == -1) {
-        auto device = indices.device();
-        indicePairs = indicePairs.to({torch::kCPU});
-        indiceNum = indiceNum.to({torch::kCPU});
-        indices = indices.to({torch::kCPU});
-        numActOut = create_submconv_indice_pair_cpu(
-            indices, gridOut, indicePairs, indiceNum, kernelSize, stride,
-            padding, dilation, outSpatialShape, transpose, false, useHash);
-        return {indices.to(device), indicePairs.to(device),
-                indiceNum.to(device)};
-      }
-
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // tv::ssprint("subm", timer.report() / 1000.0);
-    return {indices, indicePairs, indiceNum};
-  } else {
-    auto indicePairUnique = torch::full(
-        {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
-        torch::dtype(torch::kInt32).device(indices.device()));
-    torch::Tensor outInds =
-        torch::zeros({numAct * kernelVolume, coorDim + 1},
-                     torch::dtype(torch::kInt32).device(indices.device()));
-    if (indices.device().type() == torch::kCPU) {
-      numActOut = create_conv_indice_pair_cpu(
-          indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,
-          padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
-    }
-#ifdef TV_CUDA
-    else if (indices.device().type() == torch::kCUDA) {
-      numActOut = create_conv_indice_pair_p1_cuda(
-          indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
-          padding, dilation, outSpatialShape, transpose);
-      if (numActOut > 0) {
-        auto res = torch::_unique(indicePairUnique);
-        indicePairUnique = std::get<0>(res);
-        numActOut = create_conv_indice_pair_p2_cuda(
-            indices, outInds, gridOut, indicePairs, indiceNum, indicePairUnique,
-            outSpatialShape, transpose, resetGrid, useHash);
-        if (numActOut == -1) {
-          auto device = indices.device();
-          outInds = outInds.to({torch::kCPU});
-          indicePairs = indicePairs.to({torch::kCPU});
-          indiceNum = indiceNum.to({torch::kCPU});
-          indices = indices.to({torch::kCPU});
-          numActOut = create_conv_indice_pair_cpu(
-              indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,
-              stride, padding, dilation, outSpatialShape, transpose, false,
-              useHash);
-
-          return {outInds.to(device).slice(0, 0, numActOut),
-                  indicePairs.to(device), indiceNum.to(device)};
-        }
-      }
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
-  }
-}
-
-torch::Tensor indiceConvNative(torch::Tensor features, torch::Tensor filters,
-                               torch::Tensor indicePairs,
-                               torch::Tensor indiceNum, int64_t numActOut,
-                               int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  // auto timer = spconv::CudaContextTimer<>();
-
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = numActOut;
-  if (subM) { // the center index of subm conv don't need gather and scatter
-    // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-
-    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                                          indicePairNumCpu.data_ptr<int>() +
-                                              indicePairMaxOffset);
-    if (indicePairMaxSize == 0) {
-      return output;
-    }
-  } else {
-    indicePairMaxSize =
-        *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                          indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  }
-
-  torch::Tensor inputBuffer =
-      torch::empty({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({indicePairMaxSize, numOutPlanes}, options);
-
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0,
-  // std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-  //                      indicePairNumCpu.data_ptr<int>() + kernelVolume));
-
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    // TODO torch::from_blob is a little slow
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
-      /* slower than SparseGatherFunctor, may due to int->long conversion
-      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
-      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
-      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
-      features, 0, indicePairBlob);*/
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
-                              nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
-  // tv::ssprint("final subm gemm time", timer.report() / 1000.0);
-
-  return output;
-}
-
-template <int Algo>
-torch::Tensor indiceConvFused(torch::Tensor features, torch::Tensor filters,
-                              torch::Tensor indicePairs,
-                              torch::Tensor indiceNum, int64_t numActOut,
-                              int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  // auto timer = spconv::CudaContextTimer<>();
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  if (subM) { // the center index of subm conv don't need gather and scatter
-    // add.
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("fused only support gpu");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      FusedConvDispatch<Algo>::fwd(output, features, filters[i],
-                                   indicePairs[inverse][i],
-                                   indicePairs[!inverse][i], nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-  }
-  return output;
-}
-
-template <bool BatchScatter>
-torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
-                              torch::Tensor indicePairs,
-                              torch::Tensor indiceNum, int64_t numActOut,
-                              int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  auto batchScatter = BatchScatter;
-  bool inverse = _inverse != 0;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indiceNum.size(0);
-  TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  // auto timer = spconv::CudaContextTimer<>();
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairNumVec =
-      std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
-                   indicePairNumVec.end(), std::greater<int>());
-  int indicePairTop2Size = indicePairNumVec[1];
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto indice_dtype = indicePairs.scalar_type();
-  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
-  // we cant use batch conv in subm directly because
-  // number of indice in the center of filter is much more than other
-  // filter location.
-  // so we first use top2 indice num to do batch conv, then
-  // do native conv (gemm) in center.
-  int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
-  int maxKernelVolumePart = kernelVolume;
-  std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
-  filters = filters.view({kernelVolume, numInPlanes, numOutPlanes});
-
-  if (subM) {
-    maxKernelVolumePart = std::max(indicePairMaxOffset,
-                                   int(kernelVolume - indicePairMaxOffset - 1));
-    part_ranges = {{0, indicePairMaxOffset},
-                   {indicePairMaxOffset + 1, kernelVolume}};
-    torch::mm_out(output, features, filters[indicePairMaxOffset]);
-    if (indicePairTop2Size == 0) {
-      return output;
-    }
-  }
-  // tv::ssprint("first subm gemm time", timer.report() / 1000.0);
-  double totalGatherTime = 0;
-  double totalGEMMTime = 0;
-  double totalSAddTime = 0;
-
-  torch::Tensor inputBuffer =
-      torch::empty({maxKernelVolumePart, bufferSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
-  for (auto &range : part_ranges) {
-    int start = range.first;
-    int end = range.second;
-    int length = end - start;
-    int64_t size = length * bufferSize;
-    auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
-    auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
-    auto indicePairs1Part =
-        tv::torch_slice_first_axis(indicePairs[inverse], start, end);
-    auto indicePairs2Part =
-        tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
-    auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
-                               size);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // totalGatherTime += timer.report() / 1000.0;
-
-    torch::bmm_out(outputBufferPart, inputBufferPart, filtersPart);
-    // totalGEMMTime += timer.report() / 1000.0;
-
-    if (batchScatter) {
-      if (device == torch::kCPU) {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-#ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        batch_sparse_scatter_add_cuda(outputBufferPart, output,
-                                      indicePairs2Part, size);
-      }
-#endif
-      else {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-    } else {
-      for (int i = 0; i < length; ++i) {
-        auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
-        if (nHot <= 0) {
-          continue;
-        }
-        if (device == torch::kCPU) {
-          sparse_scatter_add_cpu(outputBufferPart[i], output,
-                                 indicePairs2Part[i], nHot);
-        }
-#ifdef TV_CUDA
-        else if (device == torch::kCUDA) {
-          sparse_scatter_add_cuda(outputBufferPart[i], output,
-                                  indicePairs2Part[i], nHot);
-        }
-#endif
-        else {
-          TV_THROW_INVALID_ARG("unknown device type");
-        }
-      }
-    }
-    // totalSAddTime += timer.report() / 1000.0;
-  }
-  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
-
-  return output;
-}
-
-std::vector<torch::Tensor>
-indiceConvBwNative(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::empty(filterShape, options);
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-
-    // get indice pair second max size based on subM symmetric property
-    indicePairMaxSize = *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                                          indicePairNumCpu.data_ptr<int>() +
-                                              indicePairMaxOffset);
-    if (indicePairMaxSize == 0) {
-      return {inputGrad, filtersGrad.view(filterShape)};
-    }
-  } else {
-    indicePairMaxSize =
-        *std::max_element(indicePairNumCpu.data_ptr<int>(),
-                          indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  }
-
-  torch::Tensor inputBuffer =
-      torch::empty({indicePairMaxSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::empty({indicePairMaxSize, numOutPlanes}, options);
-
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
-      sparse_gather_cpu(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
-      sparse_gather_cuda(outputBuffer, outGrad, indicePairs[!inverse][i], nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-
-    auto filterGradSub = filtersGrad[i];
-    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
-                                             {nHot, numOutPlanes}, options);
-    auto inputBufferBlob =
-        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
-
-    torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
-    torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
-    if (device == torch::kCPU) {
-      sparse_scatter_add_cpu(inputBuffer, inputGrad, indicePairs[inverse][i],
-                             nHot);
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      sparse_scatter_add_cuda(inputBuffer, inputGrad, indicePairs[inverse][i],
-                              nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <int Algo>
-std::vector<torch::Tensor>
-indiceConvBwFused(torch::Tensor features, torch::Tensor filters,
-                  torch::Tensor outGrad, torch::Tensor indicePairs,
-                  torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-  auto kernelVolume = indiceNum.size(0);
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-
-  // init for subM
-  int indicePairMaxOffset = kernelVolume / 2;
-  int indicePairMaxSize = indicePairNumCpu.data_ptr<int>()[indicePairMaxOffset];
-  if (subM) {
-    auto filterGradSub = filtersGrad[indicePairMaxOffset];
-    torch::mm_out(filterGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
-  }
-  for (int i = 0; i < kernelVolume; ++i) {
-    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
-    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
-      continue;
-    }
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      FusedConvDispatch<Algo>::bwd(features, inputGrad, outGrad, filters[i],
-                                   filtersGrad[i], indicePairs[inverse][i],
-                                   indicePairs[!inverse][i], nHot);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <bool BatchScatter>
-std::vector<torch::Tensor>
-indiceConvBwBatch(torch::Tensor features, torch::Tensor filters,
-                  torch::Tensor outGrad, torch::Tensor indicePairs,
-                  torch::Tensor indiceNum, int64_t _inverse, int64_t _subM) {
-  bool subM = _subM != 0;
-  bool inverse = _inverse != 0;
-  auto batchScatter = BatchScatter;
-  auto device = features.device().type();
-  auto ndim = filters.dim() - 2;
-  auto kernelVolume = indiceNum.size(0);
-  TV_ASSERT_INVALID_ARG(kernelVolume > 1, "error");
-  auto numInPlanes = features.size(1);
-  auto numOutPlanes = filters.size(ndim + 1);
-  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
-  auto indicePairNumVec =
-      std::vector<int>(indicePairNumCpu.data_ptr<int>(),
-                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
-  auto indicePairMaxSizeIter =
-      std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
-  int indicePairMaxOffset = indicePairMaxSizeIter - indicePairNumVec.begin();
-  int indicePairMaxSize = *indicePairMaxSizeIter;
-  std::nth_element(indicePairNumVec.begin(), indicePairNumVec.begin() + 1,
-                   indicePairNumVec.end(), std::greater<int>());
-  int indicePairTop2Size = indicePairNumVec[1];
-  auto options =
-      torch::TensorOptions().dtype(features.dtype()).device(features.device());
-  auto indice_dtype = indicePairs.scalar_type();
-  auto filterShape = filters.sizes();
-  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
-  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
-  int bufferSize = subM ? indicePairTop2Size : indicePairMaxSize;
-
-  filters = filters.view({-1, numInPlanes, numOutPlanes});
-  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
-
-  std::vector<std::pair<int, int>> part_ranges = {{0, kernelVolume}};
-  int maxKernelVolumePart = kernelVolume;
-  if (subM) {
-    maxKernelVolumePart = std::max(indicePairMaxOffset,
-                                   int(kernelVolume - indicePairMaxOffset - 1));
-    part_ranges = {{0, indicePairMaxOffset},
-                   {indicePairMaxOffset + 1, kernelVolume}};
-    auto filtersGradSub = filtersGrad[indicePairMaxOffset];
-    auto filtersSub = filters[indicePairMaxOffset];
-    torch::mm_out(filtersGradSub, features.t(), outGrad);
-    torch::mm_out(inputGrad, outGrad, filtersSub.t());
-    if (indicePairTop2Size == 0) {
-      return {inputGrad, filtersGrad.view(filterShape)};
-    }
-  }
-  torch::Tensor inputBuffer =
-      torch::zeros({maxKernelVolumePart, bufferSize, numInPlanes}, options);
-  torch::Tensor outputBuffer =
-      torch::zeros({maxKernelVolumePart, bufferSize, numOutPlanes}, options);
-
-  for (auto &range : part_ranges) {
-    int start = range.first;
-    int end = range.second;
-    int length = end - start;
-    int64_t size = length * bufferSize;
-    auto inputBufferPart = tv::torch_slice_first_axis(inputBuffer, 0, length);
-    auto outputBufferPart = tv::torch_slice_first_axis(outputBuffer, 0, length);
-    auto indicePairs1Part =
-        tv::torch_slice_first_axis(indicePairs[inverse], start, end);
-    auto indicePairs2Part =
-        tv::torch_slice_first_axis(indicePairs[!inverse], start, end);
-    auto filtersPart = tv::torch_slice_first_axis(filters, start, end);
-    auto filtersGradPart = tv::torch_slice_first_axis(filtersGrad, start, end);
-
-    if (device == torch::kCPU) {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-#ifdef TV_CUDA
-    else if (device == torch::kCUDA) {
-      batch_sparse_gather_cuda(inputBufferPart, features, indicePairs1Part,
-                               size);
-      batch_sparse_gather_cuda(outputBufferPart, outGrad, indicePairs2Part,
-                               size);
-    }
-#endif
-    else {
-      TV_THROW_INVALID_ARG("unknown device type");
-    }
-    // filters: KV, I, O, inputBuffer: [KV, buffer, I]
-    // outputBuffer: [KV, buffer, O]
-    torch::bmm_out(filtersGradPart, inputBufferPart.permute({0, 2, 1}),
-                   outputBufferPart);
-    torch::bmm_out(inputBuffer, outputBufferPart,
-                   filtersPart.permute({0, 2, 1}));
-    if (batchScatter) {
-      if (device == torch::kCPU) {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-#ifdef TV_CUDA
-      else if (device == torch::kCUDA) {
-        batch_sparse_scatter_add_cuda(inputBufferPart, inputGrad,
-                                      indicePairs1Part, size);
-      }
-#endif
-      else {
-        TV_THROW_INVALID_ARG("unknown device type");
-      }
-    } else {
-      for (int i = 0; i < length; ++i) {
-        auto nHot = indicePairNumCpu.data_ptr<int>()[i + start];
-        if (nHot <= 0) {
-          continue;
-        }
-        if (device == torch::kCPU) {
-          sparse_scatter_add_cpu(inputBufferPart[i], inputGrad,
-                                 indicePairs1Part[i], nHot);
-        }
-#ifdef TV_CUDA
-        else if (device == torch::kCUDA) {
-          sparse_scatter_add_cuda(inputBufferPart[i], inputGrad,
-                                  indicePairs1Part[i], nHot);
-        }
-#endif
-        else {
-          TV_THROW_INVALID_ARG("unknown device type");
-        }
-      }
-    }
-  }
-  return {inputGrad, filtersGrad.view(filterShape)};
-}
-
-template <int Algo> struct ConvDispatch;
-
-template <> struct ConvDispatch<kNative> {
-  constexpr static auto *fwd = indiceConvNative;
-  constexpr static auto *bwd = indiceConvBwNative;
-};
-
-template <> struct ConvDispatch<kBatch> {
-  constexpr static auto *fwd = indiceConvBatch<false>;
-  constexpr static auto *bwd = indiceConvBwBatch<false>;
-};
-
-template <> struct ConvDispatch<kBatchGemmGather> {
-  constexpr static auto *fwd = indiceConvBatch<true>;
-  constexpr static auto *bwd = indiceConvBwBatch<true>;
-};
-
-template <> struct ConvDispatch<kSparseConvNet> {
-  constexpr static auto *fwd = indiceConvFused<kFSparseConvNet>;
-  constexpr static auto *bwd = indiceConvBwFused<kFSparseConvNet>;
-};
-
-template <> struct ConvDispatch<kMinkowskiEngine> {
-  constexpr static auto *fwd = indiceConvFused<kFMinkowskiEngine>;
-  constexpr static auto *bwd = indiceConvBwFused<kFMinkowskiEngine>;
-};
-
-torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
-                         torch::Tensor indicePairs, torch::Tensor indiceNum,
-                         int64_t numActOut, int64_t _inverse, int64_t _subM,
-                         int64_t algo) {
-  torch::Tensor res;
-  tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
-    constexpr int AlgoValue = decltype(I)::value;
-    res = ConvDispatch<AlgoValue>::fwd(features, filters, indicePairs,
-                                       indiceNum, numActOut, _inverse, _subM);
-  });
-  return res;
-}
-
-std::vector<torch::Tensor>
-indiceConvBackward(torch::Tensor features, torch::Tensor filters,
-                   torch::Tensor outGrad, torch::Tensor indicePairs,
-                   torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
-                   int64_t algo) {
-  std::vector<torch::Tensor> res;
-  tv::DispatchInt<all_conv_algos_t>()(algo, [&](auto I) {
-    constexpr int AlgoValue = decltype(I)::value;
-    res = ConvDispatch<AlgoValue>::bwd(features, filters, outGrad, indicePairs,
-                                       indiceNum, _inverse, _subM);
-  });
-  return res;
-}
-} // namespace spconv
diff --git a/src/spgemm/CMakeLists.txt b/src/spgemm/CMakeLists.txt
deleted file mode 100644
index 4dba51a..0000000
--- a/src/spgemm/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-set(ALL_FILES ${ALL_FILES} gemm.cu)
-
-add_library(spgemm SHARED ${ALL_FILES})
-
-target_include_directories(spgemm PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} ${CUTLASS_INCLUDE} )
-set_property(TARGET spgemm PROPERTY CUDA_STANDARD 14)
-set_property(TARGET spgemm PROPERTY CXX_STANDARD 14)
-set_target_properties(spgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(spgemm PRIVATE ${ALL_LIBS})
-install (TARGETS spgemm DESTINATION lib)
diff --git a/src/spgemm/gemm.cu b/src/spgemm/gemm.cu
deleted file mode 100644
index bbd43e0..0000000
--- a/src/spgemm/gemm.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <spgemm/gemm.h>
-#include <spgemm/gemm_th.h>
-namespace spconv {
-template <typename T>
-using determine_half_t =
-    std::conditional_t<std::is_same<T, at::Half>::value, cutlass::half_t, T>;
-
-void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
-                    torch::Tensor b) {
-  TV_ASSERT_RT_ERR(c.dtype() == a.dtype() && c.dtype() == b.dtype(),
-                   "dtype must be same");
-  TV_ASSERT_RT_ERR(c.is_contiguous() && b.is_contiguous() && a.is_contiguous(),
-                   "error");
-  auto M = a.size(0);
-  auto K = a.size(1);
-  auto N = b.size(1);
-  TV_ASSERT_RT_ERR(b.size(0) == K && c.size(0) == M && c.size(1) == N, "error");
-  tv::dispatch_torch<float, at::Half>(c.scalar_type(), [&](auto I) {
-    using T = decltype(I);
-    using HalfT = determine_half_t<T>;
-    auto status = cutlassGemm<HalfT, false, false, false>(
-        stream, M, N, K, HalfT(1.0), reinterpret_cast<HalfT *>(a.data_ptr<T>()),
-        a.size(1), reinterpret_cast<HalfT *>(b.data_ptr<T>()), b.size(1),
-        HalfT(0.0), reinterpret_cast<HalfT *>(c.data_ptr<T>()), c.size(1));
-    TV_ASSERT_RT_ERR(status == cudaSuccess, "error");
-  });
-}
-
-void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  return cutlass_mm_out(stream, c, a, b);
-}
-
-} // namespace spconv
\ No newline at end of file
diff --git a/src/spgemm/torchdev_cutlass.cu b/src/spgemm/torchdev_cutlass.cu
deleted file mode 100644
index c375b34..0000000
--- a/src/spgemm/torchdev_cutlass.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-#define TV_CUDA
-#include <cutlass/gemm/device/gemm.h>
-#include <spgemm/gemm.h>
-#include <tensorview/cuda_utils.h>
-#include <tensorview/kernel_utils.h>
-#include <tensorview/tensor.h>
-#include <tensorview/torch_utils.h>
-#include <torch/script.h>
-#include <utility/timer.h>
-
-int main() {
-  auto M = 100000;
-  auto N = 128;
-  auto K = 128;
-  auto a =
-      torch::rand({M, K}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
-  auto b = torch::rand({K, N}, a.options());
-  auto c = torch::zeros({a.size(0), b.size(1)}, a.options());
-  auto c2 = torch::zeros({a.size(0), b.size(1)}, a.options());
-  torch::mm_out(c, a, b);
-  auto status = spconv::cutlassGemm<float, false, false, false>(
-      0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
-      b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
-  auto err = torch::norm(c2 - c);
-  tv::ssprint(status, "linalg norm", err);
-  tv::ssprint((c.view({-1}) == 0).sum());
-  auto timer = spconv::CudaContextTimer<>();
-  for (int i = 0; i < 10; ++i) {
-    torch::mm_out(c, a, b);
-    tv::ssprint("mm", timer.report() / 1000.0);
-    spconv::cutlassGemm<float, false, false, false>(
-        0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
-        b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
-    tv::ssprint("cutlass_mm", timer.report() / 1000.0);
-  }
-
-  return 0;
-}
\ No newline at end of file
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
deleted file mode 100644
index 4ca0234..0000000
--- a/src/utils/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-if (SPCONV_BuildCUDA)
-    add_library(spconv_nms STATIC nms.cu)
-    set_target_properties(spconv_nms PROPERTIES VERSION ${PROJECT_VERSION})
-    set_target_properties(spconv_nms PROPERTIES SOVERSION 1)
-    target_include_directories(spconv_nms PRIVATE ${ALL_INCLUDE})
-    set_property(TARGET spconv_nms PROPERTY CXX_STANDARD 14)
-    set_property(TARGET spconv_nms PROPERTY CUDA_STANDARD 14)
-    set_property(TARGET spconv_nms PROPERTY POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(spconv_nms ${CUDA_CUDART})
-    install (TARGETS spconv_nms DESTINATION lib)
-endif()
-
-add_library(spconv_utils SHARED all.cc)
-set_target_properties(spconv_utils PROPERTIES VERSION ${PROJECT_VERSION})
-set_target_properties(spconv_utils PROPERTIES SOVERSION 1)
-target_include_directories(spconv_utils PRIVATE ${ALL_INCLUDE}
-                    ${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
-set_property(TARGET spconv_utils PROPERTY CXX_STANDARD 14)
-set_property(TARGET spconv_utils PROPERTY CUDA_STANDARD 14)
-set_target_properties(spconv_utils PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
-                                         SUFFIX "${PYTHON_MODULE_EXTENSION}")
-if (SPCONV_BuildCUDA)
-    target_link_libraries(spconv_utils ${CUDA_CUDART} pybind11::module spconv_nms)
-else()
-    target_link_libraries(spconv_utils pybind11::module)
-endif()
-install (TARGETS spconv_utils DESTINATION lib)
diff --git a/src/utils/all.cc b/src/utils/all.cc
deleted file mode 100644
index a30ee55..0000000
--- a/src/utils/all.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <spconv/box_iou.h>
-#include <spconv/nms.h>
-#include <spconv/point2voxel.h>
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-PYBIND11_MODULE(spconv_utils, m) {
-  m.doc() = "util pybind11 functions for spconv";
-#ifdef TV_CUDA
-  m.def("non_max_suppression", &spconv::non_max_suppression<double>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-  m.def("non_max_suppression", &spconv::non_max_suppression<float>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-#endif
-  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-  m.def("rotate_non_max_suppression_cpu",
-        &spconv::rotate_non_max_suppression_cpu<float>,
-        py::return_value_policy::reference_internal, "bbox iou",
-        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
-        "thresh"_a = 4);
-  m.def("rotate_non_max_suppression_cpu",
-        &spconv::rotate_non_max_suppression_cpu<double>,
-        py::return_value_policy::reference_internal, "bbox iou",
-        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
-        "thresh"_a = 4);
-  m.def("rbbox_iou", &spconv::rbbox_iou<double>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_iou", &spconv::rbbox_iou<float>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
-        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
-        "max_points"_a = 9, "max_voxels"_a = 10);
-  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
-        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
-        "max_points"_a = 9, "max_voxels"_a = 10);
-  m.def("points_to_voxel_3d_np_mean",
-        &spconv::points_to_voxel_3d_np_mean<float, 3>, "matrix tensor_square",
-        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
-        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
-        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
-        "max_voxels"_a = 11);
-  m.def("points_to_voxel_3d_np_mean",
-        &spconv::points_to_voxel_3d_np_mean<double, 3>, "matrix tensor_square",
-        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
-        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
-        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
-        "max_voxels"_a = 11);
-  m.def("points_to_voxel_3d_with_filtering",
-        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
-        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
-        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
-        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
-        "block_size"_a = 15, "height_threshold"_a = 16,
-        "height_high_threshold"_a = 17);
-  m.def("points_to_voxel_3d_with_filtering",
-        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
-        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
-        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
-        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
-        "block_size"_a = 15, "height_threshold"_a = 16,
-        "height_high_threshold"_a = 17);
-}
\ No newline at end of file
diff --git a/src/utils/nms.cu b/src/utils/nms.cu
deleted file mode 100644
index bbde710..0000000
--- a/src/utils/nms.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-// ------------------------------------------------------------------
-// Deformable Convolutional Networks
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN
-// (https://github.com/shaoqingren/faster_rcnn)
-// ------------------------------------------------------------------
-#include <cuda_runtime.h>
-#include <iostream>
-#include <spconv/nms_gpu.h>
-#include <vector>
-
-#define CUDA_CHECK(condition)                                                  \
-  /* Code block avoids redefinition of cudaError_t error */                    \
-  do {                                                                         \
-    cudaError_t error = condition;                                             \
-    if (error != cudaSuccess) {                                                \
-      std::cout << cudaGetErrorString(error) << std::endl;                     \
-    }                                                                          \
-  } while (0)
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b) {
-  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
-  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  DType interS = width * height;
-  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-
-template <typename DType, int BLOCK_THREADS>
-__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes,
-                           unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
-
-  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size) {
-#pragma unroll
-    for (int i = 0; i < 5; ++i) {
-      block_boxes[threadIdx.x * 5 + i] =
-          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
-    const DType *cur_box = dev_boxes + cur_box_idx * 5;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (int i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-void _set_device(int device_id) {
-  int current_device;
-  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id) {
-    return;
-  }
-  // The call to cudaSetDevice must come before any calls to Get, which
-  // may perform initialization using the GPU.
-  CUDA_CHECK(cudaSetDevice(device_id));
-}
-
-template <typename DType, int BLOCK_THREADS>
-int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-             int boxes_dim, DType nms_overlap_thresh, int device_id) {
-  _set_device(device_id);
-
-  DType *boxes_dev = NULL;
-  unsigned long long *mask_dev = NULL;
-
-  const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
-
-  CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
-  CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
-                        boxes_num * boxes_dim * sizeof(DType),
-                        cudaMemcpyHostToDevice));
-
-  CUDA_CHECK(cudaMalloc(&mask_dev,
-                        boxes_num * col_blocks * sizeof(unsigned long long)));
-
-  dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
-  dim3 threads(BLOCK_THREADS);
-  nms_kernel<DType, BLOCK_THREADS>
-      <<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
-
-  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
-                        sizeof(unsigned long long) * boxes_num * col_blocks,
-                        cudaMemcpyDeviceToHost));
-
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / BLOCK_THREADS;
-    int inblock = i % BLOCK_THREADS;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-
-  CUDA_CHECK(cudaFree(boxes_dev));
-  CUDA_CHECK(cudaFree(mask_dev));
-  return num_to_keep;
-}
-
-// template<>
-template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
-                                              const float *boxes_host,
-                                              int boxes_num, int boxes_dim,
-                                              float nms_overlap_thresh,
-                                              int device_id);
-// template<>
-template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
-                                               const double *boxes_host,
-                                               int boxes_num, int boxes_dim,
-                                               double nms_overlap_thresh,
-                                               int device_id);
\ No newline at end of file
diff --git a/test/aaa.py b/test/aaa.py
new file mode 100644
index 0000000..c7858d1
--- /dev/null
+++ b/test/aaa.py
@@ -0,0 +1,82 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+STR = """
+0.0016176700592041016
+0.002481698989868164
+0.0027854442596435547
+0.0031723976135253906
+0.0017437934875488281
+0.0020503997802734375
+0.001399993896484375
+0.0016183853149414062
+0.0007357597351074219
+0.0008492469787597656
+0.0006558895111083984
+0.0007994174957275391
+0.000335693359375
+0.000347137451171875
+"""
+"""
+0.003921985626220703
+0.0049707889556884766
+0.0052530765533447266
+0.0060312747955322266
+0.0036766529083251953
+0.00421142578125
+
+0.002129793167114258
+0.0023038387298583984
+0.0013151168823242188
+0.0015285015106201172
+0.0008392333984375
+0.0008127689361572266
+0.0002486705780029297
+0.00030994415283203125
+"""
+
+STR = """
+0.0006084442138671875
+0.0005354881286621094
+0.0012688636779785156
+0.0012619495391845703
+0.002301931381225586
+0.0019693374633789062
+0.0038712024688720703
+0.002872467041015625
+0.005068302154541016
+0.0047588348388671875
+0.007832765579223633
+0.005643367767333984
+0.005807161331176758
+0.004715442657470703"""
+"""
+0.0004992485046386719
+0.0003979206085205078
+0.0013720989227294922
+0.0015933513641357422
+0.0027768611907958984
+0.0024590492248535156
+0.004837512969970703
+0.004601001739501953
+0.009881019592285156
+0.008889913558959961
+0.017162084579467773
+0.009079217910766602
+0.009355545043945312
+0.0068836212158203125
+"""
+
+nums = list(map(float, STR.strip().split("\n")))
+print(sum(nums))
\ No newline at end of file
diff --git a/test/benchmark.py b/test/benchmark.py
index f3e371f..9f406e3 100644
--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -1,22 +1,38 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import time
 from pathlib import Path
 
 import numpy as np
 import torch
 from torch import nn
+from cumm import tensorview as tv 
 
-import spconv
-from spconv.utils import VoxelGeneratorV2
-
-
+import spconv.pytorch as spconv
+from spconv.utils import Point2VoxelCPU3d
 def waymo_data(batch_size=1):
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
-                           150000)
+    gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
+                           150000, 1)
+    # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
+    #                        150000)
     data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    pc = data["pc"]
-    data = gen.generate(pc)
-    voxels = data["voxels"].reshape(-1, 3)
-    coors = data["coordinates"]
+    pc = np.ascontiguousarray(data["pc"])
+    print(pc.shape)
+    voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
+    voxels = voxels_tv.numpy().reshape(-1, 3)
+    coors = indices_tv.numpy()
     N = coors.shape[0]
     coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
     return voxels, coors, gen.grid_size
@@ -28,14 +44,25 @@ def __init__(self, shape, algo):
         self.net = spconv.SparseSequential(
             spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
                               algo=algo),
+            # spconv.SubMConv3d(32,
+            #                   32,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c0",
+            #                   algo=algo),
+            # # nn.BatchNorm1d(32),
+            # # nn.ReLU(),
+            # # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
+            # #                   algo=algo),
+            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
             spconv.SubMConv3d(64,
                               64,
                               3,
                               bias=False,
                               indice_key="c0",
                               algo=algo),
-            # nn.BatchNorm1d(32),
-            # nn.ReLU(),
+
             spconv.SparseMaxPool3d(2, 2),
             spconv.SubMConv3d(64,
                               96,
@@ -137,27 +164,111 @@ def forward(self, features, coors, batch_size):
                                     self.grid)
         return self.net(x)
 
+class Net2(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 256, 3, bias=False, indice_key="c0",
+                              algo=algo),
+            # spconv.SubMConv3d(32,
+            #                   32,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c0",
+            #                   algo=algo),
+            # # nn.BatchNorm1d(32),
+            # # nn.ReLU(),
+            # # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
+            # #                   algo=algo),
+            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(256,
+                              512,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(512,
+                              512,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+
+    def forward(self, features, coors, batch_size):
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)
+
 
 def main():
-    voxels, coors, spatial_shape = waymo_data()
+    import pickle 
+    np.random.seed(50051)
+    torch.manual_seed(50051)
+    # voxels, coors, spatial_shape = waymo_data()
+    # with open("/home/yy/test_spconv.pkl", "wb") as f:
+    #     pickle.dump((voxels, coors, spatial_shape), f)
+    with open("/home/yy/test_spconv.pkl", "rb") as f:
+        (voxels, coors, spatial_shape) = pickle.load(f)
+    print(spatial_shape)
+    print(voxels.shape)
+    # voxels = voxels[:100]
+    # coors = coors[:100]
     voxels_th = torch.from_numpy(voxels).cuda().float()
     coors_th = torch.from_numpy(coors).cuda().int()
+    voxels_th.requires_grad = True
+
     algo = spconv.ConvAlgo.Native
-    net = Net(spatial_shape[::-1], algo).cuda().eval().float()
+    net = Net(spatial_shape, algo).cuda().eval().float()
     print(coors_th.shape)
     out = net(voxels_th, coors_th, 1)
     print(out.spatial_shape)
+    print(voxels.mean(),  voxels.max(), voxels.min())
+    dout = np.random.uniform(-0.2, 0.2,
+                                out.features.shape).astype(np.float32)
+    dout_t = torch.from_numpy(dout).cuda()
+
+    print(out.spatial_shape, out.features.mean(),  out.features.max(),  out.features.min())
     times = []
     with torch.no_grad():
         for i in range(20):
+            print("------------")
             torch.cuda.synchronize()
             t = time.time()
-            out = net(voxels_th, coors_th, 1)
+            out_nograd = net(voxels_th, coors_th, 1)
             torch.cuda.synchronize()
             times.append(time.time() - t)
+    print("spconv time", np.mean(times[10:]))
+    times = []
+
+    for i in range(10):
+        out = net(voxels_th, coors_th, 1)
+        print("------------")
+        torch.cuda.synchronize()
+        t = time.time()
+        out.features.backward(dout_t)
+        torch.cuda.synchronize()
+        times.append(time.time() - t)
+
     # print((net.grid == -1).float().sum(), net.grid.numel())
     # print("spconv time", time.time() - t)
-    print("spconv time", np.mean(times[10:]))
+    print("spconv bw time", np.mean(times[5:]))
 
 
 if __name__ == "__main__":
diff --git a/test/benchmark_detail.py b/test/benchmark_detail.py
deleted file mode 100644
index c9630b7..0000000
--- a/test/benchmark_detail.py
+++ /dev/null
@@ -1,199 +0,0 @@
-import time
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch import nn
-
-import spconv
-from spconv.utils import VoxelGeneratorV2
-
-
-def waymo_data(batch_size=1):
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
-                           150000)
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    pc = data["pc"]
-    data = gen.generate(pc)
-    voxels = data["voxels"].reshape(-1, 3)
-    coors = data["coordinates"]
-    N = coors.shape[0]
-    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
-    return voxels, coors, gen.grid_size
-
-
-class Net(nn.Module):
-    def __init__(self, shape, algo):
-        super().__init__()
-        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3,
-                              64,
-                              3,
-                              bias=False,
-                              indice_key="c0",
-                              algo=algo,
-                              name="subm-0-0"),
-            spconv.SubMConv3d(64,
-                              64,
-                              3,
-                              bias=False,
-                              indice_key="c0",
-                              algo=algo,
-                              name="subm-0-1"),
-            # nn.BatchNorm1d(32),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-0"),
-            spconv.SubMConv3d(64,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo,
-                              name="subm-1-0"),
-            spconv.SubMConv3d(96,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo,
-                              name="subm-1-1"),
-            # nn.BatchNorm1d(64),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-1"),
-            spconv.SubMConv3d(96,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo,
-                              name="subm-2-0"),
-            spconv.SubMConv3d(128,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo,
-                              name="subm-2-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-2"),
-            spconv.SubMConv3d(128,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo,
-                              name="subm-3-0"),
-            spconv.SubMConv3d(160,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo,
-                              name="subm-3-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-3"),
-            spconv.SubMConv3d(160,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo,
-                              name="subm-4-0"),
-            spconv.SubMConv3d(192,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo,
-                              name="subm-4-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-4"),
-            spconv.SubMConv3d(192,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo,
-                              name="subm-5-0"),
-            spconv.SubMConv3d(224,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo,
-                              name="subm-5-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-5"),
-            spconv.SubMConv3d(224,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo,
-                              name="subm-6-0"),
-            spconv.SubMConv3d(256,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo,
-                              name="subm-6-1"),
-        )
-        max_batch_size = 1
-        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
-        self.grid = torch.full([max_batch_size, *shape], -1,
-                               dtype=torch.int32).cuda()
-        # self.grid = None
-        self.shape = shape
-
-    def forward(self, features, coors, batch_size):
-        x = spconv.SparseConvTensor(features,
-                                    coors,
-                                    self.shape,
-                                    batch_size,
-                                    self.grid,
-                                    benchmark=True)
-        return self.net(x)
-
-
-def main():
-    dtype = torch.float32
-    voxels, coors, spatial_shape = waymo_data()
-    voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
-    coors_th = torch.from_numpy(coors).cuda().int()
-    algo = spconv.ConvAlgo.Minkowski
-    net = Net(spatial_shape[::-1], algo).cuda().eval().to(dtype)
-
-    print(coors_th.shape)
-    out = net(voxels_th, coors_th, 1)
-    print(out.spatial_shape)
-    times = []
-    detail_bench = {}
-    detail_ind_gen_bench = {}
-    with torch.no_grad():
-        for i in range(20):
-            torch.cuda.synchronize()
-            t = time.time()
-            out = net(voxels_th, coors_th, 1)
-            for k, v in out.benchmark_record.items():
-                if k not in detail_bench:
-                    detail_bench[k] = []
-                    detail_ind_gen_bench[k] = []
-                detail_bench[k].extend(v["time"])
-                detail_ind_gen_bench[k].extend(v["indice_gen_time"])
-
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    # print((net.grid == -1).float().sum(), net.grid.numel())
-    # print("spconv time", time.time() - t)
-    print("spconv time", np.mean(times[10:]))
-    print(detail_bench["subm-6-0"])
-    print(detail_ind_gen_bench["subm-6-0"])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/benchmark_points_to_voxel.py b/test/benchmark_points_to_voxel.py
deleted file mode 100644
index f431c0e..0000000
--- a/test/benchmark_points_to_voxel.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import time
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch import nn
-
-import spconv
-from spconv.utils import VoxelGeneratorV2, VoxelGeneratorV3
-
-
-def waymo_data_gpu(batch_size=1):
-    print('gpu with total points available per voxel')
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    points = torch.from_numpy(data['pc']).cuda().float()
-    voxel_size = torch.Tensor([0.1, 0.1,
-                               0.1]).to(points.dtype).to(points.device)
-    coors_range = torch.Tensor([-80, -80, -2, 80, 80,
-                                6]).to(points.dtype).to(points.device)
-
-    gen = VoxelGeneratorV3(voxel_size,
-                           coors_range,
-                           max_points=200000,
-                           num_features=points.shape[1],
-                           dtype=points.dtype,
-                           device=points.device)
-    voxels, coors = gen.generate(points)
-
-    times = []
-    with torch.no_grad():
-        for i in range(200):
-            torch.cuda.synchronize()
-            t = time.time()
-            voxels, coors = gen.generate(points)
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    print("voxelization time", np.mean(times[100:]))
-
-    N = coors.shape[0]
-    batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
-    coors = torch.cat([batch_id, coors], dim=1)
-    return voxels, coors, gen.grid_size
-
-
-def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
-    print('cpu with %d max points per voxel' % max_points_per_voxel)
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6],
-                           max_points_per_voxel, 150000)
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    pc = data["pc"]
-    data = gen.generate(pc)
-
-    times = []
-    with torch.no_grad():
-        for i in range(200):
-            torch.cuda.synchronize()
-            t = time.time()
-            data = gen.generate(pc)
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    print("voxelization time", np.mean(times[100:]))
-
-    voxels = data["voxels"].reshape(-1, 3)
-    coors = data["coordinates"]
-    N = coors.shape[0]
-    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
-    return voxels, coors, gen.grid_size
-
-
-def get_index(coor, grid_size):
-    index = coor[0]
-    for c, g in zip(coor[1:], grid_size):
-        index = index * g + c
-    return index
-
-
-def main():
-    voxels_gpu, coors_gpu, grid_size_gpu = waymo_data_gpu()
-    voxels_cpu, coors_cpu, grid_size_cpu = waymo_data_cpu(1)
-    waymo_data_cpu(10)
-    waymo_data_cpu(40)
-
-    print('...')
-
-    grid_size_gpu = grid_size_gpu[::-1]
-    grid_size_cpu = grid_size_cpu[::-1]
-
-    assert len(grid_size_gpu) == len(grid_size_cpu), "mismatch grid size"
-    assert grid_size_gpu[0] == grid_size_cpu[0], "mismatch grid size"
-    assert grid_size_gpu[1] == grid_size_cpu[1], "mismatch grid size"
-    assert grid_size_gpu[2] == grid_size_cpu[2], "mismatch grid size"
-
-    assert coors_gpu.shape[0] == coors_cpu.shape[0], "mismatch coors shape"
-
-    index2voxel = dict()
-    for coor, voxel in zip(coors_gpu, voxels_gpu):
-        index = get_index(coor, grid_size_gpu).item()
-        index2voxel[index] = voxel[:3].cpu()
-
-    for coor, voxel in zip(coors_cpu, voxels_cpu):
-        index = get_index(coor, grid_size_cpu).item()
-        assert index in index2voxel, "mismatch index: " + str(index)
-        assert (index2voxel.pop(index) - voxel[:3]).abs().max() < 0.1, \
-                    "voxel diff should be smaller than voxel_size 0.1"
-
-    print('Perfect GPU Voxelization!!!')
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/benchmark_points_to_voxel_gpu.py b/test/benchmark_points_to_voxel_gpu.py
deleted file mode 100644
index 793964b..0000000
--- a/test/benchmark_points_to_voxel_gpu.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import time
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch import nn
-
-import spconv
-from spconv.utils import VoxelGeneratorV3
-
-
-def waymo_data(batch_size=1):
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    points = torch.from_numpy(data['pc']).cuda().float()
-    voxel_size = torch.Tensor([0.1, 0.1,
-                               0.1]).to(points.dtype).to(points.device)
-    coors_range = torch.Tensor([-80, -80, -2, 80, 80,
-                                6]).to(points.dtype).to(points.device)
-
-    gen = VoxelGeneratorV3(voxel_size, coors_range)
-    voxels, coors = gen.generate(points)
-    N = coors.shape[0]
-    batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
-    coors = torch.cat([batch_id, coors], dim=1)
-    return voxels, coors, gen.grid_size
-
-
-class Net(nn.Module):
-    def __init__(self, shape, algo, device):
-        super().__init__()
-        self.device = device
-        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
-                              algo=algo),
-            spconv.SubMConv3d(64,
-                              64,
-                              3,
-                              bias=False,
-                              indice_key="c0",
-                              algo=algo),
-            # nn.BatchNorm1d(32),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(64,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo),
-            spconv.SubMConv3d(96,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo),
-            # nn.BatchNorm1d(64),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(96,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo),
-            spconv.SubMConv3d(128,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(128,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo),
-            spconv.SubMConv3d(160,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(160,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo),
-            spconv.SubMConv3d(192,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(192,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo),
-            spconv.SubMConv3d(224,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(224,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo),
-            spconv.SubMConv3d(256,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo),
-        )
-        max_batch_size = 1
-        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
-        self.grid = torch.full([max_batch_size, *shape],
-                               -1,
-                               dtype=torch.int32,
-                               device=self.device)
-        # self.grid = None
-        self.shape = shape
-
-    def forward(self, features, coors, batch_size):
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
-                                    self.grid)
-        return self.net(x)
-
-
-def main():
-    voxels, coors, spatial_shape = waymo_data()
-    voxels_th, coors_th = voxels, coors
-    algo = spconv.ConvAlgo.Native
-    net = Net(spatial_shape[::-1], algo,
-              voxels_th.device).cuda(device=voxels_th.device).eval().float()
-    print(coors_th.shape)
-    out = net(voxels_th, coors_th, 1)
-    print(out.spatial_shape)
-    times = []
-    with torch.no_grad():
-        for i in range(20):
-            torch.cuda.synchronize()
-            t = time.time()
-            out = net(voxels_th, coors_th, 1)
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    # print((net.grid == -1).float().sum(), net.grid.numel())
-    # print("spconv time", time.time() - t)
-    print("spconv time", np.mean(times[10:]))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/src/catch_main.cpp b/test/src/catch_main.cpp
deleted file mode 100644
index c8bf91e..0000000
--- a/test/src/catch_main.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// 000-CatchMain.cpp
-
-// In a Catch project with multiple files, dedicate one file to compile the
-// source code of Catch itself and reuse the resulting object file for linking.
-
-// Let Catch provide main():
-#define CATCH_CONFIG_MAIN
-
-#include "catch.hpp"
-
-// That's it
-
-// Compile implementation of Catch for use with files that do contain tests:
-// - g++ -std=c++11 -Wall -I$(CATCH_SINGLE_INCLUDE) -c 000-CatchMain.cpp
-// - cl -EHsc -I%CATCH_SINGLE_INCLUDE% -c 000-CatchMain.cpp
diff --git a/test/src/test_conv_rule.cpp b/test/src/test_conv_rule.cpp
deleted file mode 100644
index e42b6db..0000000
--- a/test/src/test_conv_rule.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-
-
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include "catch.hpp"
-#include <prettyprint.h>
-#include <string>
-#include <vector>
-#include <exception>
-#include <numeric>
-#include <pybind11/embed.h> // everything needed for embedding
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <tuple>
-#include <pybind11_utils.h>
-#include <spconv/spconv_ops.h>
-
-namespace py = pybind11;
-
-TEST_CASE("GetConvIndPair", "[SpConvNet]")
-{
-    
-    using namespace py::literals;
-    py::scoped_interpreter guard{}; // start the interpreter and keep it alive
-    py::exec(R"(
-    from __future__ import print_function
-    import numpy as np
-    import math
-    # import spconv
-    # import torch
-
-    def get_convolution_output_size(input_size,
-                                    kernel_size,
-                                    stride,
-                                    padding=None,
-                                    rate=None):
-        ndim = len(input_size)
-        if padding is None:
-            padding = [0] * ndim
-
-        output_size = []
-        for i in range(ndim):
-            output_size.append((input_size[i] + 2 * padding[i] - (
-                (kernel_size[i] - 1) + 1)) // stride[i] + 1)
-        return output_size
-
-
-    def get_test_sparse_data(shape,
-                            num_points,
-                            num_channels,
-                            integer=False,
-                            dtype=np.float32):
-        dense_shape = shape
-        ndim = len(dense_shape)
-        # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
-        num_points = np.array(num_points)
-        # num_points = np.array([3, 2])
-        batch_size = len(num_points)
-        batch_indices = []
-        coors_total = np.stack(
-            np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
-        coors_total = coors_total.reshape(-1, ndim)
-        for i in range(batch_size):
-            np.random.shuffle(coors_total)
-            inds_total = coors_total[:num_points[i]]
-            inds_total = np.pad(
-                inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
-            batch_indices.append(inds_total)
-        if integer:
-            sparse_data = np.random.randint(
-                20, 100, size=[num_points.sum(), num_channels]).astype(dtype)
-        else:
-            sparse_data = np.random.uniform(
-                -1, 1, size=[num_points.sum(), num_channels]).astype(dtype)
-
-        # sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
-        dense_data = np.zeros(
-            [batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
-        start = 0
-        for i, inds in enumerate(batch_indices):
-            for j, ind in enumerate(inds):
-                dense_slice = (i, slice(None), *ind[:-1])
-                dense_data[dense_slice] = sparse_data[start + j]
-            start += len(inds)
-        batch_indices = np.concatenate(batch_indices, axis=0)
-        return {
-            "features": sparse_data.astype(dtype),
-            "indices": batch_indices.astype(np.int32),
-            "features_dense": dense_data.astype(dtype),
-        }
-    shape = [50, 30, 30]
-    num_points = [5000] * 1
-    # np.random.seed(np.random.randint(1, 100000))
-    in_channels = 64
-    sparse_dict = get_test_sparse_data(shape, num_points, in_channels)
-    features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-    indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
-    features_dense = sparse_dict["features_dense"]
-    # indices_t = torch.from_numpy(indices)
-    filters = np.random.uniform(0, 1, size=[3, 3, 3, 64, 64]).astype(np.float32)
-    # print(outids.shape)
-    )");
-    SECTION("DebugTest"){
-
-        auto inds = array2TensorView<int>(py::array(py::globals()["indices"]));
-        auto inds_tensor = torch::from_blob(inds.data(), {inds.dim(0), inds.dim(1)}, torch::dtype(torch::kInt32));
-        auto inds_gpu = inds_tensor.to(torch::Device(torch::kCPU));
-        
-        auto features = array2TensorView<float>(py::array(py::globals()["features"]));
-        auto features_tensor = torch::from_blob(features.data(), {features.dim(0), features.dim(1)}, torch::dtype(torch::kFloat));
-        auto features_gpu = features_tensor.to(torch::Device(torch::kCUDA, 0));
-        auto filters = array2TensorView<float>(py::array(py::globals()["filters"]));
-        auto filters_tensor = torch::from_blob(filters.data(), {filters.dim(0), filters.dim(1), filters.dim(2), filters.dim(3), filters.dim(4)}, torch::dtype(torch::kFloat));
-        auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
-        
-        auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
-            {1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
-        // std::cout << outputs[2] << std::endl;
-        /*
-        auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
-        std::cout << output << std::endl;*/
-    }
-}
\ No newline at end of file
diff --git a/test/test_conv.py b/test/test_conv.py
index 9cd5374..4479ae9 100644
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -1,11 +1,11 @@
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,9 +20,9 @@
 import torch
 from torch import nn
 
-import spconv
+import spconv.pytorch as spconv
 from spconv.test_utils import TestCase, generate_sparse_data, params_grid
-
+from spconv.constants import FILTER_HWIO
 # import sparseconvnet as scn
 
 
@@ -37,7 +37,7 @@ def __init__(self,
                  stride,
                  padding,
                  dilation,
-                 algo=spconv.ConvAlgo.Minkowski):
+                 algo=spconv.ConvAlgo.Native):
         super().__init__()
         layers = [
             spconv.SparseConv3d(in_channels,
@@ -47,7 +47,6 @@ def __init__(self,
                                 padding=padding,
                                 dilation=dilation,
                                 bias=False,
-                                use_hash=False,
                                 algo=algo)
         ]
         for i in range(1, num_layers):
@@ -59,7 +58,6 @@ def __init__(self,
                                     padding=padding,
                                     dilation=dilation,
                                     bias=False,
-                                    use_hash=False,
                                     algo=algo))
         self.net = spconv.SparseSequential(*layers, )
         # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
@@ -359,6 +357,9 @@ def testSpConv3d(self):
         strides = [1, 2, 3]
         paddings = [0, 1, 2]
         dilations = [1, 2, 3]
+        # strides = [1]
+        # paddings = [0]
+        # dilations = [1]
 
         for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                 devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -367,7 +368,6 @@ def testSpConv3d(self):
                 continue  # don't support this.
             device = torch.device(dev)
             num_points = [1000] * bs
-
             sparse_dict = generate_sparse_data(shape, num_points, IC)
 
             features = np.ascontiguousarray(sparse_dict["features"]).astype(
@@ -375,8 +375,13 @@ def testSpConv3d(self):
             indices = np.ascontiguousarray(
                 sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
             features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
-                                                    OC]).astype(np.float32)
+            if FILTER_HWIO:
+                filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                        OC]).astype(np.float32)
+            else:
+                filters = np.random.uniform(0, 1, size=[k, k, k, OC,
+                                                        IC]).astype(np.float32)
+
             indices_t = torch.from_numpy(indices).int().to(device)
             features_t = torch.from_numpy(features).to(device)
             features_t.requires_grad = True
@@ -387,11 +392,19 @@ def testSpConv3d(self):
             net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                       d).to(device)
             filters_t = torch.from_numpy(filters).to(device)
-            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
-                                                              2).contiguous()
+            if FILTER_HWIO:
+                net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
+                                                                2).contiguous()
+            else:
+                net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+                                                                2).contiguous()
             net.net[0].weight.data[:] = filters_t
             out_ref = net_ref(features_dense_t)
             out = net(features_t, indices_t, bs).dense()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
+
             dout = np.random.uniform(-0.2, 0.2,
                                      out_ref.shape).astype(features.dtype)
             dout_t = torch.from_numpy(dout).to(device)
@@ -401,18 +414,21 @@ def testSpConv3d(self):
                                                                1).contiguous()
             din_sparse = gather_nd(din_dense, indices_t.long())
             din = features_t.grad.detach()
+
             din_np = din.cpu().numpy()
             din_sparse_np = din_sparse.cpu().numpy()
-            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
             for layer, layer_ref in zip(net.net, net_ref.net):
                 dw = layer.weight.grad.detach().cpu().numpy()
                 dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
-                dw = dw.transpose(4, 3, 0, 1, 2)
+                if FILTER_HWIO:
+
+                    dw = dw.transpose(4, 3, 0, 1, 2)
+                else:
+                    dw = dw.transpose(3, 4, 0, 1, 2)
+
                 self.assertAllClose(dw, dw_ref, atol=1e-4)
+            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
 
-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
 
     def testSpDeConv3d(self):
         np.random.seed(484)
@@ -454,7 +470,7 @@ def testSpDeConv3d(self):
             net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                         d).to(device)
             filters_t = torch.from_numpy(filters).to(device)
-            net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
                                                               2).contiguous()
             net.net[0].weight.data[:] = filters_t
             out_ref = net_ref(features_dense_t)
@@ -474,7 +490,7 @@ def testSpDeConv3d(self):
             for layer, layer_ref in zip(net.net, net_ref.net):
                 dw = layer.weight.grad.detach().cpu().numpy()
                 dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
-                dw = dw.transpose(3, 4, 0, 1, 2)
+                dw = dw.transpose(4, 3, 0, 1, 2)
                 self.assertAllClose(dw, dw_ref, atol=1e-4)
 
             out_np = out.detach().cpu().numpy()
@@ -551,12 +567,16 @@ def testSpMaxPool3d(self):
         shapes = [[19, 18, 17]]
         batchsizes = [1, 2]
 
-        in_channels = [62]
-        out_channels = [62]
+        in_channels = [64]
+        out_channels = [64]
         ksizes = [2, 3]
         strides = [1, 2, 3]
         paddings = [0, 1]
         dilations = [1, 2, 3]
+        ksizes = [2]
+        strides = [2]
+        paddings = [0]
+        dilations = [1]
 
         for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                 devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -565,6 +585,7 @@ def testSpMaxPool3d(self):
                 continue  # don't support this.
             device = torch.device(dev)
             num_points = [1000] * bs
+
             # when data contains negative, sparse maxpool is not equal to dense maxpool.
             sparse_dict = generate_sparse_data(shape,
                                                num_points,
@@ -576,8 +597,8 @@ def testSpMaxPool3d(self):
             indices = np.ascontiguousarray(
                 sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
             features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
-                                                    OC]).astype(np.float32)
+            filters = np.random.uniform(0, 1, size=[k, k, k, OC,
+                                                    IC]).astype(np.float32)
             indices_t = torch.from_numpy(indices).int().to(device)
             features_t = torch.from_numpy(features).to(device)
             features_t.requires_grad = True
@@ -588,11 +609,15 @@ def testSpMaxPool3d(self):
 
             out_ref = net_ref(features_dense_t)
             out = net(features_t, indices_t, bs)
+
             outids = out.indices
             outfeatures = out.features
             outids_dev = outids.float()
             out_dense = out.dense(channels_first=False)
             out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
 
             dout_sparse = np.random.uniform(
                 -0.2, 0.2, outfeatures.shape).astype(features.dtype)
@@ -607,9 +632,6 @@ def testSpMaxPool3d(self):
             din_sparse = gather_nd(din_dense, indices_t.long())
             din = features_t.grad.detach()
 
-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
             din_np = din.cpu().numpy()
             din_sparse_np = din_sparse.cpu().numpy()
             self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
diff --git a/third_party/catch2/catch.hpp b/third_party/catch2/catch.hpp
deleted file mode 100644
index 4191607..0000000
--- a/third_party/catch2/catch.hpp
+++ /dev/null
@@ -1,14020 +0,0 @@
-/*
- *  Catch v2.4.1
- *  Generated: 2018-09-28 15:50:15.645795
- *  ----------------------------------------------------------
- *  This file has been merged from multiple headers. Please don't edit it directly
- *  Copyright (c) 2018 Two Blue Cubes Ltd. All rights reserved.
- *
- *  Distributed under the Boost Software License, Version 1.0. (See accompanying
- *  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
- */
-#ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
-#define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
-// start catch.hpp
-
-
-#define CATCH_VERSION_MAJOR 2
-#define CATCH_VERSION_MINOR 4
-#define CATCH_VERSION_PATCH 1
-
-#ifdef __clang__
-#    pragma clang system_header
-#elif defined __GNUC__
-#    pragma GCC system_header
-#endif
-
-// start catch_suppress_warnings.h
-
-#ifdef __clang__
-#   ifdef __ICC // icpc defines the __clang__ macro
-#       pragma warning(push)
-#       pragma warning(disable: 161 1682)
-#   else // __ICC
-#       pragma clang diagnostic push
-#       pragma clang diagnostic ignored "-Wpadded"
-#       pragma clang diagnostic ignored "-Wswitch-enum"
-#       pragma clang diagnostic ignored "-Wcovered-switch-default"
-#    endif
-#elif defined __GNUC__
-     // GCC likes to warn on REQUIREs, and we cannot suppress them
-     // locally because g++'s support for _Pragma is lacking in older,
-     // still supported, versions
-#    pragma GCC diagnostic ignored "-Wparentheses"
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wunused-variable"
-#    pragma GCC diagnostic ignored "-Wpadded"
-#endif
-// end catch_suppress_warnings.h
-#if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER)
-#  define CATCH_IMPL
-#  define CATCH_CONFIG_ALL_PARTS
-#endif
-
-// In the impl file, we want to have access to all parts of the headers
-// Can also be used to sanely support PCHs
-#if defined(CATCH_CONFIG_ALL_PARTS)
-#  define CATCH_CONFIG_EXTERNAL_INTERFACES
-#  if defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#    undef CATCH_CONFIG_DISABLE_MATCHERS
-#  endif
-#  if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
-#    define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
-#  endif
-#endif
-
-#if !defined(CATCH_CONFIG_IMPL_ONLY)
-// start catch_platform.h
-
-#ifdef __APPLE__
-# include <TargetConditionals.h>
-# if TARGET_OS_OSX == 1
-#  define CATCH_PLATFORM_MAC
-# elif TARGET_OS_IPHONE == 1
-#  define CATCH_PLATFORM_IPHONE
-# endif
-
-#elif defined(linux) || defined(__linux) || defined(__linux__)
-#  define CATCH_PLATFORM_LINUX
-
-#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__)
-#  define CATCH_PLATFORM_WINDOWS
-#endif
-
-// end catch_platform.h
-
-#ifdef CATCH_IMPL
-#  ifndef CLARA_CONFIG_MAIN
-#    define CLARA_CONFIG_MAIN_NOT_DEFINED
-#    define CLARA_CONFIG_MAIN
-#  endif
-#endif
-
-// start catch_user_interfaces.h
-
-namespace Catch {
-    unsigned int rngSeed();
-}
-
-// end catch_user_interfaces.h
-// start catch_tag_alias_autoregistrar.h
-
-// start catch_common.h
-
-// start catch_compiler_capabilities.h
-
-// Detect a number of compiler features - by compiler
-// The following features are defined:
-//
-// CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported?
-// CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported?
-// CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported?
-// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled?
-// ****************
-// Note to maintainers: if new toggles are added please document them
-// in configuration.md, too
-// ****************
-
-// In general each macro has a _NO_<feature name> form
-// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature.
-// Many features, at point of detection, define an _INTERNAL_ macro, so they
-// can be combined, en-mass, with the _NO_ forms later.
-
-#ifdef __cplusplus
-
-#  if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
-#    define CATCH_CPP14_OR_GREATER
-#  endif
-
-#  if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
-#    define CATCH_CPP17_OR_GREATER
-#  endif
-
-#endif
-
-#if defined(CATCH_CPP17_OR_GREATER)
-#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif
-
-#ifdef __clang__
-
-#       define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-            _Pragma( "clang diagnostic push" ) \
-            _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \
-            _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"")
-#       define CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \
-            _Pragma( "clang diagnostic pop" )
-
-#       define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
-            _Pragma( "clang diagnostic push" ) \
-            _Pragma( "clang diagnostic ignored \"-Wparentheses\"" )
-#       define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \
-            _Pragma( "clang diagnostic pop" )
-
-#       define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \
-            _Pragma( "clang diagnostic push" ) \
-            _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" )
-#       define CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS \
-            _Pragma( "clang diagnostic pop" )
-
-#endif // __clang__
-
-////////////////////////////////////////////////////////////////////////////////
-// Assume that non-Windows platforms support posix signals by default
-#if !defined(CATCH_PLATFORM_WINDOWS)
-    #define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// We know some environments not to support full POSIX signals
-#if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) || defined(__DJGPP__)
-    #define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
-#endif
-
-#ifdef __OS400__
-#       define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
-#       define CATCH_CONFIG_COLOUR_NONE
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Android somehow still does not support std::to_string
-#if defined(__ANDROID__)
-#    define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Not all Windows environments support SEH properly
-#if defined(__MINGW32__)
-#    define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// PS4
-#if defined(__ORBIS__)
-#    define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Cygwin
-#ifdef __CYGWIN__
-
-// Required for some versions of Cygwin to declare gettimeofday
-// see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin
-#   define _BSD_SOURCE
-// some versions of cygwin (most) do not support std::to_string. Use the libstd check.
-// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813
-# if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \
-	       && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF))
-
-#	define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
-
-# endif
-#endif // __CYGWIN__
-
-////////////////////////////////////////////////////////////////////////////////
-// Visual C++
-#ifdef _MSC_VER
-
-#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
-#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#  endif
-
-// Universal Windows platform does not support SEH
-// Or console colours (or console at all...)
-#  if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP)
-#    define CATCH_CONFIG_COLOUR_NONE
-#  else
-#    define CATCH_INTERNAL_CONFIG_WINDOWS_SEH
-#  endif
-
-#endif // _MSC_VER
-
-////////////////////////////////////////////////////////////////////////////////
-// Check if we are compiled with -fno-exceptions or equivalent
-#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND)
-#  define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// DJGPP
-#ifdef __DJGPP__
-#  define CATCH_INTERNAL_CONFIG_NO_WCHAR
-#endif // __DJGPP__
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Use of __COUNTER__ is suppressed during code analysis in
-// CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly
-// handled by it.
-// Otherwise all supported compilers support COUNTER macro,
-// but user still might want to turn it off
-#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
-    #define CATCH_INTERNAL_CONFIG_COUNTER
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Check if string_view is available and usable
-// The check is split apart to work around v140 (VS2015) preprocessor issue...
-#if defined(__has_include)
-#if __has_include(<string_view>) && defined(CATCH_CPP17_OR_GREATER)
-#    define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW
-#endif
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Check if variant is available and usable
-#if defined(__has_include)
-#  if __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
-#    if defined(__clang__) && (__clang_major__ < 8)
-       // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852
-       // fix should be in clang 8, workaround in libstdc++ 8.2
-#      include <ciso646>
-#      if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
-#        define CATCH_CONFIG_NO_CPP17_VARIANT
-#     else
-#        define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
-#      endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
-#    endif // defined(__clang__) && (__clang_major__ < 8)
-#  endif // __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
-#endif // __has_include
-
-#if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER)
-#   define CATCH_CONFIG_COUNTER
-#endif
-#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH)
-#   define CATCH_CONFIG_WINDOWS_SEH
-#endif
-// This is set by default, because we assume that unix compilers are posix-signal-compatible by default.
-#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS)
-#   define CATCH_CONFIG_POSIX_SIGNALS
-#endif
-// This is set by default, because we assume that compilers with no wchar_t support are just rare exceptions.
-#if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR)
-#   define CATCH_CONFIG_WCHAR
-#endif
-
-#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING)
-#    define CATCH_CONFIG_CPP11_TO_STRING
-#endif
-
-#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
-#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
-#endif
-
-#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW)
-#  define CATCH_CONFIG_CPP17_STRING_VIEW
-#endif
-
-#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT)
-#  define CATCH_CONFIG_CPP17_VARIANT
-#endif
-
-#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
-#  define CATCH_INTERNAL_CONFIG_NEW_CAPTURE
-#endif
-
-#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE)
-#  define CATCH_CONFIG_NEW_CAPTURE
-#endif
-
-#if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-#  define CATCH_CONFIG_DISABLE_EXCEPTIONS
-#endif
-
-#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS
-#   define CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS
-#endif
-#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
-#   define CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS
-#endif
-#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS)
-#   define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS
-#   define CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS
-#endif
-
-#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-#define CATCH_TRY if ((true))
-#define CATCH_CATCH_ALL if ((false))
-#define CATCH_CATCH_ANON(type) if ((false))
-#else
-#define CATCH_TRY try
-#define CATCH_CATCH_ALL catch (...)
-#define CATCH_CATCH_ANON(type) catch (type)
-#endif
-
-// end catch_compiler_capabilities.h
-#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
-#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
-#ifdef CATCH_CONFIG_COUNTER
-#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
-#else
-#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
-#endif
-
-#include <iosfwd>
-#include <string>
-#include <cstdint>
-
-namespace Catch {
-
-    struct CaseSensitive { enum Choice {
-        Yes,
-        No
-    }; };
-
-    class NonCopyable {
-        NonCopyable( NonCopyable const& )              = delete;
-        NonCopyable( NonCopyable && )                  = delete;
-        NonCopyable& operator = ( NonCopyable const& ) = delete;
-        NonCopyable& operator = ( NonCopyable && )     = delete;
-
-    protected:
-        NonCopyable();
-        virtual ~NonCopyable();
-    };
-
-    struct SourceLineInfo {
-
-        SourceLineInfo() = delete;
-        SourceLineInfo( char const* _file, std::size_t _line ) noexcept
-        :   file( _file ),
-            line( _line )
-        {}
-
-        SourceLineInfo( SourceLineInfo const& other )        = default;
-        SourceLineInfo( SourceLineInfo && )                  = default;
-        SourceLineInfo& operator = ( SourceLineInfo const& ) = default;
-        SourceLineInfo& operator = ( SourceLineInfo && )     = default;
-
-        bool empty() const noexcept;
-        bool operator == ( SourceLineInfo const& other ) const noexcept;
-        bool operator < ( SourceLineInfo const& other ) const noexcept;
-
-        char const* file;
-        std::size_t line;
-    };
-
-    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info );
-
-    // Use this in variadic streaming macros to allow
-    //    >> +StreamEndStop
-    // as well as
-    //    >> stuff +StreamEndStop
-    struct StreamEndStop {
-        std::string operator+() const;
-    };
-    template<typename T>
-    T const& operator + ( T const& value, StreamEndStop ) {
-        return value;
-    }
-}
-
-#define CATCH_INTERNAL_LINEINFO \
-    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
-
-// end catch_common.h
-namespace Catch {
-
-    struct RegistrarForTagAliases {
-        RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo );
-    };
-
-} // end namespace Catch
-
-#define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \
-    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-    namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \
-    CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS
-
-// end catch_tag_alias_autoregistrar.h
-// start catch_test_registry.h
-
-// start catch_interfaces_testcase.h
-
-#include <vector>
-#include <memory>
-
-namespace Catch {
-
-    class TestSpec;
-
-    struct ITestInvoker {
-        virtual void invoke () const = 0;
-        virtual ~ITestInvoker();
-    };
-
-    using ITestCasePtr = std::shared_ptr<ITestInvoker>;
-
-    class TestCase;
-    struct IConfig;
-
-    struct ITestCaseRegistry {
-        virtual ~ITestCaseRegistry();
-        virtual std::vector<TestCase> const& getAllTests() const = 0;
-        virtual std::vector<TestCase> const& getAllTestsSorted( IConfig const& config ) const = 0;
-    };
-
-    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config );
-
-}
-
-// end catch_interfaces_testcase.h
-// start catch_stringref.h
-
-#include <cstddef>
-#include <string>
-#include <iosfwd>
-
-namespace Catch {
-
-    class StringData;
-
-    /// A non-owning string class (similar to the forthcoming std::string_view)
-    /// Note that, because a StringRef may be a substring of another string,
-    /// it may not be null terminated. c_str() must return a null terminated
-    /// string, however, and so the StringRef will internally take ownership
-    /// (taking a copy), if necessary. In theory this ownership is not externally
-    /// visible - but it does mean (substring) StringRefs should not be shared between
-    /// threads.
-    class StringRef {
-    public:
-        using size_type = std::size_t;
-
-    private:
-        friend struct StringRefTestAccess;
-
-        char const* m_start;
-        size_type m_size;
-
-        char* m_data = nullptr;
-
-        void takeOwnership();
-
-        static constexpr char const* const s_empty = "";
-
-    public: // construction/ assignment
-        StringRef() noexcept
-        :   StringRef( s_empty, 0 )
-        {}
-
-        StringRef( StringRef const& other ) noexcept
-        :   m_start( other.m_start ),
-            m_size( other.m_size )
-        {}
-
-        StringRef( StringRef&& other ) noexcept
-        :   m_start( other.m_start ),
-            m_size( other.m_size ),
-            m_data( other.m_data )
-        {
-            other.m_data = nullptr;
-        }
-
-        StringRef( char const* rawChars ) noexcept;
-
-        StringRef( char const* rawChars, size_type size ) noexcept
-        :   m_start( rawChars ),
-            m_size( size )
-        {}
-
-        StringRef( std::string const& stdString ) noexcept
-        :   m_start( stdString.c_str() ),
-            m_size( stdString.size() )
-        {}
-
-        ~StringRef() noexcept {
-            delete[] m_data;
-        }
-
-        auto operator = ( StringRef const &other ) noexcept -> StringRef& {
-            delete[] m_data;
-            m_data = nullptr;
-            m_start = other.m_start;
-            m_size = other.m_size;
-            return *this;
-        }
-
-        operator std::string() const;
-
-        void swap( StringRef& other ) noexcept;
-
-    public: // operators
-        auto operator == ( StringRef const& other ) const noexcept -> bool;
-        auto operator != ( StringRef const& other ) const noexcept -> bool;
-
-        auto operator[] ( size_type index ) const noexcept -> char;
-
-    public: // named queries
-        auto empty() const noexcept -> bool {
-            return m_size == 0;
-        }
-        auto size() const noexcept -> size_type {
-            return m_size;
-        }
-
-        auto numberOfCharacters() const noexcept -> size_type;
-        auto c_str() const -> char const*;
-
-    public: // substrings and searches
-        auto substr( size_type start, size_type size ) const noexcept -> StringRef;
-
-        // Returns the current start pointer.
-        // Note that the pointer can change when if the StringRef is a substring
-        auto currentData() const noexcept -> char const*;
-
-    private: // ownership queries - may not be consistent between calls
-        auto isOwned() const noexcept -> bool;
-        auto isSubstring() const noexcept -> bool;
-    };
-
-    auto operator + ( StringRef const& lhs, StringRef const& rhs ) -> std::string;
-    auto operator + ( StringRef const& lhs, char const* rhs ) -> std::string;
-    auto operator + ( char const* lhs, StringRef const& rhs ) -> std::string;
-
-    auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&;
-    auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&;
-
-    inline auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
-        return StringRef( rawChars, size );
-    }
-
-} // namespace Catch
-
-inline auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
-    return Catch::StringRef( rawChars, size );
-}
-
-// end catch_stringref.h
-namespace Catch {
-
-template<typename C>
-class TestInvokerAsMethod : public ITestInvoker {
-    void (C::*m_testAsMethod)();
-public:
-    TestInvokerAsMethod( void (C::*testAsMethod)() ) noexcept : m_testAsMethod( testAsMethod ) {}
-
-    void invoke() const override {
-        C obj;
-        (obj.*m_testAsMethod)();
-    }
-};
-
-auto makeTestInvoker( void(*testAsFunction)() ) noexcept -> ITestInvoker*;
-
-template<typename C>
-auto makeTestInvoker( void (C::*testAsMethod)() ) noexcept -> ITestInvoker* {
-    return new(std::nothrow) TestInvokerAsMethod<C>( testAsMethod );
-}
-
-struct NameAndTags {
-    NameAndTags( StringRef const& name_ = StringRef(), StringRef const& tags_ = StringRef() ) noexcept;
-    StringRef name;
-    StringRef tags;
-};
-
-struct AutoReg : NonCopyable {
-    AutoReg( ITestInvoker* invoker, SourceLineInfo const& lineInfo, StringRef const& classOrMethod, NameAndTags const& nameAndTags ) noexcept;
-    ~AutoReg();
-};
-
-} // end namespace Catch
-
-#define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param)
-#define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO## __VA_ARGS__
-#define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__
-#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
-
-#if defined(CATCH_CONFIG_DISABLE)
-    #define INTERNAL_CATCH_TESTCASE_NO_REGISTRATION( TestName, ... ) \
-        static void TestName()
-    #define INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION( TestName, ClassName, ... ) \
-        namespace{                        \
-            struct TestName : INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF ClassName) { \
-                void test();              \
-            };                            \
-        }                                 \
-        void TestName::test()
-
-#endif
-
-    ///////////////////////////////////////////////////////////////////////////////
-    #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
-        static void TestName(); \
-        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &TestName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \
-        CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \
-        static void TestName()
-    #define INTERNAL_CATCH_TESTCASE( ... ) \
-        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), __VA_ARGS__ )
-
-    ///////////////////////////////////////////////////////////////////////////////
-    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
-        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &QualifiedMethod ), CATCH_INTERNAL_LINEINFO, "&" #QualifiedMethod, Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \
-        CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS
-
-    ///////////////////////////////////////////////////////////////////////////////
-    #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
-        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-        namespace{ \
-            struct TestName : INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF ClassName) { \
-                void test(); \
-            }; \
-            Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar ) ( Catch::makeTestInvoker( &TestName::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
-        } \
-        CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \
-        void TestName::test()
-    #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
-        INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), ClassName, __VA_ARGS__ )
-
-    ///////////////////////////////////////////////////////////////////////////////
-    #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
-        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-        Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( Function ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
-        CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS
-
-// end catch_test_registry.h
-// start catch_capture.hpp
-
-// start catch_assertionhandler.h
-
-// start catch_assertioninfo.h
-
-// start catch_result_type.h
-
-namespace Catch {
-
-    // ResultWas::OfType enum
-    struct ResultWas { enum OfType {
-        Unknown = -1,
-        Ok = 0,
-        Info = 1,
-        Warning = 2,
-
-        FailureBit = 0x10,
-
-        ExpressionFailed = FailureBit | 1,
-        ExplicitFailure = FailureBit | 2,
-
-        Exception = 0x100 | FailureBit,
-
-        ThrewException = Exception | 1,
-        DidntThrowException = Exception | 2,
-
-        FatalErrorCondition = 0x200 | FailureBit
-
-    }; };
-
-    bool isOk( ResultWas::OfType resultType );
-    bool isJustInfo( int flags );
-
-    // ResultDisposition::Flags enum
-    struct ResultDisposition { enum Flags {
-        Normal = 0x01,
-
-        ContinueOnFailure = 0x02,   // Failures fail test, but execution continues
-        FalseTest = 0x04,           // Prefix expression with !
-        SuppressFail = 0x08         // Failures are reported but do not fail the test
-    }; };
-
-    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs );
-
-    bool shouldContinueOnFailure( int flags );
-    inline bool isFalseTest( int flags ) { return ( flags & ResultDisposition::FalseTest ) != 0; }
-    bool shouldSuppressFailure( int flags );
-
-} // end namespace Catch
-
-// end catch_result_type.h
-namespace Catch {
-
-    struct AssertionInfo
-    {
-        StringRef macroName;
-        SourceLineInfo lineInfo;
-        StringRef capturedExpression;
-        ResultDisposition::Flags resultDisposition;
-
-        // We want to delete this constructor but a compiler bug in 4.8 means
-        // the struct is then treated as non-aggregate
-        //AssertionInfo() = delete;
-    };
-
-} // end namespace Catch
-
-// end catch_assertioninfo.h
-// start catch_decomposer.h
-
-// start catch_tostring.h
-
-#include <vector>
-#include <cstddef>
-#include <type_traits>
-#include <string>
-// start catch_stream.h
-
-#include <iosfwd>
-#include <cstddef>
-#include <ostream>
-
-namespace Catch {
-
-    std::ostream& cout();
-    std::ostream& cerr();
-    std::ostream& clog();
-
-    class StringRef;
-
-    struct IStream {
-        virtual ~IStream();
-        virtual std::ostream& stream() const = 0;
-    };
-
-    auto makeStream( StringRef const &filename ) -> IStream const*;
-
-    class ReusableStringStream {
-        std::size_t m_index;
-        std::ostream* m_oss;
-    public:
-        ReusableStringStream();
-        ~ReusableStringStream();
-
-        auto str() const -> std::string;
-
-        template<typename T>
-        auto operator << ( T const& value ) -> ReusableStringStream& {
-            *m_oss << value;
-            return *this;
-        }
-        auto get() -> std::ostream& { return *m_oss; }
-    };
-}
-
-// end catch_stream.h
-
-#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
-#include <string_view>
-#endif
-
-#ifdef __OBJC__
-// start catch_objc_arc.hpp
-
-#import <Foundation/Foundation.h>
-
-#ifdef __has_feature
-#define CATCH_ARC_ENABLED __has_feature(objc_arc)
-#else
-#define CATCH_ARC_ENABLED 0
-#endif
-
-void arcSafeRelease( NSObject* obj );
-id performOptionalSelector( id obj, SEL sel );
-
-#if !CATCH_ARC_ENABLED
-inline void arcSafeRelease( NSObject* obj ) {
-    [obj release];
-}
-inline id performOptionalSelector( id obj, SEL sel ) {
-    if( [obj respondsToSelector: sel] )
-        return [obj performSelector: sel];
-    return nil;
-}
-#define CATCH_UNSAFE_UNRETAINED
-#define CATCH_ARC_STRONG
-#else
-inline void arcSafeRelease( NSObject* ){}
-inline id performOptionalSelector( id obj, SEL sel ) {
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Warc-performSelector-leaks"
-#endif
-    if( [obj respondsToSelector: sel] )
-        return [obj performSelector: sel];
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-    return nil;
-}
-#define CATCH_UNSAFE_UNRETAINED __unsafe_unretained
-#define CATCH_ARC_STRONG __strong
-#endif
-
-// end catch_objc_arc.hpp
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable:4180) // We attempt to stream a function (address) by const&, which MSVC complains about but is harmless
-#endif
-
-// We need a dummy global operator<< so we can bring it into Catch namespace later
-struct Catch_global_namespace_dummy {};
-std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy);
-
-namespace Catch {
-    // Bring in operator<< from global namespace into Catch namespace
-    using ::operator<<;
-
-    namespace Detail {
-
-        extern const std::string unprintableString;
-
-        std::string rawMemoryToString( const void *object, std::size_t size );
-
-        template<typename T>
-        std::string rawMemoryToString( const T& object ) {
-          return rawMemoryToString( &object, sizeof(object) );
-        }
-
-        template<typename T>
-        class IsStreamInsertable {
-            template<typename SS, typename TT>
-            static auto test(int)
-                -> decltype(std::declval<SS&>() << std::declval<TT>(), std::true_type());
-
-            template<typename, typename>
-            static auto test(...)->std::false_type;
-
-        public:
-            static const bool value = decltype(test<std::ostream, const T&>(0))::value;
-        };
-
-        template<typename E>
-        std::string convertUnknownEnumToString( E e );
-
-        template<typename T>
-        typename std::enable_if<
-            !std::is_enum<T>::value && !std::is_base_of<std::exception, T>::value,
-        std::string>::type convertUnstreamable( T const& ) {
-            return Detail::unprintableString;
-        }
-        template<typename T>
-        typename std::enable_if<
-            !std::is_enum<T>::value && std::is_base_of<std::exception, T>::value,
-         std::string>::type convertUnstreamable(T const& ex) {
-            return ex.what();
-        }
-
-        template<typename T>
-        typename std::enable_if<
-            std::is_enum<T>::value
-        , std::string>::type convertUnstreamable( T const& value ) {
-            return convertUnknownEnumToString( value );
-        }
-
-#if defined(_MANAGED)
-        //! Convert a CLR string to a utf8 std::string
-        template<typename T>
-        std::string clrReferenceToString( T^ ref ) {
-            if (ref == nullptr)
-                return std::string("null");
-            auto bytes = System::Text::Encoding::UTF8->GetBytes(ref->ToString());
-            cli::pin_ptr<System::Byte> p = &bytes[0];
-            return std::string(reinterpret_cast<char const *>(p), bytes->Length);
-        }
-#endif
-
-    } // namespace Detail
-
-    // If we decide for C++14, change these to enable_if_ts
-    template <typename T, typename = void>
-    struct StringMaker {
-        template <typename Fake = T>
-        static
-        typename std::enable_if<::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>::type
-            convert(const Fake& value) {
-                ReusableStringStream rss;
-                // NB: call using the function-like syntax to avoid ambiguity with
-                // user-defined templated operator<< under clang.
-                rss.operator<<(value);
-                return rss.str();
-        }
-
-        template <typename Fake = T>
-        static
-        typename std::enable_if<!::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>::type
-            convert( const Fake& value ) {
-#if !defined(CATCH_CONFIG_FALLBACK_STRINGIFIER)
-            return Detail::convertUnstreamable(value);
-#else
-            return CATCH_CONFIG_FALLBACK_STRINGIFIER(value);
-#endif
-        }
-    };
-
-    namespace Detail {
-
-        // This function dispatches all stringification requests inside of Catch.
-        // Should be preferably called fully qualified, like ::Catch::Detail::stringify
-        template <typename T>
-        std::string stringify(const T& e) {
-            return ::Catch::StringMaker<typename std::remove_cv<typename std::remove_reference<T>::type>::type>::convert(e);
-        }
-
-        template<typename E>
-        std::string convertUnknownEnumToString( E e ) {
-            return ::Catch::Detail::stringify(static_cast<typename std::underlying_type<E>::type>(e));
-        }
-
-#if defined(_MANAGED)
-        template <typename T>
-        std::string stringify( T^ e ) {
-            return ::Catch::StringMaker<T^>::convert(e);
-        }
-#endif
-
-    } // namespace Detail
-
-    // Some predefined specializations
-
-    template<>
-    struct StringMaker<std::string> {
-        static std::string convert(const std::string& str);
-    };
-
-#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
-    template<>
-    struct StringMaker<std::string_view> {
-        static std::string convert(std::string_view str);
-    };
-#endif
-
-    template<>
-    struct StringMaker<char const *> {
-        static std::string convert(char const * str);
-    };
-    template<>
-    struct StringMaker<char *> {
-        static std::string convert(char * str);
-    };
-
-#ifdef CATCH_CONFIG_WCHAR
-    template<>
-    struct StringMaker<std::wstring> {
-        static std::string convert(const std::wstring& wstr);
-    };
-
-# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
-    template<>
-    struct StringMaker<std::wstring_view> {
-        static std::string convert(std::wstring_view str);
-    };
-# endif
-
-    template<>
-    struct StringMaker<wchar_t const *> {
-        static std::string convert(wchar_t const * str);
-    };
-    template<>
-    struct StringMaker<wchar_t *> {
-        static std::string convert(wchar_t * str);
-    };
-#endif
-
-    // TBD: Should we use `strnlen` to ensure that we don't go out of the buffer,
-    //      while keeping string semantics?
-    template<int SZ>
-    struct StringMaker<char[SZ]> {
-        static std::string convert(char const* str) {
-            return ::Catch::Detail::stringify(std::string{ str });
-        }
-    };
-    template<int SZ>
-    struct StringMaker<signed char[SZ]> {
-        static std::string convert(signed char const* str) {
-            return ::Catch::Detail::stringify(std::string{ reinterpret_cast<char const *>(str) });
-        }
-    };
-    template<int SZ>
-    struct StringMaker<unsigned char[SZ]> {
-        static std::string convert(unsigned char const* str) {
-            return ::Catch::Detail::stringify(std::string{ reinterpret_cast<char const *>(str) });
-        }
-    };
-
-    template<>
-    struct StringMaker<int> {
-        static std::string convert(int value);
-    };
-    template<>
-    struct StringMaker<long> {
-        static std::string convert(long value);
-    };
-    template<>
-    struct StringMaker<long long> {
-        static std::string convert(long long value);
-    };
-    template<>
-    struct StringMaker<unsigned int> {
-        static std::string convert(unsigned int value);
-    };
-    template<>
-    struct StringMaker<unsigned long> {
-        static std::string convert(unsigned long value);
-    };
-    template<>
-    struct StringMaker<unsigned long long> {
-        static std::string convert(unsigned long long value);
-    };
-
-    template<>
-    struct StringMaker<bool> {
-        static std::string convert(bool b);
-    };
-
-    template<>
-    struct StringMaker<char> {
-        static std::string convert(char c);
-    };
-    template<>
-    struct StringMaker<signed char> {
-        static std::string convert(signed char c);
-    };
-    template<>
-    struct StringMaker<unsigned char> {
-        static std::string convert(unsigned char c);
-    };
-
-    template<>
-    struct StringMaker<std::nullptr_t> {
-        static std::string convert(std::nullptr_t);
-    };
-
-    template<>
-    struct StringMaker<float> {
-        static std::string convert(float value);
-    };
-    template<>
-    struct StringMaker<double> {
-        static std::string convert(double value);
-    };
-
-    template <typename T>
-    struct StringMaker<T*> {
-        template <typename U>
-        static std::string convert(U* p) {
-            if (p) {
-                return ::Catch::Detail::rawMemoryToString(p);
-            } else {
-                return "nullptr";
-            }
-        }
-    };
-
-    template <typename R, typename C>
-    struct StringMaker<R C::*> {
-        static std::string convert(R C::* p) {
-            if (p) {
-                return ::Catch::Detail::rawMemoryToString(p);
-            } else {
-                return "nullptr";
-            }
-        }
-    };
-
-#if defined(_MANAGED)
-    template <typename T>
-    struct StringMaker<T^> {
-        static std::string convert( T^ ref ) {
-            return ::Catch::Detail::clrReferenceToString(ref);
-        }
-    };
-#endif
-
-    namespace Detail {
-        template<typename InputIterator>
-        std::string rangeToString(InputIterator first, InputIterator last) {
-            ReusableStringStream rss;
-            rss << "{ ";
-            if (first != last) {
-                rss << ::Catch::Detail::stringify(*first);
-                for (++first; first != last; ++first)
-                    rss << ", " << ::Catch::Detail::stringify(*first);
-            }
-            rss << " }";
-            return rss.str();
-        }
-    }
-
-#ifdef __OBJC__
-    template<>
-    struct StringMaker<NSString*> {
-        static std::string convert(NSString * nsstring) {
-            if (!nsstring)
-                return "nil";
-            return std::string("@") + [nsstring UTF8String];
-        }
-    };
-    template<>
-    struct StringMaker<NSObject*> {
-        static std::string convert(NSObject* nsObject) {
-            return ::Catch::Detail::stringify([nsObject description]);
-        }
-
-    };
-    namespace Detail {
-        inline std::string stringify( NSString* nsstring ) {
-            return StringMaker<NSString*>::convert( nsstring );
-        }
-
-    } // namespace Detail
-#endif // __OBJC__
-
-} // namespace Catch
-
-//////////////////////////////////////////////////////
-// Separate std-lib types stringification, so it can be selectively enabled
-// This means that we do not bring in
-
-#if defined(CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS)
-#  define CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
-#  define CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
-#  define CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
-#  define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
-#endif
-
-// Separate std::pair specialization
-#if defined(CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER)
-#include <utility>
-namespace Catch {
-    template<typename T1, typename T2>
-    struct StringMaker<std::pair<T1, T2> > {
-        static std::string convert(const std::pair<T1, T2>& pair) {
-            ReusableStringStream rss;
-            rss << "{ "
-                << ::Catch::Detail::stringify(pair.first)
-                << ", "
-                << ::Catch::Detail::stringify(pair.second)
-                << " }";
-            return rss.str();
-        }
-    };
-}
-#endif // CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
-
-// Separate std::tuple specialization
-#if defined(CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER)
-#include <tuple>
-namespace Catch {
-    namespace Detail {
-        template<
-            typename Tuple,
-            std::size_t N = 0,
-            bool = (N < std::tuple_size<Tuple>::value)
-            >
-            struct TupleElementPrinter {
-            static void print(const Tuple& tuple, std::ostream& os) {
-                os << (N ? ", " : " ")
-                    << ::Catch::Detail::stringify(std::get<N>(tuple));
-                TupleElementPrinter<Tuple, N + 1>::print(tuple, os);
-            }
-        };
-
-        template<
-            typename Tuple,
-            std::size_t N
-        >
-            struct TupleElementPrinter<Tuple, N, false> {
-            static void print(const Tuple&, std::ostream&) {}
-        };
-
-    }
-
-    template<typename ...Types>
-    struct StringMaker<std::tuple<Types...>> {
-        static std::string convert(const std::tuple<Types...>& tuple) {
-            ReusableStringStream rss;
-            rss << '{';
-            Detail::TupleElementPrinter<std::tuple<Types...>>::print(tuple, rss.get());
-            rss << " }";
-            return rss.str();
-        }
-    };
-}
-#endif // CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
-
-#if defined(CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER) && defined(CATCH_CONFIG_CPP17_VARIANT)
-#include <variant>
-namespace Catch {
-    template<>
-    struct StringMaker<std::monostate> {
-        static std::string convert(const std::monostate&) {
-            return "{ }";
-        }
-    };
-
-    template<typename... Elements>
-    struct StringMaker<std::variant<Elements...>> {
-        static std::string convert(const std::variant<Elements...>& variant) {
-            if (variant.valueless_by_exception()) {
-                return "{valueless variant}";
-            } else {
-                return std::visit(
-                    [](const auto& value) {
-                        return ::Catch::Detail::stringify(value);
-                    },
-                    variant
-                );
-            }
-        }
-    };
-}
-#endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
-
-namespace Catch {
-    struct not_this_one {}; // Tag type for detecting which begin/ end are being selected
-
-    // Import begin/ end from std here so they are considered alongside the fallback (...) overloads in this namespace
-    using std::begin;
-    using std::end;
-
-    not_this_one begin( ... );
-    not_this_one end( ... );
-
-    template <typename T>
-    struct is_range {
-        static const bool value =
-            !std::is_same<decltype(begin(std::declval<T>())), not_this_one>::value &&
-            !std::is_same<decltype(end(std::declval<T>())), not_this_one>::value;
-    };
-
-#if defined(_MANAGED) // Managed types are never ranges
-    template <typename T>
-    struct is_range<T^> {
-        static const bool value = false;
-    };
-#endif
-
-    template<typename Range>
-    std::string rangeToString( Range const& range ) {
-        return ::Catch::Detail::rangeToString( begin( range ), end( range ) );
-    }
-
-    // Handle vector<bool> specially
-    template<typename Allocator>
-    std::string rangeToString( std::vector<bool, Allocator> const& v ) {
-        ReusableStringStream rss;
-        rss << "{ ";
-        bool first = true;
-        for( bool b : v ) {
-            if( first )
-                first = false;
-            else
-                rss << ", ";
-            rss << ::Catch::Detail::stringify( b );
-        }
-        rss << " }";
-        return rss.str();
-    }
-
-    template<typename R>
-    struct StringMaker<R, typename std::enable_if<is_range<R>::value && !::Catch::Detail::IsStreamInsertable<R>::value>::type> {
-        static std::string convert( R const& range ) {
-            return rangeToString( range );
-        }
-    };
-
-    template <typename T, int SZ>
-    struct StringMaker<T[SZ]> {
-        static std::string convert(T const(&arr)[SZ]) {
-            return rangeToString(arr);
-        }
-    };
-
-} // namespace Catch
-
-// Separate std::chrono::duration specialization
-#if defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
-#include <ctime>
-#include <ratio>
-#include <chrono>
-
-namespace Catch {
-
-template <class Ratio>
-struct ratio_string {
-    static std::string symbol();
-};
-
-template <class Ratio>
-std::string ratio_string<Ratio>::symbol() {
-    Catch::ReusableStringStream rss;
-    rss << '[' << Ratio::num << '/'
-        << Ratio::den << ']';
-    return rss.str();
-}
-template <>
-struct ratio_string<std::atto> {
-    static std::string symbol();
-};
-template <>
-struct ratio_string<std::femto> {
-    static std::string symbol();
-};
-template <>
-struct ratio_string<std::pico> {
-    static std::string symbol();
-};
-template <>
-struct ratio_string<std::nano> {
-    static std::string symbol();
-};
-template <>
-struct ratio_string<std::micro> {
-    static std::string symbol();
-};
-template <>
-struct ratio_string<std::milli> {
-    static std::string symbol();
-};
-
-    ////////////
-    // std::chrono::duration specializations
-    template<typename Value, typename Ratio>
-    struct StringMaker<std::chrono::duration<Value, Ratio>> {
-        static std::string convert(std::chrono::duration<Value, Ratio> const& duration) {
-            ReusableStringStream rss;
-            rss << duration.count() << ' ' << ratio_string<Ratio>::symbol() << 's';
-            return rss.str();
-        }
-    };
-    template<typename Value>
-    struct StringMaker<std::chrono::duration<Value, std::ratio<1>>> {
-        static std::string convert(std::chrono::duration<Value, std::ratio<1>> const& duration) {
-            ReusableStringStream rss;
-            rss << duration.count() << " s";
-            return rss.str();
-        }
-    };
-    template<typename Value>
-    struct StringMaker<std::chrono::duration<Value, std::ratio<60>>> {
-        static std::string convert(std::chrono::duration<Value, std::ratio<60>> const& duration) {
-            ReusableStringStream rss;
-            rss << duration.count() << " m";
-            return rss.str();
-        }
-    };
-    template<typename Value>
-    struct StringMaker<std::chrono::duration<Value, std::ratio<3600>>> {
-        static std::string convert(std::chrono::duration<Value, std::ratio<3600>> const& duration) {
-            ReusableStringStream rss;
-            rss << duration.count() << " h";
-            return rss.str();
-        }
-    };
-
-    ////////////
-    // std::chrono::time_point specialization
-    // Generic time_point cannot be specialized, only std::chrono::time_point<system_clock>
-    template<typename Clock, typename Duration>
-    struct StringMaker<std::chrono::time_point<Clock, Duration>> {
-        static std::string convert(std::chrono::time_point<Clock, Duration> const& time_point) {
-            return ::Catch::Detail::stringify(time_point.time_since_epoch()) + " since epoch";
-        }
-    };
-    // std::chrono::time_point<system_clock> specialization
-    template<typename Duration>
-    struct StringMaker<std::chrono::time_point<std::chrono::system_clock, Duration>> {
-        static std::string convert(std::chrono::time_point<std::chrono::system_clock, Duration> const& time_point) {
-            auto converted = std::chrono::system_clock::to_time_t(time_point);
-
-#ifdef _MSC_VER
-            std::tm timeInfo = {};
-            gmtime_s(&timeInfo, &converted);
-#else
-            std::tm* timeInfo = std::gmtime(&converted);
-#endif
-
-            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
-            char timeStamp[timeStampSize];
-            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
-
-#ifdef _MSC_VER
-            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
-#else
-            std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
-#endif
-            return std::string(timeStamp);
-        }
-    };
-}
-#endif // CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-// end catch_tostring.h
-#include <iosfwd>
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable:4389) // '==' : signed/unsigned mismatch
-#pragma warning(disable:4018) // more "signed/unsigned mismatch"
-#pragma warning(disable:4312) // Converting int to T* using reinterpret_cast (issue on x64 platform)
-#pragma warning(disable:4180) // qualifier applied to function type has no meaning
-#endif
-
-namespace Catch {
-
-    struct ITransientExpression {
-        auto isBinaryExpression() const -> bool { return m_isBinaryExpression; }
-        auto getResult() const -> bool { return m_result; }
-        virtual void streamReconstructedExpression( std::ostream &os ) const = 0;
-
-        ITransientExpression( bool isBinaryExpression, bool result )
-        :   m_isBinaryExpression( isBinaryExpression ),
-            m_result( result )
-        {}
-
-        // We don't actually need a virtual destructor, but many static analysers
-        // complain if it's not here :-(
-        virtual ~ITransientExpression();
-
-        bool m_isBinaryExpression;
-        bool m_result;
-
-    };
-
-    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs );
-
-    template<typename LhsT, typename RhsT>
-    class BinaryExpr  : public ITransientExpression {
-        LhsT m_lhs;
-        StringRef m_op;
-        RhsT m_rhs;
-
-        void streamReconstructedExpression( std::ostream &os ) const override {
-            formatReconstructedExpression
-                    ( os, Catch::Detail::stringify( m_lhs ), m_op, Catch::Detail::stringify( m_rhs ) );
-        }
-
-    public:
-        BinaryExpr( bool comparisonResult, LhsT lhs, StringRef op, RhsT rhs )
-        :   ITransientExpression{ true, comparisonResult },
-            m_lhs( lhs ),
-            m_op( op ),
-            m_rhs( rhs )
-        {}
-    };
-
-    template<typename LhsT>
-    class UnaryExpr : public ITransientExpression {
-        LhsT m_lhs;
-
-        void streamReconstructedExpression( std::ostream &os ) const override {
-            os << Catch::Detail::stringify( m_lhs );
-        }
-
-    public:
-        explicit UnaryExpr( LhsT lhs )
-        :   ITransientExpression{ false, lhs ? true : false },
-            m_lhs( lhs )
-        {}
-    };
-
-    // Specialised comparison functions to handle equality comparisons between ints and pointers (NULL deduces as an int)
-    template<typename LhsT, typename RhsT>
-    auto compareEqual( LhsT const& lhs, RhsT const& rhs ) -> bool { return static_cast<bool>(lhs == rhs); }
-    template<typename T>
-    auto compareEqual( T* const& lhs, int rhs ) -> bool { return lhs == reinterpret_cast<void const*>( rhs ); }
-    template<typename T>
-    auto compareEqual( T* const& lhs, long rhs ) -> bool { return lhs == reinterpret_cast<void const*>( rhs ); }
-    template<typename T>
-    auto compareEqual( int lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) == rhs; }
-    template<typename T>
-    auto compareEqual( long lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) == rhs; }
-
-    template<typename LhsT, typename RhsT>
-    auto compareNotEqual( LhsT const& lhs, RhsT&& rhs ) -> bool { return static_cast<bool>(lhs != rhs); }
-    template<typename T>
-    auto compareNotEqual( T* const& lhs, int rhs ) -> bool { return lhs != reinterpret_cast<void const*>( rhs ); }
-    template<typename T>
-    auto compareNotEqual( T* const& lhs, long rhs ) -> bool { return lhs != reinterpret_cast<void const*>( rhs ); }
-    template<typename T>
-    auto compareNotEqual( int lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) != rhs; }
-    template<typename T>
-    auto compareNotEqual( long lhs, T* const& rhs ) -> bool { return reinterpret_cast<void const*>( lhs ) != rhs; }
-
-    template<typename LhsT>
-    class ExprLhs {
-        LhsT m_lhs;
-    public:
-        explicit ExprLhs( LhsT lhs ) : m_lhs( lhs ) {}
-
-        template<typename RhsT>
-        auto operator == ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
-            return { compareEqual( m_lhs, rhs ), m_lhs, "==", rhs };
-        }
-        auto operator == ( bool rhs ) -> BinaryExpr<LhsT, bool> const {
-            return { m_lhs == rhs, m_lhs, "==", rhs };
-        }
-
-        template<typename RhsT>
-        auto operator != ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
-            return { compareNotEqual( m_lhs, rhs ), m_lhs, "!=", rhs };
-        }
-        auto operator != ( bool rhs ) -> BinaryExpr<LhsT, bool> const {
-            return { m_lhs != rhs, m_lhs, "!=", rhs };
-        }
-
-        template<typename RhsT>
-        auto operator > ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
-            return { static_cast<bool>(m_lhs > rhs), m_lhs, ">", rhs };
-        }
-        template<typename RhsT>
-        auto operator < ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
-            return { static_cast<bool>(m_lhs < rhs), m_lhs, "<", rhs };
-        }
-        template<typename RhsT>
-        auto operator >= ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
-            return { static_cast<bool>(m_lhs >= rhs), m_lhs, ">=", rhs };
-        }
-        template<typename RhsT>
-        auto operator <= ( RhsT const& rhs ) -> BinaryExpr<LhsT, RhsT const&> const {
-            return { static_cast<bool>(m_lhs <= rhs), m_lhs, "<=", rhs };
-        }
-
-        auto makeUnaryExpr() const -> UnaryExpr<LhsT> {
-            return UnaryExpr<LhsT>{ m_lhs };
-        }
-    };
-
-    void handleExpression( ITransientExpression const& expr );
-
-    template<typename T>
-    void handleExpression( ExprLhs<T> const& expr ) {
-        handleExpression( expr.makeUnaryExpr() );
-    }
-
-    struct Decomposer {
-        template<typename T>
-        auto operator <= ( T const& lhs ) -> ExprLhs<T const&> {
-            return ExprLhs<T const&>{ lhs };
-        }
-
-        auto operator <=( bool value ) -> ExprLhs<bool> {
-            return ExprLhs<bool>{ value };
-        }
-    };
-
-} // end namespace Catch
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-// end catch_decomposer.h
-// start catch_interfaces_capture.h
-
-#include <string>
-
-namespace Catch {
-
-    class AssertionResult;
-    struct AssertionInfo;
-    struct SectionInfo;
-    struct SectionEndInfo;
-    struct MessageInfo;
-    struct Counts;
-    struct BenchmarkInfo;
-    struct BenchmarkStats;
-    struct AssertionReaction;
-    struct SourceLineInfo;
-
-    struct ITransientExpression;
-    struct IGeneratorTracker;
-
-    struct IResultCapture {
-
-        virtual ~IResultCapture();
-
-        virtual bool sectionStarted(    SectionInfo const& sectionInfo,
-                                        Counts& assertions ) = 0;
-        virtual void sectionEnded( SectionEndInfo const& endInfo ) = 0;
-        virtual void sectionEndedEarly( SectionEndInfo const& endInfo ) = 0;
-
-        virtual auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& = 0;
-
-        virtual void benchmarkStarting( BenchmarkInfo const& info ) = 0;
-        virtual void benchmarkEnded( BenchmarkStats const& stats ) = 0;
-
-        virtual void pushScopedMessage( MessageInfo const& message ) = 0;
-        virtual void popScopedMessage( MessageInfo const& message ) = 0;
-
-        virtual void handleFatalErrorCondition( StringRef message ) = 0;
-
-        virtual void handleExpr
-                (   AssertionInfo const& info,
-                    ITransientExpression const& expr,
-                    AssertionReaction& reaction ) = 0;
-        virtual void handleMessage
-                (   AssertionInfo const& info,
-                    ResultWas::OfType resultType,
-                    StringRef const& message,
-                    AssertionReaction& reaction ) = 0;
-        virtual void handleUnexpectedExceptionNotThrown
-                (   AssertionInfo const& info,
-                    AssertionReaction& reaction ) = 0;
-        virtual void handleUnexpectedInflightException
-                (   AssertionInfo const& info,
-                    std::string const& message,
-                    AssertionReaction& reaction ) = 0;
-        virtual void handleIncomplete
-                (   AssertionInfo const& info ) = 0;
-        virtual void handleNonExpr
-                (   AssertionInfo const &info,
-                    ResultWas::OfType resultType,
-                    AssertionReaction &reaction ) = 0;
-
-        virtual bool lastAssertionPassed() = 0;
-        virtual void assertionPassed() = 0;
-
-        // Deprecated, do not use:
-        virtual std::string getCurrentTestName() const = 0;
-        virtual const AssertionResult* getLastResult() const = 0;
-        virtual void exceptionEarlyReported() = 0;
-    };
-
-    IResultCapture& getResultCapture();
-}
-
-// end catch_interfaces_capture.h
-namespace Catch {
-
-    struct TestFailureException{};
-    struct AssertionResultData;
-    struct IResultCapture;
-    class RunContext;
-
-    class LazyExpression {
-        friend class AssertionHandler;
-        friend struct AssertionStats;
-        friend class RunContext;
-
-        ITransientExpression const* m_transientExpression = nullptr;
-        bool m_isNegated;
-    public:
-        LazyExpression( bool isNegated );
-        LazyExpression( LazyExpression const& other );
-        LazyExpression& operator = ( LazyExpression const& ) = delete;
-
-        explicit operator bool() const;
-
-        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
-    };
-
-    struct AssertionReaction {
-        bool shouldDebugBreak = false;
-        bool shouldThrow = false;
-    };
-
-    class AssertionHandler {
-        AssertionInfo m_assertionInfo;
-        AssertionReaction m_reaction;
-        bool m_completed = false;
-        IResultCapture& m_resultCapture;
-
-    public:
-        AssertionHandler
-            (   StringRef const& macroName,
-                SourceLineInfo const& lineInfo,
-                StringRef capturedExpression,
-                ResultDisposition::Flags resultDisposition );
-        ~AssertionHandler() {
-            if ( !m_completed ) {
-                m_resultCapture.handleIncomplete( m_assertionInfo );
-            }
-        }
-
-        template<typename T>
-        void handleExpr( ExprLhs<T> const& expr ) {
-            handleExpr( expr.makeUnaryExpr() );
-        }
-        void handleExpr( ITransientExpression const& expr );
-
-        void handleMessage(ResultWas::OfType resultType, StringRef const& message);
-
-        void handleExceptionThrownAsExpected();
-        void handleUnexpectedExceptionNotThrown();
-        void handleExceptionNotThrownAsExpected();
-        void handleThrowingCallSkipped();
-        void handleUnexpectedInflightException();
-
-        void complete();
-        void setCompleted();
-
-        // query
-        auto allowThrows() const -> bool;
-    };
-
-    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str, StringRef const& matcherString );
-
-} // namespace Catch
-
-// end catch_assertionhandler.h
-// start catch_message.h
-
-#include <string>
-#include <vector>
-
-namespace Catch {
-
-    struct MessageInfo {
-        MessageInfo(    StringRef const& _macroName,
-                        SourceLineInfo const& _lineInfo,
-                        ResultWas::OfType _type );
-
-        StringRef macroName;
-        std::string message;
-        SourceLineInfo lineInfo;
-        ResultWas::OfType type;
-        unsigned int sequence;
-
-        bool operator == ( MessageInfo const& other ) const;
-        bool operator < ( MessageInfo const& other ) const;
-    private:
-        static unsigned int globalCount;
-    };
-
-    struct MessageStream {
-
-        template<typename T>
-        MessageStream& operator << ( T const& value ) {
-            m_stream << value;
-            return *this;
-        }
-
-        ReusableStringStream m_stream;
-    };
-
-    struct MessageBuilder : MessageStream {
-        MessageBuilder( StringRef const& macroName,
-                        SourceLineInfo const& lineInfo,
-                        ResultWas::OfType type );
-
-        template<typename T>
-        MessageBuilder& operator << ( T const& value ) {
-            m_stream << value;
-            return *this;
-        }
-
-        MessageInfo m_info;
-    };
-
-    class ScopedMessage {
-    public:
-        explicit ScopedMessage( MessageBuilder const& builder );
-        ~ScopedMessage();
-
-        MessageInfo m_info;
-    };
-
-    class Capturer {
-        std::vector<MessageInfo> m_messages;
-        IResultCapture& m_resultCapture = getResultCapture();
-        size_t m_captured = 0;
-    public:
-        Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names );
-        ~Capturer();
-
-        void captureValue( size_t index, StringRef value );
-
-        template<typename T>
-        void captureValues( size_t index, T&& value ) {
-            captureValue( index, Catch::Detail::stringify( value ) );
-        }
-
-        template<typename T, typename... Ts>
-        void captureValues( size_t index, T&& value, Ts&&... values ) {
-            captureValues( index, value );
-            captureValues( index+1, values... );
-        }
-    };
-
-} // end namespace Catch
-
-// end catch_message.h
-#if !defined(CATCH_CONFIG_DISABLE)
-
-#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
-  #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__
-#else
-  #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"
-#endif
-
-#if defined(CATCH_CONFIG_FAST_COMPILE) || defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-
-///////////////////////////////////////////////////////////////////////////////
-// Another way to speed-up compilation is to omit local try-catch for REQUIRE*
-// macros.
-#define INTERNAL_CATCH_TRY
-#define INTERNAL_CATCH_CATCH( capturer )
-
-#else // CATCH_CONFIG_FAST_COMPILE
-
-#define INTERNAL_CATCH_TRY try
-#define INTERNAL_CATCH_CATCH( handler ) catch(...) { handler.handleUnexpectedInflightException(); }
-
-#endif
-
-#define INTERNAL_CATCH_REACT( handler ) handler.complete();
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_TEST( macroName, resultDisposition, ... ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
-        INTERNAL_CATCH_TRY { \
-            CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
-            catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); \
-            CATCH_INTERNAL_UNSUPPRESS_PARENTHESES_WARNINGS \
-        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( (void)0, false && static_cast<bool>( !!(__VA_ARGS__) ) ) // the expression here is never evaluated at runtime but it forces the compiler to give it a look
-    // The double negation silences MSVC's C4800 warning, the static_cast forces short-circuit evaluation if the type has overloaded &&.
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_IF( macroName, resultDisposition, ... ) \
-    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
-    if( Catch::getResultCapture().lastAssertionPassed() )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_ELSE( macroName, resultDisposition, ... ) \
-    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
-    if( !Catch::getResultCapture().lastAssertionPassed() )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_NO_THROW( macroName, resultDisposition, ... ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
-        try { \
-            static_cast<void>(__VA_ARGS__); \
-            catchAssertionHandler.handleExceptionNotThrownAsExpected(); \
-        } \
-        catch( ... ) { \
-            catchAssertionHandler.handleUnexpectedInflightException(); \
-        } \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( false )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_THROWS( macroName, resultDisposition, ... ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition); \
-        if( catchAssertionHandler.allowThrows() ) \
-            try { \
-                static_cast<void>(__VA_ARGS__); \
-                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
-            } \
-            catch( ... ) { \
-                catchAssertionHandler.handleExceptionThrownAsExpected(); \
-            } \
-        else \
-            catchAssertionHandler.handleThrowingCallSkipped(); \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( false )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_THROWS_AS( macroName, exceptionType, resultDisposition, expr ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(expr) ", " CATCH_INTERNAL_STRINGIFY(exceptionType), resultDisposition ); \
-        if( catchAssertionHandler.allowThrows() ) \
-            try { \
-                static_cast<void>(expr); \
-                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
-            } \
-            catch( exceptionType const& ) { \
-                catchAssertionHandler.handleExceptionThrownAsExpected(); \
-            } \
-            catch( ... ) { \
-                catchAssertionHandler.handleUnexpectedInflightException(); \
-            } \
-        else \
-            catchAssertionHandler.handleThrowingCallSkipped(); \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( false )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_MSG( macroName, messageType, resultDisposition, ... ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::StringRef(), resultDisposition ); \
-        catchAssertionHandler.handleMessage( messageType, ( Catch::MessageStream() << __VA_ARGS__ + ::Catch::StreamEndStop() ).m_stream.str() ); \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( false )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_CAPTURE( varName, macroName, ... ) \
-    auto varName = Catch::Capturer( macroName, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info, #__VA_ARGS__ ); \
-    varName.captureValues( 0, __VA_ARGS__ )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_INFO( macroName, log ) \
-    Catch::ScopedMessage INTERNAL_CATCH_UNIQUE_NAME( scopedMessage )( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log );
-
-///////////////////////////////////////////////////////////////////////////////
-// Although this is matcher-based, it can be used with just a string
-#define INTERNAL_CATCH_THROWS_STR_MATCHES( macroName, resultDisposition, matcher, ... ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
-        if( catchAssertionHandler.allowThrows() ) \
-            try { \
-                static_cast<void>(__VA_ARGS__); \
-                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
-            } \
-            catch( ... ) { \
-                Catch::handleExceptionMatchExpr( catchAssertionHandler, matcher, #matcher##_catch_sr ); \
-            } \
-        else \
-            catchAssertionHandler.handleThrowingCallSkipped(); \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( false )
-
-#endif // CATCH_CONFIG_DISABLE
-
-// end catch_capture.hpp
-// start catch_section.h
-
-// start catch_section_info.h
-
-// start catch_totals.h
-
-#include <cstddef>
-
-namespace Catch {
-
-    struct Counts {
-        Counts operator - ( Counts const& other ) const;
-        Counts& operator += ( Counts const& other );
-
-        std::size_t total() const;
-        bool allPassed() const;
-        bool allOk() const;
-
-        std::size_t passed = 0;
-        std::size_t failed = 0;
-        std::size_t failedButOk = 0;
-    };
-
-    struct Totals {
-
-        Totals operator - ( Totals const& other ) const;
-        Totals& operator += ( Totals const& other );
-
-        Totals delta( Totals const& prevTotals ) const;
-
-        int error = 0;
-        Counts assertions;
-        Counts testCases;
-    };
-}
-
-// end catch_totals.h
-#include <string>
-
-namespace Catch {
-
-    struct SectionInfo {
-        SectionInfo
-            (   SourceLineInfo const& _lineInfo,
-                std::string const& _name );
-
-        // Deprecated
-        SectionInfo
-            (   SourceLineInfo const& _lineInfo,
-                std::string const& _name,
-                std::string const& ) : SectionInfo( _lineInfo, _name ) {}
-
-        std::string name;
-        std::string description; // !Deprecated: this will always be empty
-        SourceLineInfo lineInfo;
-    };
-
-    struct SectionEndInfo {
-        SectionInfo sectionInfo;
-        Counts prevAssertions;
-        double durationInSeconds;
-    };
-
-} // end namespace Catch
-
-// end catch_section_info.h
-// start catch_timer.h
-
-#include <cstdint>
-
-namespace Catch {
-
-    auto getCurrentNanosecondsSinceEpoch() -> uint64_t;
-    auto getEstimatedClockResolution() -> uint64_t;
-
-    class Timer {
-        uint64_t m_nanoseconds = 0;
-    public:
-        void start();
-        auto getElapsedNanoseconds() const -> uint64_t;
-        auto getElapsedMicroseconds() const -> uint64_t;
-        auto getElapsedMilliseconds() const -> unsigned int;
-        auto getElapsedSeconds() const -> double;
-    };
-
-} // namespace Catch
-
-// end catch_timer.h
-#include <string>
-
-namespace Catch {
-
-    class Section : NonCopyable {
-    public:
-        Section( SectionInfo const& info );
-        ~Section();
-
-        // This indicates whether the section should be executed or not
-        explicit operator bool() const;
-
-    private:
-        SectionInfo m_info;
-
-        std::string m_name;
-        Counts m_assertions;
-        bool m_sectionIncluded;
-        Timer m_timer;
-    };
-
-} // end namespace Catch
-
-#define INTERNAL_CATCH_SECTION( ... ) \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
-    CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS
-
-#define INTERNAL_CATCH_DYNAMIC_SECTION( ... ) \
-    CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \
-    if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, (Catch::ReusableStringStream() << __VA_ARGS__).str() ) ) \
-    CATCH_INTERNAL_UNSUPPRESS_UNUSED_WARNINGS
-
-// end catch_section.h
-// start catch_benchmark.h
-
-#include <cstdint>
-#include <string>
-
-namespace Catch {
-
-    class BenchmarkLooper {
-
-        std::string m_name;
-        std::size_t m_count = 0;
-        std::size_t m_iterationsToRun = 1;
-        uint64_t m_resolution;
-        Timer m_timer;
-
-        static auto getResolution() -> uint64_t;
-    public:
-        // Keep most of this inline as it's on the code path that is being timed
-        BenchmarkLooper( StringRef name )
-        :   m_name( name ),
-            m_resolution( getResolution() )
-        {
-            reportStart();
-            m_timer.start();
-        }
-
-        explicit operator bool() {
-            if( m_count < m_iterationsToRun )
-                return true;
-            return needsMoreIterations();
-        }
-
-        void increment() {
-            ++m_count;
-        }
-
-        void reportStart();
-        auto needsMoreIterations() -> bool;
-    };
-
-} // end namespace Catch
-
-#define BENCHMARK( name ) \
-    for( Catch::BenchmarkLooper looper( name ); looper; looper.increment() )
-
-// end catch_benchmark.h
-// start catch_interfaces_exception.h
-
-// start catch_interfaces_registry_hub.h
-
-#include <string>
-#include <memory>
-
-namespace Catch {
-
-    class TestCase;
-    struct ITestCaseRegistry;
-    struct IExceptionTranslatorRegistry;
-    struct IExceptionTranslator;
-    struct IReporterRegistry;
-    struct IReporterFactory;
-    struct ITagAliasRegistry;
-    class StartupExceptionRegistry;
-
-    using IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;
-
-    struct IRegistryHub {
-        virtual ~IRegistryHub();
-
-        virtual IReporterRegistry const& getReporterRegistry() const = 0;
-        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
-        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
-
-        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
-
-        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
-    };
-
-    struct IMutableRegistryHub {
-        virtual ~IMutableRegistryHub();
-        virtual void registerReporter( std::string const& name, IReporterFactoryPtr const& factory ) = 0;
-        virtual void registerListener( IReporterFactoryPtr const& factory ) = 0;
-        virtual void registerTest( TestCase const& testInfo ) = 0;
-        virtual void registerTranslator( const IExceptionTranslator* translator ) = 0;
-        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
-        virtual void registerStartupException() noexcept = 0;
-    };
-
-    IRegistryHub const& getRegistryHub();
-    IMutableRegistryHub& getMutableRegistryHub();
-    void cleanUp();
-    std::string translateActiveException();
-
-}
-
-// end catch_interfaces_registry_hub.h
-#if defined(CATCH_CONFIG_DISABLE)
-    #define INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( translatorName, signature) \
-        static std::string translatorName( signature )
-#endif
-
-#include <exception>
-#include <string>
-#include <vector>
-
-namespace Catch {
-    using exceptionTranslateFunction = std::string(*)();
-
-    struct IExceptionTranslator;
-    using ExceptionTranslators = std::vector<std::unique_ptr<IExceptionTranslator const>>;
-
-    struct IExceptionTranslator {
-        virtual ~IExceptionTranslator();
-        virtual std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const = 0;
-    };
-
-    struct IExceptionTranslatorRegistry {
-        virtual ~IExceptionTranslatorRegistry();
-
-        virtual std::string translateActiveException() const = 0;
-    };
-
-    class ExceptionTranslatorRegistrar {
-        template<typename T>
-        class ExceptionTranslator : public IExceptionTranslator {
-        public:
-
-            ExceptionTranslator( std::string(*translateFunction)( T& ) )
-            : m_translateFunction( translateFunction )
-            {}
-
-            std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const override {
-                try {
-                    if( it == itEnd )
-                        std::rethrow_exception(std::current_exception());
-                    else
-                        return (*it)->translate( it+1, itEnd );
-                }
-                catch( T& ex ) {
-                    return m_translateFunction( ex );
-                }
-            }
-
-        protected:
-            std::string(*m_translateFunction)( T& );
-        };
-
-    public:
-        template<typename T>
-        ExceptionTranslatorRegistrar( std::string(*translateFunction)( T& ) ) {
-            getMutableRegistryHub().registerTranslator
-                ( new ExceptionTranslator<T>( translateFunction ) );
-        }
-    };
-}
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_TRANSLATE_EXCEPTION2( translatorName, signature ) \
-    static std::string translatorName( signature ); \
-    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
-    namespace{ Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionRegistrar )( &translatorName ); } \
-    CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS \
-    static std::string translatorName( signature )
-
-#define INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION2( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
-
-// end catch_interfaces_exception.h
-// start catch_approx.h
-
-#include <type_traits>
-
-namespace Catch {
-namespace Detail {
-
-    class Approx {
-    private:
-        bool equalityComparisonImpl(double other) const;
-        // Validates the new margin (margin >= 0)
-        // out-of-line to avoid including stdexcept in the header
-        void setMargin(double margin);
-        // Validates the new epsilon (0 < epsilon < 1)
-        // out-of-line to avoid including stdexcept in the header
-        void setEpsilon(double epsilon);
-
-    public:
-        explicit Approx ( double value );
-
-        static Approx custom();
-
-        Approx operator-() const;
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        Approx operator()( T const& value ) {
-            Approx approx( static_cast<double>(value) );
-            approx.m_epsilon = m_epsilon;
-            approx.m_margin = m_margin;
-            approx.m_scale = m_scale;
-            return approx;
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        explicit Approx( T const& value ): Approx(static_cast<double>(value))
-        {}
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator == ( const T& lhs, Approx const& rhs ) {
-            auto lhs_v = static_cast<double>(lhs);
-            return rhs.equalityComparisonImpl(lhs_v);
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator == ( Approx const& lhs, const T& rhs ) {
-            return operator==( rhs, lhs );
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator != ( T const& lhs, Approx const& rhs ) {
-            return !operator==( lhs, rhs );
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator != ( Approx const& lhs, T const& rhs ) {
-            return !operator==( rhs, lhs );
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator <= ( T const& lhs, Approx const& rhs ) {
-            return static_cast<double>(lhs) < rhs.m_value || lhs == rhs;
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator <= ( Approx const& lhs, T const& rhs ) {
-            return lhs.m_value < static_cast<double>(rhs) || lhs == rhs;
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator >= ( T const& lhs, Approx const& rhs ) {
-            return static_cast<double>(lhs) > rhs.m_value || lhs == rhs;
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        friend bool operator >= ( Approx const& lhs, T const& rhs ) {
-            return lhs.m_value > static_cast<double>(rhs) || lhs == rhs;
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        Approx& epsilon( T const& newEpsilon ) {
-            double epsilonAsDouble = static_cast<double>(newEpsilon);
-            setEpsilon(epsilonAsDouble);
-            return *this;
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        Approx& margin( T const& newMargin ) {
-            double marginAsDouble = static_cast<double>(newMargin);
-            setMargin(marginAsDouble);
-            return *this;
-        }
-
-        template <typename T, typename = typename std::enable_if<std::is_constructible<double, T>::value>::type>
-        Approx& scale( T const& newScale ) {
-            m_scale = static_cast<double>(newScale);
-            return *this;
-        }
-
-        std::string toString() const;
-
-    private:
-        double m_epsilon;
-        double m_margin;
-        double m_scale;
-        double m_value;
-    };
-} // end namespace Detail
-
-namespace literals {
-    Detail::Approx operator "" _a(long double val);
-    Detail::Approx operator "" _a(unsigned long long val);
-} // end namespace literals
-
-template<>
-struct StringMaker<Catch::Detail::Approx> {
-    static std::string convert(Catch::Detail::Approx const& value);
-};
-
-} // end namespace Catch
-
-// end catch_approx.h
-// start catch_string_manip.h
-
-#include <string>
-#include <iosfwd>
-
-namespace Catch {
-
-    bool startsWith( std::string const& s, std::string const& prefix );
-    bool startsWith( std::string const& s, char prefix );
-    bool endsWith( std::string const& s, std::string const& suffix );
-    bool endsWith( std::string const& s, char suffix );
-    bool contains( std::string const& s, std::string const& infix );
-    void toLowerInPlace( std::string& s );
-    std::string toLower( std::string const& s );
-    std::string trim( std::string const& str );
-    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis );
-
-    struct pluralise {
-        pluralise( std::size_t count, std::string const& label );
-
-        friend std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser );
-
-        std::size_t m_count;
-        std::string m_label;
-    };
-}
-
-// end catch_string_manip.h
-#ifndef CATCH_CONFIG_DISABLE_MATCHERS
-// start catch_capture_matchers.h
-
-// start catch_matchers.h
-
-#include <string>
-#include <vector>
-
-namespace Catch {
-namespace Matchers {
-    namespace Impl {
-
-        template<typename ArgT> struct MatchAllOf;
-        template<typename ArgT> struct MatchAnyOf;
-        template<typename ArgT> struct MatchNotOf;
-
-        class MatcherUntypedBase {
-        public:
-            MatcherUntypedBase() = default;
-            MatcherUntypedBase ( MatcherUntypedBase const& ) = default;
-            MatcherUntypedBase& operator = ( MatcherUntypedBase const& ) = delete;
-            std::string toString() const;
-
-        protected:
-            virtual ~MatcherUntypedBase();
-            virtual std::string describe() const = 0;
-            mutable std::string m_cachedToString;
-        };
-
-#ifdef __clang__
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wnon-virtual-dtor"
-#endif
-
-        template<typename ObjectT>
-        struct MatcherMethod {
-            virtual bool match( ObjectT const& arg ) const = 0;
-        };
-        template<typename PtrT>
-        struct MatcherMethod<PtrT*> {
-            virtual bool match( PtrT* arg ) const = 0;
-        };
-
-#ifdef __clang__
-#    pragma clang diagnostic pop
-#endif
-
-        template<typename T>
-        struct MatcherBase : MatcherUntypedBase, MatcherMethod<T> {
-
-            MatchAllOf<T> operator && ( MatcherBase const& other ) const;
-            MatchAnyOf<T> operator || ( MatcherBase const& other ) const;
-            MatchNotOf<T> operator ! () const;
-        };
-
-        template<typename ArgT>
-        struct MatchAllOf : MatcherBase<ArgT> {
-            bool match( ArgT const& arg ) const override {
-                for( auto matcher : m_matchers ) {
-                    if (!matcher->match(arg))
-                        return false;
-                }
-                return true;
-            }
-            std::string describe() const override {
-                std::string description;
-                description.reserve( 4 + m_matchers.size()*32 );
-                description += "( ";
-                bool first = true;
-                for( auto matcher : m_matchers ) {
-                    if( first )
-                        first = false;
-                    else
-                        description += " and ";
-                    description += matcher->toString();
-                }
-                description += " )";
-                return description;
-            }
-
-            MatchAllOf<ArgT>& operator && ( MatcherBase<ArgT> const& other ) {
-                m_matchers.push_back( &other );
-                return *this;
-            }
-
-            std::vector<MatcherBase<ArgT> const*> m_matchers;
-        };
-        template<typename ArgT>
-        struct MatchAnyOf : MatcherBase<ArgT> {
-
-            bool match( ArgT const& arg ) const override {
-                for( auto matcher : m_matchers ) {
-                    if (matcher->match(arg))
-                        return true;
-                }
-                return false;
-            }
-            std::string describe() const override {
-                std::string description;
-                description.reserve( 4 + m_matchers.size()*32 );
-                description += "( ";
-                bool first = true;
-                for( auto matcher : m_matchers ) {
-                    if( first )
-                        first = false;
-                    else
-                        description += " or ";
-                    description += matcher->toString();
-                }
-                description += " )";
-                return description;
-            }
-
-            MatchAnyOf<ArgT>& operator || ( MatcherBase<ArgT> const& other ) {
-                m_matchers.push_back( &other );
-                return *this;
-            }
-
-            std::vector<MatcherBase<ArgT> const*> m_matchers;
-        };
-
-        template<typename ArgT>
-        struct MatchNotOf : MatcherBase<ArgT> {
-
-            MatchNotOf( MatcherBase<ArgT> const& underlyingMatcher ) : m_underlyingMatcher( underlyingMatcher ) {}
-
-            bool match( ArgT const& arg ) const override {
-                return !m_underlyingMatcher.match( arg );
-            }
-
-            std::string describe() const override {
-                return "not " + m_underlyingMatcher.toString();
-            }
-            MatcherBase<ArgT> const& m_underlyingMatcher;
-        };
-
-        template<typename T>
-        MatchAllOf<T> MatcherBase<T>::operator && ( MatcherBase const& other ) const {
-            return MatchAllOf<T>() && *this && other;
-        }
-        template<typename T>
-        MatchAnyOf<T> MatcherBase<T>::operator || ( MatcherBase const& other ) const {
-            return MatchAnyOf<T>() || *this || other;
-        }
-        template<typename T>
-        MatchNotOf<T> MatcherBase<T>::operator ! () const {
-            return MatchNotOf<T>( *this );
-        }
-
-    } // namespace Impl
-
-} // namespace Matchers
-
-using namespace Matchers;
-using Matchers::Impl::MatcherBase;
-
-} // namespace Catch
-
-// end catch_matchers.h
-// start catch_matchers_floating.h
-
-#include <type_traits>
-#include <cmath>
-
-namespace Catch {
-namespace Matchers {
-
-    namespace Floating {
-
-        enum class FloatingPointKind : uint8_t;
-
-        struct WithinAbsMatcher : MatcherBase<double> {
-            WithinAbsMatcher(double target, double margin);
-            bool match(double const& matchee) const override;
-            std::string describe() const override;
-        private:
-            double m_target;
-            double m_margin;
-        };
-
-        struct WithinUlpsMatcher : MatcherBase<double> {
-            WithinUlpsMatcher(double target, int ulps, FloatingPointKind baseType);
-            bool match(double const& matchee) const override;
-            std::string describe() const override;
-        private:
-            double m_target;
-            int m_ulps;
-            FloatingPointKind m_type;
-        };
-
-    } // namespace Floating
-
-    // The following functions create the actual matcher objects.
-    // This allows the types to be inferred
-    Floating::WithinUlpsMatcher WithinULP(double target, int maxUlpDiff);
-    Floating::WithinUlpsMatcher WithinULP(float target, int maxUlpDiff);
-    Floating::WithinAbsMatcher WithinAbs(double target, double margin);
-
-} // namespace Matchers
-} // namespace Catch
-
-// end catch_matchers_floating.h
-// start catch_matchers_generic.hpp
-
-#include <functional>
-#include <string>
-
-namespace Catch {
-namespace Matchers {
-namespace Generic {
-
-namespace Detail {
-    std::string finalizeDescription(const std::string& desc);
-}
-
-template <typename T>
-class PredicateMatcher : public MatcherBase<T> {
-    std::function<bool(T const&)> m_predicate;
-    std::string m_description;
-public:
-
-    PredicateMatcher(std::function<bool(T const&)> const& elem, std::string const& descr)
-        :m_predicate(std::move(elem)),
-        m_description(Detail::finalizeDescription(descr))
-    {}
-
-    bool match( T const& item ) const override {
-        return m_predicate(item);
-    }
-
-    std::string describe() const override {
-        return m_description;
-    }
-};
-
-} // namespace Generic
-
-    // The following functions create the actual matcher objects.
-    // The user has to explicitly specify type to the function, because
-    // infering std::function<bool(T const&)> is hard (but possible) and
-    // requires a lot of TMP.
-    template<typename T>
-    Generic::PredicateMatcher<T> Predicate(std::function<bool(T const&)> const& predicate, std::string const& description = "") {
-        return Generic::PredicateMatcher<T>(predicate, description);
-    }
-
-} // namespace Matchers
-} // namespace Catch
-
-// end catch_matchers_generic.hpp
-// start catch_matchers_string.h
-
-#include <string>
-
-namespace Catch {
-namespace Matchers {
-
-    namespace StdString {
-
-        struct CasedString
-        {
-            CasedString( std::string const& str, CaseSensitive::Choice caseSensitivity );
-            std::string adjustString( std::string const& str ) const;
-            std::string caseSensitivitySuffix() const;
-
-            CaseSensitive::Choice m_caseSensitivity;
-            std::string m_str;
-        };
-
-        struct StringMatcherBase : MatcherBase<std::string> {
-            StringMatcherBase( std::string const& operation, CasedString const& comparator );
-            std::string describe() const override;
-
-            CasedString m_comparator;
-            std::string m_operation;
-        };
-
-        struct EqualsMatcher : StringMatcherBase {
-            EqualsMatcher( CasedString const& comparator );
-            bool match( std::string const& source ) const override;
-        };
-        struct ContainsMatcher : StringMatcherBase {
-            ContainsMatcher( CasedString const& comparator );
-            bool match( std::string const& source ) const override;
-        };
-        struct StartsWithMatcher : StringMatcherBase {
-            StartsWithMatcher( CasedString const& comparator );
-            bool match( std::string const& source ) const override;
-        };
-        struct EndsWithMatcher : StringMatcherBase {
-            EndsWithMatcher( CasedString const& comparator );
-            bool match( std::string const& source ) const override;
-        };
-
-        struct RegexMatcher : MatcherBase<std::string> {
-            RegexMatcher( std::string regex, CaseSensitive::Choice caseSensitivity );
-            bool match( std::string const& matchee ) const override;
-            std::string describe() const override;
-
-        private:
-            std::string m_regex;
-            CaseSensitive::Choice m_caseSensitivity;
-        };
-
-    } // namespace StdString
-
-    // The following functions create the actual matcher objects.
-    // This allows the types to be inferred
-
-    StdString::EqualsMatcher Equals( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
-    StdString::ContainsMatcher Contains( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
-    StdString::EndsWithMatcher EndsWith( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
-    StdString::StartsWithMatcher StartsWith( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
-    StdString::RegexMatcher Matches( std::string const& regex, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes );
-
-} // namespace Matchers
-} // namespace Catch
-
-// end catch_matchers_string.h
-// start catch_matchers_vector.h
-
-#include <algorithm>
-
-namespace Catch {
-namespace Matchers {
-
-    namespace Vector {
-        namespace Detail {
-            template <typename InputIterator, typename T>
-            size_t count(InputIterator first, InputIterator last, T const& item) {
-                size_t cnt = 0;
-                for (; first != last; ++first) {
-                    if (*first == item) {
-                        ++cnt;
-                    }
-                }
-                return cnt;
-            }
-            template <typename InputIterator, typename T>
-            bool contains(InputIterator first, InputIterator last, T const& item) {
-                for (; first != last; ++first) {
-                    if (*first == item) {
-                        return true;
-                    }
-                }
-                return false;
-            }
-        }
-
-        template<typename T>
-        struct ContainsElementMatcher : MatcherBase<std::vector<T>> {
-
-            ContainsElementMatcher(T const &comparator) : m_comparator( comparator) {}
-
-            bool match(std::vector<T> const &v) const override {
-                for (auto const& el : v) {
-                    if (el == m_comparator) {
-                        return true;
-                    }
-                }
-                return false;
-            }
-
-            std::string describe() const override {
-                return "Contains: " + ::Catch::Detail::stringify( m_comparator );
-            }
-
-            T const& m_comparator;
-        };
-
-        template<typename T>
-        struct ContainsMatcher : MatcherBase<std::vector<T>> {
-
-            ContainsMatcher(std::vector<T> const &comparator) : m_comparator( comparator ) {}
-
-            bool match(std::vector<T> const &v) const override {
-                // !TBD: see note in EqualsMatcher
-                if (m_comparator.size() > v.size())
-                    return false;
-                for (auto const& comparator : m_comparator) {
-                    auto present = false;
-                    for (const auto& el : v) {
-                        if (el == comparator) {
-                            present = true;
-                            break;
-                        }
-                    }
-                    if (!present) {
-                        return false;
-                    }
-                }
-                return true;
-            }
-            std::string describe() const override {
-                return "Contains: " + ::Catch::Detail::stringify( m_comparator );
-            }
-
-            std::vector<T> const& m_comparator;
-        };
-
-        template<typename T>
-        struct EqualsMatcher : MatcherBase<std::vector<T>> {
-
-            EqualsMatcher(std::vector<T> const &comparator) : m_comparator( comparator ) {}
-
-            bool match(std::vector<T> const &v) const override {
-                // !TBD: This currently works if all elements can be compared using !=
-                // - a more general approach would be via a compare template that defaults
-                // to using !=. but could be specialised for, e.g. std::vector<T> etc
-                // - then just call that directly
-                if (m_comparator.size() != v.size())
-                    return false;
-                for (std::size_t i = 0; i < v.size(); ++i)
-                    if (m_comparator[i] != v[i])
-                        return false;
-                return true;
-            }
-            std::string describe() const override {
-                return "Equals: " + ::Catch::Detail::stringify( m_comparator );
-            }
-            std::vector<T> const& m_comparator;
-        };
-
-        template<typename T>
-        struct UnorderedEqualsMatcher : MatcherBase<std::vector<T>> {
-            UnorderedEqualsMatcher(std::vector<T> const& target) : m_target(target) {}
-            bool match(std::vector<T> const& vec) const override {
-                // Note: This is a reimplementation of std::is_permutation,
-                //       because I don't want to include <algorithm> inside the common path
-                if (m_target.size() != vec.size()) {
-                    return false;
-                }
-                auto lfirst = m_target.begin(), llast = m_target.end();
-                auto rfirst = vec.begin(), rlast = vec.end();
-                // Cut common prefix to optimize checking of permuted parts
-                while (lfirst != llast && *lfirst == *rfirst) {
-                    ++lfirst; ++rfirst;
-                }
-                if (lfirst == llast) {
-                    return true;
-                }
-
-                for (auto mid = lfirst; mid != llast; ++mid) {
-                    // Skip already counted items
-                    if (Detail::contains(lfirst, mid, *mid)) {
-                        continue;
-                    }
-                    size_t num_vec = Detail::count(rfirst, rlast, *mid);
-                    if (num_vec == 0 || Detail::count(lfirst, llast, *mid) != num_vec) {
-                        return false;
-                    }
-                }
-
-                return true;
-            }
-
-            std::string describe() const override {
-                return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target);
-            }
-        private:
-            std::vector<T> const& m_target;
-        };
-
-    } // namespace Vector
-
-    // The following functions create the actual matcher objects.
-    // This allows the types to be inferred
-
-    template<typename T>
-    Vector::ContainsMatcher<T> Contains( std::vector<T> const& comparator ) {
-        return Vector::ContainsMatcher<T>( comparator );
-    }
-
-    template<typename T>
-    Vector::ContainsElementMatcher<T> VectorContains( T const& comparator ) {
-        return Vector::ContainsElementMatcher<T>( comparator );
-    }
-
-    template<typename T>
-    Vector::EqualsMatcher<T> Equals( std::vector<T> const& comparator ) {
-        return Vector::EqualsMatcher<T>( comparator );
-    }
-
-    template<typename T>
-    Vector::UnorderedEqualsMatcher<T> UnorderedEquals(std::vector<T> const& target) {
-        return Vector::UnorderedEqualsMatcher<T>(target);
-    }
-
-} // namespace Matchers
-} // namespace Catch
-
-// end catch_matchers_vector.h
-namespace Catch {
-
-    template<typename ArgT, typename MatcherT>
-    class MatchExpr : public ITransientExpression {
-        ArgT const& m_arg;
-        MatcherT m_matcher;
-        StringRef m_matcherString;
-    public:
-        MatchExpr( ArgT const& arg, MatcherT const& matcher, StringRef const& matcherString )
-        :   ITransientExpression{ true, matcher.match( arg ) },
-            m_arg( arg ),
-            m_matcher( matcher ),
-            m_matcherString( matcherString )
-        {}
-
-        void streamReconstructedExpression( std::ostream &os ) const override {
-            auto matcherAsString = m_matcher.toString();
-            os << Catch::Detail::stringify( m_arg ) << ' ';
-            if( matcherAsString == Detail::unprintableString )
-                os << m_matcherString;
-            else
-                os << matcherAsString;
-        }
-    };
-
-    using StringMatcher = Matchers::Impl::MatcherBase<std::string>;
-
-    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher, StringRef const& matcherString  );
-
-    template<typename ArgT, typename MatcherT>
-    auto makeMatchExpr( ArgT const& arg, MatcherT const& matcher, StringRef const& matcherString  ) -> MatchExpr<ArgT, MatcherT> {
-        return MatchExpr<ArgT, MatcherT>( arg, matcher, matcherString );
-    }
-
-} // namespace Catch
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CHECK_THAT( macroName, matcher, resultDisposition, arg ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(arg) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
-        INTERNAL_CATCH_TRY { \
-            catchAssertionHandler.handleExpr( Catch::makeMatchExpr( arg, matcher, #matcher##_catch_sr ) ); \
-        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( false )
-
-///////////////////////////////////////////////////////////////////////////////
-#define INTERNAL_CATCH_THROWS_MATCHES( macroName, exceptionType, resultDisposition, matcher, ... ) \
-    do { \
-        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(exceptionType) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
-        if( catchAssertionHandler.allowThrows() ) \
-            try { \
-                static_cast<void>(__VA_ARGS__ ); \
-                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
-            } \
-            catch( exceptionType const& ex ) { \
-                catchAssertionHandler.handleExpr( Catch::makeMatchExpr( ex, matcher, #matcher##_catch_sr ) ); \
-            } \
-            catch( ... ) { \
-                catchAssertionHandler.handleUnexpectedInflightException(); \
-            } \
-        else \
-            catchAssertionHandler.handleThrowingCallSkipped(); \
-        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
-    } while( false )
-
-// end catch_capture_matchers.h
-#endif
-// start catch_generators.hpp
-
-// start catch_interfaces_generatortracker.h
-
-
-#include <memory>
-
-namespace Catch {
-
-    namespace Generators {
-        class GeneratorBase {
-        protected:
-            size_t m_size = 0;
-
-        public:
-            GeneratorBase( size_t size ) : m_size( size ) {}
-            virtual ~GeneratorBase();
-            auto size() const -> size_t { return m_size; }
-        };
-        using GeneratorBasePtr = std::unique_ptr<GeneratorBase>;
-
-    } // namespace Generators
-
-    struct IGeneratorTracker {
-        virtual ~IGeneratorTracker();
-        virtual auto hasGenerator() const -> bool = 0;
-        virtual auto getGenerator() const -> Generators::GeneratorBasePtr const& = 0;
-        virtual void setGenerator( Generators::GeneratorBasePtr&& generator ) = 0;
-        virtual auto getIndex() const -> std::size_t = 0;
-    };
-
-} // namespace Catch
-
-// end catch_interfaces_generatortracker.h
-// start catch_enforce.h
-
-#include <stdexcept>
-
-namespace Catch {
-#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-    template <typename Ex>
-    [[noreturn]]
-    void throw_exception(Ex const& e) {
-        throw e;
-    }
-#else // ^^ Exceptions are enabled //  Exceptions are disabled vv
-    [[noreturn]]
-    void throw_exception(std::exception const& e);
-#endif
-} // namespace Catch;
-
-#define CATCH_PREPARE_EXCEPTION( type, msg ) \
-    type( ( Catch::ReusableStringStream() << msg ).str() )
-#define CATCH_INTERNAL_ERROR( msg ) \
-    Catch::throw_exception(CATCH_PREPARE_EXCEPTION( std::logic_error, CATCH_INTERNAL_LINEINFO << ": Internal Catch error: " << msg))
-#define CATCH_ERROR( msg ) \
-    Catch::throw_exception(CATCH_PREPARE_EXCEPTION( std::domain_error, msg ))
-#define CATCH_RUNTIME_ERROR( msg ) \
-    Catch::throw_exception(CATCH_PREPARE_EXCEPTION( std::runtime_error, msg ))
-#define CATCH_ENFORCE( condition, msg ) \
-    do{ if( !(condition) ) CATCH_ERROR( msg ); } while(false)
-
-// end catch_enforce.h
-#include <memory>
-#include <vector>
-#include <cassert>
-
-#include <utility>
-
-namespace Catch {
-namespace Generators {
-
-    // !TBD move this into its own location?
-    namespace pf{
-        template<typename T, typename... Args>
-        std::unique_ptr<T> make_unique( Args&&... args ) {
-            return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-        }
-    }
-
-    template<typename T>
-    struct IGenerator {
-        virtual ~IGenerator() {}
-        virtual auto get( size_t index ) const -> T = 0;
-    };
-
-    template<typename T>
-    class SingleValueGenerator : public IGenerator<T> {
-        T m_value;
-    public:
-        SingleValueGenerator( T const& value ) : m_value( value ) {}
-
-        auto get( size_t ) const -> T override {
-            return m_value;
-        }
-    };
-
-    template<typename T>
-    class FixedValuesGenerator : public IGenerator<T> {
-        std::vector<T> m_values;
-
-    public:
-        FixedValuesGenerator( std::initializer_list<T> values ) : m_values( values ) {}
-
-        auto get( size_t index ) const -> T override {
-            return m_values[index];
-        }
-    };
-
-    template<typename T>
-    class RangeGenerator : public IGenerator<T> {
-        T const m_first;
-        T const m_last;
-
-    public:
-        RangeGenerator( T const& first, T const& last ) : m_first( first ), m_last( last ) {
-            assert( m_last > m_first );
-        }
-
-        auto get( size_t index ) const -> T override {
-            // ToDo:: introduce a safe cast to catch potential overflows
-            return static_cast<T>(m_first+index);
-        }
-    };
-
-    template<typename T>
-    struct NullGenerator : IGenerator<T> {
-        auto get( size_t ) const -> T override {
-            CATCH_INTERNAL_ERROR("A Null Generator is always empty");
-        }
-    };
-
-    template<typename T>
-    class Generator {
-        std::unique_ptr<IGenerator<T>> m_generator;
-        size_t m_size;
-
-    public:
-        Generator( size_t size, std::unique_ptr<IGenerator<T>> generator )
-        :   m_generator( std::move( generator ) ),
-            m_size( size )
-        {}
-
-        auto size() const -> size_t { return m_size; }
-        auto operator[]( size_t index ) const -> T {
-            assert( index < m_size );
-            return m_generator->get( index );
-        }
-    };
-
-    std::vector<size_t> randomiseIndices( size_t selectionSize, size_t sourceSize );
-
-    template<typename T>
-    class GeneratorRandomiser : public IGenerator<T> {
-        Generator<T> m_baseGenerator;
-
-        std::vector<size_t> m_indices;
-    public:
-        GeneratorRandomiser( Generator<T>&& baseGenerator, size_t numberOfItems )
-        :   m_baseGenerator( std::move( baseGenerator ) ),
-            m_indices( randomiseIndices( numberOfItems, m_baseGenerator.size() ) )
-        {}
-
-        auto get( size_t index ) const -> T override {
-            return m_baseGenerator[m_indices[index]];
-        }
-    };
-
-    template<typename T>
-    struct RequiresASpecialisationFor;
-
-    template<typename T>
-    auto all() -> Generator<T> { return RequiresASpecialisationFor<T>(); }
-
-    template<>
-    auto all<int>() -> Generator<int>;
-
-    template<typename T>
-    auto range( T const& first, T const& last ) -> Generator<T> {
-        return Generator<T>( (last-first), pf::make_unique<RangeGenerator<T>>( first, last ) );
-    }
-
-    template<typename T>
-    auto random( T const& first, T const& last ) -> Generator<T> {
-        auto gen = range( first, last );
-        auto size = gen.size();
-
-        return Generator<T>( size, pf::make_unique<GeneratorRandomiser<T>>( std::move( gen ), size ) );
-    }
-    template<typename T>
-    auto random( size_t size ) -> Generator<T> {
-        return Generator<T>( size, pf::make_unique<GeneratorRandomiser<T>>( all<T>(), size ) );
-    }
-
-    template<typename T>
-    auto values( std::initializer_list<T> values ) -> Generator<T> {
-        return Generator<T>( values.size(), pf::make_unique<FixedValuesGenerator<T>>( values ) );
-    }
-    template<typename T>
-    auto value( T const& val ) -> Generator<T> {
-        return Generator<T>( 1, pf::make_unique<SingleValueGenerator<T>>( val ) );
-    }
-
-    template<typename T>
-    auto as() -> Generator<T> {
-        return Generator<T>( 0, pf::make_unique<NullGenerator<T>>() );
-    }
-
-    template<typename... Ts>
-    auto table( std::initializer_list<std::tuple<Ts...>>&& tuples ) -> Generator<std::tuple<Ts...>> {
-        return values<std::tuple<Ts...>>( std::forward<std::initializer_list<std::tuple<Ts...>>>( tuples ) );
-    }
-
-    template<typename T>
-    struct Generators : GeneratorBase {
-        std::vector<Generator<T>> m_generators;
-
-        using type = T;
-
-        Generators() : GeneratorBase( 0 ) {}
-
-        void populate( T&& val ) {
-            m_size += 1;
-            m_generators.emplace_back( value( std::move( val ) ) );
-        }
-        template<typename U>
-        void populate( U&& val ) {
-            populate( T( std::move( val ) ) );
-        }
-        void populate( Generator<T>&& generator ) {
-            m_size += generator.size();
-            m_generators.emplace_back( std::move( generator ) );
-        }
-
-        template<typename U, typename... Gs>
-        void populate( U&& valueOrGenerator, Gs... moreGenerators ) {
-            populate( std::forward<U>( valueOrGenerator ) );
-            populate( std::forward<Gs>( moreGenerators )... );
-        }
-
-        auto operator[]( size_t index ) const -> T {
-            size_t sizes = 0;
-            for( auto const& gen : m_generators ) {
-                auto localIndex = index-sizes;
-                sizes += gen.size();
-                if( index < sizes )
-                    return gen[localIndex];
-            }
-            CATCH_INTERNAL_ERROR("Index '" << index << "' is out of range (" << sizes << ')');
-        }
-    };
-
-    template<typename T, typename... Gs>
-    auto makeGenerators( Generator<T>&& generator, Gs... moreGenerators ) -> Generators<T> {
-        Generators<T> generators;
-        generators.m_generators.reserve( 1+sizeof...(Gs) );
-        generators.populate( std::move( generator ), std::forward<Gs>( moreGenerators )... );
-        return generators;
-    }
-    template<typename T>
-    auto makeGenerators( Generator<T>&& generator ) -> Generators<T> {
-        Generators<T> generators;
-        generators.populate( std::move( generator ) );
-        return generators;
-    }
-    template<typename T, typename... Gs>
-    auto makeGenerators( T&& val, Gs... moreGenerators ) -> Generators<T> {
-        return makeGenerators( value( std::forward<T>( val ) ), std::forward<Gs>( moreGenerators )... );
-    }
-    template<typename T, typename U, typename... Gs>
-    auto makeGenerators( U&& val, Gs... moreGenerators ) -> Generators<T> {
-        return makeGenerators( value( T( std::forward<U>( val ) ) ), std::forward<Gs>( moreGenerators )... );
-    }
-
-    auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker&;
-
-    template<typename L>
-    // Note: The type after -> is weird, because VS2015 cannot parse
-    //       the expression used in the typedef inside, when it is in
-    //       return type. Yeah, ¯\_(ツ)_/¯
-    auto generate( SourceLineInfo const& lineInfo, L const& generatorExpression ) -> decltype(std::declval<decltype(generatorExpression())>()[0]) {
-        using UnderlyingType = typename decltype(generatorExpression())::type;
-
-        IGeneratorTracker& tracker = acquireGeneratorTracker( lineInfo );
-        if( !tracker.hasGenerator() )
-            tracker.setGenerator( pf::make_unique<Generators<UnderlyingType>>( generatorExpression() ) );
-
-        auto const& generator = static_cast<Generators<UnderlyingType> const&>( *tracker.getGenerator() );
-        return generator[tracker.getIndex()];
-    }
-
-} // namespace Generators
-} // namespace Catch
-
-#define GENERATE( ... ) \
-    Catch::Generators::generate( CATCH_INTERNAL_LINEINFO, []{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } )
-
-// end catch_generators.hpp
-
-// These files are included here so the single_include script doesn't put them
-// in the conditionally compiled sections
-// start catch_test_case_info.h
-
-#include <string>
-#include <vector>
-#include <memory>
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpadded"
-#endif
-
-namespace Catch {
-
-    struct ITestInvoker;
-
-    struct TestCaseInfo {
-        enum SpecialProperties{
-            None = 0,
-            IsHidden = 1 << 1,
-            ShouldFail = 1 << 2,
-            MayFail = 1 << 3,
-            Throws = 1 << 4,
-            NonPortable = 1 << 5,
-            Benchmark = 1 << 6
-        };
-
-        TestCaseInfo(   std::string const& _name,
-                        std::string const& _className,
-                        std::string const& _description,
-                        std::vector<std::string> const& _tags,
-                        SourceLineInfo const& _lineInfo );
-
-        friend void setTags( TestCaseInfo& testCaseInfo, std::vector<std::string> tags );
-
-        bool isHidden() const;
-        bool throws() const;
-        bool okToFail() const;
-        bool expectedToFail() const;
-
-        std::string tagsAsString() const;
-
-        std::string name;
-        std::string className;
-        std::string description;
-        std::vector<std::string> tags;
-        std::vector<std::string> lcaseTags;
-        SourceLineInfo lineInfo;
-        SpecialProperties properties;
-    };
-
-    class TestCase : public TestCaseInfo {
-    public:
-
-        TestCase( ITestInvoker* testCase, TestCaseInfo&& info );
-
-        TestCase withName( std::string const& _newName ) const;
-
-        void invoke() const;
-
-        TestCaseInfo const& getTestCaseInfo() const;
-
-        bool operator == ( TestCase const& other ) const;
-        bool operator < ( TestCase const& other ) const;
-
-    private:
-        std::shared_ptr<ITestInvoker> test;
-    };
-
-    TestCase makeTestCase(  ITestInvoker* testCase,
-                            std::string const& className,
-                            NameAndTags const& nameAndTags,
-                            SourceLineInfo const& lineInfo );
-}
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-
-// end catch_test_case_info.h
-// start catch_interfaces_runner.h
-
-namespace Catch {
-
-    struct IRunner {
-        virtual ~IRunner();
-        virtual bool aborting() const = 0;
-    };
-}
-
-// end catch_interfaces_runner.h
-
-#ifdef __OBJC__
-// start catch_objc.hpp
-
-#import <objc/runtime.h>
-
-#include <string>
-
-// NB. Any general catch headers included here must be included
-// in catch.hpp first to make sure they are included by the single
-// header for non obj-usage
-
-///////////////////////////////////////////////////////////////////////////////
-// This protocol is really only here for (self) documenting purposes, since
-// all its methods are optional.
-@protocol OcFixture
-
-@optional
-
--(void) setUp;
--(void) tearDown;
-
-@end
-
-namespace Catch {
-
-    class OcMethod : public ITestInvoker {
-
-    public:
-        OcMethod( Class cls, SEL sel ) : m_cls( cls ), m_sel( sel ) {}
-
-        virtual void invoke() const {
-            id obj = [[m_cls alloc] init];
-
-            performOptionalSelector( obj, @selector(setUp)  );
-            performOptionalSelector( obj, m_sel );
-            performOptionalSelector( obj, @selector(tearDown)  );
-
-            arcSafeRelease( obj );
-        }
-    private:
-        virtual ~OcMethod() {}
-
-        Class m_cls;
-        SEL m_sel;
-    };
-
-    namespace Detail{
-
-        inline std::string getAnnotation(   Class cls,
-                                            std::string const& annotationName,
-                                            std::string const& testCaseName ) {
-            NSString* selStr = [[NSString alloc] initWithFormat:@"Catch_%s_%s", annotationName.c_str(), testCaseName.c_str()];
-            SEL sel = NSSelectorFromString( selStr );
-            arcSafeRelease( selStr );
-            id value = performOptionalSelector( cls, sel );
-            if( value )
-                return [(NSString*)value UTF8String];
-            return "";
-        }
-    }
-
-    inline std::size_t registerTestMethods() {
-        std::size_t noTestMethods = 0;
-        int noClasses = objc_getClassList( nullptr, 0 );
-
-        Class* classes = (CATCH_UNSAFE_UNRETAINED Class *)malloc( sizeof(Class) * noClasses);
-        objc_getClassList( classes, noClasses );
-
-        for( int c = 0; c < noClasses; c++ ) {
-            Class cls = classes[c];
-            {
-                u_int count;
-                Method* methods = class_copyMethodList( cls, &count );
-                for( u_int m = 0; m < count ; m++ ) {
-                    SEL selector = method_getName(methods[m]);
-                    std::string methodName = sel_getName(selector);
-                    if( startsWith( methodName, "Catch_TestCase_" ) ) {
-                        std::string testCaseName = methodName.substr( 15 );
-                        std::string name = Detail::getAnnotation( cls, "Name", testCaseName );
-                        std::string desc = Detail::getAnnotation( cls, "Description", testCaseName );
-                        const char* className = class_getName( cls );
-
-                        getMutableRegistryHub().registerTest( makeTestCase( new OcMethod( cls, selector ), className, NameAndTags( name.c_str(), desc.c_str() ), SourceLineInfo("",0) ) );
-                        noTestMethods++;
-                    }
-                }
-                free(methods);
-            }
-        }
-        return noTestMethods;
-    }
-
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-
-    namespace Matchers {
-        namespace Impl {
-        namespace NSStringMatchers {
-
-            struct StringHolder : MatcherBase<NSString*>{
-                StringHolder( NSString* substr ) : m_substr( [substr copy] ){}
-                StringHolder( StringHolder const& other ) : m_substr( [other.m_substr copy] ){}
-                StringHolder() {
-                    arcSafeRelease( m_substr );
-                }
-
-                bool match( NSString* arg ) const override {
-                    return false;
-                }
-
-                NSString* CATCH_ARC_STRONG m_substr;
-            };
-
-            struct Equals : StringHolder {
-                Equals( NSString* substr ) : StringHolder( substr ){}
-
-                bool match( NSString* str ) const override {
-                    return  (str != nil || m_substr == nil ) &&
-                            [str isEqualToString:m_substr];
-                }
-
-                std::string describe() const override {
-                    return "equals string: " + Catch::Detail::stringify( m_substr );
-                }
-            };
-
-            struct Contains : StringHolder {
-                Contains( NSString* substr ) : StringHolder( substr ){}
-
-                bool match( NSString* str ) const {
-                    return  (str != nil || m_substr == nil ) &&
-                            [str rangeOfString:m_substr].location != NSNotFound;
-                }
-
-                std::string describe() const override {
-                    return "contains string: " + Catch::Detail::stringify( m_substr );
-                }
-            };
-
-            struct StartsWith : StringHolder {
-                StartsWith( NSString* substr ) : StringHolder( substr ){}
-
-                bool match( NSString* str ) const override {
-                    return  (str != nil || m_substr == nil ) &&
-                            [str rangeOfString:m_substr].location == 0;
-                }
-
-                std::string describe() const override {
-                    return "starts with: " + Catch::Detail::stringify( m_substr );
-                }
-            };
-            struct EndsWith : StringHolder {
-                EndsWith( NSString* substr ) : StringHolder( substr ){}
-
-                bool match( NSString* str ) const override {
-                    return  (str != nil || m_substr == nil ) &&
-                            [str rangeOfString:m_substr].location == [str length] - [m_substr length];
-                }
-
-                std::string describe() const override {
-                    return "ends with: " + Catch::Detail::stringify( m_substr );
-                }
-            };
-
-        } // namespace NSStringMatchers
-        } // namespace Impl
-
-        inline Impl::NSStringMatchers::Equals
-            Equals( NSString* substr ){ return Impl::NSStringMatchers::Equals( substr ); }
-
-        inline Impl::NSStringMatchers::Contains
-            Contains( NSString* substr ){ return Impl::NSStringMatchers::Contains( substr ); }
-
-        inline Impl::NSStringMatchers::StartsWith
-            StartsWith( NSString* substr ){ return Impl::NSStringMatchers::StartsWith( substr ); }
-
-        inline Impl::NSStringMatchers::EndsWith
-            EndsWith( NSString* substr ){ return Impl::NSStringMatchers::EndsWith( substr ); }
-
-    } // namespace Matchers
-
-    using namespace Matchers;
-
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-
-} // namespace Catch
-
-///////////////////////////////////////////////////////////////////////////////
-#define OC_MAKE_UNIQUE_NAME( root, uniqueSuffix ) root##uniqueSuffix
-#define OC_TEST_CASE2( name, desc, uniqueSuffix ) \
-+(NSString*) OC_MAKE_UNIQUE_NAME( Catch_Name_test_, uniqueSuffix ) \
-{ \
-return @ name; \
-} \
-+(NSString*) OC_MAKE_UNIQUE_NAME( Catch_Description_test_, uniqueSuffix ) \
-{ \
-return @ desc; \
-} \
--(void) OC_MAKE_UNIQUE_NAME( Catch_TestCase_test_, uniqueSuffix )
-
-#define OC_TEST_CASE( name, desc ) OC_TEST_CASE2( name, desc, __LINE__ )
-
-// end catch_objc.hpp
-#endif
-
-#ifdef CATCH_CONFIG_EXTERNAL_INTERFACES
-// start catch_external_interfaces.h
-
-// start catch_reporter_bases.hpp
-
-// start catch_interfaces_reporter.h
-
-// start catch_config.hpp
-
-// start catch_test_spec_parser.h
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpadded"
-#endif
-
-// start catch_test_spec.h
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpadded"
-#endif
-
-// start catch_wildcard_pattern.h
-
-namespace Catch
-{
-    class WildcardPattern {
-        enum WildcardPosition {
-            NoWildcard = 0,
-            WildcardAtStart = 1,
-            WildcardAtEnd = 2,
-            WildcardAtBothEnds = WildcardAtStart | WildcardAtEnd
-        };
-
-    public:
-
-        WildcardPattern( std::string const& pattern, CaseSensitive::Choice caseSensitivity );
-        virtual ~WildcardPattern() = default;
-        virtual bool matches( std::string const& str ) const;
-
-    private:
-        std::string adjustCase( std::string const& str ) const;
-        CaseSensitive::Choice m_caseSensitivity;
-        WildcardPosition m_wildcard = NoWildcard;
-        std::string m_pattern;
-    };
-}
-
-// end catch_wildcard_pattern.h
-#include <string>
-#include <vector>
-#include <memory>
-
-namespace Catch {
-
-    class TestSpec {
-        struct Pattern {
-            virtual ~Pattern();
-            virtual bool matches( TestCaseInfo const& testCase ) const = 0;
-        };
-        using PatternPtr = std::shared_ptr<Pattern>;
-
-        class NamePattern : public Pattern {
-        public:
-            NamePattern( std::string const& name );
-            virtual ~NamePattern();
-            virtual bool matches( TestCaseInfo const& testCase ) const override;
-        private:
-            WildcardPattern m_wildcardPattern;
-        };
-
-        class TagPattern : public Pattern {
-        public:
-            TagPattern( std::string const& tag );
-            virtual ~TagPattern();
-            virtual bool matches( TestCaseInfo const& testCase ) const override;
-        private:
-            std::string m_tag;
-        };
-
-        class ExcludedPattern : public Pattern {
-        public:
-            ExcludedPattern( PatternPtr const& underlyingPattern );
-            virtual ~ExcludedPattern();
-            virtual bool matches( TestCaseInfo const& testCase ) const override;
-        private:
-            PatternPtr m_underlyingPattern;
-        };
-
-        struct Filter {
-            std::vector<PatternPtr> m_patterns;
-
-            bool matches( TestCaseInfo const& testCase ) const;
-        };
-
-    public:
-        bool hasFilters() const;
-        bool matches( TestCaseInfo const& testCase ) const;
-
-    private:
-        std::vector<Filter> m_filters;
-
-        friend class TestSpecParser;
-    };
-}
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-
-// end catch_test_spec.h
-// start catch_interfaces_tag_alias_registry.h
-
-#include <string>
-
-namespace Catch {
-
-    struct TagAlias;
-
-    struct ITagAliasRegistry {
-        virtual ~ITagAliasRegistry();
-        // Nullptr if not present
-        virtual TagAlias const* find( std::string const& alias ) const = 0;
-        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
-
-        static ITagAliasRegistry const& get();
-    };
-
-} // end namespace Catch
-
-// end catch_interfaces_tag_alias_registry.h
-namespace Catch {
-
-    class TestSpecParser {
-        enum Mode{ None, Name, QuotedName, Tag, EscapedName };
-        Mode m_mode = None;
-        bool m_exclusion = false;
-        std::size_t m_start = std::string::npos, m_pos = 0;
-        std::string m_arg;
-        std::vector<std::size_t> m_escapeChars;
-        TestSpec::Filter m_currentFilter;
-        TestSpec m_testSpec;
-        ITagAliasRegistry const* m_tagAliases = nullptr;
-
-    public:
-        TestSpecParser( ITagAliasRegistry const& tagAliases );
-
-        TestSpecParser& parse( std::string const& arg );
-        TestSpec testSpec();
-
-    private:
-        void visitChar( char c );
-        void startNewMode( Mode mode, std::size_t start );
-        void escape();
-        std::string subString() const;
-
-        template<typename T>
-        void addPattern() {
-            std::string token = subString();
-            for( std::size_t i = 0; i < m_escapeChars.size(); ++i )
-                token = token.substr( 0, m_escapeChars[i]-m_start-i ) + token.substr( m_escapeChars[i]-m_start-i+1 );
-            m_escapeChars.clear();
-            if( startsWith( token, "exclude:" ) ) {
-                m_exclusion = true;
-                token = token.substr( 8 );
-            }
-            if( !token.empty() ) {
-                TestSpec::PatternPtr pattern = std::make_shared<T>( token );
-                if( m_exclusion )
-                    pattern = std::make_shared<TestSpec::ExcludedPattern>( pattern );
-                m_currentFilter.m_patterns.push_back( pattern );
-            }
-            m_exclusion = false;
-            m_mode = None;
-        }
-
-        void addFilter();
-    };
-    TestSpec parseTestSpec( std::string const& arg );
-
-} // namespace Catch
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-
-// end catch_test_spec_parser.h
-// start catch_interfaces_config.h
-
-#include <iosfwd>
-#include <string>
-#include <vector>
-#include <memory>
-
-namespace Catch {
-
-    enum class Verbosity {
-        Quiet = 0,
-        Normal,
-        High
-    };
-
-    struct WarnAbout { enum What {
-        Nothing = 0x00,
-        NoAssertions = 0x01,
-        NoTests = 0x02
-    }; };
-
-    struct ShowDurations { enum OrNot {
-        DefaultForReporter,
-        Always,
-        Never
-    }; };
-    struct RunTests { enum InWhatOrder {
-        InDeclarationOrder,
-        InLexicographicalOrder,
-        InRandomOrder
-    }; };
-    struct UseColour { enum YesOrNo {
-        Auto,
-        Yes,
-        No
-    }; };
-    struct WaitForKeypress { enum When {
-        Never,
-        BeforeStart = 1,
-        BeforeExit = 2,
-        BeforeStartAndExit = BeforeStart | BeforeExit
-    }; };
-
-    class TestSpec;
-
-    struct IConfig : NonCopyable {
-
-        virtual ~IConfig();
-
-        virtual bool allowThrows() const = 0;
-        virtual std::ostream& stream() const = 0;
-        virtual std::string name() const = 0;
-        virtual bool includeSuccessfulResults() const = 0;
-        virtual bool shouldDebugBreak() const = 0;
-        virtual bool warnAboutMissingAssertions() const = 0;
-        virtual bool warnAboutNoTests() const = 0;
-        virtual int abortAfter() const = 0;
-        virtual bool showInvisibles() const = 0;
-        virtual ShowDurations::OrNot showDurations() const = 0;
-        virtual TestSpec const& testSpec() const = 0;
-        virtual bool hasTestFilters() const = 0;
-        virtual RunTests::InWhatOrder runOrder() const = 0;
-        virtual unsigned int rngSeed() const = 0;
-        virtual int benchmarkResolutionMultiple() const = 0;
-        virtual UseColour::YesOrNo useColour() const = 0;
-        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
-        virtual Verbosity verbosity() const = 0;
-    };
-
-    using IConfigPtr = std::shared_ptr<IConfig const>;
-}
-
-// end catch_interfaces_config.h
-// Libstdc++ doesn't like incomplete classes for unique_ptr
-
-#include <memory>
-#include <vector>
-#include <string>
-
-#ifndef CATCH_CONFIG_CONSOLE_WIDTH
-#define CATCH_CONFIG_CONSOLE_WIDTH 80
-#endif
-
-namespace Catch {
-
-    struct IStream;
-
-    struct ConfigData {
-        bool listTests = false;
-        bool listTags = false;
-        bool listReporters = false;
-        bool listTestNamesOnly = false;
-
-        bool showSuccessfulTests = false;
-        bool shouldDebugBreak = false;
-        bool noThrow = false;
-        bool showHelp = false;
-        bool showInvisibles = false;
-        bool filenamesAsTags = false;
-        bool libIdentify = false;
-
-        int abortAfter = -1;
-        unsigned int rngSeed = 0;
-        int benchmarkResolutionMultiple = 100;
-
-        Verbosity verbosity = Verbosity::Normal;
-        WarnAbout::What warnings = WarnAbout::Nothing;
-        ShowDurations::OrNot showDurations = ShowDurations::DefaultForReporter;
-        RunTests::InWhatOrder runOrder = RunTests::InDeclarationOrder;
-        UseColour::YesOrNo useColour = UseColour::Auto;
-        WaitForKeypress::When waitForKeypress = WaitForKeypress::Never;
-
-        std::string outputFilename;
-        std::string name;
-        std::string processName;
-#ifndef CATCH_CONFIG_DEFAULT_REPORTER
-#define CATCH_CONFIG_DEFAULT_REPORTER "console"
-#endif
-        std::string reporterName = CATCH_CONFIG_DEFAULT_REPORTER;
-#undef CATCH_CONFIG_DEFAULT_REPORTER
-
-        std::vector<std::string> testsOrTags;
-        std::vector<std::string> sectionsToRun;
-    };
-
-    class Config : public IConfig {
-    public:
-
-        Config() = default;
-        Config( ConfigData const& data );
-        virtual ~Config() = default;
-
-        std::string const& getFilename() const;
-
-        bool listTests() const;
-        bool listTestNamesOnly() const;
-        bool listTags() const;
-        bool listReporters() const;
-
-        std::string getProcessName() const;
-        std::string const& getReporterName() const;
-
-        std::vector<std::string> const& getTestsOrTags() const;
-        std::vector<std::string> const& getSectionsToRun() const override;
-
-        virtual TestSpec const& testSpec() const override;
-        bool hasTestFilters() const override;
-
-        bool showHelp() const;
-
-        // IConfig interface
-        bool allowThrows() const override;
-        std::ostream& stream() const override;
-        std::string name() const override;
-        bool includeSuccessfulResults() const override;
-        bool warnAboutMissingAssertions() const override;
-        bool warnAboutNoTests() const override;
-        ShowDurations::OrNot showDurations() const override;
-        RunTests::InWhatOrder runOrder() const override;
-        unsigned int rngSeed() const override;
-        int benchmarkResolutionMultiple() const override;
-        UseColour::YesOrNo useColour() const override;
-        bool shouldDebugBreak() const override;
-        int abortAfter() const override;
-        bool showInvisibles() const override;
-        Verbosity verbosity() const override;
-
-    private:
-
-        IStream const* openStream();
-        ConfigData m_data;
-
-        std::unique_ptr<IStream const> m_stream;
-        TestSpec m_testSpec;
-        bool m_hasTestFilters = false;
-    };
-
-} // end namespace Catch
-
-// end catch_config.hpp
-// start catch_assertionresult.h
-
-#include <string>
-
-namespace Catch {
-
-    struct AssertionResultData
-    {
-        AssertionResultData() = delete;
-
-        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
-
-        std::string message;
-        mutable std::string reconstructedExpression;
-        LazyExpression lazyExpression;
-        ResultWas::OfType resultType;
-
-        std::string reconstructExpression() const;
-    };
-
-    class AssertionResult {
-    public:
-        AssertionResult() = delete;
-        AssertionResult( AssertionInfo const& info, AssertionResultData const& data );
-
-        bool isOk() const;
-        bool succeeded() const;
-        ResultWas::OfType getResultType() const;
-        bool hasExpression() const;
-        bool hasMessage() const;
-        std::string getExpression() const;
-        std::string getExpressionInMacro() const;
-        bool hasExpandedExpression() const;
-        std::string getExpandedExpression() const;
-        std::string getMessage() const;
-        SourceLineInfo getSourceInfo() const;
-        StringRef getTestMacroName() const;
-
-    //protected:
-        AssertionInfo m_info;
-        AssertionResultData m_resultData;
-    };
-
-} // end namespace Catch
-
-// end catch_assertionresult.h
-// start catch_option.hpp
-
-namespace Catch {
-
-    // An optional type
-    template<typename T>
-    class Option {
-    public:
-        Option() : nullableValue( nullptr ) {}
-        Option( T const& _value )
-        : nullableValue( new( storage ) T( _value ) )
-        {}
-        Option( Option const& _other )
-        : nullableValue( _other ? new( storage ) T( *_other ) : nullptr )
-        {}
-
-        ~Option() {
-            reset();
-        }
-
-        Option& operator= ( Option const& _other ) {
-            if( &_other != this ) {
-                reset();
-                if( _other )
-                    nullableValue = new( storage ) T( *_other );
-            }
-            return *this;
-        }
-        Option& operator = ( T const& _value ) {
-            reset();
-            nullableValue = new( storage ) T( _value );
-            return *this;
-        }
-
-        void reset() {
-            if( nullableValue )
-                nullableValue->~T();
-            nullableValue = nullptr;
-        }
-
-        T& operator*() { return *nullableValue; }
-        T const& operator*() const { return *nullableValue; }
-        T* operator->() { return nullableValue; }
-        const T* operator->() const { return nullableValue; }
-
-        T valueOr( T const& defaultValue ) const {
-            return nullableValue ? *nullableValue : defaultValue;
-        }
-
-        bool some() const { return nullableValue != nullptr; }
-        bool none() const { return nullableValue == nullptr; }
-
-        bool operator !() const { return nullableValue == nullptr; }
-        explicit operator bool() const {
-            return some();
-        }
-
-    private:
-        T *nullableValue;
-        alignas(alignof(T)) char storage[sizeof(T)];
-    };
-
-} // end namespace Catch
-
-// end catch_option.hpp
-#include <string>
-#include <iosfwd>
-#include <map>
-#include <set>
-#include <memory>
-
-namespace Catch {
-
-    struct ReporterConfig {
-        explicit ReporterConfig( IConfigPtr const& _fullConfig );
-
-        ReporterConfig( IConfigPtr const& _fullConfig, std::ostream& _stream );
-
-        std::ostream& stream() const;
-        IConfigPtr fullConfig() const;
-
-    private:
-        std::ostream* m_stream;
-        IConfigPtr m_fullConfig;
-    };
-
-    struct ReporterPreferences {
-        bool shouldRedirectStdOut = false;
-        bool shouldReportAllAssertions = false;
-    };
-
-    template<typename T>
-    struct LazyStat : Option<T> {
-        LazyStat& operator=( T const& _value ) {
-            Option<T>::operator=( _value );
-            used = false;
-            return *this;
-        }
-        void reset() {
-            Option<T>::reset();
-            used = false;
-        }
-        bool used = false;
-    };
-
-    struct TestRunInfo {
-        TestRunInfo( std::string const& _name );
-        std::string name;
-    };
-    struct GroupInfo {
-        GroupInfo(  std::string const& _name,
-                    std::size_t _groupIndex,
-                    std::size_t _groupsCount );
-
-        std::string name;
-        std::size_t groupIndex;
-        std::size_t groupsCounts;
-    };
-
-    struct AssertionStats {
-        AssertionStats( AssertionResult const& _assertionResult,
-                        std::vector<MessageInfo> const& _infoMessages,
-                        Totals const& _totals );
-
-        AssertionStats( AssertionStats const& )              = default;
-        AssertionStats( AssertionStats && )                  = default;
-        AssertionStats& operator = ( AssertionStats const& ) = default;
-        AssertionStats& operator = ( AssertionStats && )     = default;
-        virtual ~AssertionStats();
-
-        AssertionResult assertionResult;
-        std::vector<MessageInfo> infoMessages;
-        Totals totals;
-    };
-
-    struct SectionStats {
-        SectionStats(   SectionInfo const& _sectionInfo,
-                        Counts const& _assertions,
-                        double _durationInSeconds,
-                        bool _missingAssertions );
-        SectionStats( SectionStats const& )              = default;
-        SectionStats( SectionStats && )                  = default;
-        SectionStats& operator = ( SectionStats const& ) = default;
-        SectionStats& operator = ( SectionStats && )     = default;
-        virtual ~SectionStats();
-
-        SectionInfo sectionInfo;
-        Counts assertions;
-        double durationInSeconds;
-        bool missingAssertions;
-    };
-
-    struct TestCaseStats {
-        TestCaseStats(  TestCaseInfo const& _testInfo,
-                        Totals const& _totals,
-                        std::string const& _stdOut,
-                        std::string const& _stdErr,
-                        bool _aborting );
-
-        TestCaseStats( TestCaseStats const& )              = default;
-        TestCaseStats( TestCaseStats && )                  = default;
-        TestCaseStats& operator = ( TestCaseStats const& ) = default;
-        TestCaseStats& operator = ( TestCaseStats && )     = default;
-        virtual ~TestCaseStats();
-
-        TestCaseInfo testInfo;
-        Totals totals;
-        std::string stdOut;
-        std::string stdErr;
-        bool aborting;
-    };
-
-    struct TestGroupStats {
-        TestGroupStats( GroupInfo const& _groupInfo,
-                        Totals const& _totals,
-                        bool _aborting );
-        TestGroupStats( GroupInfo const& _groupInfo );
-
-        TestGroupStats( TestGroupStats const& )              = default;
-        TestGroupStats( TestGroupStats && )                  = default;
-        TestGroupStats& operator = ( TestGroupStats const& ) = default;
-        TestGroupStats& operator = ( TestGroupStats && )     = default;
-        virtual ~TestGroupStats();
-
-        GroupInfo groupInfo;
-        Totals totals;
-        bool aborting;
-    };
-
-    struct TestRunStats {
-        TestRunStats(   TestRunInfo const& _runInfo,
-                        Totals const& _totals,
-                        bool _aborting );
-
-        TestRunStats( TestRunStats const& )              = default;
-        TestRunStats( TestRunStats && )                  = default;
-        TestRunStats& operator = ( TestRunStats const& ) = default;
-        TestRunStats& operator = ( TestRunStats && )     = default;
-        virtual ~TestRunStats();
-
-        TestRunInfo runInfo;
-        Totals totals;
-        bool aborting;
-    };
-
-    struct BenchmarkInfo {
-        std::string name;
-    };
-    struct BenchmarkStats {
-        BenchmarkInfo info;
-        std::size_t iterations;
-        uint64_t elapsedTimeInNanoseconds;
-    };
-
-    struct IStreamingReporter {
-        virtual ~IStreamingReporter() = default;
-
-        // Implementing class must also provide the following static methods:
-        // static std::string getDescription();
-        // static std::set<Verbosity> getSupportedVerbosities()
-
-        virtual ReporterPreferences getPreferences() const = 0;
-
-        virtual void noMatchingTestCases( std::string const& spec ) = 0;
-
-        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
-        virtual void testGroupStarting( GroupInfo const& groupInfo ) = 0;
-
-        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
-        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
-
-        // *** experimental ***
-        virtual void benchmarkStarting( BenchmarkInfo const& ) {}
-
-        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
-
-        // The return value indicates if the messages buffer should be cleared:
-        virtual bool assertionEnded( AssertionStats const& assertionStats ) = 0;
-
-        // *** experimental ***
-        virtual void benchmarkEnded( BenchmarkStats const& ) {}
-
-        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
-        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
-        virtual void testGroupEnded( TestGroupStats const& testGroupStats ) = 0;
-        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
-
-        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
-
-        // Default empty implementation provided
-        virtual void fatalErrorEncountered( StringRef name );
-
-        virtual bool isMulti() const;
-    };
-    using IStreamingReporterPtr = std::unique_ptr<IStreamingReporter>;
-
-    struct IReporterFactory {
-        virtual ~IReporterFactory();
-        virtual IStreamingReporterPtr create( ReporterConfig const& config ) const = 0;
-        virtual std::string getDescription() const = 0;
-    };
-    using IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;
-
-    struct IReporterRegistry {
-        using FactoryMap = std::map<std::string, IReporterFactoryPtr>;
-        using Listeners = std::vector<IReporterFactoryPtr>;
-
-        virtual ~IReporterRegistry();
-        virtual IStreamingReporterPtr create( std::string const& name, IConfigPtr const& config ) const = 0;
-        virtual FactoryMap const& getFactories() const = 0;
-        virtual Listeners const& getListeners() const = 0;
-    };
-
-} // end namespace Catch
-
-// end catch_interfaces_reporter.h
-#include <algorithm>
-#include <cstring>
-#include <cfloat>
-#include <cstdio>
-#include <cassert>
-#include <memory>
-#include <ostream>
-
-namespace Catch {
-    void prepareExpandedExpression(AssertionResult& result);
-
-    // Returns double formatted as %.3f (format expected on output)
-    std::string getFormattedDuration( double duration );
-
-    template<typename DerivedT>
-    struct StreamingReporterBase : IStreamingReporter {
-
-        StreamingReporterBase( ReporterConfig const& _config )
-        :   m_config( _config.fullConfig() ),
-            stream( _config.stream() )
-        {
-            m_reporterPrefs.shouldRedirectStdOut = false;
-            if( !DerivedT::getSupportedVerbosities().count( m_config->verbosity() ) )
-                CATCH_ERROR( "Verbosity level not supported by this reporter" );
-        }
-
-        ReporterPreferences getPreferences() const override {
-            return m_reporterPrefs;
-        }
-
-        static std::set<Verbosity> getSupportedVerbosities() {
-            return { Verbosity::Normal };
-        }
-
-        ~StreamingReporterBase() override = default;
-
-        void noMatchingTestCases(std::string const&) override {}
-
-        void testRunStarting(TestRunInfo const& _testRunInfo) override {
-            currentTestRunInfo = _testRunInfo;
-        }
-        void testGroupStarting(GroupInfo const& _groupInfo) override {
-            currentGroupInfo = _groupInfo;
-        }
-
-        void testCaseStarting(TestCaseInfo const& _testInfo) override  {
-            currentTestCaseInfo = _testInfo;
-        }
-        void sectionStarting(SectionInfo const& _sectionInfo) override {
-            m_sectionStack.push_back(_sectionInfo);
-        }
-
-        void sectionEnded(SectionStats const& /* _sectionStats */) override {
-            m_sectionStack.pop_back();
-        }
-        void testCaseEnded(TestCaseStats const& /* _testCaseStats */) override {
-            currentTestCaseInfo.reset();
-        }
-        void testGroupEnded(TestGroupStats const& /* _testGroupStats */) override {
-            currentGroupInfo.reset();
-        }
-        void testRunEnded(TestRunStats const& /* _testRunStats */) override {
-            currentTestCaseInfo.reset();
-            currentGroupInfo.reset();
-            currentTestRunInfo.reset();
-        }
-
-        void skipTest(TestCaseInfo const&) override {
-            // Don't do anything with this by default.
-            // It can optionally be overridden in the derived class.
-        }
-
-        IConfigPtr m_config;
-        std::ostream& stream;
-
-        LazyStat<TestRunInfo> currentTestRunInfo;
-        LazyStat<GroupInfo> currentGroupInfo;
-        LazyStat<TestCaseInfo> currentTestCaseInfo;
-
-        std::vector<SectionInfo> m_sectionStack;
-        ReporterPreferences m_reporterPrefs;
-    };
-
-    template<typename DerivedT>
-    struct CumulativeReporterBase : IStreamingReporter {
-        template<typename T, typename ChildNodeT>
-        struct Node {
-            explicit Node( T const& _value ) : value( _value ) {}
-            virtual ~Node() {}
-
-            using ChildNodes = std::vector<std::shared_ptr<ChildNodeT>>;
-            T value;
-            ChildNodes children;
-        };
-        struct SectionNode {
-            explicit SectionNode(SectionStats const& _stats) : stats(_stats) {}
-            virtual ~SectionNode() = default;
-
-            bool operator == (SectionNode const& other) const {
-                return stats.sectionInfo.lineInfo == other.stats.sectionInfo.lineInfo;
-            }
-            bool operator == (std::shared_ptr<SectionNode> const& other) const {
-                return operator==(*other);
-            }
-
-            SectionStats stats;
-            using ChildSections = std::vector<std::shared_ptr<SectionNode>>;
-            using Assertions = std::vector<AssertionStats>;
-            ChildSections childSections;
-            Assertions assertions;
-            std::string stdOut;
-            std::string stdErr;
-        };
-
-        struct BySectionInfo {
-            BySectionInfo( SectionInfo const& other ) : m_other( other ) {}
-            BySectionInfo( BySectionInfo const& other ) : m_other( other.m_other ) {}
-            bool operator() (std::shared_ptr<SectionNode> const& node) const {
-                return ((node->stats.sectionInfo.name == m_other.name) &&
-                        (node->stats.sectionInfo.lineInfo == m_other.lineInfo));
-            }
-            void operator=(BySectionInfo const&) = delete;
-
-        private:
-            SectionInfo const& m_other;
-        };
-
-        using TestCaseNode = Node<TestCaseStats, SectionNode>;
-        using TestGroupNode = Node<TestGroupStats, TestCaseNode>;
-        using TestRunNode = Node<TestRunStats, TestGroupNode>;
-
-        CumulativeReporterBase( ReporterConfig const& _config )
-        :   m_config( _config.fullConfig() ),
-            stream( _config.stream() )
-        {
-            m_reporterPrefs.shouldRedirectStdOut = false;
-            if( !DerivedT::getSupportedVerbosities().count( m_config->verbosity() ) )
-                CATCH_ERROR( "Verbosity level not supported by this reporter" );
-        }
-        ~CumulativeReporterBase() override = default;
-
-        ReporterPreferences getPreferences() const override {
-            return m_reporterPrefs;
-        }
-
-        static std::set<Verbosity> getSupportedVerbosities() {
-            return { Verbosity::Normal };
-        }
-
-        void testRunStarting( TestRunInfo const& ) override {}
-        void testGroupStarting( GroupInfo const& ) override {}
-
-        void testCaseStarting( TestCaseInfo const& ) override {}
-
-        void sectionStarting( SectionInfo const& sectionInfo ) override {
-            SectionStats incompleteStats( sectionInfo, Counts(), 0, false );
-            std::shared_ptr<SectionNode> node;
-            if( m_sectionStack.empty() ) {
-                if( !m_rootSection )
-                    m_rootSection = std::make_shared<SectionNode>( incompleteStats );
-                node = m_rootSection;
-            }
-            else {
-                SectionNode& parentNode = *m_sectionStack.back();
-                auto it =
-                    std::find_if(   parentNode.childSections.begin(),
-                                    parentNode.childSections.end(),
-                                    BySectionInfo( sectionInfo ) );
-                if( it == parentNode.childSections.end() ) {
-                    node = std::make_shared<SectionNode>( incompleteStats );
-                    parentNode.childSections.push_back( node );
-                }
-                else
-                    node = *it;
-            }
-            m_sectionStack.push_back( node );
-            m_deepestSection = std::move(node);
-        }
-
-        void assertionStarting(AssertionInfo const&) override {}
-
-        bool assertionEnded(AssertionStats const& assertionStats) override {
-            assert(!m_sectionStack.empty());
-            // AssertionResult holds a pointer to a temporary DecomposedExpression,
-            // which getExpandedExpression() calls to build the expression string.
-            // Our section stack copy of the assertionResult will likely outlive the
-            // temporary, so it must be expanded or discarded now to avoid calling
-            // a destroyed object later.
-            prepareExpandedExpression(const_cast<AssertionResult&>( assertionStats.assertionResult ) );
-            SectionNode& sectionNode = *m_sectionStack.back();
-            sectionNode.assertions.push_back(assertionStats);
-            return true;
-        }
-        void sectionEnded(SectionStats const& sectionStats) override {
-            assert(!m_sectionStack.empty());
-            SectionNode& node = *m_sectionStack.back();
-            node.stats = sectionStats;
-            m_sectionStack.pop_back();
-        }
-        void testCaseEnded(TestCaseStats const& testCaseStats) override {
-            auto node = std::make_shared<TestCaseNode>(testCaseStats);
-            assert(m_sectionStack.size() == 0);
-            node->children.push_back(m_rootSection);
-            m_testCases.push_back(node);
-            m_rootSection.reset();
-
-            assert(m_deepestSection);
-            m_deepestSection->stdOut = testCaseStats.stdOut;
-            m_deepestSection->stdErr = testCaseStats.stdErr;
-        }
-        void testGroupEnded(TestGroupStats const& testGroupStats) override {
-            auto node = std::make_shared<TestGroupNode>(testGroupStats);
-            node->children.swap(m_testCases);
-            m_testGroups.push_back(node);
-        }
-        void testRunEnded(TestRunStats const& testRunStats) override {
-            auto node = std::make_shared<TestRunNode>(testRunStats);
-            node->children.swap(m_testGroups);
-            m_testRuns.push_back(node);
-            testRunEndedCumulative();
-        }
-        virtual void testRunEndedCumulative() = 0;
-
-        void skipTest(TestCaseInfo const&) override {}
-
-        IConfigPtr m_config;
-        std::ostream& stream;
-        std::vector<AssertionStats> m_assertions;
-        std::vector<std::vector<std::shared_ptr<SectionNode>>> m_sections;
-        std::vector<std::shared_ptr<TestCaseNode>> m_testCases;
-        std::vector<std::shared_ptr<TestGroupNode>> m_testGroups;
-
-        std::vector<std::shared_ptr<TestRunNode>> m_testRuns;
-
-        std::shared_ptr<SectionNode> m_rootSection;
-        std::shared_ptr<SectionNode> m_deepestSection;
-        std::vector<std::shared_ptr<SectionNode>> m_sectionStack;
-        ReporterPreferences m_reporterPrefs;
-    };
-
-    template<char C>
-    char const* getLineOfChars() {
-        static char line[CATCH_CONFIG_CONSOLE_WIDTH] = {0};
-        if( !*line ) {
-            std::memset( line, C, CATCH_CONFIG_CONSOLE_WIDTH-1 );
-            line[CATCH_CONFIG_CONSOLE_WIDTH-1] = 0;
-        }
-        return line;
-    }
-
-    struct TestEventListenerBase : StreamingReporterBase<TestEventListenerBase> {
-        TestEventListenerBase( ReporterConfig const& _config );
-
-        void assertionStarting(AssertionInfo const&) override;
-        bool assertionEnded(AssertionStats const&) override;
-    };
-
-} // end namespace Catch
-
-// end catch_reporter_bases.hpp
-// start catch_console_colour.h
-
-namespace Catch {
-
-    struct Colour {
-        enum Code {
-            None = 0,
-
-            White,
-            Red,
-            Green,
-            Blue,
-            Cyan,
-            Yellow,
-            Grey,
-
-            Bright = 0x10,
-
-            BrightRed = Bright | Red,
-            BrightGreen = Bright | Green,
-            LightGrey = Bright | Grey,
-            BrightWhite = Bright | White,
-            BrightYellow = Bright | Yellow,
-
-            // By intention
-            FileName = LightGrey,
-            Warning = BrightYellow,
-            ResultError = BrightRed,
-            ResultSuccess = BrightGreen,
-            ResultExpectedFailure = Warning,
-
-            Error = BrightRed,
-            Success = Green,
-
-            OriginalExpression = Cyan,
-            ReconstructedExpression = BrightYellow,
-
-            SecondaryText = LightGrey,
-            Headers = White
-        };
-
-        // Use constructed object for RAII guard
-        Colour( Code _colourCode );
-        Colour( Colour&& other ) noexcept;
-        Colour& operator=( Colour&& other ) noexcept;
-        ~Colour();
-
-        // Use static method for one-shot changes
-        static void use( Code _colourCode );
-
-    private:
-        bool m_moved = false;
-    };
-
-    std::ostream& operator << ( std::ostream& os, Colour const& );
-
-} // end namespace Catch
-
-// end catch_console_colour.h
-// start catch_reporter_registrars.hpp
-
-
-namespace Catch {
-
-    template<typename T>
-    class ReporterRegistrar {
-
-        class ReporterFactory : public IReporterFactory {
-
-            virtual IStreamingReporterPtr create( ReporterConfig const& config ) const override {
-                return std::unique_ptr<T>( new T( config ) );
-            }
-
-            virtual std::string getDescription() const override {
-                return T::getDescription();
-            }
-        };
-
-    public:
-
-        explicit ReporterRegistrar( std::string const& name ) {
-            getMutableRegistryHub().registerReporter( name, std::make_shared<ReporterFactory>() );
-        }
-    };
-
-    template<typename T>
-    class ListenerRegistrar {
-
-        class ListenerFactory : public IReporterFactory {
-
-            virtual IStreamingReporterPtr create( ReporterConfig const& config ) const override {
-                return std::unique_ptr<T>( new T( config ) );
-            }
-            virtual std::string getDescription() const override {
-                return std::string();
-            }
-        };
-
-    public:
-
-        ListenerRegistrar() {
-            getMutableRegistryHub().registerListener( std::make_shared<ListenerFactory>() );
-        }
-    };
-}
-
-#if !defined(CATCH_CONFIG_DISABLE)
-
-#define CATCH_REGISTER_REPORTER( name, reporterType ) \
-    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS          \
-    namespace{ Catch::ReporterRegistrar<reporterType> catch_internal_RegistrarFor##reporterType( name ); } \
-    CATCH_INTERNAL_UNSUPPRESS_GLOBALS_WARNINGS
-
-#define CATCH_REGISTER_LISTENER( listenerType ) \
-     CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS   \
-     namespace{ Catch::ListenerRegistrar<listenerType> catch_internal_RegistrarFor##listenerType; } \
-     CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
-#else // CATCH_CONFIG_DISABLE
-
-#define CATCH_REGISTER_REPORTER(name, reporterType)
-#define CATCH_REGISTER_LISTENER(listenerType)
-
-#endif // CATCH_CONFIG_DISABLE
-
-// end catch_reporter_registrars.hpp
-// Allow users to base their work off existing reporters
-// start catch_reporter_compact.h
-
-namespace Catch {
-
-    struct CompactReporter : StreamingReporterBase<CompactReporter> {
-
-        using StreamingReporterBase::StreamingReporterBase;
-
-        ~CompactReporter() override;
-
-        static std::string getDescription();
-
-        ReporterPreferences getPreferences() const override;
-
-        void noMatchingTestCases(std::string const& spec) override;
-
-        void assertionStarting(AssertionInfo const&) override;
-
-        bool assertionEnded(AssertionStats const& _assertionStats) override;
-
-        void sectionEnded(SectionStats const& _sectionStats) override;
-
-        void testRunEnded(TestRunStats const& _testRunStats) override;
-
-    };
-
-} // end namespace Catch
-
-// end catch_reporter_compact.h
-// start catch_reporter_console.h
-
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
-                              // Note that 4062 (not all labels are handled
-                              // and default is missing) is enabled
-#endif
-
-namespace Catch {
-    // Fwd decls
-    struct SummaryColumn;
-    class TablePrinter;
-
-    struct ConsoleReporter : StreamingReporterBase<ConsoleReporter> {
-        std::unique_ptr<TablePrinter> m_tablePrinter;
-
-        ConsoleReporter(ReporterConfig const& config);
-        ~ConsoleReporter() override;
-        static std::string getDescription();
-
-        void noMatchingTestCases(std::string const& spec) override;
-
-        void assertionStarting(AssertionInfo const&) override;
-
-        bool assertionEnded(AssertionStats const& _assertionStats) override;
-
-        void sectionStarting(SectionInfo const& _sectionInfo) override;
-        void sectionEnded(SectionStats const& _sectionStats) override;
-
-        void benchmarkStarting(BenchmarkInfo const& info) override;
-        void benchmarkEnded(BenchmarkStats const& stats) override;
-
-        void testCaseEnded(TestCaseStats const& _testCaseStats) override;
-        void testGroupEnded(TestGroupStats const& _testGroupStats) override;
-        void testRunEnded(TestRunStats const& _testRunStats) override;
-
-    private:
-
-        void lazyPrint();
-
-        void lazyPrintWithoutClosingBenchmarkTable();
-        void lazyPrintRunInfo();
-        void lazyPrintGroupInfo();
-        void printTestCaseAndSectionHeader();
-
-        void printClosedHeader(std::string const& _name);
-        void printOpenHeader(std::string const& _name);
-
-        // if string has a : in first line will set indent to follow it on
-        // subsequent lines
-        void printHeaderString(std::string const& _string, std::size_t indent = 0);
-
-        void printTotals(Totals const& totals);
-        void printSummaryRow(std::string const& label, std::vector<SummaryColumn> const& cols, std::size_t row);
-
-        void printTotalsDivider(Totals const& totals);
-        void printSummaryDivider();
-
-    private:
-        bool m_headerPrinted = false;
-    };
-
-} // end namespace Catch
-
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-
-// end catch_reporter_console.h
-// start catch_reporter_junit.h
-
-// start catch_xmlwriter.h
-
-#include <vector>
-
-namespace Catch {
-
-    class XmlEncode {
-    public:
-        enum ForWhat { ForTextNodes, ForAttributes };
-
-        XmlEncode( std::string const& str, ForWhat forWhat = ForTextNodes );
-
-        void encodeTo( std::ostream& os ) const;
-
-        friend std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode );
-
-    private:
-        std::string m_str;
-        ForWhat m_forWhat;
-    };
-
-    class XmlWriter {
-    public:
-
-        class ScopedElement {
-        public:
-            ScopedElement( XmlWriter* writer );
-
-            ScopedElement( ScopedElement&& other ) noexcept;
-            ScopedElement& operator=( ScopedElement&& other ) noexcept;
-
-            ~ScopedElement();
-
-            ScopedElement& writeText( std::string const& text, bool indent = true );
-
-            template<typename T>
-            ScopedElement& writeAttribute( std::string const& name, T const& attribute ) {
-                m_writer->writeAttribute( name, attribute );
-                return *this;
-            }
-
-        private:
-            mutable XmlWriter* m_writer = nullptr;
-        };
-
-        XmlWriter( std::ostream& os = Catch::cout() );
-        ~XmlWriter();
-
-        XmlWriter( XmlWriter const& ) = delete;
-        XmlWriter& operator=( XmlWriter const& ) = delete;
-
-        XmlWriter& startElement( std::string const& name );
-
-        ScopedElement scopedElement( std::string const& name );
-
-        XmlWriter& endElement();
-
-        XmlWriter& writeAttribute( std::string const& name, std::string const& attribute );
-
-        XmlWriter& writeAttribute( std::string const& name, bool attribute );
-
-        template<typename T>
-        XmlWriter& writeAttribute( std::string const& name, T const& attribute ) {
-            ReusableStringStream rss;
-            rss << attribute;
-            return writeAttribute( name, rss.str() );
-        }
-
-        XmlWriter& writeText( std::string const& text, bool indent = true );
-
-        XmlWriter& writeComment( std::string const& text );
-
-        void writeStylesheetRef( std::string const& url );
-
-        XmlWriter& writeBlankLine();
-
-        void ensureTagClosed();
-
-    private:
-
-        void writeDeclaration();
-
-        void newlineIfNecessary();
-
-        bool m_tagIsOpen = false;
-        bool m_needsNewline = false;
-        std::vector<std::string> m_tags;
-        std::string m_indent;
-        std::ostream& m_os;
-    };
-
-}
-
-// end catch_xmlwriter.h
-namespace Catch {
-
-    class JunitReporter : public CumulativeReporterBase<JunitReporter> {
-    public:
-        JunitReporter(ReporterConfig const& _config);
-
-        ~JunitReporter() override;
-
-        static std::string getDescription();
-
-        void noMatchingTestCases(std::string const& /*spec*/) override;
-
-        void testRunStarting(TestRunInfo const& runInfo) override;
-
-        void testGroupStarting(GroupInfo const& groupInfo) override;
-
-        void testCaseStarting(TestCaseInfo const& testCaseInfo) override;
-        bool assertionEnded(AssertionStats const& assertionStats) override;
-
-        void testCaseEnded(TestCaseStats const& testCaseStats) override;
-
-        void testGroupEnded(TestGroupStats const& testGroupStats) override;
-
-        void testRunEndedCumulative() override;
-
-        void writeGroup(TestGroupNode const& groupNode, double suiteTime);
-
-        void writeTestCase(TestCaseNode const& testCaseNode);
-
-        void writeSection(std::string const& className,
-                          std::string const& rootName,
-                          SectionNode const& sectionNode);
-
-        void writeAssertions(SectionNode const& sectionNode);
-        void writeAssertion(AssertionStats const& stats);
-
-        XmlWriter xml;
-        Timer suiteTimer;
-        std::string stdOutForSuite;
-        std::string stdErrForSuite;
-        unsigned int unexpectedExceptions = 0;
-        bool m_okToFail = false;
-    };
-
-} // end namespace Catch
-
-// end catch_reporter_junit.h
-// start catch_reporter_xml.h
-
-namespace Catch {
-    class XmlReporter : public StreamingReporterBase<XmlReporter> {
-    public:
-        XmlReporter(ReporterConfig const& _config);
-
-        ~XmlReporter() override;
-
-        static std::string getDescription();
-
-        virtual std::string getStylesheetRef() const;
-
-        void writeSourceInfo(SourceLineInfo const& sourceInfo);
-
-    public: // StreamingReporterBase
-
-        void noMatchingTestCases(std::string const& s) override;
-
-        void testRunStarting(TestRunInfo const& testInfo) override;
-
-        void testGroupStarting(GroupInfo const& groupInfo) override;
-
-        void testCaseStarting(TestCaseInfo const& testInfo) override;
-
-        void sectionStarting(SectionInfo const& sectionInfo) override;
-
-        void assertionStarting(AssertionInfo const&) override;
-
-        bool assertionEnded(AssertionStats const& assertionStats) override;
-
-        void sectionEnded(SectionStats const& sectionStats) override;
-
-        void testCaseEnded(TestCaseStats const& testCaseStats) override;
-
-        void testGroupEnded(TestGroupStats const& testGroupStats) override;
-
-        void testRunEnded(TestRunStats const& testRunStats) override;
-
-    private:
-        Timer m_testCaseTimer;
-        XmlWriter m_xml;
-        int m_sectionDepth = 0;
-    };
-
-} // end namespace Catch
-
-// end catch_reporter_xml.h
-
-// end catch_external_interfaces.h
-#endif
-
-#endif // ! CATCH_CONFIG_IMPL_ONLY
-
-#ifdef CATCH_IMPL
-// start catch_impl.hpp
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-
-// Keep these here for external reporters
-// start catch_test_case_tracker.h
-
-#include <string>
-#include <vector>
-#include <memory>
-
-namespace Catch {
-namespace TestCaseTracking {
-
-    struct NameAndLocation {
-        std::string name;
-        SourceLineInfo location;
-
-        NameAndLocation( std::string const& _name, SourceLineInfo const& _location );
-    };
-
-    struct ITracker;
-
-    using ITrackerPtr = std::shared_ptr<ITracker>;
-
-    struct ITracker {
-        virtual ~ITracker();
-
-        // static queries
-        virtual NameAndLocation const& nameAndLocation() const = 0;
-
-        // dynamic queries
-        virtual bool isComplete() const = 0; // Successfully completed or failed
-        virtual bool isSuccessfullyCompleted() const = 0;
-        virtual bool isOpen() const = 0; // Started but not complete
-        virtual bool hasChildren() const = 0;
-
-        virtual ITracker& parent() = 0;
-
-        // actions
-        virtual void close() = 0; // Successfully complete
-        virtual void fail() = 0;
-        virtual void markAsNeedingAnotherRun() = 0;
-
-        virtual void addChild( ITrackerPtr const& child ) = 0;
-        virtual ITrackerPtr findChild( NameAndLocation const& nameAndLocation ) = 0;
-        virtual void openChild() = 0;
-
-        // Debug/ checking
-        virtual bool isSectionTracker() const = 0;
-        virtual bool isIndexTracker() const = 0;
-    };
-
-    class TrackerContext {
-
-        enum RunState {
-            NotStarted,
-            Executing,
-            CompletedCycle
-        };
-
-        ITrackerPtr m_rootTracker;
-        ITracker* m_currentTracker = nullptr;
-        RunState m_runState = NotStarted;
-
-    public:
-
-        static TrackerContext& instance();
-
-        ITracker& startRun();
-        void endRun();
-
-        void startCycle();
-        void completeCycle();
-
-        bool completedCycle() const;
-        ITracker& currentTracker();
-        void setCurrentTracker( ITracker* tracker );
-    };
-
-    class TrackerBase : public ITracker {
-    protected:
-        enum CycleState {
-            NotStarted,
-            Executing,
-            ExecutingChildren,
-            NeedsAnotherRun,
-            CompletedSuccessfully,
-            Failed
-        };
-
-        using Children = std::vector<ITrackerPtr>;
-        NameAndLocation m_nameAndLocation;
-        TrackerContext& m_ctx;
-        ITracker* m_parent;
-        Children m_children;
-        CycleState m_runState = NotStarted;
-
-    public:
-        TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent );
-
-        NameAndLocation const& nameAndLocation() const override;
-        bool isComplete() const override;
-        bool isSuccessfullyCompleted() const override;
-        bool isOpen() const override;
-        bool hasChildren() const override;
-
-        void addChild( ITrackerPtr const& child ) override;
-
-        ITrackerPtr findChild( NameAndLocation const& nameAndLocation ) override;
-        ITracker& parent() override;
-
-        void openChild() override;
-
-        bool isSectionTracker() const override;
-        bool isIndexTracker() const override;
-
-        void open();
-
-        void close() override;
-        void fail() override;
-        void markAsNeedingAnotherRun() override;
-
-    private:
-        void moveToParent();
-        void moveToThis();
-    };
-
-    class SectionTracker : public TrackerBase {
-        std::vector<std::string> m_filters;
-    public:
-        SectionTracker( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent );
-
-        bool isSectionTracker() const override;
-
-        static SectionTracker& acquire( TrackerContext& ctx, NameAndLocation const& nameAndLocation );
-
-        void tryOpen();
-
-        void addInitialFilters( std::vector<std::string> const& filters );
-        void addNextFilters( std::vector<std::string> const& filters );
-    };
-
-    class IndexTracker : public TrackerBase {
-        int m_size;
-        int m_index = -1;
-    public:
-        IndexTracker( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent, int size );
-
-        bool isIndexTracker() const override;
-        void close() override;
-
-        static IndexTracker& acquire( TrackerContext& ctx, NameAndLocation const& nameAndLocation, int size );
-
-        int index() const;
-
-        void moveNext();
-    };
-
-} // namespace TestCaseTracking
-
-using TestCaseTracking::ITracker;
-using TestCaseTracking::TrackerContext;
-using TestCaseTracking::SectionTracker;
-using TestCaseTracking::IndexTracker;
-
-} // namespace Catch
-
-// end catch_test_case_tracker.h
-
-// start catch_leak_detector.h
-
-namespace Catch {
-
-    struct LeakDetector {
-        LeakDetector();
-    };
-
-}
-// end catch_leak_detector.h
-// Cpp files will be included in the single-header file here
-// start catch_approx.cpp
-
-#include <cmath>
-#include <limits>
-
-namespace {
-
-// Performs equivalent check of std::fabs(lhs - rhs) <= margin
-// But without the subtraction to allow for INFINITY in comparison
-bool marginComparison(double lhs, double rhs, double margin) {
-    return (lhs + margin >= rhs) && (rhs + margin >= lhs);
-}
-
-}
-
-namespace Catch {
-namespace Detail {
-
-    Approx::Approx ( double value )
-    :   m_epsilon( std::numeric_limits<float>::epsilon()*100 ),
-        m_margin( 0.0 ),
-        m_scale( 0.0 ),
-        m_value( value )
-    {}
-
-    Approx Approx::custom() {
-        return Approx( 0 );
-    }
-
-    Approx Approx::operator-() const {
-        auto temp(*this);
-        temp.m_value = -temp.m_value;
-        return temp;
-    }
-
-    std::string Approx::toString() const {
-        ReusableStringStream rss;
-        rss << "Approx( " << ::Catch::Detail::stringify( m_value ) << " )";
-        return rss.str();
-    }
-
-    bool Approx::equalityComparisonImpl(const double other) const {
-        // First try with fixed margin, then compute margin based on epsilon, scale and Approx's value
-        // Thanks to Richard Harris for his help refining the scaled margin value
-        return marginComparison(m_value, other, m_margin) || marginComparison(m_value, other, m_epsilon * (m_scale + std::fabs(m_value)));
-    }
-
-    void Approx::setMargin(double margin) {
-        CATCH_ENFORCE(margin >= 0,
-            "Invalid Approx::margin: " << margin << '.'
-            << " Approx::Margin has to be non-negative.");
-        m_margin = margin;
-    }
-
-    void Approx::setEpsilon(double epsilon) {
-        CATCH_ENFORCE(epsilon >= 0 && epsilon <= 1.0,
-            "Invalid Approx::epsilon: " << epsilon << '.'
-            << " Approx::epsilon has to be in [0, 1]");
-        m_epsilon = epsilon;
-    }
-
-} // end namespace Detail
-
-namespace literals {
-    Detail::Approx operator "" _a(long double val) {
-        return Detail::Approx(val);
-    }
-    Detail::Approx operator "" _a(unsigned long long val) {
-        return Detail::Approx(val);
-    }
-} // end namespace literals
-
-std::string StringMaker<Catch::Detail::Approx>::convert(Catch::Detail::Approx const& value) {
-    return value.toString();
-}
-
-} // end namespace Catch
-// end catch_approx.cpp
-// start catch_assertionhandler.cpp
-
-// start catch_context.h
-
-#include <memory>
-
-namespace Catch {
-
-    struct IResultCapture;
-    struct IRunner;
-    struct IConfig;
-    struct IMutableContext;
-
-    using IConfigPtr = std::shared_ptr<IConfig const>;
-
-    struct IContext
-    {
-        virtual ~IContext();
-
-        virtual IResultCapture* getResultCapture() = 0;
-        virtual IRunner* getRunner() = 0;
-        virtual IConfigPtr const& getConfig() const = 0;
-    };
-
-    struct IMutableContext : IContext
-    {
-        virtual ~IMutableContext();
-        virtual void setResultCapture( IResultCapture* resultCapture ) = 0;
-        virtual void setRunner( IRunner* runner ) = 0;
-        virtual void setConfig( IConfigPtr const& config ) = 0;
-
-    private:
-        static IMutableContext *currentContext;
-        friend IMutableContext& getCurrentMutableContext();
-        friend void cleanUpContext();
-        static void createContext();
-    };
-
-    inline IMutableContext& getCurrentMutableContext()
-    {
-        if( !IMutableContext::currentContext )
-            IMutableContext::createContext();
-        return *IMutableContext::currentContext;
-    }
-
-    inline IContext& getCurrentContext()
-    {
-        return getCurrentMutableContext();
-    }
-
-    void cleanUpContext();
-}
-
-// end catch_context.h
-// start catch_debugger.h
-
-namespace Catch {
-    bool isDebuggerActive();
-}
-
-#ifdef CATCH_PLATFORM_MAC
-
-    #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */
-
-#elif defined(CATCH_PLATFORM_LINUX)
-    // If we can use inline assembler, do it because this allows us to break
-    // directly at the location of the failing check instead of breaking inside
-    // raise() called from it, i.e. one stack frame below.
-    #if defined(__GNUC__) && (defined(__i386) || defined(__x86_64))
-        #define CATCH_TRAP() asm volatile ("int $3") /* NOLINT */
-    #else // Fall back to the generic way.
-        #include <signal.h>
-
-        #define CATCH_TRAP() raise(SIGTRAP)
-    #endif
-#elif defined(_MSC_VER)
-    #define CATCH_TRAP() __debugbreak()
-#elif defined(__MINGW32__)
-    extern "C" __declspec(dllimport) void __stdcall DebugBreak();
-    #define CATCH_TRAP() DebugBreak()
-#endif
-
-#ifdef CATCH_TRAP
-    #define CATCH_BREAK_INTO_DEBUGGER() if( Catch::isDebuggerActive() ) { CATCH_TRAP(); }
-#else
-    namespace Catch {
-        inline void doNothing() {}
-    }
-    #define CATCH_BREAK_INTO_DEBUGGER() Catch::doNothing()
-#endif
-
-// end catch_debugger.h
-// start catch_run_context.h
-
-// start catch_fatal_condition.h
-
-// start catch_windows_h_proxy.h
-
-
-#if defined(CATCH_PLATFORM_WINDOWS)
-
-#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX)
-#  define CATCH_DEFINED_NOMINMAX
-#  define NOMINMAX
-#endif
-#if !defined(WIN32_LEAN_AND_MEAN) && !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN)
-#  define CATCH_DEFINED_WIN32_LEAN_AND_MEAN
-#  define WIN32_LEAN_AND_MEAN
-#endif
-
-#ifdef __AFXDLL
-#include <AfxWin.h>
-#else
-#include <windows.h>
-#endif
-
-#ifdef CATCH_DEFINED_NOMINMAX
-#  undef NOMINMAX
-#endif
-#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN
-#  undef WIN32_LEAN_AND_MEAN
-#endif
-
-#endif // defined(CATCH_PLATFORM_WINDOWS)
-
-// end catch_windows_h_proxy.h
-#if defined( CATCH_CONFIG_WINDOWS_SEH )
-
-namespace Catch {
-
-    struct FatalConditionHandler {
-
-        static LONG CALLBACK handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo);
-        FatalConditionHandler();
-        static void reset();
-        ~FatalConditionHandler();
-
-    private:
-        static bool isSet;
-        static ULONG guaranteeSize;
-        static PVOID exceptionHandlerHandle;
-    };
-
-} // namespace Catch
-
-#elif defined ( CATCH_CONFIG_POSIX_SIGNALS )
-
-#include <signal.h>
-
-namespace Catch {
-
-    struct FatalConditionHandler {
-
-        static bool isSet;
-        static struct sigaction oldSigActions[];
-        static stack_t oldSigStack;
-        static char altStackMem[];
-
-        static void handleSignal( int sig );
-
-        FatalConditionHandler();
-        ~FatalConditionHandler();
-        static void reset();
-    };
-
-} // namespace Catch
-
-#else
-
-namespace Catch {
-    struct FatalConditionHandler {
-        void reset();
-    };
-}
-
-#endif
-
-// end catch_fatal_condition.h
-#include <string>
-
-namespace Catch {
-
-    struct IMutableContext;
-
-    ///////////////////////////////////////////////////////////////////////////
-
-    class RunContext : public IResultCapture, public IRunner {
-
-    public:
-        RunContext( RunContext const& ) = delete;
-        RunContext& operator =( RunContext const& ) = delete;
-
-        explicit RunContext( IConfigPtr const& _config, IStreamingReporterPtr&& reporter );
-
-        ~RunContext() override;
-
-        void testGroupStarting( std::string const& testSpec, std::size_t groupIndex, std::size_t groupsCount );
-        void testGroupEnded( std::string const& testSpec, Totals const& totals, std::size_t groupIndex, std::size_t groupsCount );
-
-        Totals runTest(TestCase const& testCase);
-
-        IConfigPtr config() const;
-        IStreamingReporter& reporter() const;
-
-    public: // IResultCapture
-
-        // Assertion handlers
-        void handleExpr
-                (   AssertionInfo const& info,
-                    ITransientExpression const& expr,
-                    AssertionReaction& reaction ) override;
-        void handleMessage
-                (   AssertionInfo const& info,
-                    ResultWas::OfType resultType,
-                    StringRef const& message,
-                    AssertionReaction& reaction ) override;
-        void handleUnexpectedExceptionNotThrown
-                (   AssertionInfo const& info,
-                    AssertionReaction& reaction ) override;
-        void handleUnexpectedInflightException
-                (   AssertionInfo const& info,
-                    std::string const& message,
-                    AssertionReaction& reaction ) override;
-        void handleIncomplete
-                (   AssertionInfo const& info ) override;
-        void handleNonExpr
-                (   AssertionInfo const &info,
-                    ResultWas::OfType resultType,
-                    AssertionReaction &reaction ) override;
-
-        bool sectionStarted( SectionInfo const& sectionInfo, Counts& assertions ) override;
-
-        void sectionEnded( SectionEndInfo const& endInfo ) override;
-        void sectionEndedEarly( SectionEndInfo const& endInfo ) override;
-
-        auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& override;
-
-        void benchmarkStarting( BenchmarkInfo const& info ) override;
-        void benchmarkEnded( BenchmarkStats const& stats ) override;
-
-        void pushScopedMessage( MessageInfo const& message ) override;
-        void popScopedMessage( MessageInfo const& message ) override;
-
-        std::string getCurrentTestName() const override;
-
-        const AssertionResult* getLastResult() const override;
-
-        void exceptionEarlyReported() override;
-
-        void handleFatalErrorCondition( StringRef message ) override;
-
-        bool lastAssertionPassed() override;
-
-        void assertionPassed() override;
-
-    public:
-        // !TBD We need to do this another way!
-        bool aborting() const final;
-
-    private:
-
-        void runCurrentTest( std::string& redirectedCout, std::string& redirectedCerr );
-        void invokeActiveTestCase();
-
-        void resetAssertionInfo();
-        bool testForMissingAssertions( Counts& assertions );
-
-        void assertionEnded( AssertionResult const& result );
-        void reportExpr
-                (   AssertionInfo const &info,
-                    ResultWas::OfType resultType,
-                    ITransientExpression const *expr,
-                    bool negated );
-
-        void populateReaction( AssertionReaction& reaction );
-
-    private:
-
-        void handleUnfinishedSections();
-
-        TestRunInfo m_runInfo;
-        IMutableContext& m_context;
-        TestCase const* m_activeTestCase = nullptr;
-        ITracker* m_testCaseTracker;
-        Option<AssertionResult> m_lastResult;
-
-        IConfigPtr m_config;
-        Totals m_totals;
-        IStreamingReporterPtr m_reporter;
-        std::vector<MessageInfo> m_messages;
-        AssertionInfo m_lastAssertionInfo;
-        std::vector<SectionEndInfo> m_unfinishedSections;
-        std::vector<ITracker*> m_activeSections;
-        TrackerContext m_trackerContext;
-        bool m_lastAssertionPassed = false;
-        bool m_shouldReportUnexpected = true;
-        bool m_includeSuccessfulResults;
-    };
-
-} // end namespace Catch
-
-// end catch_run_context.h
-namespace Catch {
-
-    namespace {
-        auto operator <<( std::ostream& os, ITransientExpression const& expr ) -> std::ostream& {
-            expr.streamReconstructedExpression( os );
-            return os;
-        }
-    }
-
-    LazyExpression::LazyExpression( bool isNegated )
-    :   m_isNegated( isNegated )
-    {}
-
-    LazyExpression::LazyExpression( LazyExpression const& other ) : m_isNegated( other.m_isNegated ) {}
-
-    LazyExpression::operator bool() const {
-        return m_transientExpression != nullptr;
-    }
-
-    auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream& {
-        if( lazyExpr.m_isNegated )
-            os << "!";
-
-        if( lazyExpr ) {
-            if( lazyExpr.m_isNegated && lazyExpr.m_transientExpression->isBinaryExpression() )
-                os << "(" << *lazyExpr.m_transientExpression << ")";
-            else
-                os << *lazyExpr.m_transientExpression;
-        }
-        else {
-            os << "{** error - unchecked empty expression requested **}";
-        }
-        return os;
-    }
-
-    AssertionHandler::AssertionHandler
-        (   StringRef const& macroName,
-            SourceLineInfo const& lineInfo,
-            StringRef capturedExpression,
-            ResultDisposition::Flags resultDisposition )
-    :   m_assertionInfo{ macroName, lineInfo, capturedExpression, resultDisposition },
-        m_resultCapture( getResultCapture() )
-    {}
-
-    void AssertionHandler::handleExpr( ITransientExpression const& expr ) {
-        m_resultCapture.handleExpr( m_assertionInfo, expr, m_reaction );
-    }
-    void AssertionHandler::handleMessage(ResultWas::OfType resultType, StringRef const& message) {
-        m_resultCapture.handleMessage( m_assertionInfo, resultType, message, m_reaction );
-    }
-
-    auto AssertionHandler::allowThrows() const -> bool {
-        return getCurrentContext().getConfig()->allowThrows();
-    }
-
-    void AssertionHandler::complete() {
-        setCompleted();
-        if( m_reaction.shouldDebugBreak ) {
-
-            // If you find your debugger stopping you here then go one level up on the
-            // call-stack for the code that caused it (typically a failed assertion)
-
-            // (To go back to the test and change execution, jump over the throw, next)
-            CATCH_BREAK_INTO_DEBUGGER();
-        }
-        if (m_reaction.shouldThrow) {
-#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-            throw Catch::TestFailureException();
-#else
-            CATCH_ERROR( "Test failure requires aborting test!" );
-#endif
-        }
-    }
-    void AssertionHandler::setCompleted() {
-        m_completed = true;
-    }
-
-    void AssertionHandler::handleUnexpectedInflightException() {
-        m_resultCapture.handleUnexpectedInflightException( m_assertionInfo, Catch::translateActiveException(), m_reaction );
-    }
-
-    void AssertionHandler::handleExceptionThrownAsExpected() {
-        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
-    }
-    void AssertionHandler::handleExceptionNotThrownAsExpected() {
-        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
-    }
-
-    void AssertionHandler::handleUnexpectedExceptionNotThrown() {
-        m_resultCapture.handleUnexpectedExceptionNotThrown( m_assertionInfo, m_reaction );
-    }
-
-    void AssertionHandler::handleThrowingCallSkipped() {
-        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
-    }
-
-    // This is the overload that takes a string and infers the Equals matcher from it
-    // The more general overload, that takes any string matcher, is in catch_capture_matchers.cpp
-    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str, StringRef const& matcherString  ) {
-        handleExceptionMatchExpr( handler, Matchers::Equals( str ), matcherString );
-    }
-
-} // namespace Catch
-// end catch_assertionhandler.cpp
-// start catch_assertionresult.cpp
-
-namespace Catch {
-    AssertionResultData::AssertionResultData(ResultWas::OfType _resultType, LazyExpression const & _lazyExpression):
-        lazyExpression(_lazyExpression),
-        resultType(_resultType) {}
-
-    std::string AssertionResultData::reconstructExpression() const {
-
-        if( reconstructedExpression.empty() ) {
-            if( lazyExpression ) {
-                ReusableStringStream rss;
-                rss << lazyExpression;
-                reconstructedExpression = rss.str();
-            }
-        }
-        return reconstructedExpression;
-    }
-
-    AssertionResult::AssertionResult( AssertionInfo const& info, AssertionResultData const& data )
-    :   m_info( info ),
-        m_resultData( data )
-    {}
-
-    // Result was a success
-    bool AssertionResult::succeeded() const {
-        return Catch::isOk( m_resultData.resultType );
-    }
-
-    // Result was a success, or failure is suppressed
-    bool AssertionResult::isOk() const {
-        return Catch::isOk( m_resultData.resultType ) || shouldSuppressFailure( m_info.resultDisposition );
-    }
-
-    ResultWas::OfType AssertionResult::getResultType() const {
-        return m_resultData.resultType;
-    }
-
-    bool AssertionResult::hasExpression() const {
-        return m_info.capturedExpression[0] != 0;
-    }
-
-    bool AssertionResult::hasMessage() const {
-        return !m_resultData.message.empty();
-    }
-
-    std::string AssertionResult::getExpression() const {
-        if( isFalseTest( m_info.resultDisposition ) )
-            return "!(" + m_info.capturedExpression + ")";
-        else
-            return m_info.capturedExpression;
-    }
-
-    std::string AssertionResult::getExpressionInMacro() const {
-        std::string expr;
-        if( m_info.macroName[0] == 0 )
-            expr = m_info.capturedExpression;
-        else {
-            expr.reserve( m_info.macroName.size() + m_info.capturedExpression.size() + 4 );
-            expr += m_info.macroName;
-            expr += "( ";
-            expr += m_info.capturedExpression;
-            expr += " )";
-        }
-        return expr;
-    }
-
-    bool AssertionResult::hasExpandedExpression() const {
-        return hasExpression() && getExpandedExpression() != getExpression();
-    }
-
-    std::string AssertionResult::getExpandedExpression() const {
-        std::string expr = m_resultData.reconstructExpression();
-        return expr.empty()
-                ? getExpression()
-                : expr;
-    }
-
-    std::string AssertionResult::getMessage() const {
-        return m_resultData.message;
-    }
-    SourceLineInfo AssertionResult::getSourceInfo() const {
-        return m_info.lineInfo;
-    }
-
-    StringRef AssertionResult::getTestMacroName() const {
-        return m_info.macroName;
-    }
-
-} // end namespace Catch
-// end catch_assertionresult.cpp
-// start catch_benchmark.cpp
-
-namespace Catch {
-
-    auto BenchmarkLooper::getResolution() -> uint64_t {
-        return getEstimatedClockResolution() * getCurrentContext().getConfig()->benchmarkResolutionMultiple();
-    }
-
-    void BenchmarkLooper::reportStart() {
-        getResultCapture().benchmarkStarting( { m_name } );
-    }
-    auto BenchmarkLooper::needsMoreIterations() -> bool {
-        auto elapsed = m_timer.getElapsedNanoseconds();
-
-        // Exponentially increasing iterations until we're confident in our timer resolution
-        if( elapsed < m_resolution ) {
-            m_iterationsToRun *= 10;
-            return true;
-        }
-
-        getResultCapture().benchmarkEnded( { { m_name }, m_count, elapsed } );
-        return false;
-    }
-
-} // end namespace Catch
-// end catch_benchmark.cpp
-// start catch_capture_matchers.cpp
-
-namespace Catch {
-
-    using StringMatcher = Matchers::Impl::MatcherBase<std::string>;
-
-    // This is the general overload that takes a any string matcher
-    // There is another overload, in catch_assertionhandler.h/.cpp, that only takes a string and infers
-    // the Equals matcher (so the header does not mention matchers)
-    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher, StringRef const& matcherString  ) {
-        std::string exceptionMessage = Catch::translateActiveException();
-        MatchExpr<std::string, StringMatcher const&> expr( exceptionMessage, matcher, matcherString );
-        handler.handleExpr( expr );
-    }
-
-} // namespace Catch
-// end catch_capture_matchers.cpp
-// start catch_commandline.cpp
-
-// start catch_commandline.h
-
-// start catch_clara.h
-
-// Use Catch's value for console width (store Clara's off to the side, if present)
-#ifdef CLARA_CONFIG_CONSOLE_WIDTH
-#define CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
-#undef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
-#endif
-#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_CONFIG_CONSOLE_WIDTH-1
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wweak-vtables"
-#pragma clang diagnostic ignored "-Wexit-time-destructors"
-#pragma clang diagnostic ignored "-Wshadow"
-#endif
-
-// start clara.hpp
-// Copyright 2017 Two Blue Cubes Ltd. All rights reserved.
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// See https://github.com/philsquared/Clara for more details
-
-// Clara v1.1.4
-
-
-#ifndef CATCH_CLARA_CONFIG_CONSOLE_WIDTH
-#define CATCH_CLARA_CONFIG_CONSOLE_WIDTH 80
-#endif
-
-#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
-#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_CLARA_CONFIG_CONSOLE_WIDTH
-#endif
-
-#ifndef CLARA_CONFIG_OPTIONAL_TYPE
-#ifdef __has_include
-#if __has_include(<optional>) && __cplusplus >= 201703L
-#include <optional>
-#define CLARA_CONFIG_OPTIONAL_TYPE std::optional
-#endif
-#endif
-#endif
-
-// ----------- #included from clara_textflow.hpp -----------
-
-// TextFlowCpp
-//
-// A single-header library for wrapping and laying out basic text, by Phil Nash
-//
-// This work is licensed under the BSD 2-Clause license.
-// See the accompanying LICENSE file, or the one at https://opensource.org/licenses/BSD-2-Clause
-//
-// This project is hosted at https://github.com/philsquared/textflowcpp
-
-
-#include <cassert>
-#include <ostream>
-#include <sstream>
-#include <vector>
-
-#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
-#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH 80
-#endif
-
-namespace Catch { namespace clara { namespace TextFlow {
-
-    inline auto isWhitespace( char c ) -> bool {
-        static std::string chars = " \t\n\r";
-        return chars.find( c ) != std::string::npos;
-    }
-    inline auto isBreakableBefore( char c ) -> bool {
-        static std::string chars = "[({<|";
-        return chars.find( c ) != std::string::npos;
-    }
-    inline auto isBreakableAfter( char c ) -> bool {
-        static std::string chars = "])}>.,:;*+-=&/\\";
-        return chars.find( c ) != std::string::npos;
-    }
-
-    class Columns;
-
-    class Column {
-        std::vector<std::string> m_strings;
-        size_t m_width = CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH;
-        size_t m_indent = 0;
-        size_t m_initialIndent = std::string::npos;
-
-    public:
-        class iterator {
-            friend Column;
-
-            Column const& m_column;
-            size_t m_stringIndex = 0;
-            size_t m_pos = 0;
-
-            size_t m_len = 0;
-            size_t m_end = 0;
-            bool m_suffix = false;
-
-            iterator( Column const& column, size_t stringIndex )
-            :   m_column( column ),
-                m_stringIndex( stringIndex )
-            {}
-
-            auto line() const -> std::string const& { return m_column.m_strings[m_stringIndex]; }
-
-            auto isBoundary( size_t at ) const -> bool {
-                assert( at > 0 );
-                assert( at <= line().size() );
-
-                return at == line().size() ||
-                       ( isWhitespace( line()[at] ) && !isWhitespace( line()[at-1] ) ) ||
-                       isBreakableBefore( line()[at] ) ||
-                       isBreakableAfter( line()[at-1] );
-            }
-
-            void calcLength() {
-                assert( m_stringIndex < m_column.m_strings.size() );
-
-                m_suffix = false;
-                auto width = m_column.m_width-indent();
-                m_end = m_pos;
-                while( m_end < line().size() && line()[m_end] != '\n' )
-                    ++m_end;
-
-                if( m_end < m_pos + width ) {
-                    m_len = m_end - m_pos;
-                }
-                else {
-                    size_t len = width;
-                    while (len > 0 && !isBoundary(m_pos + len))
-                        --len;
-                    while (len > 0 && isWhitespace( line()[m_pos + len - 1] ))
-                        --len;
-
-                    if (len > 0) {
-                        m_len = len;
-                    } else {
-                        m_suffix = true;
-                        m_len = width - 1;
-                    }
-                }
-            }
-
-            auto indent() const -> size_t {
-                auto initial = m_pos == 0 && m_stringIndex == 0 ? m_column.m_initialIndent : std::string::npos;
-                return initial == std::string::npos ? m_column.m_indent : initial;
-            }
-
-            auto addIndentAndSuffix(std::string const &plain) const -> std::string {
-                return std::string( indent(), ' ' ) + (m_suffix ? plain + "-" : plain);
-            }
-
-        public:
-            explicit iterator( Column const& column ) : m_column( column ) {
-                assert( m_column.m_width > m_column.m_indent );
-                assert( m_column.m_initialIndent == std::string::npos || m_column.m_width > m_column.m_initialIndent );
-                calcLength();
-                if( m_len == 0 )
-                    m_stringIndex++; // Empty string
-            }
-
-            auto operator *() const -> std::string {
-                assert( m_stringIndex < m_column.m_strings.size() );
-                assert( m_pos <= m_end );
-                if( m_pos + m_column.m_width < m_end )
-                    return addIndentAndSuffix(line().substr(m_pos, m_len));
-                else
-                    return addIndentAndSuffix(line().substr(m_pos, m_end - m_pos));
-            }
-
-            auto operator ++() -> iterator& {
-                m_pos += m_len;
-                if( m_pos < line().size() && line()[m_pos] == '\n' )
-                    m_pos += 1;
-                else
-                    while( m_pos < line().size() && isWhitespace( line()[m_pos] ) )
-                        ++m_pos;
-
-                if( m_pos == line().size() ) {
-                    m_pos = 0;
-                    ++m_stringIndex;
-                }
-                if( m_stringIndex < m_column.m_strings.size() )
-                    calcLength();
-                return *this;
-            }
-            auto operator ++(int) -> iterator {
-                iterator prev( *this );
-                operator++();
-                return prev;
-            }
-
-            auto operator ==( iterator const& other ) const -> bool {
-                return
-                    m_pos == other.m_pos &&
-                    m_stringIndex == other.m_stringIndex &&
-                    &m_column == &other.m_column;
-            }
-            auto operator !=( iterator const& other ) const -> bool {
-                return !operator==( other );
-            }
-        };
-        using const_iterator = iterator;
-
-        explicit Column( std::string const& text ) { m_strings.push_back( text ); }
-
-        auto width( size_t newWidth ) -> Column& {
-            assert( newWidth > 0 );
-            m_width = newWidth;
-            return *this;
-        }
-        auto indent( size_t newIndent ) -> Column& {
-            m_indent = newIndent;
-            return *this;
-        }
-        auto initialIndent( size_t newIndent ) -> Column& {
-            m_initialIndent = newIndent;
-            return *this;
-        }
-
-        auto width() const -> size_t { return m_width; }
-        auto begin() const -> iterator { return iterator( *this ); }
-        auto end() const -> iterator { return { *this, m_strings.size() }; }
-
-        inline friend std::ostream& operator << ( std::ostream& os, Column const& col ) {
-            bool first = true;
-            for( auto line : col ) {
-                if( first )
-                    first = false;
-                else
-                    os << "\n";
-                os <<  line;
-            }
-            return os;
-        }
-
-        auto operator + ( Column const& other ) -> Columns;
-
-        auto toString() const -> std::string {
-            std::ostringstream oss;
-            oss << *this;
-            return oss.str();
-        }
-    };
-
-    class Spacer : public Column {
-
-    public:
-        explicit Spacer( size_t spaceWidth ) : Column( "" ) {
-            width( spaceWidth );
-        }
-    };
-
-    class Columns {
-        std::vector<Column> m_columns;
-
-    public:
-
-        class iterator {
-            friend Columns;
-            struct EndTag {};
-
-            std::vector<Column> const& m_columns;
-            std::vector<Column::iterator> m_iterators;
-            size_t m_activeIterators;
-
-            iterator( Columns const& columns, EndTag )
-            :   m_columns( columns.m_columns ),
-                m_activeIterators( 0 )
-            {
-                m_iterators.reserve( m_columns.size() );
-
-                for( auto const& col : m_columns )
-                    m_iterators.push_back( col.end() );
-            }
-
-        public:
-            explicit iterator( Columns const& columns )
-            :   m_columns( columns.m_columns ),
-                m_activeIterators( m_columns.size() )
-            {
-                m_iterators.reserve( m_columns.size() );
-
-                for( auto const& col : m_columns )
-                    m_iterators.push_back( col.begin() );
-            }
-
-            auto operator ==( iterator const& other ) const -> bool {
-                return m_iterators == other.m_iterators;
-            }
-            auto operator !=( iterator const& other ) const -> bool {
-                return m_iterators != other.m_iterators;
-            }
-            auto operator *() const -> std::string {
-                std::string row, padding;
-
-                for( size_t i = 0; i < m_columns.size(); ++i ) {
-                    auto width = m_columns[i].width();
-                    if( m_iterators[i] != m_columns[i].end() ) {
-                        std::string col = *m_iterators[i];
-                        row += padding + col;
-                        if( col.size() < width )
-                            padding = std::string( width - col.size(), ' ' );
-                        else
-                            padding = "";
-                    }
-                    else {
-                        padding += std::string( width, ' ' );
-                    }
-                }
-                return row;
-            }
-            auto operator ++() -> iterator& {
-                for( size_t i = 0; i < m_columns.size(); ++i ) {
-                    if (m_iterators[i] != m_columns[i].end())
-                        ++m_iterators[i];
-                }
-                return *this;
-            }
-            auto operator ++(int) -> iterator {
-                iterator prev( *this );
-                operator++();
-                return prev;
-            }
-        };
-        using const_iterator = iterator;
-
-        auto begin() const -> iterator { return iterator( *this ); }
-        auto end() const -> iterator { return { *this, iterator::EndTag() }; }
-
-        auto operator += ( Column const& col ) -> Columns& {
-            m_columns.push_back( col );
-            return *this;
-        }
-        auto operator + ( Column const& col ) -> Columns {
-            Columns combined = *this;
-            combined += col;
-            return combined;
-        }
-
-        inline friend std::ostream& operator << ( std::ostream& os, Columns const& cols ) {
-
-            bool first = true;
-            for( auto line : cols ) {
-                if( first )
-                    first = false;
-                else
-                    os << "\n";
-                os << line;
-            }
-            return os;
-        }
-
-        auto toString() const -> std::string {
-            std::ostringstream oss;
-            oss << *this;
-            return oss.str();
-        }
-    };
-
-    inline auto Column::operator + ( Column const& other ) -> Columns {
-        Columns cols;
-        cols += *this;
-        cols += other;
-        return cols;
-    }
-}}} // namespace Catch::clara::TextFlow
-
-// ----------- end of #include from clara_textflow.hpp -----------
-// ........... back in clara.hpp
-
-#include <memory>
-#include <set>
-#include <algorithm>
-
-#if !defined(CATCH_PLATFORM_WINDOWS) && ( defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) )
-#define CATCH_PLATFORM_WINDOWS
-#endif
-
-namespace Catch { namespace clara {
-namespace detail {
-
-    // Traits for extracting arg and return type of lambdas (for single argument lambdas)
-    template<typename L>
-    struct UnaryLambdaTraits : UnaryLambdaTraits<decltype( &L::operator() )> {};
-
-    template<typename ClassT, typename ReturnT, typename... Args>
-    struct UnaryLambdaTraits<ReturnT( ClassT::* )( Args... ) const> {
-        static const bool isValid = false;
-    };
-
-    template<typename ClassT, typename ReturnT, typename ArgT>
-    struct UnaryLambdaTraits<ReturnT( ClassT::* )( ArgT ) const> {
-        static const bool isValid = true;
-        using ArgType = typename std::remove_const<typename std::remove_reference<ArgT>::type>::type;
-        using ReturnType = ReturnT;
-    };
-
-    class TokenStream;
-
-    // Transport for raw args (copied from main args, or supplied via init list for testing)
-    class Args {
-        friend TokenStream;
-        std::string m_exeName;
-        std::vector<std::string> m_args;
-
-    public:
-        Args( int argc, char const* const* argv )
-            : m_exeName(argv[0]),
-              m_args(argv + 1, argv + argc) {}
-
-        Args( std::initializer_list<std::string> args )
-        :   m_exeName( *args.begin() ),
-            m_args( args.begin()+1, args.end() )
-        {}
-
-        auto exeName() const -> std::string {
-            return m_exeName;
-        }
-    };
-
-    // Wraps a token coming from a token stream. These may not directly correspond to strings as a single string
-    // may encode an option + its argument if the : or = form is used
-    enum class TokenType {
-        Option, Argument
-    };
-    struct Token {
-        TokenType type;
-        std::string token;
-    };
-
-    inline auto isOptPrefix( char c ) -> bool {
-        return c == '-'
-#ifdef CATCH_PLATFORM_WINDOWS
-            || c == '/'
-#endif
-        ;
-    }
-
-    // Abstracts iterators into args as a stream of tokens, with option arguments uniformly handled
-    class TokenStream {
-        using Iterator = std::vector<std::string>::const_iterator;
-        Iterator it;
-        Iterator itEnd;
-        std::vector<Token> m_tokenBuffer;
-
-        void loadBuffer() {
-            m_tokenBuffer.resize( 0 );
-
-            // Skip any empty strings
-            while( it != itEnd && it->empty() )
-                ++it;
-
-            if( it != itEnd ) {
-                auto const &next = *it;
-                if( isOptPrefix( next[0] ) ) {
-                    auto delimiterPos = next.find_first_of( " :=" );
-                    if( delimiterPos != std::string::npos ) {
-                        m_tokenBuffer.push_back( { TokenType::Option, next.substr( 0, delimiterPos ) } );
-                        m_tokenBuffer.push_back( { TokenType::Argument, next.substr( delimiterPos + 1 ) } );
-                    } else {
-                        if( next[1] != '-' && next.size() > 2 ) {
-                            std::string opt = "- ";
-                            for( size_t i = 1; i < next.size(); ++i ) {
-                                opt[1] = next[i];
-                                m_tokenBuffer.push_back( { TokenType::Option, opt } );
-                            }
-                        } else {
-                            m_tokenBuffer.push_back( { TokenType::Option, next } );
-                        }
-                    }
-                } else {
-                    m_tokenBuffer.push_back( { TokenType::Argument, next } );
-                }
-            }
-        }
-
-    public:
-        explicit TokenStream( Args const &args ) : TokenStream( args.m_args.begin(), args.m_args.end() ) {}
-
-        TokenStream( Iterator it, Iterator itEnd ) : it( it ), itEnd( itEnd ) {
-            loadBuffer();
-        }
-
-        explicit operator bool() const {
-            return !m_tokenBuffer.empty() || it != itEnd;
-        }
-
-        auto count() const -> size_t { return m_tokenBuffer.size() + (itEnd - it); }
-
-        auto operator*() const -> Token {
-            assert( !m_tokenBuffer.empty() );
-            return m_tokenBuffer.front();
-        }
-
-        auto operator->() const -> Token const * {
-            assert( !m_tokenBuffer.empty() );
-            return &m_tokenBuffer.front();
-        }
-
-        auto operator++() -> TokenStream & {
-            if( m_tokenBuffer.size() >= 2 ) {
-                m_tokenBuffer.erase( m_tokenBuffer.begin() );
-            } else {
-                if( it != itEnd )
-                    ++it;
-                loadBuffer();
-            }
-            return *this;
-        }
-    };
-
-    class ResultBase {
-    public:
-        enum Type {
-            Ok, LogicError, RuntimeError
-        };
-
-    protected:
-        ResultBase( Type type ) : m_type( type ) {}
-        virtual ~ResultBase() = default;
-
-        virtual void enforceOk() const = 0;
-
-        Type m_type;
-    };
-
-    template<typename T>
-    class ResultValueBase : public ResultBase {
-    public:
-        auto value() const -> T const & {
-            enforceOk();
-            return m_value;
-        }
-
-    protected:
-        ResultValueBase( Type type ) : ResultBase( type ) {}
-
-        ResultValueBase( ResultValueBase const &other ) : ResultBase( other ) {
-            if( m_type == ResultBase::Ok )
-                new( &m_value ) T( other.m_value );
-        }
-
-        ResultValueBase( Type, T const &value ) : ResultBase( Ok ) {
-            new( &m_value ) T( value );
-        }
-
-        auto operator=( ResultValueBase const &other ) -> ResultValueBase & {
-            if( m_type == ResultBase::Ok )
-                m_value.~T();
-            ResultBase::operator=(other);
-            if( m_type == ResultBase::Ok )
-                new( &m_value ) T( other.m_value );
-            return *this;
-        }
-
-        ~ResultValueBase() override {
-            if( m_type == Ok )
-                m_value.~T();
-        }
-
-        union {
-            T m_value;
-        };
-    };
-
-    template<>
-    class ResultValueBase<void> : public ResultBase {
-    protected:
-        using ResultBase::ResultBase;
-    };
-
-    template<typename T = void>
-    class BasicResult : public ResultValueBase<T> {
-    public:
-        template<typename U>
-        explicit BasicResult( BasicResult<U> const &other )
-        :   ResultValueBase<T>( other.type() ),
-            m_errorMessage( other.errorMessage() )
-        {
-            assert( type() != ResultBase::Ok );
-        }
-
-        template<typename U>
-        static auto ok( U const &value ) -> BasicResult { return { ResultBase::Ok, value }; }
-        static auto ok() -> BasicResult { return { ResultBase::Ok }; }
-        static auto logicError( std::string const &message ) -> BasicResult { return { ResultBase::LogicError, message }; }
-        static auto runtimeError( std::string const &message ) -> BasicResult { return { ResultBase::RuntimeError, message }; }
-
-        explicit operator bool() const { return m_type == ResultBase::Ok; }
-        auto type() const -> ResultBase::Type { return m_type; }
-        auto errorMessage() const -> std::string { return m_errorMessage; }
-
-    protected:
-        void enforceOk() const override {
-
-            // Errors shouldn't reach this point, but if they do
-            // the actual error message will be in m_errorMessage
-            assert( m_type != ResultBase::LogicError );
-            assert( m_type != ResultBase::RuntimeError );
-            if( m_type != ResultBase::Ok )
-                std::abort();
-        }
-
-        std::string m_errorMessage; // Only populated if resultType is an error
-
-        BasicResult( ResultBase::Type type, std::string const &message )
-        :   ResultValueBase<T>(type),
-            m_errorMessage(message)
-        {
-            assert( m_type != ResultBase::Ok );
-        }
-
-        using ResultValueBase<T>::ResultValueBase;
-        using ResultBase::m_type;
-    };
-
-    enum class ParseResultType {
-        Matched, NoMatch, ShortCircuitAll, ShortCircuitSame
-    };
-
-    class ParseState {
-    public:
-
-        ParseState( ParseResultType type, TokenStream const &remainingTokens )
-        : m_type(type),
-          m_remainingTokens( remainingTokens )
-        {}
-
-        auto type() const -> ParseResultType { return m_type; }
-        auto remainingTokens() const -> TokenStream { return m_remainingTokens; }
-
-    private:
-        ParseResultType m_type;
-        TokenStream m_remainingTokens;
-    };
-
-    using Result = BasicResult<void>;
-    using ParserResult = BasicResult<ParseResultType>;
-    using InternalParseResult = BasicResult<ParseState>;
-
-    struct HelpColumns {
-        std::string left;
-        std::string right;
-    };
-
-    template<typename T>
-    inline auto convertInto( std::string const &source, T& target ) -> ParserResult {
-        std::stringstream ss;
-        ss << source;
-        ss >> target;
-        if( ss.fail() )
-            return ParserResult::runtimeError( "Unable to convert '" + source + "' to destination type" );
-        else
-            return ParserResult::ok( ParseResultType::Matched );
-    }
-    inline auto convertInto( std::string const &source, std::string& target ) -> ParserResult {
-        target = source;
-        return ParserResult::ok( ParseResultType::Matched );
-    }
-    inline auto convertInto( std::string const &source, bool &target ) -> ParserResult {
-        std::string srcLC = source;
-        std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( char c ) { return static_cast<char>( ::tolower(c) ); } );
-        if (srcLC == "y" || srcLC == "1" || srcLC == "true" || srcLC == "yes" || srcLC == "on")
-            target = true;
-        else if (srcLC == "n" || srcLC == "0" || srcLC == "false" || srcLC == "no" || srcLC == "off")
-            target = false;
-        else
-            return ParserResult::runtimeError( "Expected a boolean value but did not recognise: '" + source + "'" );
-        return ParserResult::ok( ParseResultType::Matched );
-    }
-#ifdef CLARA_CONFIG_OPTIONAL_TYPE
-    template<typename T>
-    inline auto convertInto( std::string const &source, CLARA_CONFIG_OPTIONAL_TYPE<T>& target ) -> ParserResult {
-        T temp;
-        auto result = convertInto( source, temp );
-        if( result )
-            target = std::move(temp);
-        return result;
-    }
-#endif // CLARA_CONFIG_OPTIONAL_TYPE
-
-    struct NonCopyable {
-        NonCopyable() = default;
-        NonCopyable( NonCopyable const & ) = delete;
-        NonCopyable( NonCopyable && ) = delete;
-        NonCopyable &operator=( NonCopyable const & ) = delete;
-        NonCopyable &operator=( NonCopyable && ) = delete;
-    };
-
-    struct BoundRef : NonCopyable {
-        virtual ~BoundRef() = default;
-        virtual auto isContainer() const -> bool { return false; }
-        virtual auto isFlag() const -> bool { return false; }
-    };
-    struct BoundValueRefBase : BoundRef {
-        virtual auto setValue( std::string const &arg ) -> ParserResult = 0;
-    };
-    struct BoundFlagRefBase : BoundRef {
-        virtual auto setFlag( bool flag ) -> ParserResult = 0;
-        virtual auto isFlag() const -> bool { return true; }
-    };
-
-    template<typename T>
-    struct BoundValueRef : BoundValueRefBase {
-        T &m_ref;
-
-        explicit BoundValueRef( T &ref ) : m_ref( ref ) {}
-
-        auto setValue( std::string const &arg ) -> ParserResult override {
-            return convertInto( arg, m_ref );
-        }
-    };
-
-    template<typename T>
-    struct BoundValueRef<std::vector<T>> : BoundValueRefBase {
-        std::vector<T> &m_ref;
-
-        explicit BoundValueRef( std::vector<T> &ref ) : m_ref( ref ) {}
-
-        auto isContainer() const -> bool override { return true; }
-
-        auto setValue( std::string const &arg ) -> ParserResult override {
-            T temp;
-            auto result = convertInto( arg, temp );
-            if( result )
-                m_ref.push_back( temp );
-            return result;
-        }
-    };
-
-    struct BoundFlagRef : BoundFlagRefBase {
-        bool &m_ref;
-
-        explicit BoundFlagRef( bool &ref ) : m_ref( ref ) {}
-
-        auto setFlag( bool flag ) -> ParserResult override {
-            m_ref = flag;
-            return ParserResult::ok( ParseResultType::Matched );
-        }
-    };
-
-    template<typename ReturnType>
-    struct LambdaInvoker {
-        static_assert( std::is_same<ReturnType, ParserResult>::value, "Lambda must return void or clara::ParserResult" );
-
-        template<typename L, typename ArgType>
-        static auto invoke( L const &lambda, ArgType const &arg ) -> ParserResult {
-            return lambda( arg );
-        }
-    };
-
-    template<>
-    struct LambdaInvoker<void> {
-        template<typename L, typename ArgType>
-        static auto invoke( L const &lambda, ArgType const &arg ) -> ParserResult {
-            lambda( arg );
-            return ParserResult::ok( ParseResultType::Matched );
-        }
-    };
-
-    template<typename ArgType, typename L>
-    inline auto invokeLambda( L const &lambda, std::string const &arg ) -> ParserResult {
-        ArgType temp{};
-        auto result = convertInto( arg, temp );
-        return !result
-           ? result
-           : LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke( lambda, temp );
-    }
-
-    template<typename L>
-    struct BoundLambda : BoundValueRefBase {
-        L m_lambda;
-
-        static_assert( UnaryLambdaTraits<L>::isValid, "Supplied lambda must take exactly one argument" );
-        explicit BoundLambda( L const &lambda ) : m_lambda( lambda ) {}
-
-        auto setValue( std::string const &arg ) -> ParserResult override {
-            return invokeLambda<typename UnaryLambdaTraits<L>::ArgType>( m_lambda, arg );
-        }
-    };
-
-    template<typename L>
-    struct BoundFlagLambda : BoundFlagRefBase {
-        L m_lambda;
-
-        static_assert( UnaryLambdaTraits<L>::isValid, "Supplied lambda must take exactly one argument" );
-        static_assert( std::is_same<typename UnaryLambdaTraits<L>::ArgType, bool>::value, "flags must be boolean" );
-
-        explicit BoundFlagLambda( L const &lambda ) : m_lambda( lambda ) {}
-
-        auto setFlag( bool flag ) -> ParserResult override {
-            return LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke( m_lambda, flag );
-        }
-    };
-
-    enum class Optionality { Optional, Required };
-
-    struct Parser;
-
-    class ParserBase {
-    public:
-        virtual ~ParserBase() = default;
-        virtual auto validate() const -> Result { return Result::ok(); }
-        virtual auto parse( std::string const& exeName, TokenStream const &tokens) const -> InternalParseResult  = 0;
-        virtual auto cardinality() const -> size_t { return 1; }
-
-        auto parse( Args const &args ) const -> InternalParseResult {
-            return parse( args.exeName(), TokenStream( args ) );
-        }
-    };
-
-    template<typename DerivedT>
-    class ComposableParserImpl : public ParserBase {
-    public:
-        template<typename T>
-        auto operator|( T const &other ) const -> Parser;
-
-		template<typename T>
-        auto operator+( T const &other ) const -> Parser;
-    };
-
-    // Common code and state for Args and Opts
-    template<typename DerivedT>
-    class ParserRefImpl : public ComposableParserImpl<DerivedT> {
-    protected:
-        Optionality m_optionality = Optionality::Optional;
-        std::shared_ptr<BoundRef> m_ref;
-        std::string m_hint;
-        std::string m_description;
-
-        explicit ParserRefImpl( std::shared_ptr<BoundRef> const &ref ) : m_ref( ref ) {}
-
-    public:
-        template<typename T>
-        ParserRefImpl( T &ref, std::string const &hint )
-        :   m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
-            m_hint( hint )
-        {}
-
-        template<typename LambdaT>
-        ParserRefImpl( LambdaT const &ref, std::string const &hint )
-        :   m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
-            m_hint(hint)
-        {}
-
-        auto operator()( std::string const &description ) -> DerivedT & {
-            m_description = description;
-            return static_cast<DerivedT &>( *this );
-        }
-
-        auto optional() -> DerivedT & {
-            m_optionality = Optionality::Optional;
-            return static_cast<DerivedT &>( *this );
-        };
-
-        auto required() -> DerivedT & {
-            m_optionality = Optionality::Required;
-            return static_cast<DerivedT &>( *this );
-        };
-
-        auto isOptional() const -> bool {
-            return m_optionality == Optionality::Optional;
-        }
-
-        auto cardinality() const -> size_t override {
-            if( m_ref->isContainer() )
-                return 0;
-            else
-                return 1;
-        }
-
-        auto hint() const -> std::string { return m_hint; }
-    };
-
-    class ExeName : public ComposableParserImpl<ExeName> {
-        std::shared_ptr<std::string> m_name;
-        std::shared_ptr<BoundValueRefBase> m_ref;
-
-        template<typename LambdaT>
-        static auto makeRef(LambdaT const &lambda) -> std::shared_ptr<BoundValueRefBase> {
-            return std::make_shared<BoundLambda<LambdaT>>( lambda) ;
-        }
-
-    public:
-        ExeName() : m_name( std::make_shared<std::string>( "<executable>" ) ) {}
-
-        explicit ExeName( std::string &ref ) : ExeName() {
-            m_ref = std::make_shared<BoundValueRef<std::string>>( ref );
-        }
-
-        template<typename LambdaT>
-        explicit ExeName( LambdaT const& lambda ) : ExeName() {
-            m_ref = std::make_shared<BoundLambda<LambdaT>>( lambda );
-        }
-
-        // The exe name is not parsed out of the normal tokens, but is handled specially
-        auto parse( std::string const&, TokenStream const &tokens ) const -> InternalParseResult override {
-            return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, tokens ) );
-        }
-
-        auto name() const -> std::string { return *m_name; }
-        auto set( std::string const& newName ) -> ParserResult {
-
-            auto lastSlash = newName.find_last_of( "\\/" );
-            auto filename = ( lastSlash == std::string::npos )
-                    ? newName
-                    : newName.substr( lastSlash+1 );
-
-            *m_name = filename;
-            if( m_ref )
-                return m_ref->setValue( filename );
-            else
-                return ParserResult::ok( ParseResultType::Matched );
-        }
-    };
-
-    class Arg : public ParserRefImpl<Arg> {
-    public:
-        using ParserRefImpl::ParserRefImpl;
-
-        auto parse( std::string const &, TokenStream const &tokens ) const -> InternalParseResult override {
-            auto validationResult = validate();
-            if( !validationResult )
-                return InternalParseResult( validationResult );
-
-            auto remainingTokens = tokens;
-            auto const &token = *remainingTokens;
-            if( token.type != TokenType::Argument )
-                return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, remainingTokens ) );
-
-            assert( !m_ref->isFlag() );
-            auto valueRef = static_cast<detail::BoundValueRefBase*>( m_ref.get() );
-
-            auto result = valueRef->setValue( remainingTokens->token );
-            if( !result )
-                return InternalParseResult( result );
-            else
-                return InternalParseResult::ok( ParseState( ParseResultType::Matched, ++remainingTokens ) );
-        }
-    };
-
-    inline auto normaliseOpt( std::string const &optName ) -> std::string {
-#ifdef CATCH_PLATFORM_WINDOWS
-        if( optName[0] == '/' )
-            return "-" + optName.substr( 1 );
-        else
-#endif
-            return optName;
-    }
-
-    class Opt : public ParserRefImpl<Opt> {
-    protected:
-        std::vector<std::string> m_optNames;
-
-    public:
-        template<typename LambdaT>
-        explicit Opt( LambdaT const &ref ) : ParserRefImpl( std::make_shared<BoundFlagLambda<LambdaT>>( ref ) ) {}
-
-        explicit Opt( bool &ref ) : ParserRefImpl( std::make_shared<BoundFlagRef>( ref ) ) {}
-
-        template<typename LambdaT>
-        Opt( LambdaT const &ref, std::string const &hint ) : ParserRefImpl( ref, hint ) {}
-
-        template<typename T>
-        Opt( T &ref, std::string const &hint ) : ParserRefImpl( ref, hint ) {}
-
-        auto operator[]( std::string const &optName ) -> Opt & {
-            m_optNames.push_back( optName );
-            return *this;
-        }
-
-        auto getHelpColumns() const -> std::vector<HelpColumns> {
-            std::ostringstream oss;
-            bool first = true;
-            for( auto const &opt : m_optNames ) {
-                if (first)
-                    first = false;
-                else
-                    oss << ", ";
-                oss << opt;
-            }
-            if( !m_hint.empty() )
-                oss << " <" << m_hint << ">";
-            return { { oss.str(), m_description } };
-        }
-
-        auto isMatch( std::string const &optToken ) const -> bool {
-            auto normalisedToken = normaliseOpt( optToken );
-            for( auto const &name : m_optNames ) {
-                if( normaliseOpt( name ) == normalisedToken )
-                    return true;
-            }
-            return false;
-        }
-
-        using ParserBase::parse;
-
-        auto parse( std::string const&, TokenStream const &tokens ) const -> InternalParseResult override {
-            auto validationResult = validate();
-            if( !validationResult )
-                return InternalParseResult( validationResult );
-
-            auto remainingTokens = tokens;
-            if( remainingTokens && remainingTokens->type == TokenType::Option ) {
-                auto const &token = *remainingTokens;
-                if( isMatch(token.token ) ) {
-                    if( m_ref->isFlag() ) {
-                        auto flagRef = static_cast<detail::BoundFlagRefBase*>( m_ref.get() );
-                        auto result = flagRef->setFlag( true );
-                        if( !result )
-                            return InternalParseResult( result );
-                        if( result.value() == ParseResultType::ShortCircuitAll )
-                            return InternalParseResult::ok( ParseState( result.value(), remainingTokens ) );
-                    } else {
-                        auto valueRef = static_cast<detail::BoundValueRefBase*>( m_ref.get() );
-                        ++remainingTokens;
-                        if( !remainingTokens )
-                            return InternalParseResult::runtimeError( "Expected argument following " + token.token );
-                        auto const &argToken = *remainingTokens;
-                        if( argToken.type != TokenType::Argument )
-                            return InternalParseResult::runtimeError( "Expected argument following " + token.token );
-                        auto result = valueRef->setValue( argToken.token );
-                        if( !result )
-                            return InternalParseResult( result );
-                        if( result.value() == ParseResultType::ShortCircuitAll )
-                            return InternalParseResult::ok( ParseState( result.value(), remainingTokens ) );
-                    }
-                    return InternalParseResult::ok( ParseState( ParseResultType::Matched, ++remainingTokens ) );
-                }
-            }
-            return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, remainingTokens ) );
-        }
-
-        auto validate() const -> Result override {
-            if( m_optNames.empty() )
-                return Result::logicError( "No options supplied to Opt" );
-            for( auto const &name : m_optNames ) {
-                if( name.empty() )
-                    return Result::logicError( "Option name cannot be empty" );
-#ifdef CATCH_PLATFORM_WINDOWS
-                if( name[0] != '-' && name[0] != '/' )
-                    return Result::logicError( "Option name must begin with '-' or '/'" );
-#else
-                if( name[0] != '-' )
-                    return Result::logicError( "Option name must begin with '-'" );
-#endif
-            }
-            return ParserRefImpl::validate();
-        }
-    };
-
-    struct Help : Opt {
-        Help( bool &showHelpFlag )
-        :   Opt([&]( bool flag ) {
-                showHelpFlag = flag;
-                return ParserResult::ok( ParseResultType::ShortCircuitAll );
-            })
-        {
-            static_cast<Opt &>( *this )
-                    ("display usage information")
-                    ["-?"]["-h"]["--help"]
-                    .optional();
-        }
-    };
-
-    struct Parser : ParserBase {
-
-        mutable ExeName m_exeName;
-        std::vector<Opt> m_options;
-        std::vector<Arg> m_args;
-
-        auto operator|=( ExeName const &exeName ) -> Parser & {
-            m_exeName = exeName;
-            return *this;
-        }
-
-        auto operator|=( Arg const &arg ) -> Parser & {
-            m_args.push_back(arg);
-            return *this;
-        }
-
-        auto operator|=( Opt const &opt ) -> Parser & {
-            m_options.push_back(opt);
-            return *this;
-        }
-
-        auto operator|=( Parser const &other ) -> Parser & {
-            m_options.insert(m_options.end(), other.m_options.begin(), other.m_options.end());
-            m_args.insert(m_args.end(), other.m_args.begin(), other.m_args.end());
-            return *this;
-        }
-
-        template<typename T>
-        auto operator|( T const &other ) const -> Parser {
-            return Parser( *this ) |= other;
-        }
-
-        // Forward deprecated interface with '+' instead of '|'
-        template<typename T>
-        auto operator+=( T const &other ) -> Parser & { return operator|=( other ); }
-        template<typename T>
-        auto operator+( T const &other ) const -> Parser { return operator|( other ); }
-
-        auto getHelpColumns() const -> std::vector<HelpColumns> {
-            std::vector<HelpColumns> cols;
-            for (auto const &o : m_options) {
-                auto childCols = o.getHelpColumns();
-                cols.insert( cols.end(), childCols.begin(), childCols.end() );
-            }
-            return cols;
-        }
-
-        void writeToStream( std::ostream &os ) const {
-            if (!m_exeName.name().empty()) {
-                os << "usage:\n" << "  " << m_exeName.name() << " ";
-                bool required = true, first = true;
-                for( auto const &arg : m_args ) {
-                    if (first)
-                        first = false;
-                    else
-                        os << " ";
-                    if( arg.isOptional() && required ) {
-                        os << "[";
-                        required = false;
-                    }
-                    os << "<" << arg.hint() << ">";
-                    if( arg.cardinality() == 0 )
-                        os << " ... ";
-                }
-                if( !required )
-                    os << "]";
-                if( !m_options.empty() )
-                    os << " options";
-                os << "\n\nwhere options are:" << std::endl;
-            }
-
-            auto rows = getHelpColumns();
-            size_t consoleWidth = CATCH_CLARA_CONFIG_CONSOLE_WIDTH;
-            size_t optWidth = 0;
-            for( auto const &cols : rows )
-                optWidth = (std::max)(optWidth, cols.left.size() + 2);
-
-            optWidth = (std::min)(optWidth, consoleWidth/2);
-
-            for( auto const &cols : rows ) {
-                auto row =
-                        TextFlow::Column( cols.left ).width( optWidth ).indent( 2 ) +
-                        TextFlow::Spacer(4) +
-                        TextFlow::Column( cols.right ).width( consoleWidth - 7 - optWidth );
-                os << row << std::endl;
-            }
-        }
-
-        friend auto operator<<( std::ostream &os, Parser const &parser ) -> std::ostream& {
-            parser.writeToStream( os );
-            return os;
-        }
-
-        auto validate() const -> Result override {
-            for( auto const &opt : m_options ) {
-                auto result = opt.validate();
-                if( !result )
-                    return result;
-            }
-            for( auto const &arg : m_args ) {
-                auto result = arg.validate();
-                if( !result )
-                    return result;
-            }
-            return Result::ok();
-        }
-
-        using ParserBase::parse;
-
-        auto parse( std::string const& exeName, TokenStream const &tokens ) const -> InternalParseResult override {
-
-            struct ParserInfo {
-                ParserBase const* parser = nullptr;
-                size_t count = 0;
-            };
-            const size_t totalParsers = m_options.size() + m_args.size();
-            assert( totalParsers < 512 );
-            // ParserInfo parseInfos[totalParsers]; // <-- this is what we really want to do
-            ParserInfo parseInfos[512];
-
-            {
-                size_t i = 0;
-                for (auto const &opt : m_options) parseInfos[i++].parser = &opt;
-                for (auto const &arg : m_args) parseInfos[i++].parser = &arg;
-            }
-
-            m_exeName.set( exeName );
-
-            auto result = InternalParseResult::ok( ParseState( ParseResultType::NoMatch, tokens ) );
-            while( result.value().remainingTokens() ) {
-                bool tokenParsed = false;
-
-                for( size_t i = 0; i < totalParsers; ++i ) {
-                    auto&  parseInfo = parseInfos[i];
-                    if( parseInfo.parser->cardinality() == 0 || parseInfo.count < parseInfo.parser->cardinality() ) {
-                        result = parseInfo.parser->parse(exeName, result.value().remainingTokens());
-                        if (!result)
-                            return result;
-                        if (result.value().type() != ParseResultType::NoMatch) {
-                            tokenParsed = true;
-                            ++parseInfo.count;
-                            break;
-                        }
-                    }
-                }
-
-                if( result.value().type() == ParseResultType::ShortCircuitAll )
-                    return result;
-                if( !tokenParsed )
-                    return InternalParseResult::runtimeError( "Unrecognised token: " + result.value().remainingTokens()->token );
-            }
-            // !TBD Check missing required options
-            return result;
-        }
-    };
-
-    template<typename DerivedT>
-    template<typename T>
-    auto ComposableParserImpl<DerivedT>::operator|( T const &other ) const -> Parser {
-        return Parser() | static_cast<DerivedT const &>( *this ) | other;
-    }
-} // namespace detail
-
-// A Combined parser
-using detail::Parser;
-
-// A parser for options
-using detail::Opt;
-
-// A parser for arguments
-using detail::Arg;
-
-// Wrapper for argc, argv from main()
-using detail::Args;
-
-// Specifies the name of the executable
-using detail::ExeName;
-
-// Convenience wrapper for option parser that specifies the help option
-using detail::Help;
-
-// enum of result types from a parse
-using detail::ParseResultType;
-
-// Result type for parser operation
-using detail::ParserResult;
-
-}} // namespace Catch::clara
-
-// end clara.hpp
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-
-// Restore Clara's value for console width, if present
-#ifdef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
-#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
-#undef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
-#endif
-
-// end catch_clara.h
-namespace Catch {
-
-    clara::Parser makeCommandLineParser( ConfigData& config );
-
-} // end namespace Catch
-
-// end catch_commandline.h
-#include <fstream>
-#include <ctime>
-
-namespace Catch {
-
-    clara::Parser makeCommandLineParser( ConfigData& config ) {
-
-        using namespace clara;
-
-        auto const setWarning = [&]( std::string const& warning ) {
-                auto warningSet = [&]() {
-                    if( warning == "NoAssertions" )
-                        return WarnAbout::NoAssertions;
-
-                    if ( warning == "NoTests" )
-                        return WarnAbout::NoTests;
-
-                    return WarnAbout::Nothing;
-                }();
-
-                if (warningSet == WarnAbout::Nothing)
-                    return ParserResult::runtimeError( "Unrecognised warning: '" + warning + "'" );
-                config.warnings = static_cast<WarnAbout::What>( config.warnings | warningSet );
-                return ParserResult::ok( ParseResultType::Matched );
-            };
-        auto const loadTestNamesFromFile = [&]( std::string const& filename ) {
-                std::ifstream f( filename.c_str() );
-                if( !f.is_open() )
-                    return ParserResult::runtimeError( "Unable to load input file: '" + filename + "'" );
-
-                std::string line;
-                while( std::getline( f, line ) ) {
-                    line = trim(line);
-                    if( !line.empty() && !startsWith( line, '#' ) ) {
-                        if( !startsWith( line, '"' ) )
-                            line = '"' + line + '"';
-                        config.testsOrTags.push_back( line + ',' );
-                    }
-                }
-                return ParserResult::ok( ParseResultType::Matched );
-            };
-        auto const setTestOrder = [&]( std::string const& order ) {
-                if( startsWith( "declared", order ) )
-                    config.runOrder = RunTests::InDeclarationOrder;
-                else if( startsWith( "lexical", order ) )
-                    config.runOrder = RunTests::InLexicographicalOrder;
-                else if( startsWith( "random", order ) )
-                    config.runOrder = RunTests::InRandomOrder;
-                else
-                    return clara::ParserResult::runtimeError( "Unrecognised ordering: '" + order + "'" );
-                return ParserResult::ok( ParseResultType::Matched );
-            };
-        auto const setRngSeed = [&]( std::string const& seed ) {
-                if( seed != "time" )
-                    return clara::detail::convertInto( seed, config.rngSeed );
-                config.rngSeed = static_cast<unsigned int>( std::time(nullptr) );
-                return ParserResult::ok( ParseResultType::Matched );
-            };
-        auto const setColourUsage = [&]( std::string const& useColour ) {
-                    auto mode = toLower( useColour );
-
-                    if( mode == "yes" )
-                        config.useColour = UseColour::Yes;
-                    else if( mode == "no" )
-                        config.useColour = UseColour::No;
-                    else if( mode == "auto" )
-                        config.useColour = UseColour::Auto;
-                    else
-                        return ParserResult::runtimeError( "colour mode must be one of: auto, yes or no. '" + useColour + "' not recognised" );
-                return ParserResult::ok( ParseResultType::Matched );
-            };
-        auto const setWaitForKeypress = [&]( std::string const& keypress ) {
-                auto keypressLc = toLower( keypress );
-                if( keypressLc == "start" )
-                    config.waitForKeypress = WaitForKeypress::BeforeStart;
-                else if( keypressLc == "exit" )
-                    config.waitForKeypress = WaitForKeypress::BeforeExit;
-                else if( keypressLc == "both" )
-                    config.waitForKeypress = WaitForKeypress::BeforeStartAndExit;
-                else
-                    return ParserResult::runtimeError( "keypress argument must be one of: start, exit or both. '" + keypress + "' not recognised" );
-            return ParserResult::ok( ParseResultType::Matched );
-            };
-        auto const setVerbosity = [&]( std::string const& verbosity ) {
-            auto lcVerbosity = toLower( verbosity );
-            if( lcVerbosity == "quiet" )
-                config.verbosity = Verbosity::Quiet;
-            else if( lcVerbosity == "normal" )
-                config.verbosity = Verbosity::Normal;
-            else if( lcVerbosity == "high" )
-                config.verbosity = Verbosity::High;
-            else
-                return ParserResult::runtimeError( "Unrecognised verbosity, '" + verbosity + "'" );
-            return ParserResult::ok( ParseResultType::Matched );
-        };
-
-        auto cli
-            = ExeName( config.processName )
-            | Help( config.showHelp )
-            | Opt( config.listTests )
-                ["-l"]["--list-tests"]
-                ( "list all/matching test cases" )
-            | Opt( config.listTags )
-                ["-t"]["--list-tags"]
-                ( "list all/matching tags" )
-            | Opt( config.showSuccessfulTests )
-                ["-s"]["--success"]
-                ( "include successful tests in output" )
-            | Opt( config.shouldDebugBreak )
-                ["-b"]["--break"]
-                ( "break into debugger on failure" )
-            | Opt( config.noThrow )
-                ["-e"]["--nothrow"]
-                ( "skip exception tests" )
-            | Opt( config.showInvisibles )
-                ["-i"]["--invisibles"]
-                ( "show invisibles (tabs, newlines)" )
-            | Opt( config.outputFilename, "filename" )
-                ["-o"]["--out"]
-                ( "output filename" )
-            | Opt( config.reporterName, "name" )
-                ["-r"]["--reporter"]
-                ( "reporter to use (defaults to console)" )
-            | Opt( config.name, "name" )
-                ["-n"]["--name"]
-                ( "suite name" )
-            | Opt( [&]( bool ){ config.abortAfter = 1; } )
-                ["-a"]["--abort"]
-                ( "abort at first failure" )
-            | Opt( [&]( int x ){ config.abortAfter = x; }, "no. failures" )
-                ["-x"]["--abortx"]
-                ( "abort after x failures" )
-            | Opt( setWarning, "warning name" )
-                ["-w"]["--warn"]
-                ( "enable warnings" )
-            | Opt( [&]( bool flag ) { config.showDurations = flag ? ShowDurations::Always : ShowDurations::Never; }, "yes|no" )
-                ["-d"]["--durations"]
-                ( "show test durations" )
-            | Opt( loadTestNamesFromFile, "filename" )
-                ["-f"]["--input-file"]
-                ( "load test names to run from a file" )
-            | Opt( config.filenamesAsTags )
-                ["-#"]["--filenames-as-tags"]
-                ( "adds a tag for the filename" )
-            | Opt( config.sectionsToRun, "section name" )
-                ["-c"]["--section"]
-                ( "specify section to run" )
-            | Opt( setVerbosity, "quiet|normal|high" )
-                ["-v"]["--verbosity"]
-                ( "set output verbosity" )
-            | Opt( config.listTestNamesOnly )
-                ["--list-test-names-only"]
-                ( "list all/matching test cases names only" )
-            | Opt( config.listReporters )
-                ["--list-reporters"]
-                ( "list all reporters" )
-            | Opt( setTestOrder, "decl|lex|rand" )
-                ["--order"]
-                ( "test case order (defaults to decl)" )
-            | Opt( setRngSeed, "'time'|number" )
-                ["--rng-seed"]
-                ( "set a specific seed for random numbers" )
-            | Opt( setColourUsage, "yes|no" )
-                ["--use-colour"]
-                ( "should output be colourised" )
-            | Opt( config.libIdentify )
-                ["--libidentify"]
-                ( "report name and version according to libidentify standard" )
-            | Opt( setWaitForKeypress, "start|exit|both" )
-                ["--wait-for-keypress"]
-                ( "waits for a keypress before exiting" )
-            | Opt( config.benchmarkResolutionMultiple, "multiplier" )
-                ["--benchmark-resolution-multiple"]
-                ( "multiple of clock resolution to run benchmarks" )
-
-            | Arg( config.testsOrTags, "test name|pattern|tags" )
-                ( "which test or tests to use" );
-
-        return cli;
-    }
-
-} // end namespace Catch
-// end catch_commandline.cpp
-// start catch_common.cpp
-
-#include <cstring>
-#include <ostream>
-
-namespace Catch {
-
-    bool SourceLineInfo::empty() const noexcept {
-        return file[0] == '\0';
-    }
-    bool SourceLineInfo::operator == ( SourceLineInfo const& other ) const noexcept {
-        return line == other.line && (file == other.file || std::strcmp(file, other.file) == 0);
-    }
-    bool SourceLineInfo::operator < ( SourceLineInfo const& other ) const noexcept {
-        // We can assume that the same file will usually have the same pointer.
-        // Thus, if the pointers are the same, there is no point in calling the strcmp
-        return line < other.line || ( line == other.line && file != other.file && (std::strcmp(file, other.file) < 0));
-    }
-
-    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ) {
-#ifndef __GNUG__
-        os << info.file << '(' << info.line << ')';
-#else
-        os << info.file << ':' << info.line;
-#endif
-        return os;
-    }
-
-    std::string StreamEndStop::operator+() const {
-        return std::string();
-    }
-
-    NonCopyable::NonCopyable() = default;
-    NonCopyable::~NonCopyable() = default;
-
-}
-// end catch_common.cpp
-// start catch_config.cpp
-
-namespace Catch {
-
-    Config::Config( ConfigData const& data )
-    :   m_data( data ),
-        m_stream( openStream() )
-    {
-        TestSpecParser parser(ITagAliasRegistry::get());
-        if (data.testsOrTags.empty()) {
-            parser.parse("~[.]"); // All not hidden tests
-        }
-        else {
-            m_hasTestFilters = true;
-            for( auto const& testOrTags : data.testsOrTags )
-                parser.parse( testOrTags );
-        }
-        m_testSpec = parser.testSpec();
-    }
-
-    std::string const& Config::getFilename() const {
-        return m_data.outputFilename ;
-    }
-
-    bool Config::listTests() const          { return m_data.listTests; }
-    bool Config::listTestNamesOnly() const  { return m_data.listTestNamesOnly; }
-    bool Config::listTags() const           { return m_data.listTags; }
-    bool Config::listReporters() const      { return m_data.listReporters; }
-
-    std::string Config::getProcessName() const { return m_data.processName; }
-    std::string const& Config::getReporterName() const { return m_data.reporterName; }
-
-    std::vector<std::string> const& Config::getTestsOrTags() const { return m_data.testsOrTags; }
-    std::vector<std::string> const& Config::getSectionsToRun() const { return m_data.sectionsToRun; }
-
-    TestSpec const& Config::testSpec() const { return m_testSpec; }
-    bool Config::hasTestFilters() const { return m_hasTestFilters; }
-
-    bool Config::showHelp() const { return m_data.showHelp; }
-
-    // IConfig interface
-    bool Config::allowThrows() const                   { return !m_data.noThrow; }
-    std::ostream& Config::stream() const               { return m_stream->stream(); }
-    std::string Config::name() const                   { return m_data.name.empty() ? m_data.processName : m_data.name; }
-    bool Config::includeSuccessfulResults() const      { return m_data.showSuccessfulTests; }
-    bool Config::warnAboutMissingAssertions() const    { return !!(m_data.warnings & WarnAbout::NoAssertions); }
-    bool Config::warnAboutNoTests() const              { return !!(m_data.warnings & WarnAbout::NoTests); }
-    ShowDurations::OrNot Config::showDurations() const { return m_data.showDurations; }
-    RunTests::InWhatOrder Config::runOrder() const     { return m_data.runOrder; }
-    unsigned int Config::rngSeed() const               { return m_data.rngSeed; }
-    int Config::benchmarkResolutionMultiple() const    { return m_data.benchmarkResolutionMultiple; }
-    UseColour::YesOrNo Config::useColour() const       { return m_data.useColour; }
-    bool Config::shouldDebugBreak() const              { return m_data.shouldDebugBreak; }
-    int Config::abortAfter() const                     { return m_data.abortAfter; }
-    bool Config::showInvisibles() const                { return m_data.showInvisibles; }
-    Verbosity Config::verbosity() const                { return m_data.verbosity; }
-
-    IStream const* Config::openStream() {
-        return Catch::makeStream(m_data.outputFilename);
-    }
-
-} // end namespace Catch
-// end catch_config.cpp
-// start catch_console_colour.cpp
-
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-
-// start catch_errno_guard.h
-
-namespace Catch {
-
-    class ErrnoGuard {
-    public:
-        ErrnoGuard();
-        ~ErrnoGuard();
-    private:
-        int m_oldErrno;
-    };
-
-}
-
-// end catch_errno_guard.h
-#include <sstream>
-
-namespace Catch {
-    namespace {
-
-        struct IColourImpl {
-            virtual ~IColourImpl() = default;
-            virtual void use( Colour::Code _colourCode ) = 0;
-        };
-
-        struct NoColourImpl : IColourImpl {
-            void use( Colour::Code ) {}
-
-            static IColourImpl* instance() {
-                static NoColourImpl s_instance;
-                return &s_instance;
-            }
-        };
-
-    } // anon namespace
-} // namespace Catch
-
-#if !defined( CATCH_CONFIG_COLOUR_NONE ) && !defined( CATCH_CONFIG_COLOUR_WINDOWS ) && !defined( CATCH_CONFIG_COLOUR_ANSI )
-#   ifdef CATCH_PLATFORM_WINDOWS
-#       define CATCH_CONFIG_COLOUR_WINDOWS
-#   else
-#       define CATCH_CONFIG_COLOUR_ANSI
-#   endif
-#endif
-
-#if defined ( CATCH_CONFIG_COLOUR_WINDOWS ) /////////////////////////////////////////
-
-namespace Catch {
-namespace {
-
-    class Win32ColourImpl : public IColourImpl {
-    public:
-        Win32ColourImpl() : stdoutHandle( GetStdHandle(STD_OUTPUT_HANDLE) )
-        {
-            CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
-            GetConsoleScreenBufferInfo( stdoutHandle, &csbiInfo );
-            originalForegroundAttributes = csbiInfo.wAttributes & ~( BACKGROUND_GREEN | BACKGROUND_RED | BACKGROUND_BLUE | BACKGROUND_INTENSITY );
-            originalBackgroundAttributes = csbiInfo.wAttributes & ~( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY );
-        }
-
-        virtual void use( Colour::Code _colourCode ) override {
-            switch( _colourCode ) {
-                case Colour::None:      return setTextAttribute( originalForegroundAttributes );
-                case Colour::White:     return setTextAttribute( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
-                case Colour::Red:       return setTextAttribute( FOREGROUND_RED );
-                case Colour::Green:     return setTextAttribute( FOREGROUND_GREEN );
-                case Colour::Blue:      return setTextAttribute( FOREGROUND_BLUE );
-                case Colour::Cyan:      return setTextAttribute( FOREGROUND_BLUE | FOREGROUND_GREEN );
-                case Colour::Yellow:    return setTextAttribute( FOREGROUND_RED | FOREGROUND_GREEN );
-                case Colour::Grey:      return setTextAttribute( 0 );
-
-                case Colour::LightGrey:     return setTextAttribute( FOREGROUND_INTENSITY );
-                case Colour::BrightRed:     return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED );
-                case Colour::BrightGreen:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN );
-                case Colour::BrightWhite:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
-                case Colour::BrightYellow:  return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN );
-
-                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
-
-                default:
-                    CATCH_ERROR( "Unknown colour requested" );
-            }
-        }
-
-    private:
-        void setTextAttribute( WORD _textAttribute ) {
-            SetConsoleTextAttribute( stdoutHandle, _textAttribute | originalBackgroundAttributes );
-        }
-        HANDLE stdoutHandle;
-        WORD originalForegroundAttributes;
-        WORD originalBackgroundAttributes;
-    };
-
-    IColourImpl* platformColourInstance() {
-        static Win32ColourImpl s_instance;
-
-        IConfigPtr config = getCurrentContext().getConfig();
-        UseColour::YesOrNo colourMode = config
-            ? config->useColour()
-            : UseColour::Auto;
-        if( colourMode == UseColour::Auto )
-            colourMode = UseColour::Yes;
-        return colourMode == UseColour::Yes
-            ? &s_instance
-            : NoColourImpl::instance();
-    }
-
-} // end anon namespace
-} // end namespace Catch
-
-#elif defined( CATCH_CONFIG_COLOUR_ANSI ) //////////////////////////////////////
-
-#include <unistd.h>
-
-namespace Catch {
-namespace {
-
-    // use POSIX/ ANSI console terminal codes
-    // Thanks to Adam Strzelecki for original contribution
-    // (http://github.com/nanoant)
-    // https://github.com/philsquared/Catch/pull/131
-    class PosixColourImpl : public IColourImpl {
-    public:
-        virtual void use( Colour::Code _colourCode ) override {
-            switch( _colourCode ) {
-                case Colour::None:
-                case Colour::White:     return setColour( "[0m" );
-                case Colour::Red:       return setColour( "[0;31m" );
-                case Colour::Green:     return setColour( "[0;32m" );
-                case Colour::Blue:      return setColour( "[0;34m" );
-                case Colour::Cyan:      return setColour( "[0;36m" );
-                case Colour::Yellow:    return setColour( "[0;33m" );
-                case Colour::Grey:      return setColour( "[1;30m" );
-
-                case Colour::LightGrey:     return setColour( "[0;37m" );
-                case Colour::BrightRed:     return setColour( "[1;31m" );
-                case Colour::BrightGreen:   return setColour( "[1;32m" );
-                case Colour::BrightWhite:   return setColour( "[1;37m" );
-                case Colour::BrightYellow:  return setColour( "[1;33m" );
-
-                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
-                default: CATCH_INTERNAL_ERROR( "Unknown colour requested" );
-            }
-        }
-        static IColourImpl* instance() {
-            static PosixColourImpl s_instance;
-            return &s_instance;
-        }
-
-    private:
-        void setColour( const char* _escapeCode ) {
-            Catch::cout() << '\033' << _escapeCode;
-        }
-    };
-
-    bool useColourOnPlatform() {
-        return
-#ifdef CATCH_PLATFORM_MAC
-            !isDebuggerActive() &&
-#endif
-#if !(defined(__DJGPP__) && defined(__STRICT_ANSI__))
-            isatty(STDOUT_FILENO)
-#else
-            false
-#endif
-            ;
-    }
-    IColourImpl* platformColourInstance() {
-        ErrnoGuard guard;
-        IConfigPtr config = getCurrentContext().getConfig();
-        UseColour::YesOrNo colourMode = config
-            ? config->useColour()
-            : UseColour::Auto;
-        if( colourMode == UseColour::Auto )
-            colourMode = useColourOnPlatform()
-                ? UseColour::Yes
-                : UseColour::No;
-        return colourMode == UseColour::Yes
-            ? PosixColourImpl::instance()
-            : NoColourImpl::instance();
-    }
-
-} // end anon namespace
-} // end namespace Catch
-
-#else  // not Windows or ANSI ///////////////////////////////////////////////
-
-namespace Catch {
-
-    static IColourImpl* platformColourInstance() { return NoColourImpl::instance(); }
-
-} // end namespace Catch
-
-#endif // Windows/ ANSI/ None
-
-namespace Catch {
-
-    Colour::Colour( Code _colourCode ) { use( _colourCode ); }
-    Colour::Colour( Colour&& rhs ) noexcept {
-        m_moved = rhs.m_moved;
-        rhs.m_moved = true;
-    }
-    Colour& Colour::operator=( Colour&& rhs ) noexcept {
-        m_moved = rhs.m_moved;
-        rhs.m_moved  = true;
-        return *this;
-    }
-
-    Colour::~Colour(){ if( !m_moved ) use( None ); }
-
-    void Colour::use( Code _colourCode ) {
-        static IColourImpl* impl = platformColourInstance();
-        impl->use( _colourCode );
-    }
-
-    std::ostream& operator << ( std::ostream& os, Colour const& ) {
-        return os;
-    }
-
-} // end namespace Catch
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-
-// end catch_console_colour.cpp
-// start catch_context.cpp
-
-namespace Catch {
-
-    class Context : public IMutableContext, NonCopyable {
-
-    public: // IContext
-        virtual IResultCapture* getResultCapture() override {
-            return m_resultCapture;
-        }
-        virtual IRunner* getRunner() override {
-            return m_runner;
-        }
-
-        virtual IConfigPtr const& getConfig() const override {
-            return m_config;
-        }
-
-        virtual ~Context() override;
-
-    public: // IMutableContext
-        virtual void setResultCapture( IResultCapture* resultCapture ) override {
-            m_resultCapture = resultCapture;
-        }
-        virtual void setRunner( IRunner* runner ) override {
-            m_runner = runner;
-        }
-        virtual void setConfig( IConfigPtr const& config ) override {
-            m_config = config;
-        }
-
-        friend IMutableContext& getCurrentMutableContext();
-
-    private:
-        IConfigPtr m_config;
-        IRunner* m_runner = nullptr;
-        IResultCapture* m_resultCapture = nullptr;
-    };
-
-    IMutableContext *IMutableContext::currentContext = nullptr;
-
-    void IMutableContext::createContext()
-    {
-        currentContext = new Context();
-    }
-
-    void cleanUpContext() {
-        delete IMutableContext::currentContext;
-        IMutableContext::currentContext = nullptr;
-    }
-    IContext::~IContext() = default;
-    IMutableContext::~IMutableContext() = default;
-    Context::~Context() = default;
-}
-// end catch_context.cpp
-// start catch_debug_console.cpp
-
-// start catch_debug_console.h
-
-#include <string>
-
-namespace Catch {
-    void writeToDebugConsole( std::string const& text );
-}
-
-// end catch_debug_console.h
-#ifdef CATCH_PLATFORM_WINDOWS
-
-    namespace Catch {
-        void writeToDebugConsole( std::string const& text ) {
-            ::OutputDebugStringA( text.c_str() );
-        }
-    }
-
-#else
-
-    namespace Catch {
-        void writeToDebugConsole( std::string const& text ) {
-            // !TBD: Need a version for Mac/ XCode and other IDEs
-            Catch::cout() << text;
-        }
-    }
-
-#endif // Platform
-// end catch_debug_console.cpp
-// start catch_debugger.cpp
-
-#ifdef CATCH_PLATFORM_MAC
-
-#  include <assert.h>
-#  include <stdbool.h>
-#  include <sys/types.h>
-#  include <unistd.h>
-#  include <sys/sysctl.h>
-#  include <cstddef>
-#  include <ostream>
-
-namespace Catch {
-
-        // The following function is taken directly from the following technical note:
-        // http://developer.apple.com/library/mac/#qa/qa2004/qa1361.html
-
-        // Returns true if the current process is being debugged (either
-        // running under the debugger or has a debugger attached post facto).
-        bool isDebuggerActive(){
-
-            int                 mib[4];
-            struct kinfo_proc   info;
-            std::size_t         size;
-
-            // Initialize the flags so that, if sysctl fails for some bizarre
-            // reason, we get a predictable result.
-
-            info.kp_proc.p_flag = 0;
-
-            // Initialize mib, which tells sysctl the info we want, in this case
-            // we're looking for information about a specific process ID.
-
-            mib[0] = CTL_KERN;
-            mib[1] = KERN_PROC;
-            mib[2] = KERN_PROC_PID;
-            mib[3] = getpid();
-
-            // Call sysctl.
-
-            size = sizeof(info);
-            if( sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, nullptr, 0) != 0 ) {
-                Catch::cerr() << "\n** Call to sysctl failed - unable to determine if debugger is active **\n" << std::endl;
-                return false;
-            }
-
-            // We're being debugged if the P_TRACED flag is set.
-
-            return ( (info.kp_proc.p_flag & P_TRACED) != 0 );
-        }
-    } // namespace Catch
-
-#elif defined(CATCH_PLATFORM_LINUX)
-    #include <fstream>
-    #include <string>
-
-    namespace Catch{
-        // The standard POSIX way of detecting a debugger is to attempt to
-        // ptrace() the process, but this needs to be done from a child and not
-        // this process itself to still allow attaching to this process later
-        // if wanted, so is rather heavy. Under Linux we have the PID of the
-        // "debugger" (which doesn't need to be gdb, of course, it could also
-        // be strace, for example) in /proc/$PID/status, so just get it from
-        // there instead.
-        bool isDebuggerActive(){
-            // Libstdc++ has a bug, where std::ifstream sets errno to 0
-            // This way our users can properly assert over errno values
-            ErrnoGuard guard;
-            std::ifstream in("/proc/self/status");
-            for( std::string line; std::getline(in, line); ) {
-                static const int PREFIX_LEN = 11;
-                if( line.compare(0, PREFIX_LEN, "TracerPid:\t") == 0 ) {
-                    // We're traced if the PID is not 0 and no other PID starts
-                    // with 0 digit, so it's enough to check for just a single
-                    // character.
-                    return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';
-                }
-            }
-
-            return false;
-        }
-    } // namespace Catch
-#elif defined(_MSC_VER)
-    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
-    namespace Catch {
-        bool isDebuggerActive() {
-            return IsDebuggerPresent() != 0;
-        }
-    }
-#elif defined(__MINGW32__)
-    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
-    namespace Catch {
-        bool isDebuggerActive() {
-            return IsDebuggerPresent() != 0;
-        }
-    }
-#else
-    namespace Catch {
-       bool isDebuggerActive() { return false; }
-    }
-#endif // Platform
-// end catch_debugger.cpp
-// start catch_decomposer.cpp
-
-namespace Catch {
-
-    ITransientExpression::~ITransientExpression() = default;
-
-    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs ) {
-        if( lhs.size() + rhs.size() < 40 &&
-                lhs.find('\n') == std::string::npos &&
-                rhs.find('\n') == std::string::npos )
-            os << lhs << " " << op << " " << rhs;
-        else
-            os << lhs << "\n" << op << "\n" << rhs;
-    }
-}
-// end catch_decomposer.cpp
-// start catch_enforce.cpp
-
-namespace Catch {
-#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER)
-    [[noreturn]]
-    void throw_exception(std::exception const& e) {
-        Catch::cerr() << "Catch will terminate because it needed to throw an exception.\n"
-                      << "The message was: " << e.what() << '\n';
-        std::terminate();
-    }
-#endif
-} // namespace Catch;
-// end catch_enforce.cpp
-// start catch_errno_guard.cpp
-
-#include <cerrno>
-
-namespace Catch {
-        ErrnoGuard::ErrnoGuard():m_oldErrno(errno){}
-        ErrnoGuard::~ErrnoGuard() { errno = m_oldErrno; }
-}
-// end catch_errno_guard.cpp
-// start catch_exception_translator_registry.cpp
-
-// start catch_exception_translator_registry.h
-
-#include <vector>
-#include <string>
-#include <memory>
-
-namespace Catch {
-
-    class ExceptionTranslatorRegistry : public IExceptionTranslatorRegistry {
-    public:
-        ~ExceptionTranslatorRegistry();
-        virtual void registerTranslator( const IExceptionTranslator* translator );
-        virtual std::string translateActiveException() const override;
-        std::string tryTranslators() const;
-
-    private:
-        std::vector<std::unique_ptr<IExceptionTranslator const>> m_translators;
-    };
-}
-
-// end catch_exception_translator_registry.h
-#ifdef __OBJC__
-#import "Foundation/Foundation.h"
-#endif
-
-namespace Catch {
-
-    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {
-    }
-
-    void ExceptionTranslatorRegistry::registerTranslator( const IExceptionTranslator* translator ) {
-        m_translators.push_back( std::unique_ptr<const IExceptionTranslator>( translator ) );
-    }
-
-#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-    std::string ExceptionTranslatorRegistry::translateActiveException() const {
-        try {
-#ifdef __OBJC__
-            // In Objective-C try objective-c exceptions first
-            @try {
-                return tryTranslators();
-            }
-            @catch (NSException *exception) {
-                return Catch::Detail::stringify( [exception description] );
-            }
-#else
-            // Compiling a mixed mode project with MSVC means that CLR
-            // exceptions will be caught in (...) as well. However, these
-            // do not fill-in std::current_exception and thus lead to crash
-            // when attempting rethrow.
-            // /EHa switch also causes structured exceptions to be caught
-            // here, but they fill-in current_exception properly, so
-            // at worst the output should be a little weird, instead of
-            // causing a crash.
-            if (std::current_exception() == nullptr) {
-                return "Non C++ exception. Possibly a CLR exception.";
-            }
-            return tryTranslators();
-#endif
-        }
-        catch( TestFailureException& ) {
-            std::rethrow_exception(std::current_exception());
-        }
-        catch( std::exception& ex ) {
-            return ex.what();
-        }
-        catch( std::string& msg ) {
-            return msg;
-        }
-        catch( const char* msg ) {
-            return msg;
-        }
-        catch(...) {
-            return "Unknown exception";
-        }
-    }
-
-#else // ^^ Exceptions are enabled // Exceptions are disabled vv
-    std::string ExceptionTranslatorRegistry::translateActiveException() const {
-        CATCH_INTERNAL_ERROR("Attempted to translate active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
-    }
-#endif
-
-    std::string ExceptionTranslatorRegistry::tryTranslators() const {
-        if( m_translators.empty() )
-            std::rethrow_exception(std::current_exception());
-        else
-            return m_translators[0]->translate( m_translators.begin()+1, m_translators.end() );
-    }
-}
-// end catch_exception_translator_registry.cpp
-// start catch_fatal_condition.cpp
-
-#if defined(__GNUC__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-#endif
-
-#if defined( CATCH_CONFIG_WINDOWS_SEH ) || defined( CATCH_CONFIG_POSIX_SIGNALS )
-
-namespace {
-    // Report the error condition
-    void reportFatal( char const * const message ) {
-        Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition( message );
-    }
-}
-
-#endif // signals/SEH handling
-
-#if defined( CATCH_CONFIG_WINDOWS_SEH )
-
-namespace Catch {
-    struct SignalDefs { DWORD id; const char* name; };
-
-    // There is no 1-1 mapping between signals and windows exceptions.
-    // Windows can easily distinguish between SO and SigSegV,
-    // but SigInt, SigTerm, etc are handled differently.
-    static SignalDefs signalDefs[] = {
-        { EXCEPTION_ILLEGAL_INSTRUCTION,  "SIGILL - Illegal instruction signal" },
-        { EXCEPTION_STACK_OVERFLOW, "SIGSEGV - Stack overflow" },
-        { EXCEPTION_ACCESS_VIOLATION, "SIGSEGV - Segmentation violation signal" },
-        { EXCEPTION_INT_DIVIDE_BY_ZERO, "Divide by zero error" },
-    };
-
-    LONG CALLBACK FatalConditionHandler::handleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) {
-        for (auto const& def : signalDefs) {
-            if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) {
-                reportFatal(def.name);
-            }
-        }
-        // If its not an exception we care about, pass it along.
-        // This stops us from eating debugger breaks etc.
-        return EXCEPTION_CONTINUE_SEARCH;
-    }
-
-    FatalConditionHandler::FatalConditionHandler() {
-        isSet = true;
-        // 32k seems enough for Catch to handle stack overflow,
-        // but the value was found experimentally, so there is no strong guarantee
-        guaranteeSize = 32 * 1024;
-        exceptionHandlerHandle = nullptr;
-        // Register as first handler in current chain
-        exceptionHandlerHandle = AddVectoredExceptionHandler(1, handleVectoredException);
-        // Pass in guarantee size to be filled
-        SetThreadStackGuarantee(&guaranteeSize);
-    }
-
-    void FatalConditionHandler::reset() {
-        if (isSet) {
-            RemoveVectoredExceptionHandler(exceptionHandlerHandle);
-            SetThreadStackGuarantee(&guaranteeSize);
-            exceptionHandlerHandle = nullptr;
-            isSet = false;
-        }
-    }
-
-    FatalConditionHandler::~FatalConditionHandler() {
-        reset();
-    }
-
-bool FatalConditionHandler::isSet = false;
-ULONG FatalConditionHandler::guaranteeSize = 0;
-PVOID FatalConditionHandler::exceptionHandlerHandle = nullptr;
-
-} // namespace Catch
-
-#elif defined( CATCH_CONFIG_POSIX_SIGNALS )
-
-namespace Catch {
-
-    struct SignalDefs {
-        int id;
-        const char* name;
-    };
-
-    // 32kb for the alternate stack seems to be sufficient. However, this value
-    // is experimentally determined, so that's not guaranteed.
-    constexpr static std::size_t sigStackSize = 32768 >= MINSIGSTKSZ ? 32768 : MINSIGSTKSZ;
-
-    static SignalDefs signalDefs[] = {
-        { SIGINT,  "SIGINT - Terminal interrupt signal" },
-        { SIGILL,  "SIGILL - Illegal instruction signal" },
-        { SIGFPE,  "SIGFPE - Floating point error signal" },
-        { SIGSEGV, "SIGSEGV - Segmentation violation signal" },
-        { SIGTERM, "SIGTERM - Termination request signal" },
-        { SIGABRT, "SIGABRT - Abort (abnormal termination) signal" }
-    };
-
-    void FatalConditionHandler::handleSignal( int sig ) {
-        char const * name = "<unknown signal>";
-        for (auto const& def : signalDefs) {
-            if (sig == def.id) {
-                name = def.name;
-                break;
-            }
-        }
-        reset();
-        reportFatal(name);
-        raise( sig );
-    }
-
-    FatalConditionHandler::FatalConditionHandler() {
-        isSet = true;
-        stack_t sigStack;
-        sigStack.ss_sp = altStackMem;
-        sigStack.ss_size = sigStackSize;
-        sigStack.ss_flags = 0;
-        sigaltstack(&sigStack, &oldSigStack);
-        struct sigaction sa = { };
-
-        sa.sa_handler = handleSignal;
-        sa.sa_flags = SA_ONSTACK;
-        for (std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i) {
-            sigaction(signalDefs[i].id, &sa, &oldSigActions[i]);
-        }
-    }
-
-    FatalConditionHandler::~FatalConditionHandler() {
-        reset();
-    }
-
-    void FatalConditionHandler::reset() {
-        if( isSet ) {
-            // Set signals back to previous values -- hopefully nobody overwrote them in the meantime
-            for( std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i ) {
-                sigaction(signalDefs[i].id, &oldSigActions[i], nullptr);
-            }
-            // Return the old stack
-            sigaltstack(&oldSigStack, nullptr);
-            isSet = false;
-        }
-    }
-
-    bool FatalConditionHandler::isSet = false;
-    struct sigaction FatalConditionHandler::oldSigActions[sizeof(signalDefs)/sizeof(SignalDefs)] = {};
-    stack_t FatalConditionHandler::oldSigStack = {};
-    char FatalConditionHandler::altStackMem[sigStackSize] = {};
-
-} // namespace Catch
-
-#else
-
-namespace Catch {
-    void FatalConditionHandler::reset() {}
-}
-
-#endif // signals/SEH handling
-
-#if defined(__GNUC__)
-#    pragma GCC diagnostic pop
-#endif
-// end catch_fatal_condition.cpp
-// start catch_generators.cpp
-
-// start catch_random_number_generator.h
-
-#include <algorithm>
-#include <random>
-
-namespace Catch {
-
-    struct IConfig;
-
-    std::mt19937& rng();
-    void seedRng( IConfig const& config );
-    unsigned int rngSeed();
-
-}
-
-// end catch_random_number_generator.h
-#include <limits>
-#include <set>
-
-namespace Catch {
-
-IGeneratorTracker::~IGeneratorTracker() {}
-
-namespace Generators {
-
-    GeneratorBase::~GeneratorBase() {}
-
-    std::vector<size_t> randomiseIndices( size_t selectionSize, size_t sourceSize ) {
-
-        assert( selectionSize <= sourceSize );
-        std::vector<size_t> indices;
-        indices.reserve( selectionSize );
-        std::uniform_int_distribution<size_t> uid( 0, sourceSize-1 );
-
-        std::set<size_t> seen;
-        // !TBD: improve this algorithm
-        while( indices.size() < selectionSize ) {
-            auto index = uid( rng() );
-            if( seen.insert( index ).second )
-                indices.push_back( index );
-        }
-        return indices;
-    }
-
-    auto acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
-        return getResultCapture().acquireGeneratorTracker( lineInfo );
-    }
-
-    template<>
-    auto all<int>() -> Generator<int> {
-        return range( std::numeric_limits<int>::min(), std::numeric_limits<int>::max() );
-    }
-
-} // namespace Generators
-} // namespace Catch
-// end catch_generators.cpp
-// start catch_interfaces_capture.cpp
-
-namespace Catch {
-    IResultCapture::~IResultCapture() = default;
-}
-// end catch_interfaces_capture.cpp
-// start catch_interfaces_config.cpp
-
-namespace Catch {
-    IConfig::~IConfig() = default;
-}
-// end catch_interfaces_config.cpp
-// start catch_interfaces_exception.cpp
-
-namespace Catch {
-    IExceptionTranslator::~IExceptionTranslator() = default;
-    IExceptionTranslatorRegistry::~IExceptionTranslatorRegistry() = default;
-}
-// end catch_interfaces_exception.cpp
-// start catch_interfaces_registry_hub.cpp
-
-namespace Catch {
-    IRegistryHub::~IRegistryHub() = default;
-    IMutableRegistryHub::~IMutableRegistryHub() = default;
-}
-// end catch_interfaces_registry_hub.cpp
-// start catch_interfaces_reporter.cpp
-
-// start catch_reporter_listening.h
-
-namespace Catch {
-
-    class ListeningReporter : public IStreamingReporter {
-        using Reporters = std::vector<IStreamingReporterPtr>;
-        Reporters m_listeners;
-        IStreamingReporterPtr m_reporter = nullptr;
-        ReporterPreferences m_preferences;
-
-    public:
-        ListeningReporter();
-
-        void addListener( IStreamingReporterPtr&& listener );
-        void addReporter( IStreamingReporterPtr&& reporter );
-
-    public: // IStreamingReporter
-
-        ReporterPreferences getPreferences() const override;
-
-        void noMatchingTestCases( std::string const& spec ) override;
-
-        static std::set<Verbosity> getSupportedVerbosities();
-
-        void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) override;
-        void benchmarkEnded( BenchmarkStats const& benchmarkStats ) override;
-
-        void testRunStarting( TestRunInfo const& testRunInfo ) override;
-        void testGroupStarting( GroupInfo const& groupInfo ) override;
-        void testCaseStarting( TestCaseInfo const& testInfo ) override;
-        void sectionStarting( SectionInfo const& sectionInfo ) override;
-        void assertionStarting( AssertionInfo const& assertionInfo ) override;
-
-        // The return value indicates if the messages buffer should be cleared:
-        bool assertionEnded( AssertionStats const& assertionStats ) override;
-        void sectionEnded( SectionStats const& sectionStats ) override;
-        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
-        void testGroupEnded( TestGroupStats const& testGroupStats ) override;
-        void testRunEnded( TestRunStats const& testRunStats ) override;
-
-        void skipTest( TestCaseInfo const& testInfo ) override;
-        bool isMulti() const override;
-
-    };
-
-} // end namespace Catch
-
-// end catch_reporter_listening.h
-namespace Catch {
-
-    ReporterConfig::ReporterConfig( IConfigPtr const& _fullConfig )
-    :   m_stream( &_fullConfig->stream() ), m_fullConfig( _fullConfig ) {}
-
-    ReporterConfig::ReporterConfig( IConfigPtr const& _fullConfig, std::ostream& _stream )
-    :   m_stream( &_stream ), m_fullConfig( _fullConfig ) {}
-
-    std::ostream& ReporterConfig::stream() const { return *m_stream; }
-    IConfigPtr ReporterConfig::fullConfig() const { return m_fullConfig; }
-
-    TestRunInfo::TestRunInfo( std::string const& _name ) : name( _name ) {}
-
-    GroupInfo::GroupInfo(  std::string const& _name,
-                           std::size_t _groupIndex,
-                           std::size_t _groupsCount )
-    :   name( _name ),
-        groupIndex( _groupIndex ),
-        groupsCounts( _groupsCount )
-    {}
-
-     AssertionStats::AssertionStats( AssertionResult const& _assertionResult,
-                                     std::vector<MessageInfo> const& _infoMessages,
-                                     Totals const& _totals )
-    :   assertionResult( _assertionResult ),
-        infoMessages( _infoMessages ),
-        totals( _totals )
-    {
-        assertionResult.m_resultData.lazyExpression.m_transientExpression = _assertionResult.m_resultData.lazyExpression.m_transientExpression;
-
-        if( assertionResult.hasMessage() ) {
-            // Copy message into messages list.
-            // !TBD This should have been done earlier, somewhere
-            MessageBuilder builder( assertionResult.getTestMacroName(), assertionResult.getSourceInfo(), assertionResult.getResultType() );
-            builder << assertionResult.getMessage();
-            builder.m_info.message = builder.m_stream.str();
-
-            infoMessages.push_back( builder.m_info );
-        }
-    }
-
-     AssertionStats::~AssertionStats() = default;
-
-    SectionStats::SectionStats(  SectionInfo const& _sectionInfo,
-                                 Counts const& _assertions,
-                                 double _durationInSeconds,
-                                 bool _missingAssertions )
-    :   sectionInfo( _sectionInfo ),
-        assertions( _assertions ),
-        durationInSeconds( _durationInSeconds ),
-        missingAssertions( _missingAssertions )
-    {}
-
-    SectionStats::~SectionStats() = default;
-
-    TestCaseStats::TestCaseStats(  TestCaseInfo const& _testInfo,
-                                   Totals const& _totals,
-                                   std::string const& _stdOut,
-                                   std::string const& _stdErr,
-                                   bool _aborting )
-    : testInfo( _testInfo ),
-        totals( _totals ),
-        stdOut( _stdOut ),
-        stdErr( _stdErr ),
-        aborting( _aborting )
-    {}
-
-    TestCaseStats::~TestCaseStats() = default;
-
-    TestGroupStats::TestGroupStats( GroupInfo const& _groupInfo,
-                                    Totals const& _totals,
-                                    bool _aborting )
-    :   groupInfo( _groupInfo ),
-        totals( _totals ),
-        aborting( _aborting )
-    {}
-
-    TestGroupStats::TestGroupStats( GroupInfo const& _groupInfo )
-    :   groupInfo( _groupInfo ),
-        aborting( false )
-    {}
-
-    TestGroupStats::~TestGroupStats() = default;
-
-    TestRunStats::TestRunStats(   TestRunInfo const& _runInfo,
-                    Totals const& _totals,
-                    bool _aborting )
-    :   runInfo( _runInfo ),
-        totals( _totals ),
-        aborting( _aborting )
-    {}
-
-    TestRunStats::~TestRunStats() = default;
-
-    void IStreamingReporter::fatalErrorEncountered( StringRef ) {}
-    bool IStreamingReporter::isMulti() const { return false; }
-
-    IReporterFactory::~IReporterFactory() = default;
-    IReporterRegistry::~IReporterRegistry() = default;
-
-} // end namespace Catch
-// end catch_interfaces_reporter.cpp
-// start catch_interfaces_runner.cpp
-
-namespace Catch {
-    IRunner::~IRunner() = default;
-}
-// end catch_interfaces_runner.cpp
-// start catch_interfaces_testcase.cpp
-
-namespace Catch {
-    ITestInvoker::~ITestInvoker() = default;
-    ITestCaseRegistry::~ITestCaseRegistry() = default;
-}
-// end catch_interfaces_testcase.cpp
-// start catch_leak_detector.cpp
-
-#ifdef CATCH_CONFIG_WINDOWS_CRTDBG
-#include <crtdbg.h>
-
-namespace Catch {
-
-    LeakDetector::LeakDetector() {
-        int flag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
-        flag |= _CRTDBG_LEAK_CHECK_DF;
-        flag |= _CRTDBG_ALLOC_MEM_DF;
-        _CrtSetDbgFlag(flag);
-        _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
-        _CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDERR);
-        // Change this to leaking allocation's number to break there
-        _CrtSetBreakAlloc(-1);
-    }
-}
-
-#else
-
-    Catch::LeakDetector::LeakDetector() {}
-
-#endif
-// end catch_leak_detector.cpp
-// start catch_list.cpp
-
-// start catch_list.h
-
-#include <set>
-
-namespace Catch {
-
-    std::size_t listTests( Config const& config );
-
-    std::size_t listTestsNamesOnly( Config const& config );
-
-    struct TagInfo {
-        void add( std::string const& spelling );
-        std::string all() const;
-
-        std::set<std::string> spellings;
-        std::size_t count = 0;
-    };
-
-    std::size_t listTags( Config const& config );
-
-    std::size_t listReporters( Config const& /*config*/ );
-
-    Option<std::size_t> list( Config const& config );
-
-} // end namespace Catch
-
-// end catch_list.h
-// start catch_text.h
-
-namespace Catch {
-    using namespace clara::TextFlow;
-}
-
-// end catch_text.h
-#include <limits>
-#include <algorithm>
-#include <iomanip>
-
-namespace Catch {
-
-    std::size_t listTests( Config const& config ) {
-        TestSpec testSpec = config.testSpec();
-        if( config.hasTestFilters() )
-            Catch::cout() << "Matching test cases:\n";
-        else {
-            Catch::cout() << "All available test cases:\n";
-        }
-
-        auto matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
-        for( auto const& testCaseInfo : matchedTestCases ) {
-            Colour::Code colour = testCaseInfo.isHidden()
-                ? Colour::SecondaryText
-                : Colour::None;
-            Colour colourGuard( colour );
-
-            Catch::cout() << Column( testCaseInfo.name ).initialIndent( 2 ).indent( 4 ) << "\n";
-            if( config.verbosity() >= Verbosity::High ) {
-                Catch::cout() << Column( Catch::Detail::stringify( testCaseInfo.lineInfo ) ).indent(4) << std::endl;
-                std::string description = testCaseInfo.description;
-                if( description.empty() )
-                    description = "(NO DESCRIPTION)";
-                Catch::cout() << Column( description ).indent(4) << std::endl;
-            }
-            if( !testCaseInfo.tags.empty() )
-                Catch::cout() << Column( testCaseInfo.tagsAsString() ).indent( 6 ) << "\n";
-        }
-
-        if( !config.hasTestFilters() )
-            Catch::cout() << pluralise( matchedTestCases.size(), "test case" ) << '\n' << std::endl;
-        else
-            Catch::cout() << pluralise( matchedTestCases.size(), "matching test case" ) << '\n' << std::endl;
-        return matchedTestCases.size();
-    }
-
-    std::size_t listTestsNamesOnly( Config const& config ) {
-        TestSpec testSpec = config.testSpec();
-        std::size_t matchedTests = 0;
-        std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
-        for( auto const& testCaseInfo : matchedTestCases ) {
-            matchedTests++;
-            if( startsWith( testCaseInfo.name, '#' ) )
-               Catch::cout() << '"' << testCaseInfo.name << '"';
-            else
-               Catch::cout() << testCaseInfo.name;
-            if ( config.verbosity() >= Verbosity::High )
-                Catch::cout() << "\t@" << testCaseInfo.lineInfo;
-            Catch::cout() << std::endl;
-        }
-        return matchedTests;
-    }
-
-    void TagInfo::add( std::string const& spelling ) {
-        ++count;
-        spellings.insert( spelling );
-    }
-
-    std::string TagInfo::all() const {
-        std::string out;
-        for( auto const& spelling : spellings )
-            out += "[" + spelling + "]";
-        return out;
-    }
-
-    std::size_t listTags( Config const& config ) {
-        TestSpec testSpec = config.testSpec();
-        if( config.hasTestFilters() )
-            Catch::cout() << "Tags for matching test cases:\n";
-        else {
-            Catch::cout() << "All available tags:\n";
-        }
-
-        std::map<std::string, TagInfo> tagCounts;
-
-        std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
-        for( auto const& testCase : matchedTestCases ) {
-            for( auto const& tagName : testCase.getTestCaseInfo().tags ) {
-                std::string lcaseTagName = toLower( tagName );
-                auto countIt = tagCounts.find( lcaseTagName );
-                if( countIt == tagCounts.end() )
-                    countIt = tagCounts.insert( std::make_pair( lcaseTagName, TagInfo() ) ).first;
-                countIt->second.add( tagName );
-            }
-        }
-
-        for( auto const& tagCount : tagCounts ) {
-            ReusableStringStream rss;
-            rss << "  " << std::setw(2) << tagCount.second.count << "  ";
-            auto str = rss.str();
-            auto wrapper = Column( tagCount.second.all() )
-                                                    .initialIndent( 0 )
-                                                    .indent( str.size() )
-                                                    .width( CATCH_CONFIG_CONSOLE_WIDTH-10 );
-            Catch::cout() << str << wrapper << '\n';
-        }
-        Catch::cout() << pluralise( tagCounts.size(), "tag" ) << '\n' << std::endl;
-        return tagCounts.size();
-    }
-
-    std::size_t listReporters( Config const& /*config*/ ) {
-        Catch::cout() << "Available reporters:\n";
-        IReporterRegistry::FactoryMap const& factories = getRegistryHub().getReporterRegistry().getFactories();
-        std::size_t maxNameLen = 0;
-        for( auto const& factoryKvp : factories )
-            maxNameLen = (std::max)( maxNameLen, factoryKvp.first.size() );
-
-        for( auto const& factoryKvp : factories ) {
-            Catch::cout()
-                    << Column( factoryKvp.first + ":" )
-                            .indent(2)
-                            .width( 5+maxNameLen )
-                    +  Column( factoryKvp.second->getDescription() )
-                            .initialIndent(0)
-                            .indent(2)
-                            .width( CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen-8 )
-                    << "\n";
-        }
-        Catch::cout() << std::endl;
-        return factories.size();
-    }
-
-    Option<std::size_t> list( Config const& config ) {
-        Option<std::size_t> listedCount;
-        if( config.listTests() )
-            listedCount = listedCount.valueOr(0) + listTests( config );
-        if( config.listTestNamesOnly() )
-            listedCount = listedCount.valueOr(0) + listTestsNamesOnly( config );
-        if( config.listTags() )
-            listedCount = listedCount.valueOr(0) + listTags( config );
-        if( config.listReporters() )
-            listedCount = listedCount.valueOr(0) + listReporters( config );
-        return listedCount;
-    }
-
-} // end namespace Catch
-// end catch_list.cpp
-// start catch_matchers.cpp
-
-namespace Catch {
-namespace Matchers {
-    namespace Impl {
-
-        std::string MatcherUntypedBase::toString() const {
-            if( m_cachedToString.empty() )
-                m_cachedToString = describe();
-            return m_cachedToString;
-        }
-
-        MatcherUntypedBase::~MatcherUntypedBase() = default;
-
-    } // namespace Impl
-} // namespace Matchers
-
-using namespace Matchers;
-using Matchers::Impl::MatcherBase;
-
-} // namespace Catch
-// end catch_matchers.cpp
-// start catch_matchers_floating.cpp
-
-// start catch_to_string.hpp
-
-#include <string>
-
-namespace Catch {
-    template <typename T>
-    std::string to_string(T const& t) {
-#if defined(CATCH_CONFIG_CPP11_TO_STRING)
-        return std::to_string(t);
-#else
-        ReusableStringStream rss;
-        rss << t;
-        return rss.str();
-#endif
-    }
-} // end namespace Catch
-
-// end catch_to_string.hpp
-#include <cstdlib>
-#include <cstdint>
-#include <cstring>
-
-namespace Catch {
-namespace Matchers {
-namespace Floating {
-enum class FloatingPointKind : uint8_t {
-    Float,
-    Double
-};
-}
-}
-}
-
-namespace {
-
-template <typename T>
-struct Converter;
-
-template <>
-struct Converter<float> {
-    static_assert(sizeof(float) == sizeof(int32_t), "Important ULP matcher assumption violated");
-    Converter(float f) {
-        std::memcpy(&i, &f, sizeof(f));
-    }
-    int32_t i;
-};
-
-template <>
-struct Converter<double> {
-    static_assert(sizeof(double) == sizeof(int64_t), "Important ULP matcher assumption violated");
-    Converter(double d) {
-        std::memcpy(&i, &d, sizeof(d));
-    }
-    int64_t i;
-};
-
-template <typename T>
-auto convert(T t) -> Converter<T> {
-    return Converter<T>(t);
-}
-
-template <typename FP>
-bool almostEqualUlps(FP lhs, FP rhs, int maxUlpDiff) {
-    // Comparison with NaN should always be false.
-    // This way we can rule it out before getting into the ugly details
-    if (std::isnan(lhs) || std::isnan(rhs)) {
-        return false;
-    }
-
-    auto lc = convert(lhs);
-    auto rc = convert(rhs);
-
-    if ((lc.i < 0) != (rc.i < 0)) {
-        // Potentially we can have +0 and -0
-        return lhs == rhs;
-    }
-
-    auto ulpDiff = std::abs(lc.i - rc.i);
-    return ulpDiff <= maxUlpDiff;
-}
-
-}
-
-namespace Catch {
-namespace Matchers {
-namespace Floating {
-    WithinAbsMatcher::WithinAbsMatcher(double target, double margin)
-        :m_target{ target }, m_margin{ margin } {
-        CATCH_ENFORCE(margin >= 0, "Invalid margin: " << margin << '.'
-            << " Margin has to be non-negative.");
-    }
-
-    // Performs equivalent check of std::fabs(lhs - rhs) <= margin
-    // But without the subtraction to allow for INFINITY in comparison
-    bool WithinAbsMatcher::match(double const& matchee) const {
-        return (matchee + m_margin >= m_target) && (m_target + m_margin >= matchee);
-    }
-
-    std::string WithinAbsMatcher::describe() const {
-        return "is within " + ::Catch::Detail::stringify(m_margin) + " of " + ::Catch::Detail::stringify(m_target);
-    }
-
-    WithinUlpsMatcher::WithinUlpsMatcher(double target, int ulps, FloatingPointKind baseType)
-        :m_target{ target }, m_ulps{ ulps }, m_type{ baseType } {
-        CATCH_ENFORCE(ulps >= 0, "Invalid ULP setting: " << ulps << '.'
-            << " ULPs have to be non-negative.");
-    }
-
-#if defined(__clang__)
-#pragma clang diagnostic push
-// Clang <3.5 reports on the default branch in the switch below
-#pragma clang diagnostic ignored "-Wunreachable-code"
-#endif
-
-    bool WithinUlpsMatcher::match(double const& matchee) const {
-        switch (m_type) {
-        case FloatingPointKind::Float:
-            return almostEqualUlps<float>(static_cast<float>(matchee), static_cast<float>(m_target), m_ulps);
-        case FloatingPointKind::Double:
-            return almostEqualUlps<double>(matchee, m_target, m_ulps);
-        default:
-            CATCH_INTERNAL_ERROR( "Unknown FloatingPointKind value" );
-        }
-    }
-
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-    std::string WithinUlpsMatcher::describe() const {
-        return "is within " + Catch::to_string(m_ulps) + " ULPs of " + ::Catch::Detail::stringify(m_target) + ((m_type == FloatingPointKind::Float)? "f" : "");
-    }
-
-}// namespace Floating
-
-Floating::WithinUlpsMatcher WithinULP(double target, int maxUlpDiff) {
-    return Floating::WithinUlpsMatcher(target, maxUlpDiff, Floating::FloatingPointKind::Double);
-}
-
-Floating::WithinUlpsMatcher WithinULP(float target, int maxUlpDiff) {
-    return Floating::WithinUlpsMatcher(target, maxUlpDiff, Floating::FloatingPointKind::Float);
-}
-
-Floating::WithinAbsMatcher WithinAbs(double target, double margin) {
-    return Floating::WithinAbsMatcher(target, margin);
-}
-
-} // namespace Matchers
-} // namespace Catch
-
-// end catch_matchers_floating.cpp
-// start catch_matchers_generic.cpp
-
-std::string Catch::Matchers::Generic::Detail::finalizeDescription(const std::string& desc) {
-    if (desc.empty()) {
-        return "matches undescribed predicate";
-    } else {
-        return "matches predicate: \"" + desc + '"';
-    }
-}
-// end catch_matchers_generic.cpp
-// start catch_matchers_string.cpp
-
-#include <regex>
-
-namespace Catch {
-namespace Matchers {
-
-    namespace StdString {
-
-        CasedString::CasedString( std::string const& str, CaseSensitive::Choice caseSensitivity )
-        :   m_caseSensitivity( caseSensitivity ),
-            m_str( adjustString( str ) )
-        {}
-        std::string CasedString::adjustString( std::string const& str ) const {
-            return m_caseSensitivity == CaseSensitive::No
-                   ? toLower( str )
-                   : str;
-        }
-        std::string CasedString::caseSensitivitySuffix() const {
-            return m_caseSensitivity == CaseSensitive::No
-                   ? " (case insensitive)"
-                   : std::string();
-        }
-
-        StringMatcherBase::StringMatcherBase( std::string const& operation, CasedString const& comparator )
-        : m_comparator( comparator ),
-          m_operation( operation ) {
-        }
-
-        std::string StringMatcherBase::describe() const {
-            std::string description;
-            description.reserve(5 + m_operation.size() + m_comparator.m_str.size() +
-                                        m_comparator.caseSensitivitySuffix().size());
-            description += m_operation;
-            description += ": \"";
-            description += m_comparator.m_str;
-            description += "\"";
-            description += m_comparator.caseSensitivitySuffix();
-            return description;
-        }
-
-        EqualsMatcher::EqualsMatcher( CasedString const& comparator ) : StringMatcherBase( "equals", comparator ) {}
-
-        bool EqualsMatcher::match( std::string const& source ) const {
-            return m_comparator.adjustString( source ) == m_comparator.m_str;
-        }
-
-        ContainsMatcher::ContainsMatcher( CasedString const& comparator ) : StringMatcherBase( "contains", comparator ) {}
-
-        bool ContainsMatcher::match( std::string const& source ) const {
-            return contains( m_comparator.adjustString( source ), m_comparator.m_str );
-        }
-
-        StartsWithMatcher::StartsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "starts with", comparator ) {}
-
-        bool StartsWithMatcher::match( std::string const& source ) const {
-            return startsWith( m_comparator.adjustString( source ), m_comparator.m_str );
-        }
-
-        EndsWithMatcher::EndsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "ends with", comparator ) {}
-
-        bool EndsWithMatcher::match( std::string const& source ) const {
-            return endsWith( m_comparator.adjustString( source ), m_comparator.m_str );
-        }
-
-        RegexMatcher::RegexMatcher(std::string regex, CaseSensitive::Choice caseSensitivity): m_regex(std::move(regex)), m_caseSensitivity(caseSensitivity) {}
-
-        bool RegexMatcher::match(std::string const& matchee) const {
-            auto flags = std::regex::ECMAScript; // ECMAScript is the default syntax option anyway
-            if (m_caseSensitivity == CaseSensitive::Choice::No) {
-                flags |= std::regex::icase;
-            }
-            auto reg = std::regex(m_regex, flags);
-            return std::regex_match(matchee, reg);
-        }
-
-        std::string RegexMatcher::describe() const {
-            return "matches " + ::Catch::Detail::stringify(m_regex) + ((m_caseSensitivity == CaseSensitive::Choice::Yes)? " case sensitively" : " case insensitively");
-        }
-
-    } // namespace StdString
-
-    StdString::EqualsMatcher Equals( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
-        return StdString::EqualsMatcher( StdString::CasedString( str, caseSensitivity) );
-    }
-    StdString::ContainsMatcher Contains( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
-        return StdString::ContainsMatcher( StdString::CasedString( str, caseSensitivity) );
-    }
-    StdString::EndsWithMatcher EndsWith( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
-        return StdString::EndsWithMatcher( StdString::CasedString( str, caseSensitivity) );
-    }
-    StdString::StartsWithMatcher StartsWith( std::string const& str, CaseSensitive::Choice caseSensitivity ) {
-        return StdString::StartsWithMatcher( StdString::CasedString( str, caseSensitivity) );
-    }
-
-    StdString::RegexMatcher Matches(std::string const& regex, CaseSensitive::Choice caseSensitivity) {
-        return StdString::RegexMatcher(regex, caseSensitivity);
-    }
-
-} // namespace Matchers
-} // namespace Catch
-// end catch_matchers_string.cpp
-// start catch_message.cpp
-
-// start catch_uncaught_exceptions.h
-
-namespace Catch {
-    bool uncaught_exceptions();
-} // end namespace Catch
-
-// end catch_uncaught_exceptions.h
-#include <cassert>
-
-namespace Catch {
-
-    MessageInfo::MessageInfo(   StringRef const& _macroName,
-                                SourceLineInfo const& _lineInfo,
-                                ResultWas::OfType _type )
-    :   macroName( _macroName ),
-        lineInfo( _lineInfo ),
-        type( _type ),
-        sequence( ++globalCount )
-    {}
-
-    bool MessageInfo::operator==( MessageInfo const& other ) const {
-        return sequence == other.sequence;
-    }
-
-    bool MessageInfo::operator<( MessageInfo const& other ) const {
-        return sequence < other.sequence;
-    }
-
-    // This may need protecting if threading support is added
-    unsigned int MessageInfo::globalCount = 0;
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    Catch::MessageBuilder::MessageBuilder( StringRef const& macroName,
-                                           SourceLineInfo const& lineInfo,
-                                           ResultWas::OfType type )
-        :m_info(macroName, lineInfo, type) {}
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    ScopedMessage::ScopedMessage( MessageBuilder const& builder )
-    : m_info( builder.m_info )
-    {
-        m_info.message = builder.m_stream.str();
-        getResultCapture().pushScopedMessage( m_info );
-    }
-
-    ScopedMessage::~ScopedMessage() {
-        if ( !uncaught_exceptions() ){
-            getResultCapture().popScopedMessage(m_info);
-        }
-    }
-
-    Capturer::Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names ) {
-        auto start = std::string::npos;
-        for( size_t pos = 0; pos <= names.size(); ++pos ) {
-            char c = names[pos];
-            if( pos == names.size() || c == ' ' || c == '\t' || c == ',' || c == ']' ) {
-                if( start != std::string::npos ) {
-                    m_messages.push_back( MessageInfo( macroName, lineInfo, resultType ) );
-                    m_messages.back().message = names.substr( start, pos-start) + " := ";
-                    start = std::string::npos;
-                }
-            }
-            else if( c != '[' && c != ']' && start == std::string::npos )
-                start = pos;
-        }
-    }
-    Capturer::~Capturer() {
-        if ( !uncaught_exceptions() ){
-            assert( m_captured == m_messages.size() );
-            for( size_t i = 0; i < m_captured; ++i  )
-                m_resultCapture.popScopedMessage( m_messages[i] );
-        }
-    }
-
-    void Capturer::captureValue( size_t index, StringRef value ) {
-        assert( index < m_messages.size() );
-        m_messages[index].message += value;
-        m_resultCapture.pushScopedMessage( m_messages[index] );
-        m_captured++;
-    }
-
-} // end namespace Catch
-// end catch_message.cpp
-// start catch_output_redirect.cpp
-
-// start catch_output_redirect.h
-#ifndef TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
-#define TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
-
-#include <cstdio>
-#include <iosfwd>
-#include <string>
-
-namespace Catch {
-
-    class RedirectedStream {
-        std::ostream& m_originalStream;
-        std::ostream& m_redirectionStream;
-        std::streambuf* m_prevBuf;
-
-    public:
-        RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream );
-        ~RedirectedStream();
-    };
-
-    class RedirectedStdOut {
-        ReusableStringStream m_rss;
-        RedirectedStream m_cout;
-    public:
-        RedirectedStdOut();
-        auto str() const -> std::string;
-    };
-
-    // StdErr has two constituent streams in C++, std::cerr and std::clog
-    // This means that we need to redirect 2 streams into 1 to keep proper
-    // order of writes
-    class RedirectedStdErr {
-        ReusableStringStream m_rss;
-        RedirectedStream m_cerr;
-        RedirectedStream m_clog;
-    public:
-        RedirectedStdErr();
-        auto str() const -> std::string;
-    };
-
-#if defined(CATCH_CONFIG_NEW_CAPTURE)
-
-    // Windows's implementation of std::tmpfile is terrible (it tries
-    // to create a file inside system folder, thus requiring elevated
-    // privileges for the binary), so we have to use tmpnam(_s) and
-    // create the file ourselves there.
-    class TempFile {
-    public:
-        TempFile(TempFile const&) = delete;
-        TempFile& operator=(TempFile const&) = delete;
-        TempFile(TempFile&&) = delete;
-        TempFile& operator=(TempFile&&) = delete;
-
-        TempFile();
-        ~TempFile();
-
-        std::FILE* getFile();
-        std::string getContents();
-
-    private:
-        std::FILE* m_file = nullptr;
-    #if defined(_MSC_VER)
-        char m_buffer[L_tmpnam] = { 0 };
-    #endif
-    };
-
-    class OutputRedirect {
-    public:
-        OutputRedirect(OutputRedirect const&) = delete;
-        OutputRedirect& operator=(OutputRedirect const&) = delete;
-        OutputRedirect(OutputRedirect&&) = delete;
-        OutputRedirect& operator=(OutputRedirect&&) = delete;
-
-        OutputRedirect(std::string& stdout_dest, std::string& stderr_dest);
-        ~OutputRedirect();
-
-    private:
-        int m_originalStdout = -1;
-        int m_originalStderr = -1;
-        TempFile m_stdoutFile;
-        TempFile m_stderrFile;
-        std::string& m_stdoutDest;
-        std::string& m_stderrDest;
-    };
-
-#endif
-
-} // end namespace Catch
-
-#endif // TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H
-// end catch_output_redirect.h
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <sstream>
-#include <stdexcept>
-
-#if defined(CATCH_CONFIG_NEW_CAPTURE)
-    #if defined(_MSC_VER)
-    #include <io.h>      //_dup and _dup2
-    #define dup _dup
-    #define dup2 _dup2
-    #define fileno _fileno
-    #else
-    #include <unistd.h>  // dup and dup2
-    #endif
-#endif
-
-namespace Catch {
-
-    RedirectedStream::RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream )
-    :   m_originalStream( originalStream ),
-        m_redirectionStream( redirectionStream ),
-        m_prevBuf( m_originalStream.rdbuf() )
-    {
-        m_originalStream.rdbuf( m_redirectionStream.rdbuf() );
-    }
-
-    RedirectedStream::~RedirectedStream() {
-        m_originalStream.rdbuf( m_prevBuf );
-    }
-
-    RedirectedStdOut::RedirectedStdOut() : m_cout( Catch::cout(), m_rss.get() ) {}
-    auto RedirectedStdOut::str() const -> std::string { return m_rss.str(); }
-
-    RedirectedStdErr::RedirectedStdErr()
-    :   m_cerr( Catch::cerr(), m_rss.get() ),
-        m_clog( Catch::clog(), m_rss.get() )
-    {}
-    auto RedirectedStdErr::str() const -> std::string { return m_rss.str(); }
-
-#if defined(CATCH_CONFIG_NEW_CAPTURE)
-
-#if defined(_MSC_VER)
-    TempFile::TempFile() {
-        if (tmpnam_s(m_buffer)) {
-            CATCH_RUNTIME_ERROR("Could not get a temp filename");
-        }
-        if (fopen_s(&m_file, m_buffer, "w")) {
-            char buffer[100];
-            if (strerror_s(buffer, errno)) {
-                CATCH_RUNTIME_ERROR("Could not translate errno to a string");
-            }
-            CATCH_RUNTIME_ERROR("Coul dnot open the temp file: '" << m_buffer << "' because: " << buffer);
-        }
-    }
-#else
-    TempFile::TempFile() {
-        m_file = std::tmpfile();
-        if (!m_file) {
-            CATCH_RUNTIME_ERROR("Could not create a temp file.");
-        }
-    }
-
-#endif
-
-    TempFile::~TempFile() {
-         // TBD: What to do about errors here?
-         std::fclose(m_file);
-         // We manually create the file on Windows only, on Linux
-         // it will be autodeleted
-#if defined(_MSC_VER)
-         std::remove(m_buffer);
-#endif
-    }
-
-    FILE* TempFile::getFile() {
-        return m_file;
-    }
-
-    std::string TempFile::getContents() {
-        std::stringstream sstr;
-        char buffer[100] = {};
-        std::rewind(m_file);
-        while (std::fgets(buffer, sizeof(buffer), m_file)) {
-            sstr << buffer;
-        }
-        return sstr.str();
-    }
-
-    OutputRedirect::OutputRedirect(std::string& stdout_dest, std::string& stderr_dest) :
-        m_originalStdout(dup(1)),
-        m_originalStderr(dup(2)),
-        m_stdoutDest(stdout_dest),
-        m_stderrDest(stderr_dest) {
-        dup2(fileno(m_stdoutFile.getFile()), 1);
-        dup2(fileno(m_stderrFile.getFile()), 2);
-    }
-
-    OutputRedirect::~OutputRedirect() {
-        Catch::cout() << std::flush;
-        fflush(stdout);
-        // Since we support overriding these streams, we flush cerr
-        // even though std::cerr is unbuffered
-        Catch::cerr() << std::flush;
-        Catch::clog() << std::flush;
-        fflush(stderr);
-
-        dup2(m_originalStdout, 1);
-        dup2(m_originalStderr, 2);
-
-        m_stdoutDest += m_stdoutFile.getContents();
-        m_stderrDest += m_stderrFile.getContents();
-    }
-
-#endif // CATCH_CONFIG_NEW_CAPTURE
-
-} // namespace Catch
-
-#if defined(CATCH_CONFIG_NEW_CAPTURE)
-    #if defined(_MSC_VER)
-    #undef dup
-    #undef dup2
-    #undef fileno
-    #endif
-#endif
-// end catch_output_redirect.cpp
-// start catch_random_number_generator.cpp
-
-namespace Catch {
-
-    std::mt19937& rng() {
-        static std::mt19937 s_rng;
-        return s_rng;
-    }
-
-    void seedRng( IConfig const& config ) {
-        if( config.rngSeed() != 0 ) {
-            std::srand( config.rngSeed() );
-            rng().seed( config.rngSeed() );
-        }
-    }
-
-    unsigned int rngSeed() {
-        return getCurrentContext().getConfig()->rngSeed();
-    }
-}
-// end catch_random_number_generator.cpp
-// start catch_registry_hub.cpp
-
-// start catch_test_case_registry_impl.h
-
-#include <vector>
-#include <set>
-#include <algorithm>
-#include <ios>
-
-namespace Catch {
-
-    class TestCase;
-    struct IConfig;
-
-    std::vector<TestCase> sortTests( IConfig const& config, std::vector<TestCase> const& unsortedTestCases );
-    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config );
-
-    void enforceNoDuplicateTestCases( std::vector<TestCase> const& functions );
-
-    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config );
-    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config );
-
-    class TestRegistry : public ITestCaseRegistry {
-    public:
-        virtual ~TestRegistry() = default;
-
-        virtual void registerTest( TestCase const& testCase );
-
-        std::vector<TestCase> const& getAllTests() const override;
-        std::vector<TestCase> const& getAllTestsSorted( IConfig const& config ) const override;
-
-    private:
-        std::vector<TestCase> m_functions;
-        mutable RunTests::InWhatOrder m_currentSortOrder = RunTests::InDeclarationOrder;
-        mutable std::vector<TestCase> m_sortedFunctions;
-        std::size_t m_unnamedCount = 0;
-        std::ios_base::Init m_ostreamInit; // Forces cout/ cerr to be initialised
-    };
-
-    ///////////////////////////////////////////////////////////////////////////
-
-    class TestInvokerAsFunction : public ITestInvoker {
-        void(*m_testAsFunction)();
-    public:
-        TestInvokerAsFunction( void(*testAsFunction)() ) noexcept;
-
-        void invoke() const override;
-    };
-
-    std::string extractClassName( StringRef const& classOrQualifiedMethodName );
-
-    ///////////////////////////////////////////////////////////////////////////
-
-} // end namespace Catch
-
-// end catch_test_case_registry_impl.h
-// start catch_reporter_registry.h
-
-#include <map>
-
-namespace Catch {
-
-    class ReporterRegistry : public IReporterRegistry {
-
-    public:
-
-        ~ReporterRegistry() override;
-
-        IStreamingReporterPtr create( std::string const& name, IConfigPtr const& config ) const override;
-
-        void registerReporter( std::string const& name, IReporterFactoryPtr const& factory );
-        void registerListener( IReporterFactoryPtr const& factory );
-
-        FactoryMap const& getFactories() const override;
-        Listeners const& getListeners() const override;
-
-    private:
-        FactoryMap m_factories;
-        Listeners m_listeners;
-    };
-}
-
-// end catch_reporter_registry.h
-// start catch_tag_alias_registry.h
-
-// start catch_tag_alias.h
-
-#include <string>
-
-namespace Catch {
-
-    struct TagAlias {
-        TagAlias(std::string const& _tag, SourceLineInfo _lineInfo);
-
-        std::string tag;
-        SourceLineInfo lineInfo;
-    };
-
-} // end namespace Catch
-
-// end catch_tag_alias.h
-#include <map>
-
-namespace Catch {
-
-    class TagAliasRegistry : public ITagAliasRegistry {
-    public:
-        ~TagAliasRegistry() override;
-        TagAlias const* find( std::string const& alias ) const override;
-        std::string expandAliases( std::string const& unexpandedTestSpec ) const override;
-        void add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo );
-
-    private:
-        std::map<std::string, TagAlias> m_registry;
-    };
-
-} // end namespace Catch
-
-// end catch_tag_alias_registry.h
-// start catch_startup_exception_registry.h
-
-#include <vector>
-#include <exception>
-
-namespace Catch {
-
-    class StartupExceptionRegistry {
-    public:
-        void add(std::exception_ptr const& exception) noexcept;
-        std::vector<std::exception_ptr> const& getExceptions() const noexcept;
-    private:
-        std::vector<std::exception_ptr> m_exceptions;
-    };
-
-} // end namespace Catch
-
-// end catch_startup_exception_registry.h
-// start catch_singletons.hpp
-
-namespace Catch {
-
-    struct ISingleton {
-        virtual ~ISingleton();
-    };
-
-    void addSingleton( ISingleton* singleton );
-    void cleanupSingletons();
-
-    template<typename SingletonImplT, typename InterfaceT = SingletonImplT, typename MutableInterfaceT = InterfaceT>
-    class Singleton : SingletonImplT, public ISingleton {
-
-        static auto getInternal() -> Singleton* {
-            static Singleton* s_instance = nullptr;
-            if( !s_instance ) {
-                s_instance = new Singleton;
-                addSingleton( s_instance );
-            }
-            return s_instance;
-        }
-
-    public:
-        static auto get() -> InterfaceT const& {
-            return *getInternal();
-        }
-        static auto getMutable() -> MutableInterfaceT& {
-            return *getInternal();
-        }
-    };
-
-} // namespace Catch
-
-// end catch_singletons.hpp
-namespace Catch {
-
-    namespace {
-
-        class RegistryHub : public IRegistryHub, public IMutableRegistryHub,
-                            private NonCopyable {
-
-        public: // IRegistryHub
-            RegistryHub() = default;
-            IReporterRegistry const& getReporterRegistry() const override {
-                return m_reporterRegistry;
-            }
-            ITestCaseRegistry const& getTestCaseRegistry() const override {
-                return m_testCaseRegistry;
-            }
-            IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const override {
-                return m_exceptionTranslatorRegistry;
-            }
-            ITagAliasRegistry const& getTagAliasRegistry() const override {
-                return m_tagAliasRegistry;
-            }
-            StartupExceptionRegistry const& getStartupExceptionRegistry() const override {
-                return m_exceptionRegistry;
-            }
-
-        public: // IMutableRegistryHub
-            void registerReporter( std::string const& name, IReporterFactoryPtr const& factory ) override {
-                m_reporterRegistry.registerReporter( name, factory );
-            }
-            void registerListener( IReporterFactoryPtr const& factory ) override {
-                m_reporterRegistry.registerListener( factory );
-            }
-            void registerTest( TestCase const& testInfo ) override {
-                m_testCaseRegistry.registerTest( testInfo );
-            }
-            void registerTranslator( const IExceptionTranslator* translator ) override {
-                m_exceptionTranslatorRegistry.registerTranslator( translator );
-            }
-            void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) override {
-                m_tagAliasRegistry.add( alias, tag, lineInfo );
-            }
-            void registerStartupException() noexcept override {
-                m_exceptionRegistry.add(std::current_exception());
-            }
-
-        private:
-            TestRegistry m_testCaseRegistry;
-            ReporterRegistry m_reporterRegistry;
-            ExceptionTranslatorRegistry m_exceptionTranslatorRegistry;
-            TagAliasRegistry m_tagAliasRegistry;
-            StartupExceptionRegistry m_exceptionRegistry;
-        };
-    }
-
-    using RegistryHubSingleton = Singleton<RegistryHub, IRegistryHub, IMutableRegistryHub>;
-
-    IRegistryHub const& getRegistryHub() {
-        return RegistryHubSingleton::get();
-    }
-    IMutableRegistryHub& getMutableRegistryHub() {
-        return RegistryHubSingleton::getMutable();
-    }
-    void cleanUp() {
-        cleanupSingletons();
-        cleanUpContext();
-    }
-    std::string translateActiveException() {
-        return getRegistryHub().getExceptionTranslatorRegistry().translateActiveException();
-    }
-
-} // end namespace Catch
-// end catch_registry_hub.cpp
-// start catch_reporter_registry.cpp
-
-namespace Catch {
-
-    ReporterRegistry::~ReporterRegistry() = default;
-
-    IStreamingReporterPtr ReporterRegistry::create( std::string const& name, IConfigPtr const& config ) const {
-        auto it =  m_factories.find( name );
-        if( it == m_factories.end() )
-            return nullptr;
-        return it->second->create( ReporterConfig( config ) );
-    }
-
-    void ReporterRegistry::registerReporter( std::string const& name, IReporterFactoryPtr const& factory ) {
-        m_factories.emplace(name, factory);
-    }
-    void ReporterRegistry::registerListener( IReporterFactoryPtr const& factory ) {
-        m_listeners.push_back( factory );
-    }
-
-    IReporterRegistry::FactoryMap const& ReporterRegistry::getFactories() const {
-        return m_factories;
-    }
-    IReporterRegistry::Listeners const& ReporterRegistry::getListeners() const {
-        return m_listeners;
-    }
-
-}
-// end catch_reporter_registry.cpp
-// start catch_result_type.cpp
-
-namespace Catch {
-
-    bool isOk( ResultWas::OfType resultType ) {
-        return ( resultType & ResultWas::FailureBit ) == 0;
-    }
-    bool isJustInfo( int flags ) {
-        return flags == ResultWas::Info;
-    }
-
-    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs ) {
-        return static_cast<ResultDisposition::Flags>( static_cast<int>( lhs ) | static_cast<int>( rhs ) );
-    }
-
-    bool shouldContinueOnFailure( int flags )    { return ( flags & ResultDisposition::ContinueOnFailure ) != 0; }
-    bool shouldSuppressFailure( int flags )      { return ( flags & ResultDisposition::SuppressFail ) != 0; }
-
-} // end namespace Catch
-// end catch_result_type.cpp
-// start catch_run_context.cpp
-
-#include <cassert>
-#include <algorithm>
-#include <sstream>
-
-namespace Catch {
-
-    namespace Generators {
-        struct GeneratorTracker : TestCaseTracking::TrackerBase, IGeneratorTracker {
-            size_t m_index = static_cast<size_t>( -1 );
-            GeneratorBasePtr m_generator;
-
-            GeneratorTracker( TestCaseTracking::NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent )
-            :   TrackerBase( nameAndLocation, ctx, parent )
-            {}
-            ~GeneratorTracker();
-
-            static GeneratorTracker& acquire( TrackerContext& ctx, TestCaseTracking::NameAndLocation const& nameAndLocation ) {
-                std::shared_ptr<GeneratorTracker> tracker;
-
-                ITracker& currentTracker = ctx.currentTracker();
-                if( TestCaseTracking::ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) {
-                    assert( childTracker );
-                    assert( childTracker->isIndexTracker() );
-                    tracker = std::static_pointer_cast<GeneratorTracker>( childTracker );
-                }
-                else {
-                    tracker = std::make_shared<GeneratorTracker>( nameAndLocation, ctx, &currentTracker );
-                    currentTracker.addChild( tracker );
-                }
-
-                if( !ctx.completedCycle() && !tracker->isComplete() ) {
-                    if( tracker->m_runState != ExecutingChildren && tracker->m_runState != NeedsAnotherRun )
-                        tracker->moveNext();
-                    tracker->open();
-                }
-
-                return *tracker;
-            }
-
-            void moveNext() {
-                m_index++;
-                m_children.clear();
-            }
-
-            // TrackerBase interface
-            bool isIndexTracker() const override { return true; }
-            auto hasGenerator() const -> bool override {
-                return !!m_generator;
-            }
-            void close() override {
-                TrackerBase::close();
-                if( m_runState == CompletedSuccessfully && m_index < m_generator->size()-1 )
-                    m_runState = Executing;
-            }
-
-            // IGeneratorTracker interface
-            auto getGenerator() const -> GeneratorBasePtr const& override {
-                return m_generator;
-            }
-            void setGenerator( GeneratorBasePtr&& generator ) override {
-                m_generator = std::move( generator );
-            }
-            auto getIndex() const -> size_t override {
-                return m_index;
-            }
-        };
-        GeneratorTracker::~GeneratorTracker() {}
-    }
-
-    RunContext::RunContext(IConfigPtr const& _config, IStreamingReporterPtr&& reporter)
-    :   m_runInfo(_config->name()),
-        m_context(getCurrentMutableContext()),
-        m_config(_config),
-        m_reporter(std::move(reporter)),
-        m_lastAssertionInfo{ StringRef(), SourceLineInfo("",0), StringRef(), ResultDisposition::Normal },
-        m_includeSuccessfulResults( m_config->includeSuccessfulResults() || m_reporter->getPreferences().shouldReportAllAssertions )
-    {
-        m_context.setRunner(this);
-        m_context.setConfig(m_config);
-        m_context.setResultCapture(this);
-        m_reporter->testRunStarting(m_runInfo);
-    }
-
-    RunContext::~RunContext() {
-        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, aborting()));
-    }
-
-    void RunContext::testGroupStarting(std::string const& testSpec, std::size_t groupIndex, std::size_t groupsCount) {
-        m_reporter->testGroupStarting(GroupInfo(testSpec, groupIndex, groupsCount));
-    }
-
-    void RunContext::testGroupEnded(std::string const& testSpec, Totals const& totals, std::size_t groupIndex, std::size_t groupsCount) {
-        m_reporter->testGroupEnded(TestGroupStats(GroupInfo(testSpec, groupIndex, groupsCount), totals, aborting()));
-    }
-
-    Totals RunContext::runTest(TestCase const& testCase) {
-        Totals prevTotals = m_totals;
-
-        std::string redirectedCout;
-        std::string redirectedCerr;
-
-        auto const& testInfo = testCase.getTestCaseInfo();
-
-        m_reporter->testCaseStarting(testInfo);
-
-        m_activeTestCase = &testCase;
-
-        ITracker& rootTracker = m_trackerContext.startRun();
-        assert(rootTracker.isSectionTracker());
-        static_cast<SectionTracker&>(rootTracker).addInitialFilters(m_config->getSectionsToRun());
-        do {
-            m_trackerContext.startCycle();
-            m_testCaseTracker = &SectionTracker::acquire(m_trackerContext, TestCaseTracking::NameAndLocation(testInfo.name, testInfo.lineInfo));
-            runCurrentTest(redirectedCout, redirectedCerr);
-        } while (!m_testCaseTracker->isSuccessfullyCompleted() && !aborting());
-
-        Totals deltaTotals = m_totals.delta(prevTotals);
-        if (testInfo.expectedToFail() && deltaTotals.testCases.passed > 0) {
-            deltaTotals.assertions.failed++;
-            deltaTotals.testCases.passed--;
-            deltaTotals.testCases.failed++;
-        }
-        m_totals.testCases += deltaTotals.testCases;
-        m_reporter->testCaseEnded(TestCaseStats(testInfo,
-                                  deltaTotals,
-                                  redirectedCout,
-                                  redirectedCerr,
-                                  aborting()));
-
-        m_activeTestCase = nullptr;
-        m_testCaseTracker = nullptr;
-
-        return deltaTotals;
-    }
-
-    IConfigPtr RunContext::config() const {
-        return m_config;
-    }
-
-    IStreamingReporter& RunContext::reporter() const {
-        return *m_reporter;
-    }
-
-    void RunContext::assertionEnded(AssertionResult const & result) {
-        if (result.getResultType() == ResultWas::Ok) {
-            m_totals.assertions.passed++;
-            m_lastAssertionPassed = true;
-        } else if (!result.isOk()) {
-            m_lastAssertionPassed = false;
-            if( m_activeTestCase->getTestCaseInfo().okToFail() )
-                m_totals.assertions.failedButOk++;
-            else
-                m_totals.assertions.failed++;
-        }
-        else {
-            m_lastAssertionPassed = true;
-        }
-
-        // We have no use for the return value (whether messages should be cleared), because messages were made scoped
-        // and should be let to clear themselves out.
-        static_cast<void>(m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals)));
-
-        // Reset working state
-        resetAssertionInfo();
-        m_lastResult = result;
-    }
-    void RunContext::resetAssertionInfo() {
-        m_lastAssertionInfo.macroName = StringRef();
-        m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
-    }
-
-    bool RunContext::sectionStarted(SectionInfo const & sectionInfo, Counts & assertions) {
-        ITracker& sectionTracker = SectionTracker::acquire(m_trackerContext, TestCaseTracking::NameAndLocation(sectionInfo.name, sectionInfo.lineInfo));
-        if (!sectionTracker.isOpen())
-            return false;
-        m_activeSections.push_back(&sectionTracker);
-
-        m_lastAssertionInfo.lineInfo = sectionInfo.lineInfo;
-
-        m_reporter->sectionStarting(sectionInfo);
-
-        assertions = m_totals.assertions;
-
-        return true;
-    }
-    auto RunContext::acquireGeneratorTracker( SourceLineInfo const& lineInfo ) -> IGeneratorTracker& {
-        using namespace Generators;
-        GeneratorTracker& tracker = GeneratorTracker::acquire( m_trackerContext, TestCaseTracking::NameAndLocation( "generator", lineInfo ) );
-        assert( tracker.isOpen() );
-        m_lastAssertionInfo.lineInfo = lineInfo;
-        return tracker;
-    }
-
-    bool RunContext::testForMissingAssertions(Counts& assertions) {
-        if (assertions.total() != 0)
-            return false;
-        if (!m_config->warnAboutMissingAssertions())
-            return false;
-        if (m_trackerContext.currentTracker().hasChildren())
-            return false;
-        m_totals.assertions.failed++;
-        assertions.failed++;
-        return true;
-    }
-
-    void RunContext::sectionEnded(SectionEndInfo const & endInfo) {
-        Counts assertions = m_totals.assertions - endInfo.prevAssertions;
-        bool missingAssertions = testForMissingAssertions(assertions);
-
-        if (!m_activeSections.empty()) {
-            m_activeSections.back()->close();
-            m_activeSections.pop_back();
-        }
-
-        m_reporter->sectionEnded(SectionStats(endInfo.sectionInfo, assertions, endInfo.durationInSeconds, missingAssertions));
-        m_messages.clear();
-    }
-
-    void RunContext::sectionEndedEarly(SectionEndInfo const & endInfo) {
-        if (m_unfinishedSections.empty())
-            m_activeSections.back()->fail();
-        else
-            m_activeSections.back()->close();
-        m_activeSections.pop_back();
-
-        m_unfinishedSections.push_back(endInfo);
-    }
-    void RunContext::benchmarkStarting( BenchmarkInfo const& info ) {
-        m_reporter->benchmarkStarting( info );
-    }
-    void RunContext::benchmarkEnded( BenchmarkStats const& stats ) {
-        m_reporter->benchmarkEnded( stats );
-    }
-
-    void RunContext::pushScopedMessage(MessageInfo const & message) {
-        m_messages.push_back(message);
-    }
-
-    void RunContext::popScopedMessage(MessageInfo const & message) {
-        m_messages.erase(std::remove(m_messages.begin(), m_messages.end(), message), m_messages.end());
-    }
-
-    std::string RunContext::getCurrentTestName() const {
-        return m_activeTestCase
-            ? m_activeTestCase->getTestCaseInfo().name
-            : std::string();
-    }
-
-    const AssertionResult * RunContext::getLastResult() const {
-        return &(*m_lastResult);
-    }
-
-    void RunContext::exceptionEarlyReported() {
-        m_shouldReportUnexpected = false;
-    }
-
-    void RunContext::handleFatalErrorCondition( StringRef message ) {
-        // First notify reporter that bad things happened
-        m_reporter->fatalErrorEncountered(message);
-
-        // Don't rebuild the result -- the stringification itself can cause more fatal errors
-        // Instead, fake a result data.
-        AssertionResultData tempResult( ResultWas::FatalErrorCondition, { false } );
-        tempResult.message = message;
-        AssertionResult result(m_lastAssertionInfo, tempResult);
-
-        assertionEnded(result);
-
-        handleUnfinishedSections();
-
-        // Recreate section for test case (as we will lose the one that was in scope)
-        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
-        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
-
-        Counts assertions;
-        assertions.failed = 1;
-        SectionStats testCaseSectionStats(testCaseSection, assertions, 0, false);
-        m_reporter->sectionEnded(testCaseSectionStats);
-
-        auto const& testInfo = m_activeTestCase->getTestCaseInfo();
-
-        Totals deltaTotals;
-        deltaTotals.testCases.failed = 1;
-        deltaTotals.assertions.failed = 1;
-        m_reporter->testCaseEnded(TestCaseStats(testInfo,
-                                  deltaTotals,
-                                  std::string(),
-                                  std::string(),
-                                  false));
-        m_totals.testCases.failed++;
-        testGroupEnded(std::string(), m_totals, 1, 1);
-        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, false));
-    }
-
-    bool RunContext::lastAssertionPassed() {
-         return m_lastAssertionPassed;
-    }
-
-    void RunContext::assertionPassed() {
-        m_lastAssertionPassed = true;
-        ++m_totals.assertions.passed;
-        resetAssertionInfo();
-    }
-
-    bool RunContext::aborting() const {
-        return m_totals.assertions.failed >= static_cast<std::size_t>(m_config->abortAfter());
-    }
-
-    void RunContext::runCurrentTest(std::string & redirectedCout, std::string & redirectedCerr) {
-        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
-        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
-        m_reporter->sectionStarting(testCaseSection);
-        Counts prevAssertions = m_totals.assertions;
-        double duration = 0;
-        m_shouldReportUnexpected = true;
-        m_lastAssertionInfo = { "TEST_CASE"_sr, testCaseInfo.lineInfo, StringRef(), ResultDisposition::Normal };
-
-        seedRng(*m_config);
-
-        Timer timer;
-        CATCH_TRY {
-            if (m_reporter->getPreferences().shouldRedirectStdOut) {
-#if !defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
-                RedirectedStdOut redirectedStdOut;
-                RedirectedStdErr redirectedStdErr;
-
-                timer.start();
-                invokeActiveTestCase();
-                redirectedCout += redirectedStdOut.str();
-                redirectedCerr += redirectedStdErr.str();
-#else
-                OutputRedirect r(redirectedCout, redirectedCerr);
-                timer.start();
-                invokeActiveTestCase();
-#endif
-            } else {
-                timer.start();
-                invokeActiveTestCase();
-            }
-            duration = timer.getElapsedSeconds();
-        } CATCH_CATCH_ANON (TestFailureException&) {
-            // This just means the test was aborted due to failure
-        } CATCH_CATCH_ALL {
-            // Under CATCH_CONFIG_FAST_COMPILE, unexpected exceptions under REQUIRE assertions
-            // are reported without translation at the point of origin.
-            if( m_shouldReportUnexpected ) {
-                AssertionReaction dummyReaction;
-                handleUnexpectedInflightException( m_lastAssertionInfo, translateActiveException(), dummyReaction );
-            }
-        }
-        Counts assertions = m_totals.assertions - prevAssertions;
-        bool missingAssertions = testForMissingAssertions(assertions);
-
-        m_testCaseTracker->close();
-        handleUnfinishedSections();
-        m_messages.clear();
-
-        SectionStats testCaseSectionStats(testCaseSection, assertions, duration, missingAssertions);
-        m_reporter->sectionEnded(testCaseSectionStats);
-    }
-
-    void RunContext::invokeActiveTestCase() {
-        FatalConditionHandler fatalConditionHandler; // Handle signals
-        m_activeTestCase->invoke();
-        fatalConditionHandler.reset();
-    }
-
-    void RunContext::handleUnfinishedSections() {
-        // If sections ended prematurely due to an exception we stored their
-        // infos here so we can tear them down outside the unwind process.
-        for (auto it = m_unfinishedSections.rbegin(),
-             itEnd = m_unfinishedSections.rend();
-             it != itEnd;
-             ++it)
-            sectionEnded(*it);
-        m_unfinishedSections.clear();
-    }
-
-    void RunContext::handleExpr(
-        AssertionInfo const& info,
-        ITransientExpression const& expr,
-        AssertionReaction& reaction
-    ) {
-        m_reporter->assertionStarting( info );
-
-        bool negated = isFalseTest( info.resultDisposition );
-        bool result = expr.getResult() != negated;
-
-        if( result ) {
-            if (!m_includeSuccessfulResults) {
-                assertionPassed();
-            }
-            else {
-                reportExpr(info, ResultWas::Ok, &expr, negated);
-            }
-        }
-        else {
-            reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
-            populateReaction( reaction );
-        }
-    }
-    void RunContext::reportExpr(
-            AssertionInfo const &info,
-            ResultWas::OfType resultType,
-            ITransientExpression const *expr,
-            bool negated ) {
-
-        m_lastAssertionInfo = info;
-        AssertionResultData data( resultType, LazyExpression( negated ) );
-
-        AssertionResult assertionResult{ info, data };
-        assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;
-
-        assertionEnded( assertionResult );
-    }
-
-    void RunContext::handleMessage(
-            AssertionInfo const& info,
-            ResultWas::OfType resultType,
-            StringRef const& message,
-            AssertionReaction& reaction
-    ) {
-        m_reporter->assertionStarting( info );
-
-        m_lastAssertionInfo = info;
-
-        AssertionResultData data( resultType, LazyExpression( false ) );
-        data.message = message;
-        AssertionResult assertionResult{ m_lastAssertionInfo, data };
-        assertionEnded( assertionResult );
-        if( !assertionResult.isOk() )
-            populateReaction( reaction );
-    }
-    void RunContext::handleUnexpectedExceptionNotThrown(
-            AssertionInfo const& info,
-            AssertionReaction& reaction
-    ) {
-        handleNonExpr(info, Catch::ResultWas::DidntThrowException, reaction);
-    }
-
-    void RunContext::handleUnexpectedInflightException(
-            AssertionInfo const& info,
-            std::string const& message,
-            AssertionReaction& reaction
-    ) {
-        m_lastAssertionInfo = info;
-
-        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
-        data.message = message;
-        AssertionResult assertionResult{ info, data };
-        assertionEnded( assertionResult );
-        populateReaction( reaction );
-    }
-
-    void RunContext::populateReaction( AssertionReaction& reaction ) {
-        reaction.shouldDebugBreak = m_config->shouldDebugBreak();
-        reaction.shouldThrow = aborting() || (m_lastAssertionInfo.resultDisposition & ResultDisposition::Normal);
-    }
-
-    void RunContext::handleIncomplete(
-            AssertionInfo const& info
-    ) {
-        m_lastAssertionInfo = info;
-
-        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
-        data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE";
-        AssertionResult assertionResult{ info, data };
-        assertionEnded( assertionResult );
-    }
-    void RunContext::handleNonExpr(
-            AssertionInfo const &info,
-            ResultWas::OfType resultType,
-            AssertionReaction &reaction
-    ) {
-        m_lastAssertionInfo = info;
-
-        AssertionResultData data( resultType, LazyExpression( false ) );
-        AssertionResult assertionResult{ info, data };
-        assertionEnded( assertionResult );
-
-        if( !assertionResult.isOk() )
-            populateReaction( reaction );
-    }
-
-    IResultCapture& getResultCapture() {
-        if (auto* capture = getCurrentContext().getResultCapture())
-            return *capture;
-        else
-            CATCH_INTERNAL_ERROR("No result capture instance");
-    }
-}
-// end catch_run_context.cpp
-// start catch_section.cpp
-
-namespace Catch {
-
-    Section::Section( SectionInfo const& info )
-    :   m_info( info ),
-        m_sectionIncluded( getResultCapture().sectionStarted( m_info, m_assertions ) )
-    {
-        m_timer.start();
-    }
-
-    Section::~Section() {
-        if( m_sectionIncluded ) {
-            SectionEndInfo endInfo{ m_info, m_assertions, m_timer.getElapsedSeconds() };
-            if( uncaught_exceptions() )
-                getResultCapture().sectionEndedEarly( endInfo );
-            else
-                getResultCapture().sectionEnded( endInfo );
-        }
-    }
-
-    // This indicates whether the section should be executed or not
-    Section::operator bool() const {
-        return m_sectionIncluded;
-    }
-
-} // end namespace Catch
-// end catch_section.cpp
-// start catch_section_info.cpp
-
-namespace Catch {
-
-    SectionInfo::SectionInfo
-        (   SourceLineInfo const& _lineInfo,
-            std::string const& _name )
-    :   name( _name ),
-        lineInfo( _lineInfo )
-    {}
-
-} // end namespace Catch
-// end catch_section_info.cpp
-// start catch_session.cpp
-
-// start catch_session.h
-
-#include <memory>
-
-namespace Catch {
-
-    class Session : NonCopyable {
-    public:
-
-        Session();
-        ~Session() override;
-
-        void showHelp() const;
-        void libIdentify();
-
-        int applyCommandLine( int argc, char const * const * argv );
-
-        void useConfigData( ConfigData const& configData );
-
-        int run( int argc, char* argv[] );
-    #if defined(CATCH_CONFIG_WCHAR) && defined(WIN32) && defined(UNICODE)
-        int run( int argc, wchar_t* const argv[] );
-    #endif
-        int run();
-
-        clara::Parser const& cli() const;
-        void cli( clara::Parser const& newParser );
-        ConfigData& configData();
-        Config& config();
-    private:
-        int runInternal();
-
-        clara::Parser m_cli;
-        ConfigData m_configData;
-        std::shared_ptr<Config> m_config;
-        bool m_startupExceptions = false;
-    };
-
-} // end namespace Catch
-
-// end catch_session.h
-// start catch_version.h
-
-#include <iosfwd>
-
-namespace Catch {
-
-    // Versioning information
-    struct Version {
-        Version( Version const& ) = delete;
-        Version& operator=( Version const& ) = delete;
-        Version(    unsigned int _majorVersion,
-                    unsigned int _minorVersion,
-                    unsigned int _patchNumber,
-                    char const * const _branchName,
-                    unsigned int _buildNumber );
-
-        unsigned int const majorVersion;
-        unsigned int const minorVersion;
-        unsigned int const patchNumber;
-
-        // buildNumber is only used if branchName is not null
-        char const * const branchName;
-        unsigned int const buildNumber;
-
-        friend std::ostream& operator << ( std::ostream& os, Version const& version );
-    };
-
-    Version const& libraryVersion();
-}
-
-// end catch_version.h
-#include <cstdlib>
-#include <iomanip>
-
-namespace Catch {
-
-    namespace {
-        const int MaxExitCode = 255;
-
-        IStreamingReporterPtr createReporter(std::string const& reporterName, IConfigPtr const& config) {
-            auto reporter = Catch::getRegistryHub().getReporterRegistry().create(reporterName, config);
-            CATCH_ENFORCE(reporter, "No reporter registered with name: '" << reporterName << "'");
-
-            return reporter;
-        }
-
-        IStreamingReporterPtr makeReporter(std::shared_ptr<Config> const& config) {
-            if (Catch::getRegistryHub().getReporterRegistry().getListeners().empty()) {
-                return createReporter(config->getReporterName(), config);
-            }
-
-            auto multi = std::unique_ptr<ListeningReporter>(new ListeningReporter);
-
-            auto const& listeners = Catch::getRegistryHub().getReporterRegistry().getListeners();
-            for (auto const& listener : listeners) {
-                multi->addListener(listener->create(Catch::ReporterConfig(config)));
-            }
-            multi->addReporter(createReporter(config->getReporterName(), config));
-            return std::move(multi);
-        }
-
-        Catch::Totals runTests(std::shared_ptr<Config> const& config) {
-            // FixMe: Add listeners in order first, then add reporters.
-
-            auto reporter = makeReporter(config);
-
-            RunContext context(config, std::move(reporter));
-
-            Totals totals;
-
-            context.testGroupStarting(config->name(), 1, 1);
-
-            TestSpec testSpec = config->testSpec();
-
-            auto const& allTestCases = getAllTestCasesSorted(*config);
-            for (auto const& testCase : allTestCases) {
-                if (!context.aborting() && matchTest(testCase, testSpec, *config))
-                    totals += context.runTest(testCase);
-                else
-                    context.reporter().skipTest(testCase);
-            }
-
-            if (config->warnAboutNoTests() && totals.testCases.total() == 0) {
-                ReusableStringStream testConfig;
-
-                bool first = true;
-                for (const auto& input : config->getTestsOrTags()) {
-                    if (!first) { testConfig << ' '; }
-                    first = false;
-                    testConfig << input;
-                }
-
-                context.reporter().noMatchingTestCases(testConfig.str());
-                totals.error = -1;
-            }
-
-            context.testGroupEnded(config->name(), totals, 1, 1);
-            return totals;
-        }
-
-        void applyFilenamesAsTags(Catch::IConfig const& config) {
-            auto& tests = const_cast<std::vector<TestCase>&>(getAllTestCasesSorted(config));
-            for (auto& testCase : tests) {
-                auto tags = testCase.tags;
-
-                std::string filename = testCase.lineInfo.file;
-                auto lastSlash = filename.find_last_of("\\/");
-                if (lastSlash != std::string::npos) {
-                    filename.erase(0, lastSlash);
-                    filename[0] = '#';
-                }
-
-                auto lastDot = filename.find_last_of('.');
-                if (lastDot != std::string::npos) {
-                    filename.erase(lastDot);
-                }
-
-                tags.push_back(std::move(filename));
-                setTags(testCase, tags);
-            }
-        }
-
-    } // anon namespace
-
-    Session::Session() {
-        static bool alreadyInstantiated = false;
-        if( alreadyInstantiated ) {
-            CATCH_TRY { CATCH_INTERNAL_ERROR( "Only one instance of Catch::Session can ever be used" ); }
-            CATCH_CATCH_ALL { getMutableRegistryHub().registerStartupException(); }
-        }
-
-        // There cannot be exceptions at startup in no-exception mode.
-#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-        const auto& exceptions = getRegistryHub().getStartupExceptionRegistry().getExceptions();
-        if ( !exceptions.empty() ) {
-            m_startupExceptions = true;
-            Colour colourGuard( Colour::Red );
-            Catch::cerr() << "Errors occurred during startup!" << '\n';
-            // iterate over all exceptions and notify user
-            for ( const auto& ex_ptr : exceptions ) {
-                try {
-                    std::rethrow_exception(ex_ptr);
-                } catch ( std::exception const& ex ) {
-                    Catch::cerr() << Column( ex.what() ).indent(2) << '\n';
-                }
-            }
-        }
-#endif
-
-        alreadyInstantiated = true;
-        m_cli = makeCommandLineParser( m_configData );
-    }
-    Session::~Session() {
-        Catch::cleanUp();
-    }
-
-    void Session::showHelp() const {
-        Catch::cout()
-                << "\nCatch v" << libraryVersion() << "\n"
-                << m_cli << std::endl
-                << "For more detailed usage please see the project docs\n" << std::endl;
-    }
-    void Session::libIdentify() {
-        Catch::cout()
-                << std::left << std::setw(16) << "description: " << "A Catch test executable\n"
-                << std::left << std::setw(16) << "category: " << "testframework\n"
-                << std::left << std::setw(16) << "framework: " << "Catch Test\n"
-                << std::left << std::setw(16) << "version: " << libraryVersion() << std::endl;
-    }
-
-    int Session::applyCommandLine( int argc, char const * const * argv ) {
-        if( m_startupExceptions )
-            return 1;
-
-        auto result = m_cli.parse( clara::Args( argc, argv ) );
-        if( !result ) {
-            Catch::cerr()
-                << Colour( Colour::Red )
-                << "\nError(s) in input:\n"
-                << Column( result.errorMessage() ).indent( 2 )
-                << "\n\n";
-            Catch::cerr() << "Run with -? for usage\n" << std::endl;
-            return MaxExitCode;
-        }
-
-        if( m_configData.showHelp )
-            showHelp();
-        if( m_configData.libIdentify )
-            libIdentify();
-        m_config.reset();
-        return 0;
-    }
-
-    void Session::useConfigData( ConfigData const& configData ) {
-        m_configData = configData;
-        m_config.reset();
-    }
-
-    int Session::run( int argc, char* argv[] ) {
-        if( m_startupExceptions )
-            return 1;
-        int returnCode = applyCommandLine( argc, argv );
-        if( returnCode == 0 )
-            returnCode = run();
-        return returnCode;
-    }
-
-#if defined(CATCH_CONFIG_WCHAR) && defined(WIN32) && defined(UNICODE)
-    int Session::run( int argc, wchar_t* const argv[] ) {
-
-        char **utf8Argv = new char *[ argc ];
-
-        for ( int i = 0; i < argc; ++i ) {
-            int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, NULL, 0, NULL, NULL );
-
-            utf8Argv[ i ] = new char[ bufSize ];
-
-            WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, NULL, NULL );
-        }
-
-        int returnCode = run( argc, utf8Argv );
-
-        for ( int i = 0; i < argc; ++i )
-            delete [] utf8Argv[ i ];
-
-        delete [] utf8Argv;
-
-        return returnCode;
-    }
-#endif
-    int Session::run() {
-        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeStart ) != 0 ) {
-            Catch::cout() << "...waiting for enter/ return before starting" << std::endl;
-            static_cast<void>(std::getchar());
-        }
-        int exitCode = runInternal();
-        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeExit ) != 0 ) {
-            Catch::cout() << "...waiting for enter/ return before exiting, with code: " << exitCode << std::endl;
-            static_cast<void>(std::getchar());
-        }
-        return exitCode;
-    }
-
-    clara::Parser const& Session::cli() const {
-        return m_cli;
-    }
-    void Session::cli( clara::Parser const& newParser ) {
-        m_cli = newParser;
-    }
-    ConfigData& Session::configData() {
-        return m_configData;
-    }
-    Config& Session::config() {
-        if( !m_config )
-            m_config = std::make_shared<Config>( m_configData );
-        return *m_config;
-    }
-
-    int Session::runInternal() {
-        if( m_startupExceptions )
-            return 1;
-
-        if (m_configData.showHelp || m_configData.libIdentify) {
-            return 0;
-        }
-
-        CATCH_TRY {
-            config(); // Force config to be constructed
-
-            seedRng( *m_config );
-
-            if( m_configData.filenamesAsTags )
-                applyFilenamesAsTags( *m_config );
-
-            // Handle list request
-            if( Option<std::size_t> listed = list( config() ) )
-                return static_cast<int>( *listed );
-
-            auto totals = runTests( m_config );
-            // Note that on unices only the lower 8 bits are usually used, clamping
-            // the return value to 255 prevents false negative when some multiple
-            // of 256 tests has failed
-            return (std::min) (MaxExitCode, (std::max) (totals.error, static_cast<int>(totals.assertions.failed)));
-        }
-#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
-        catch( std::exception& ex ) {
-            Catch::cerr() << ex.what() << std::endl;
-            return MaxExitCode;
-        }
-#endif
-    }
-
-} // end namespace Catch
-// end catch_session.cpp
-// start catch_singletons.cpp
-
-#include <vector>
-
-namespace Catch {
-
-    namespace {
-        static auto getSingletons() -> std::vector<ISingleton*>*& {
-            static std::vector<ISingleton*>* g_singletons = nullptr;
-            if( !g_singletons )
-                g_singletons = new std::vector<ISingleton*>();
-            return g_singletons;
-        }
-    }
-
-    ISingleton::~ISingleton() {}
-
-    void addSingleton(ISingleton* singleton ) {
-        getSingletons()->push_back( singleton );
-    }
-    void cleanupSingletons() {
-        auto& singletons = getSingletons();
-        for( auto singleton : *singletons )
-            delete singleton;
-        delete singletons;
-        singletons = nullptr;
-    }
-
-} // namespace Catch
-// end catch_singletons.cpp
-// start catch_startup_exception_registry.cpp
-
-namespace Catch {
-void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexcept {
-        CATCH_TRY {
-            m_exceptions.push_back(exception);
-        } CATCH_CATCH_ALL {
-            // If we run out of memory during start-up there's really not a lot more we can do about it
-            std::terminate();
-        }
-    }
-
-    std::vector<std::exception_ptr> const& StartupExceptionRegistry::getExceptions() const noexcept {
-        return m_exceptions;
-    }
-
-} // end namespace Catch
-// end catch_startup_exception_registry.cpp
-// start catch_stream.cpp
-
-#include <cstdio>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <vector>
-#include <memory>
-
-namespace Catch {
-
-    Catch::IStream::~IStream() = default;
-
-    namespace detail { namespace {
-        template<typename WriterF, std::size_t bufferSize=256>
-        class StreamBufImpl : public std::streambuf {
-            char data[bufferSize];
-            WriterF m_writer;
-
-        public:
-            StreamBufImpl() {
-                setp( data, data + sizeof(data) );
-            }
-
-            ~StreamBufImpl() noexcept {
-                StreamBufImpl::sync();
-            }
-
-        private:
-            int overflow( int c ) override {
-                sync();
-
-                if( c != EOF ) {
-                    if( pbase() == epptr() )
-                        m_writer( std::string( 1, static_cast<char>( c ) ) );
-                    else
-                        sputc( static_cast<char>( c ) );
-                }
-                return 0;
-            }
-
-            int sync() override {
-                if( pbase() != pptr() ) {
-                    m_writer( std::string( pbase(), static_cast<std::string::size_type>( pptr() - pbase() ) ) );
-                    setp( pbase(), epptr() );
-                }
-                return 0;
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////
-
-        struct OutputDebugWriter {
-
-            void operator()( std::string const&str ) {
-                writeToDebugConsole( str );
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////
-
-        class FileStream : public IStream {
-            mutable std::ofstream m_ofs;
-        public:
-            FileStream( StringRef filename ) {
-                m_ofs.open( filename.c_str() );
-                CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << "'" );
-            }
-            ~FileStream() override = default;
-        public: // IStream
-            std::ostream& stream() const override {
-                return m_ofs;
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////
-
-        class CoutStream : public IStream {
-            mutable std::ostream m_os;
-        public:
-            // Store the streambuf from cout up-front because
-            // cout may get redirected when running tests
-            CoutStream() : m_os( Catch::cout().rdbuf() ) {}
-            ~CoutStream() override = default;
-
-        public: // IStream
-            std::ostream& stream() const override { return m_os; }
-        };
-
-        ///////////////////////////////////////////////////////////////////////////
-
-        class DebugOutStream : public IStream {
-            std::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
-            mutable std::ostream m_os;
-        public:
-            DebugOutStream()
-            :   m_streamBuf( new StreamBufImpl<OutputDebugWriter>() ),
-                m_os( m_streamBuf.get() )
-            {}
-
-            ~DebugOutStream() override = default;
-
-        public: // IStream
-            std::ostream& stream() const override { return m_os; }
-        };
-
-    }} // namespace anon::detail
-
-    ///////////////////////////////////////////////////////////////////////////
-
-    auto makeStream( StringRef const &filename ) -> IStream const* {
-        if( filename.empty() )
-            return new detail::CoutStream();
-        else if( filename[0] == '%' ) {
-            if( filename == "%debug" )
-                return new detail::DebugOutStream();
-            else
-                CATCH_ERROR( "Unrecognised stream: '" << filename << "'" );
-        }
-        else
-            return new detail::FileStream( filename );
-    }
-
-    // This class encapsulates the idea of a pool of ostringstreams that can be reused.
-    struct StringStreams {
-        std::vector<std::unique_ptr<std::ostringstream>> m_streams;
-        std::vector<std::size_t> m_unused;
-        std::ostringstream m_referenceStream; // Used for copy state/ flags from
-
-        auto add() -> std::size_t {
-            if( m_unused.empty() ) {
-                m_streams.push_back( std::unique_ptr<std::ostringstream>( new std::ostringstream ) );
-                return m_streams.size()-1;
-            }
-            else {
-                auto index = m_unused.back();
-                m_unused.pop_back();
-                return index;
-            }
-        }
-
-        void release( std::size_t index ) {
-            m_streams[index]->copyfmt( m_referenceStream ); // Restore initial flags and other state
-            m_unused.push_back(index);
-        }
-    };
-
-    ReusableStringStream::ReusableStringStream()
-    :   m_index( Singleton<StringStreams>::getMutable().add() ),
-        m_oss( Singleton<StringStreams>::getMutable().m_streams[m_index].get() )
-    {}
-
-    ReusableStringStream::~ReusableStringStream() {
-        static_cast<std::ostringstream*>( m_oss )->str("");
-        m_oss->clear();
-        Singleton<StringStreams>::getMutable().release( m_index );
-    }
-
-    auto ReusableStringStream::str() const -> std::string {
-        return static_cast<std::ostringstream*>( m_oss )->str();
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-
-#ifndef CATCH_CONFIG_NOSTDOUT // If you #define this you must implement these functions
-    std::ostream& cout() { return std::cout; }
-    std::ostream& cerr() { return std::cerr; }
-    std::ostream& clog() { return std::clog; }
-#endif
-}
-// end catch_stream.cpp
-// start catch_string_manip.cpp
-
-#include <algorithm>
-#include <ostream>
-#include <cstring>
-#include <cctype>
-
-namespace Catch {
-
-    namespace {
-        char toLowerCh(char c) {
-            return static_cast<char>( std::tolower( c ) );
-        }
-    }
-
-    bool startsWith( std::string const& s, std::string const& prefix ) {
-        return s.size() >= prefix.size() && std::equal(prefix.begin(), prefix.end(), s.begin());
-    }
-    bool startsWith( std::string const& s, char prefix ) {
-        return !s.empty() && s[0] == prefix;
-    }
-    bool endsWith( std::string const& s, std::string const& suffix ) {
-        return s.size() >= suffix.size() && std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
-    }
-    bool endsWith( std::string const& s, char suffix ) {
-        return !s.empty() && s[s.size()-1] == suffix;
-    }
-    bool contains( std::string const& s, std::string const& infix ) {
-        return s.find( infix ) != std::string::npos;
-    }
-    void toLowerInPlace( std::string& s ) {
-        std::transform( s.begin(), s.end(), s.begin(), toLowerCh );
-    }
-    std::string toLower( std::string const& s ) {
-        std::string lc = s;
-        toLowerInPlace( lc );
-        return lc;
-    }
-    std::string trim( std::string const& str ) {
-        static char const* whitespaceChars = "\n\r\t ";
-        std::string::size_type start = str.find_first_not_of( whitespaceChars );
-        std::string::size_type end = str.find_last_not_of( whitespaceChars );
-
-        return start != std::string::npos ? str.substr( start, 1+end-start ) : std::string();
-    }
-
-    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ) {
-        bool replaced = false;
-        std::size_t i = str.find( replaceThis );
-        while( i != std::string::npos ) {
-            replaced = true;
-            str = str.substr( 0, i ) + withThis + str.substr( i+replaceThis.size() );
-            if( i < str.size()-withThis.size() )
-                i = str.find( replaceThis, i+withThis.size() );
-            else
-                i = std::string::npos;
-        }
-        return replaced;
-    }
-
-    pluralise::pluralise( std::size_t count, std::string const& label )
-    :   m_count( count ),
-        m_label( label )
-    {}
-
-    std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser ) {
-        os << pluraliser.m_count << ' ' << pluraliser.m_label;
-        if( pluraliser.m_count != 1 )
-            os << 's';
-        return os;
-    }
-
-}
-// end catch_string_manip.cpp
-// start catch_stringref.cpp
-
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-
-#include <ostream>
-#include <cstring>
-#include <cstdint>
-
-namespace {
-    const uint32_t byte_2_lead = 0xC0;
-    const uint32_t byte_3_lead = 0xE0;
-    const uint32_t byte_4_lead = 0xF0;
-}
-
-namespace Catch {
-    StringRef::StringRef( char const* rawChars ) noexcept
-    : StringRef( rawChars, static_cast<StringRef::size_type>(std::strlen(rawChars) ) )
-    {}
-
-    StringRef::operator std::string() const {
-        return std::string( m_start, m_size );
-    }
-
-    void StringRef::swap( StringRef& other ) noexcept {
-        std::swap( m_start, other.m_start );
-        std::swap( m_size, other.m_size );
-        std::swap( m_data, other.m_data );
-    }
-
-    auto StringRef::c_str() const -> char const* {
-        if( isSubstring() )
-           const_cast<StringRef*>( this )->takeOwnership();
-        return m_start;
-    }
-    auto StringRef::currentData() const noexcept -> char const* {
-        return m_start;
-    }
-
-    auto StringRef::isOwned() const noexcept -> bool {
-        return m_data != nullptr;
-    }
-    auto StringRef::isSubstring() const noexcept -> bool {
-        return m_start[m_size] != '\0';
-    }
-
-    void StringRef::takeOwnership() {
-        if( !isOwned() ) {
-            m_data = new char[m_size+1];
-            memcpy( m_data, m_start, m_size );
-            m_data[m_size] = '\0';
-            m_start = m_data;
-        }
-    }
-    auto StringRef::substr( size_type start, size_type size ) const noexcept -> StringRef {
-        if( start < m_size )
-            return StringRef( m_start+start, size );
-        else
-            return StringRef();
-    }
-    auto StringRef::operator == ( StringRef const& other ) const noexcept -> bool {
-        return
-            size() == other.size() &&
-            (std::strncmp( m_start, other.m_start, size() ) == 0);
-    }
-    auto StringRef::operator != ( StringRef const& other ) const noexcept -> bool {
-        return !operator==( other );
-    }
-
-    auto StringRef::operator[](size_type index) const noexcept -> char {
-        return m_start[index];
-    }
-
-    auto StringRef::numberOfCharacters() const noexcept -> size_type {
-        size_type noChars = m_size;
-        // Make adjustments for uft encodings
-        for( size_type i=0; i < m_size; ++i ) {
-            char c = m_start[i];
-            if( ( c & byte_2_lead ) == byte_2_lead ) {
-                noChars--;
-                if (( c & byte_3_lead ) == byte_3_lead )
-                    noChars--;
-                if( ( c & byte_4_lead ) == byte_4_lead )
-                    noChars--;
-            }
-        }
-        return noChars;
-    }
-
-    auto operator + ( StringRef const& lhs, StringRef const& rhs ) -> std::string {
-        std::string str;
-        str.reserve( lhs.size() + rhs.size() );
-        str += lhs;
-        str += rhs;
-        return str;
-    }
-    auto operator + ( StringRef const& lhs, const char* rhs ) -> std::string {
-        return std::string( lhs ) + std::string( rhs );
-    }
-    auto operator + ( char const* lhs, StringRef const& rhs ) -> std::string {
-        return std::string( lhs ) + std::string( rhs );
-    }
-
-    auto operator << ( std::ostream& os, StringRef const& str ) -> std::ostream& {
-        return os.write(str.currentData(), str.size());
-    }
-
-    auto operator+=( std::string& lhs, StringRef const& rhs ) -> std::string& {
-        lhs.append(rhs.currentData(), rhs.size());
-        return lhs;
-    }
-
-} // namespace Catch
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-// end catch_stringref.cpp
-// start catch_tag_alias.cpp
-
-namespace Catch {
-    TagAlias::TagAlias(std::string const & _tag, SourceLineInfo _lineInfo): tag(_tag), lineInfo(_lineInfo) {}
-}
-// end catch_tag_alias.cpp
-// start catch_tag_alias_autoregistrar.cpp
-
-namespace Catch {
-
-    RegistrarForTagAliases::RegistrarForTagAliases(char const* alias, char const* tag, SourceLineInfo const& lineInfo) {
-        CATCH_TRY {
-            getMutableRegistryHub().registerTagAlias(alias, tag, lineInfo);
-        } CATCH_CATCH_ALL {
-            // Do not throw when constructing global objects, instead register the exception to be processed later
-            getMutableRegistryHub().registerStartupException();
-        }
-    }
-
-}
-// end catch_tag_alias_autoregistrar.cpp
-// start catch_tag_alias_registry.cpp
-
-#include <sstream>
-
-namespace Catch {
-
-    TagAliasRegistry::~TagAliasRegistry() {}
-
-    TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
-        auto it = m_registry.find( alias );
-        if( it != m_registry.end() )
-            return &(it->second);
-        else
-            return nullptr;
-    }
-
-    std::string TagAliasRegistry::expandAliases( std::string const& unexpandedTestSpec ) const {
-        std::string expandedTestSpec = unexpandedTestSpec;
-        for( auto const& registryKvp : m_registry ) {
-            std::size_t pos = expandedTestSpec.find( registryKvp.first );
-            if( pos != std::string::npos ) {
-                expandedTestSpec =  expandedTestSpec.substr( 0, pos ) +
-                                    registryKvp.second.tag +
-                                    expandedTestSpec.substr( pos + registryKvp.first.size() );
-            }
-        }
-        return expandedTestSpec;
-    }
-
-    void TagAliasRegistry::add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) {
-        CATCH_ENFORCE( startsWith(alias, "[@") && endsWith(alias, ']'),
-                      "error: tag alias, '" << alias << "' is not of the form [@alias name].\n" << lineInfo );
-
-        CATCH_ENFORCE( m_registry.insert(std::make_pair(alias, TagAlias(tag, lineInfo))).second,
-                      "error: tag alias, '" << alias << "' already registered.\n"
-                      << "\tFirst seen at: " << find(alias)->lineInfo << "\n"
-                      << "\tRedefined at: " << lineInfo );
-    }
-
-    ITagAliasRegistry::~ITagAliasRegistry() {}
-
-    ITagAliasRegistry const& ITagAliasRegistry::get() {
-        return getRegistryHub().getTagAliasRegistry();
-    }
-
-} // end namespace Catch
-// end catch_tag_alias_registry.cpp
-// start catch_test_case_info.cpp
-
-#include <cctype>
-#include <exception>
-#include <algorithm>
-#include <sstream>
-
-namespace Catch {
-
-    namespace {
-        TestCaseInfo::SpecialProperties parseSpecialTag( std::string const& tag ) {
-            if( startsWith( tag, '.' ) ||
-                tag == "!hide" )
-                return TestCaseInfo::IsHidden;
-            else if( tag == "!throws" )
-                return TestCaseInfo::Throws;
-            else if( tag == "!shouldfail" )
-                return TestCaseInfo::ShouldFail;
-            else if( tag == "!mayfail" )
-                return TestCaseInfo::MayFail;
-            else if( tag == "!nonportable" )
-                return TestCaseInfo::NonPortable;
-            else if( tag == "!benchmark" )
-                return static_cast<TestCaseInfo::SpecialProperties>( TestCaseInfo::Benchmark | TestCaseInfo::IsHidden );
-            else
-                return TestCaseInfo::None;
-        }
-        bool isReservedTag( std::string const& tag ) {
-            return parseSpecialTag( tag ) == TestCaseInfo::None && tag.size() > 0 && !std::isalnum( static_cast<unsigned char>(tag[0]) );
-        }
-        void enforceNotReservedTag( std::string const& tag, SourceLineInfo const& _lineInfo ) {
-            CATCH_ENFORCE( !isReservedTag(tag),
-                          "Tag name: [" << tag << "] is not allowed.\n"
-                          << "Tag names starting with non alpha-numeric characters are reserved\n"
-                          << _lineInfo );
-        }
-    }
-
-    TestCase makeTestCase(  ITestInvoker* _testCase,
-                            std::string const& _className,
-                            NameAndTags const& nameAndTags,
-                            SourceLineInfo const& _lineInfo )
-    {
-        bool isHidden = false;
-
-        // Parse out tags
-        std::vector<std::string> tags;
-        std::string desc, tag;
-        bool inTag = false;
-        std::string _descOrTags = nameAndTags.tags;
-        for (char c : _descOrTags) {
-            if( !inTag ) {
-                if( c == '[' )
-                    inTag = true;
-                else
-                    desc += c;
-            }
-            else {
-                if( c == ']' ) {
-                    TestCaseInfo::SpecialProperties prop = parseSpecialTag( tag );
-                    if( ( prop & TestCaseInfo::IsHidden ) != 0 )
-                        isHidden = true;
-                    else if( prop == TestCaseInfo::None )
-                        enforceNotReservedTag( tag, _lineInfo );
-
-                    tags.push_back( tag );
-                    tag.clear();
-                    inTag = false;
-                }
-                else
-                    tag += c;
-            }
-        }
-        if( isHidden ) {
-            tags.push_back( "." );
-        }
-
-        TestCaseInfo info( nameAndTags.name, _className, desc, tags, _lineInfo );
-        return TestCase( _testCase, std::move(info) );
-    }
-
-    void setTags( TestCaseInfo& testCaseInfo, std::vector<std::string> tags ) {
-        std::sort(begin(tags), end(tags));
-        tags.erase(std::unique(begin(tags), end(tags)), end(tags));
-        testCaseInfo.lcaseTags.clear();
-
-        for( auto const& tag : tags ) {
-            std::string lcaseTag = toLower( tag );
-            testCaseInfo.properties = static_cast<TestCaseInfo::SpecialProperties>( testCaseInfo.properties | parseSpecialTag( lcaseTag ) );
-            testCaseInfo.lcaseTags.push_back( lcaseTag );
-        }
-        testCaseInfo.tags = std::move(tags);
-    }
-
-    TestCaseInfo::TestCaseInfo( std::string const& _name,
-                                std::string const& _className,
-                                std::string const& _description,
-                                std::vector<std::string> const& _tags,
-                                SourceLineInfo const& _lineInfo )
-    :   name( _name ),
-        className( _className ),
-        description( _description ),
-        lineInfo( _lineInfo ),
-        properties( None )
-    {
-        setTags( *this, _tags );
-    }
-
-    bool TestCaseInfo::isHidden() const {
-        return ( properties & IsHidden ) != 0;
-    }
-    bool TestCaseInfo::throws() const {
-        return ( properties & Throws ) != 0;
-    }
-    bool TestCaseInfo::okToFail() const {
-        return ( properties & (ShouldFail | MayFail ) ) != 0;
-    }
-    bool TestCaseInfo::expectedToFail() const {
-        return ( properties & (ShouldFail ) ) != 0;
-    }
-
-    std::string TestCaseInfo::tagsAsString() const {
-        std::string ret;
-        // '[' and ']' per tag
-        std::size_t full_size = 2 * tags.size();
-        for (const auto& tag : tags) {
-            full_size += tag.size();
-        }
-        ret.reserve(full_size);
-        for (const auto& tag : tags) {
-            ret.push_back('[');
-            ret.append(tag);
-            ret.push_back(']');
-        }
-
-        return ret;
-    }
-
-    TestCase::TestCase( ITestInvoker* testCase, TestCaseInfo&& info ) : TestCaseInfo( std::move(info) ), test( testCase ) {}
-
-    TestCase TestCase::withName( std::string const& _newName ) const {
-        TestCase other( *this );
-        other.name = _newName;
-        return other;
-    }
-
-    void TestCase::invoke() const {
-        test->invoke();
-    }
-
-    bool TestCase::operator == ( TestCase const& other ) const {
-        return  test.get() == other.test.get() &&
-                name == other.name &&
-                className == other.className;
-    }
-
-    bool TestCase::operator < ( TestCase const& other ) const {
-        return name < other.name;
-    }
-
-    TestCaseInfo const& TestCase::getTestCaseInfo() const
-    {
-        return *this;
-    }
-
-} // end namespace Catch
-// end catch_test_case_info.cpp
-// start catch_test_case_registry_impl.cpp
-
-#include <sstream>
-
-namespace Catch {
-
-    std::vector<TestCase> sortTests( IConfig const& config, std::vector<TestCase> const& unsortedTestCases ) {
-
-        std::vector<TestCase> sorted = unsortedTestCases;
-
-        switch( config.runOrder() ) {
-            case RunTests::InLexicographicalOrder:
-                std::sort( sorted.begin(), sorted.end() );
-                break;
-            case RunTests::InRandomOrder:
-                seedRng( config );
-                std::shuffle( sorted.begin(), sorted.end(), rng() );
-                break;
-            case RunTests::InDeclarationOrder:
-                // already in declaration order
-                break;
-        }
-        return sorted;
-    }
-    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config ) {
-        return testSpec.matches( testCase ) && ( config.allowThrows() || !testCase.throws() );
-    }
-
-    void enforceNoDuplicateTestCases( std::vector<TestCase> const& functions ) {
-        std::set<TestCase> seenFunctions;
-        for( auto const& function : functions ) {
-            auto prev = seenFunctions.insert( function );
-            CATCH_ENFORCE( prev.second,
-                    "error: TEST_CASE( \"" << function.name << "\" ) already defined.\n"
-                    << "\tFirst seen at " << prev.first->getTestCaseInfo().lineInfo << "\n"
-                    << "\tRedefined at " << function.getTestCaseInfo().lineInfo );
-        }
-    }
-
-    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
-        std::vector<TestCase> filtered;
-        filtered.reserve( testCases.size() );
-        for( auto const& testCase : testCases )
-            if( matchTest( testCase, testSpec, config ) )
-                filtered.push_back( testCase );
-        return filtered;
-    }
-    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config ) {
-        return getRegistryHub().getTestCaseRegistry().getAllTestsSorted( config );
-    }
-
-    void TestRegistry::registerTest( TestCase const& testCase ) {
-        std::string name = testCase.getTestCaseInfo().name;
-        if( name.empty() ) {
-            ReusableStringStream rss;
-            rss << "Anonymous test case " << ++m_unnamedCount;
-            return registerTest( testCase.withName( rss.str() ) );
-        }
-        m_functions.push_back( testCase );
-    }
-
-    std::vector<TestCase> const& TestRegistry::getAllTests() const {
-        return m_functions;
-    }
-    std::vector<TestCase> const& TestRegistry::getAllTestsSorted( IConfig const& config ) const {
-        if( m_sortedFunctions.empty() )
-            enforceNoDuplicateTestCases( m_functions );
-
-        if(  m_currentSortOrder != config.runOrder() || m_sortedFunctions.empty() ) {
-            m_sortedFunctions = sortTests( config, m_functions );
-            m_currentSortOrder = config.runOrder();
-        }
-        return m_sortedFunctions;
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    TestInvokerAsFunction::TestInvokerAsFunction( void(*testAsFunction)() ) noexcept : m_testAsFunction( testAsFunction ) {}
-
-    void TestInvokerAsFunction::invoke() const {
-        m_testAsFunction();
-    }
-
-    std::string extractClassName( StringRef const& classOrQualifiedMethodName ) {
-        std::string className = classOrQualifiedMethodName;
-        if( startsWith( className, '&' ) )
-        {
-            std::size_t lastColons = className.rfind( "::" );
-            std::size_t penultimateColons = className.rfind( "::", lastColons-1 );
-            if( penultimateColons == std::string::npos )
-                penultimateColons = 1;
-            className = className.substr( penultimateColons, lastColons-penultimateColons );
-        }
-        return className;
-    }
-
-} // end namespace Catch
-// end catch_test_case_registry_impl.cpp
-// start catch_test_case_tracker.cpp
-
-#include <algorithm>
-#include <cassert>
-#include <stdexcept>
-#include <memory>
-#include <sstream>
-
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wexit-time-destructors"
-#endif
-
-namespace Catch {
-namespace TestCaseTracking {
-
-    NameAndLocation::NameAndLocation( std::string const& _name, SourceLineInfo const& _location )
-    :   name( _name ),
-        location( _location )
-    {}
-
-    ITracker::~ITracker() = default;
-
-    TrackerContext& TrackerContext::instance() {
-        static TrackerContext s_instance;
-        return s_instance;
-    }
-
-    ITracker& TrackerContext::startRun() {
-        m_rootTracker = std::make_shared<SectionTracker>( NameAndLocation( "{root}", CATCH_INTERNAL_LINEINFO ), *this, nullptr );
-        m_currentTracker = nullptr;
-        m_runState = Executing;
-        return *m_rootTracker;
-    }
-
-    void TrackerContext::endRun() {
-        m_rootTracker.reset();
-        m_currentTracker = nullptr;
-        m_runState = NotStarted;
-    }
-
-    void TrackerContext::startCycle() {
-        m_currentTracker = m_rootTracker.get();
-        m_runState = Executing;
-    }
-    void TrackerContext::completeCycle() {
-        m_runState = CompletedCycle;
-    }
-
-    bool TrackerContext::completedCycle() const {
-        return m_runState == CompletedCycle;
-    }
-    ITracker& TrackerContext::currentTracker() {
-        return *m_currentTracker;
-    }
-    void TrackerContext::setCurrentTracker( ITracker* tracker ) {
-        m_currentTracker = tracker;
-    }
-
-    TrackerBase::TrackerBase( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent )
-    :   m_nameAndLocation( nameAndLocation ),
-        m_ctx( ctx ),
-        m_parent( parent )
-    {}
-
-    NameAndLocation const& TrackerBase::nameAndLocation() const {
-        return m_nameAndLocation;
-    }
-    bool TrackerBase::isComplete() const {
-        return m_runState == CompletedSuccessfully || m_runState == Failed;
-    }
-    bool TrackerBase::isSuccessfullyCompleted() const {
-        return m_runState == CompletedSuccessfully;
-    }
-    bool TrackerBase::isOpen() const {
-        return m_runState != NotStarted && !isComplete();
-    }
-    bool TrackerBase::hasChildren() const {
-        return !m_children.empty();
-    }
-
-    void TrackerBase::addChild( ITrackerPtr const& child ) {
-        m_children.push_back( child );
-    }
-
-    ITrackerPtr TrackerBase::findChild( NameAndLocation const& nameAndLocation ) {
-        auto it = std::find_if( m_children.begin(), m_children.end(),
-            [&nameAndLocation]( ITrackerPtr const& tracker ){
-                return
-                    tracker->nameAndLocation().location == nameAndLocation.location &&
-                    tracker->nameAndLocation().name == nameAndLocation.name;
-            } );
-        return( it != m_children.end() )
-            ? *it
-            : nullptr;
-    }
-    ITracker& TrackerBase::parent() {
-        assert( m_parent ); // Should always be non-null except for root
-        return *m_parent;
-    }
-
-    void TrackerBase::openChild() {
-        if( m_runState != ExecutingChildren ) {
-            m_runState = ExecutingChildren;
-            if( m_parent )
-                m_parent->openChild();
-        }
-    }
-
-    bool TrackerBase::isSectionTracker() const { return false; }
-    bool TrackerBase::isIndexTracker() const { return false; }
-
-    void TrackerBase::open() {
-        m_runState = Executing;
-        moveToThis();
-        if( m_parent )
-            m_parent->openChild();
-    }
-
-    void TrackerBase::close() {
-
-        // Close any still open children (e.g. generators)
-        while( &m_ctx.currentTracker() != this )
-            m_ctx.currentTracker().close();
-
-        switch( m_runState ) {
-            case NeedsAnotherRun:
-                break;
-
-            case Executing:
-                m_runState = CompletedSuccessfully;
-                break;
-            case ExecutingChildren:
-                if( m_children.empty() || m_children.back()->isComplete() )
-                    m_runState = CompletedSuccessfully;
-                break;
-
-            case NotStarted:
-            case CompletedSuccessfully:
-            case Failed:
-                CATCH_INTERNAL_ERROR( "Illogical state: " << m_runState );
-
-            default:
-                CATCH_INTERNAL_ERROR( "Unknown state: " << m_runState );
-        }
-        moveToParent();
-        m_ctx.completeCycle();
-    }
-    void TrackerBase::fail() {
-        m_runState = Failed;
-        if( m_parent )
-            m_parent->markAsNeedingAnotherRun();
-        moveToParent();
-        m_ctx.completeCycle();
-    }
-    void TrackerBase::markAsNeedingAnotherRun() {
-        m_runState = NeedsAnotherRun;
-    }
-
-    void TrackerBase::moveToParent() {
-        assert( m_parent );
-        m_ctx.setCurrentTracker( m_parent );
-    }
-    void TrackerBase::moveToThis() {
-        m_ctx.setCurrentTracker( this );
-    }
-
-    SectionTracker::SectionTracker( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent )
-    :   TrackerBase( nameAndLocation, ctx, parent )
-    {
-        if( parent ) {
-            while( !parent->isSectionTracker() )
-                parent = &parent->parent();
-
-            SectionTracker& parentSection = static_cast<SectionTracker&>( *parent );
-            addNextFilters( parentSection.m_filters );
-        }
-    }
-
-    bool SectionTracker::isSectionTracker() const { return true; }
-
-    SectionTracker& SectionTracker::acquire( TrackerContext& ctx, NameAndLocation const& nameAndLocation ) {
-        std::shared_ptr<SectionTracker> section;
-
-        ITracker& currentTracker = ctx.currentTracker();
-        if( ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) {
-            assert( childTracker );
-            assert( childTracker->isSectionTracker() );
-            section = std::static_pointer_cast<SectionTracker>( childTracker );
-        }
-        else {
-            section = std::make_shared<SectionTracker>( nameAndLocation, ctx, &currentTracker );
-            currentTracker.addChild( section );
-        }
-        if( !ctx.completedCycle() )
-            section->tryOpen();
-        return *section;
-    }
-
-    void SectionTracker::tryOpen() {
-        if( !isComplete() && (m_filters.empty() || m_filters[0].empty() ||  m_filters[0] == m_nameAndLocation.name ) )
-            open();
-    }
-
-    void SectionTracker::addInitialFilters( std::vector<std::string> const& filters ) {
-        if( !filters.empty() ) {
-            m_filters.push_back(""); // Root - should never be consulted
-            m_filters.push_back(""); // Test Case - not a section filter
-            m_filters.insert( m_filters.end(), filters.begin(), filters.end() );
-        }
-    }
-    void SectionTracker::addNextFilters( std::vector<std::string> const& filters ) {
-        if( filters.size() > 1 )
-            m_filters.insert( m_filters.end(), ++filters.begin(), filters.end() );
-    }
-
-    IndexTracker::IndexTracker( NameAndLocation const& nameAndLocation, TrackerContext& ctx, ITracker* parent, int size )
-    :   TrackerBase( nameAndLocation, ctx, parent ),
-        m_size( size )
-    {}
-
-    bool IndexTracker::isIndexTracker() const { return true; }
-
-    IndexTracker& IndexTracker::acquire( TrackerContext& ctx, NameAndLocation const& nameAndLocation, int size ) {
-        std::shared_ptr<IndexTracker> tracker;
-
-        ITracker& currentTracker = ctx.currentTracker();
-        if( ITrackerPtr childTracker = currentTracker.findChild( nameAndLocation ) ) {
-            assert( childTracker );
-            assert( childTracker->isIndexTracker() );
-            tracker = std::static_pointer_cast<IndexTracker>( childTracker );
-        }
-        else {
-            tracker = std::make_shared<IndexTracker>( nameAndLocation, ctx, &currentTracker, size );
-            currentTracker.addChild( tracker );
-        }
-
-        if( !ctx.completedCycle() && !tracker->isComplete() ) {
-            if( tracker->m_runState != ExecutingChildren && tracker->m_runState != NeedsAnotherRun )
-                tracker->moveNext();
-            tracker->open();
-        }
-
-        return *tracker;
-    }
-
-    int IndexTracker::index() const { return m_index; }
-
-    void IndexTracker::moveNext() {
-        m_index++;
-        m_children.clear();
-    }
-
-    void IndexTracker::close() {
-        TrackerBase::close();
-        if( m_runState == CompletedSuccessfully && m_index < m_size-1 )
-            m_runState = Executing;
-    }
-
-} // namespace TestCaseTracking
-
-using TestCaseTracking::ITracker;
-using TestCaseTracking::TrackerContext;
-using TestCaseTracking::SectionTracker;
-using TestCaseTracking::IndexTracker;
-
-} // namespace Catch
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-// end catch_test_case_tracker.cpp
-// start catch_test_registry.cpp
-
-namespace Catch {
-
-    auto makeTestInvoker( void(*testAsFunction)() ) noexcept -> ITestInvoker* {
-        return new(std::nothrow) TestInvokerAsFunction( testAsFunction );
-    }
-
-    NameAndTags::NameAndTags( StringRef const& name_ , StringRef const& tags_ ) noexcept : name( name_ ), tags( tags_ ) {}
-
-    AutoReg::AutoReg( ITestInvoker* invoker, SourceLineInfo const& lineInfo, StringRef const& classOrMethod, NameAndTags const& nameAndTags ) noexcept {
-        CATCH_TRY {
-            getMutableRegistryHub()
-                    .registerTest(
-                        makeTestCase(
-                            invoker,
-                            extractClassName( classOrMethod ),
-                            nameAndTags,
-                            lineInfo));
-        } CATCH_CATCH_ALL {
-            // Do not throw when constructing global objects, instead register the exception to be processed later
-            getMutableRegistryHub().registerStartupException();
-        }
-    }
-
-    AutoReg::~AutoReg() = default;
-}
-// end catch_test_registry.cpp
-// start catch_test_spec.cpp
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include <memory>
-
-namespace Catch {
-
-    TestSpec::Pattern::~Pattern() = default;
-    TestSpec::NamePattern::~NamePattern() = default;
-    TestSpec::TagPattern::~TagPattern() = default;
-    TestSpec::ExcludedPattern::~ExcludedPattern() = default;
-
-    TestSpec::NamePattern::NamePattern( std::string const& name )
-    : m_wildcardPattern( toLower( name ), CaseSensitive::No )
-    {}
-    bool TestSpec::NamePattern::matches( TestCaseInfo const& testCase ) const {
-        return m_wildcardPattern.matches( toLower( testCase.name ) );
-    }
-
-    TestSpec::TagPattern::TagPattern( std::string const& tag ) : m_tag( toLower( tag ) ) {}
-    bool TestSpec::TagPattern::matches( TestCaseInfo const& testCase ) const {
-        return std::find(begin(testCase.lcaseTags),
-                         end(testCase.lcaseTags),
-                         m_tag) != end(testCase.lcaseTags);
-    }
-
-    TestSpec::ExcludedPattern::ExcludedPattern( PatternPtr const& underlyingPattern ) : m_underlyingPattern( underlyingPattern ) {}
-    bool TestSpec::ExcludedPattern::matches( TestCaseInfo const& testCase ) const { return !m_underlyingPattern->matches( testCase ); }
-
-    bool TestSpec::Filter::matches( TestCaseInfo const& testCase ) const {
-        // All patterns in a filter must match for the filter to be a match
-        for( auto const& pattern : m_patterns ) {
-            if( !pattern->matches( testCase ) )
-                return false;
-        }
-        return true;
-    }
-
-    bool TestSpec::hasFilters() const {
-        return !m_filters.empty();
-    }
-    bool TestSpec::matches( TestCaseInfo const& testCase ) const {
-        // A TestSpec matches if any filter matches
-        for( auto const& filter : m_filters )
-            if( filter.matches( testCase ) )
-                return true;
-        return false;
-    }
-}
-// end catch_test_spec.cpp
-// start catch_test_spec_parser.cpp
-
-namespace Catch {
-
-    TestSpecParser::TestSpecParser( ITagAliasRegistry const& tagAliases ) : m_tagAliases( &tagAliases ) {}
-
-    TestSpecParser& TestSpecParser::parse( std::string const& arg ) {
-        m_mode = None;
-        m_exclusion = false;
-        m_start = std::string::npos;
-        m_arg = m_tagAliases->expandAliases( arg );
-        m_escapeChars.clear();
-        for( m_pos = 0; m_pos < m_arg.size(); ++m_pos )
-            visitChar( m_arg[m_pos] );
-        if( m_mode == Name )
-            addPattern<TestSpec::NamePattern>();
-        return *this;
-    }
-    TestSpec TestSpecParser::testSpec() {
-        addFilter();
-        return m_testSpec;
-    }
-
-    void TestSpecParser::visitChar( char c ) {
-        if( m_mode == None ) {
-            switch( c ) {
-            case ' ': return;
-            case '~': m_exclusion = true; return;
-            case '[': return startNewMode( Tag, ++m_pos );
-            case '"': return startNewMode( QuotedName, ++m_pos );
-            case '\\': return escape();
-            default: startNewMode( Name, m_pos ); break;
-            }
-        }
-        if( m_mode == Name ) {
-            if( c == ',' ) {
-                addPattern<TestSpec::NamePattern>();
-                addFilter();
-            }
-            else if( c == '[' ) {
-                if( subString() == "exclude:" )
-                    m_exclusion = true;
-                else
-                    addPattern<TestSpec::NamePattern>();
-                startNewMode( Tag, ++m_pos );
-            }
-            else if( c == '\\' )
-                escape();
-        }
-        else if( m_mode == EscapedName )
-            m_mode = Name;
-        else if( m_mode == QuotedName && c == '"' )
-            addPattern<TestSpec::NamePattern>();
-        else if( m_mode == Tag && c == ']' )
-            addPattern<TestSpec::TagPattern>();
-    }
-    void TestSpecParser::startNewMode( Mode mode, std::size_t start ) {
-        m_mode = mode;
-        m_start = start;
-    }
-    void TestSpecParser::escape() {
-        if( m_mode == None )
-            m_start = m_pos;
-        m_mode = EscapedName;
-        m_escapeChars.push_back( m_pos );
-    }
-    std::string TestSpecParser::subString() const { return m_arg.substr( m_start, m_pos - m_start ); }
-
-    void TestSpecParser::addFilter() {
-        if( !m_currentFilter.m_patterns.empty() ) {
-            m_testSpec.m_filters.push_back( m_currentFilter );
-            m_currentFilter = TestSpec::Filter();
-        }
-    }
-
-    TestSpec parseTestSpec( std::string const& arg ) {
-        return TestSpecParser( ITagAliasRegistry::get() ).parse( arg ).testSpec();
-    }
-
-} // namespace Catch
-// end catch_test_spec_parser.cpp
-// start catch_timer.cpp
-
-#include <chrono>
-
-static const uint64_t nanosecondsInSecond = 1000000000;
-
-namespace Catch {
-
-    auto getCurrentNanosecondsSinceEpoch() -> uint64_t {
-        return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
-    }
-
-    namespace {
-        auto estimateClockResolution() -> uint64_t {
-            uint64_t sum = 0;
-            static const uint64_t iterations = 1000000;
-
-            auto startTime = getCurrentNanosecondsSinceEpoch();
-
-            for( std::size_t i = 0; i < iterations; ++i ) {
-
-                uint64_t ticks;
-                uint64_t baseTicks = getCurrentNanosecondsSinceEpoch();
-                do {
-                    ticks = getCurrentNanosecondsSinceEpoch();
-                } while( ticks == baseTicks );
-
-                auto delta = ticks - baseTicks;
-                sum += delta;
-
-                // If we have been calibrating for over 3 seconds -- the clock
-                // is terrible and we should move on.
-                // TBD: How to signal that the measured resolution is probably wrong?
-                if (ticks > startTime + 3 * nanosecondsInSecond) {
-                    return sum / i;
-                }
-            }
-
-            // We're just taking the mean, here. To do better we could take the std. dev and exclude outliers
-            // - and potentially do more iterations if there's a high variance.
-            return sum/iterations;
-        }
-    }
-    auto getEstimatedClockResolution() -> uint64_t {
-        static auto s_resolution = estimateClockResolution();
-        return s_resolution;
-    }
-
-    void Timer::start() {
-       m_nanoseconds = getCurrentNanosecondsSinceEpoch();
-    }
-    auto Timer::getElapsedNanoseconds() const -> uint64_t {
-        return getCurrentNanosecondsSinceEpoch() - m_nanoseconds;
-    }
-    auto Timer::getElapsedMicroseconds() const -> uint64_t {
-        return getElapsedNanoseconds()/1000;
-    }
-    auto Timer::getElapsedMilliseconds() const -> unsigned int {
-        return static_cast<unsigned int>(getElapsedMicroseconds()/1000);
-    }
-    auto Timer::getElapsedSeconds() const -> double {
-        return getElapsedMicroseconds()/1000000.0;
-    }
-
-} // namespace Catch
-// end catch_timer.cpp
-// start catch_tostring.cpp
-
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wexit-time-destructors"
-#    pragma clang diagnostic ignored "-Wglobal-constructors"
-#endif
-
-// Enable specific decls locally
-#if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)
-#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER
-#endif
-
-#include <cmath>
-#include <iomanip>
-
-namespace Catch {
-
-namespace Detail {
-
-    const std::string unprintableString = "{?}";
-
-    namespace {
-        const int hexThreshold = 255;
-
-        struct Endianness {
-            enum Arch { Big, Little };
-
-            static Arch which() {
-                union _{
-                    int asInt;
-                    char asChar[sizeof (int)];
-                } u;
-
-                u.asInt = 1;
-                return ( u.asChar[sizeof(int)-1] == 1 ) ? Big : Little;
-            }
-        };
-    }
-
-    std::string rawMemoryToString( const void *object, std::size_t size ) {
-        // Reverse order for little endian architectures
-        int i = 0, end = static_cast<int>( size ), inc = 1;
-        if( Endianness::which() == Endianness::Little ) {
-            i = end-1;
-            end = inc = -1;
-        }
-
-        unsigned char const *bytes = static_cast<unsigned char const *>(object);
-        ReusableStringStream rss;
-        rss << "0x" << std::setfill('0') << std::hex;
-        for( ; i != end; i += inc )
-             rss << std::setw(2) << static_cast<unsigned>(bytes[i]);
-       return rss.str();
-    }
-}
-
-template<typename T>
-std::string fpToString( T value, int precision ) {
-    if (std::isnan(value)) {
-        return "nan";
-    }
-
-    ReusableStringStream rss;
-    rss << std::setprecision( precision )
-        << std::fixed
-        << value;
-    std::string d = rss.str();
-    std::size_t i = d.find_last_not_of( '0' );
-    if( i != std::string::npos && i != d.size()-1 ) {
-        if( d[i] == '.' )
-            i++;
-        d = d.substr( 0, i+1 );
-    }
-    return d;
-}
-
-//// ======================================================= ////
-//
-//   Out-of-line defs for full specialization of StringMaker
-//
-//// ======================================================= ////
-
-std::string StringMaker<std::string>::convert(const std::string& str) {
-    if (!getCurrentContext().getConfig()->showInvisibles()) {
-        return '"' + str + '"';
-    }
-
-    std::string s("\"");
-    for (char c : str) {
-        switch (c) {
-        case '\n':
-            s.append("\\n");
-            break;
-        case '\t':
-            s.append("\\t");
-            break;
-        default:
-            s.push_back(c);
-            break;
-        }
-    }
-    s.append("\"");
-    return s;
-}
-
-#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
-std::string StringMaker<std::string_view>::convert(std::string_view str) {
-    return ::Catch::Detail::stringify(std::string{ str });
-}
-#endif
-
-std::string StringMaker<char const*>::convert(char const* str) {
-    if (str) {
-        return ::Catch::Detail::stringify(std::string{ str });
-    } else {
-        return{ "{null string}" };
-    }
-}
-std::string StringMaker<char*>::convert(char* str) {
-    if (str) {
-        return ::Catch::Detail::stringify(std::string{ str });
-    } else {
-        return{ "{null string}" };
-    }
-}
-
-#ifdef CATCH_CONFIG_WCHAR
-std::string StringMaker<std::wstring>::convert(const std::wstring& wstr) {
-    std::string s;
-    s.reserve(wstr.size());
-    for (auto c : wstr) {
-        s += (c <= 0xff) ? static_cast<char>(c) : '?';
-    }
-    return ::Catch::Detail::stringify(s);
-}
-
-# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
-std::string StringMaker<std::wstring_view>::convert(std::wstring_view str) {
-    return StringMaker<std::wstring>::convert(std::wstring(str));
-}
-# endif
-
-std::string StringMaker<wchar_t const*>::convert(wchar_t const * str) {
-    if (str) {
-        return ::Catch::Detail::stringify(std::wstring{ str });
-    } else {
-        return{ "{null string}" };
-    }
-}
-std::string StringMaker<wchar_t *>::convert(wchar_t * str) {
-    if (str) {
-        return ::Catch::Detail::stringify(std::wstring{ str });
-    } else {
-        return{ "{null string}" };
-    }
-}
-#endif
-
-std::string StringMaker<int>::convert(int value) {
-    return ::Catch::Detail::stringify(static_cast<long long>(value));
-}
-std::string StringMaker<long>::convert(long value) {
-    return ::Catch::Detail::stringify(static_cast<long long>(value));
-}
-std::string StringMaker<long long>::convert(long long value) {
-    ReusableStringStream rss;
-    rss << value;
-    if (value > Detail::hexThreshold) {
-        rss << " (0x" << std::hex << value << ')';
-    }
-    return rss.str();
-}
-
-std::string StringMaker<unsigned int>::convert(unsigned int value) {
-    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
-}
-std::string StringMaker<unsigned long>::convert(unsigned long value) {
-    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
-}
-std::string StringMaker<unsigned long long>::convert(unsigned long long value) {
-    ReusableStringStream rss;
-    rss << value;
-    if (value > Detail::hexThreshold) {
-        rss << " (0x" << std::hex << value << ')';
-    }
-    return rss.str();
-}
-
-std::string StringMaker<bool>::convert(bool b) {
-    return b ? "true" : "false";
-}
-
-std::string StringMaker<char>::convert(char value) {
-    if (value == '\r') {
-        return "'\\r'";
-    } else if (value == '\f') {
-        return "'\\f'";
-    } else if (value == '\n') {
-        return "'\\n'";
-    } else if (value == '\t') {
-        return "'\\t'";
-    } else if ('\0' <= value && value < ' ') {
-        return ::Catch::Detail::stringify(static_cast<unsigned int>(value));
-    } else {
-        char chstr[] = "' '";
-        chstr[1] = value;
-        return chstr;
-    }
-}
-std::string StringMaker<signed char>::convert(signed char c) {
-    return ::Catch::Detail::stringify(static_cast<char>(c));
-}
-std::string StringMaker<unsigned char>::convert(unsigned char c) {
-    return ::Catch::Detail::stringify(static_cast<char>(c));
-}
-
-std::string StringMaker<std::nullptr_t>::convert(std::nullptr_t) {
-    return "nullptr";
-}
-
-std::string StringMaker<float>::convert(float value) {
-    return fpToString(value, 5) + 'f';
-}
-std::string StringMaker<double>::convert(double value) {
-    return fpToString(value, 10);
-}
-
-std::string ratio_string<std::atto>::symbol() { return "a"; }
-std::string ratio_string<std::femto>::symbol() { return "f"; }
-std::string ratio_string<std::pico>::symbol() { return "p"; }
-std::string ratio_string<std::nano>::symbol() { return "n"; }
-std::string ratio_string<std::micro>::symbol() { return "u"; }
-std::string ratio_string<std::milli>::symbol() { return "m"; }
-
-} // end namespace Catch
-
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#endif
-
-// end catch_tostring.cpp
-// start catch_totals.cpp
-
-namespace Catch {
-
-    Counts Counts::operator - ( Counts const& other ) const {
-        Counts diff;
-        diff.passed = passed - other.passed;
-        diff.failed = failed - other.failed;
-        diff.failedButOk = failedButOk - other.failedButOk;
-        return diff;
-    }
-
-    Counts& Counts::operator += ( Counts const& other ) {
-        passed += other.passed;
-        failed += other.failed;
-        failedButOk += other.failedButOk;
-        return *this;
-    }
-
-    std::size_t Counts::total() const {
-        return passed + failed + failedButOk;
-    }
-    bool Counts::allPassed() const {
-        return failed == 0 && failedButOk == 0;
-    }
-    bool Counts::allOk() const {
-        return failed == 0;
-    }
-
-    Totals Totals::operator - ( Totals const& other ) const {
-        Totals diff;
-        diff.assertions = assertions - other.assertions;
-        diff.testCases = testCases - other.testCases;
-        return diff;
-    }
-
-    Totals& Totals::operator += ( Totals const& other ) {
-        assertions += other.assertions;
-        testCases += other.testCases;
-        return *this;
-    }
-
-    Totals Totals::delta( Totals const& prevTotals ) const {
-        Totals diff = *this - prevTotals;
-        if( diff.assertions.failed > 0 )
-            ++diff.testCases.failed;
-        else if( diff.assertions.failedButOk > 0 )
-            ++diff.testCases.failedButOk;
-        else
-            ++diff.testCases.passed;
-        return diff;
-    }
-
-}
-// end catch_totals.cpp
-// start catch_uncaught_exceptions.cpp
-
-#include <exception>
-
-namespace Catch {
-    bool uncaught_exceptions() {
-#if defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
-        return std::uncaught_exceptions() > 0;
-#else
-        return std::uncaught_exception();
-#endif
-  }
-} // end namespace Catch
-// end catch_uncaught_exceptions.cpp
-// start catch_version.cpp
-
-#include <ostream>
-
-namespace Catch {
-
-    Version::Version
-        (   unsigned int _majorVersion,
-            unsigned int _minorVersion,
-            unsigned int _patchNumber,
-            char const * const _branchName,
-            unsigned int _buildNumber )
-    :   majorVersion( _majorVersion ),
-        minorVersion( _minorVersion ),
-        patchNumber( _patchNumber ),
-        branchName( _branchName ),
-        buildNumber( _buildNumber )
-    {}
-
-    std::ostream& operator << ( std::ostream& os, Version const& version ) {
-        os  << version.majorVersion << '.'
-            << version.minorVersion << '.'
-            << version.patchNumber;
-        // branchName is never null -> 0th char is \0 if it is empty
-        if (version.branchName[0]) {
-            os << '-' << version.branchName
-               << '.' << version.buildNumber;
-        }
-        return os;
-    }
-
-    Version const& libraryVersion() {
-        static Version version( 2, 4, 1, "", 0 );
-        return version;
-    }
-
-}
-// end catch_version.cpp
-// start catch_wildcard_pattern.cpp
-
-#include <sstream>
-
-namespace Catch {
-
-    WildcardPattern::WildcardPattern( std::string const& pattern,
-                                      CaseSensitive::Choice caseSensitivity )
-    :   m_caseSensitivity( caseSensitivity ),
-        m_pattern( adjustCase( pattern ) )
-    {
-        if( startsWith( m_pattern, '*' ) ) {
-            m_pattern = m_pattern.substr( 1 );
-            m_wildcard = WildcardAtStart;
-        }
-        if( endsWith( m_pattern, '*' ) ) {
-            m_pattern = m_pattern.substr( 0, m_pattern.size()-1 );
-            m_wildcard = static_cast<WildcardPosition>( m_wildcard | WildcardAtEnd );
-        }
-    }
-
-    bool WildcardPattern::matches( std::string const& str ) const {
-        switch( m_wildcard ) {
-            case NoWildcard:
-                return m_pattern == adjustCase( str );
-            case WildcardAtStart:
-                return endsWith( adjustCase( str ), m_pattern );
-            case WildcardAtEnd:
-                return startsWith( adjustCase( str ), m_pattern );
-            case WildcardAtBothEnds:
-                return contains( adjustCase( str ), m_pattern );
-            default:
-                CATCH_INTERNAL_ERROR( "Unknown enum" );
-        }
-    }
-
-    std::string WildcardPattern::adjustCase( std::string const& str ) const {
-        return m_caseSensitivity == CaseSensitive::No ? toLower( str ) : str;
-    }
-}
-// end catch_wildcard_pattern.cpp
-// start catch_xmlwriter.cpp
-
-#include <iomanip>
-
-using uchar = unsigned char;
-
-namespace Catch {
-
-namespace {
-
-    size_t trailingBytes(unsigned char c) {
-        if ((c & 0xE0) == 0xC0) {
-            return 2;
-        }
-        if ((c & 0xF0) == 0xE0) {
-            return 3;
-        }
-        if ((c & 0xF8) == 0xF0) {
-            return 4;
-        }
-        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
-    }
-
-    uint32_t headerValue(unsigned char c) {
-        if ((c & 0xE0) == 0xC0) {
-            return c & 0x1F;
-        }
-        if ((c & 0xF0) == 0xE0) {
-            return c & 0x0F;
-        }
-        if ((c & 0xF8) == 0xF0) {
-            return c & 0x07;
-        }
-        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
-    }
-
-    void hexEscapeChar(std::ostream& os, unsigned char c) {
-        os << "\\x"
-            << std::uppercase << std::hex << std::setfill('0') << std::setw(2)
-            << static_cast<int>(c);
-    }
-
-} // anonymous namespace
-
-    XmlEncode::XmlEncode( std::string const& str, ForWhat forWhat )
-    :   m_str( str ),
-        m_forWhat( forWhat )
-    {}
-
-    void XmlEncode::encodeTo( std::ostream& os ) const {
-        // Apostrophe escaping not necessary if we always use " to write attributes
-        // (see: http://www.w3.org/TR/xml/#syntax)
-
-        for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
-            uchar c = m_str[idx];
-            switch (c) {
-            case '<':   os << "&lt;"; break;
-            case '&':   os << "&amp;"; break;
-
-            case '>':
-                // See: http://www.w3.org/TR/xml/#syntax
-                if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
-                    os << "&gt;";
-                else
-                    os << c;
-                break;
-
-            case '\"':
-                if (m_forWhat == ForAttributes)
-                    os << "&quot;";
-                else
-                    os << c;
-                break;
-
-            default:
-                // Check for control characters and invalid utf-8
-
-                // Escape control characters in standard ascii
-                // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
-                if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
-                    hexEscapeChar(os, c);
-                    break;
-                }
-
-                // Plain ASCII: Write it to stream
-                if (c < 0x7F) {
-                    os << c;
-                    break;
-                }
-
-                // UTF-8 territory
-                // Check if the encoding is valid and if it is not, hex escape bytes.
-                // Important: We do not check the exact decoded values for validity, only the encoding format
-                // First check that this bytes is a valid lead byte:
-                // This means that it is not encoded as 1111 1XXX
-                // Or as 10XX XXXX
-                if (c <  0xC0 ||
-                    c >= 0xF8) {
-                    hexEscapeChar(os, c);
-                    break;
-                }
-
-                auto encBytes = trailingBytes(c);
-                // Are there enough bytes left to avoid accessing out-of-bounds memory?
-                if (idx + encBytes - 1 >= m_str.size()) {
-                    hexEscapeChar(os, c);
-                    break;
-                }
-                // The header is valid, check data
-                // The next encBytes bytes must together be a valid utf-8
-                // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
-                bool valid = true;
-                uint32_t value = headerValue(c);
-                for (std::size_t n = 1; n < encBytes; ++n) {
-                    uchar nc = m_str[idx + n];
-                    valid &= ((nc & 0xC0) == 0x80);
-                    value = (value << 6) | (nc & 0x3F);
-                }
-
-                if (
-                    // Wrong bit pattern of following bytes
-                    (!valid) ||
-                    // Overlong encodings
-                    (value < 0x80) ||
-                    (0x80 <= value && value < 0x800   && encBytes > 2) ||
-                    (0x800 < value && value < 0x10000 && encBytes > 3) ||
-                    // Encoded value out of range
-                    (value >= 0x110000)
-                    ) {
-                    hexEscapeChar(os, c);
-                    break;
-                }
-
-                // If we got here, this is in fact a valid(ish) utf-8 sequence
-                for (std::size_t n = 0; n < encBytes; ++n) {
-                    os << m_str[idx + n];
-                }
-                idx += encBytes - 1;
-                break;
-            }
-        }
-    }
-
-    std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
-        xmlEncode.encodeTo( os );
-        return os;
-    }
-
-    XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer )
-    :   m_writer( writer )
-    {}
-
-    XmlWriter::ScopedElement::ScopedElement( ScopedElement&& other ) noexcept
-    :   m_writer( other.m_writer ){
-        other.m_writer = nullptr;
-    }
-    XmlWriter::ScopedElement& XmlWriter::ScopedElement::operator=( ScopedElement&& other ) noexcept {
-        if ( m_writer ) {
-            m_writer->endElement();
-        }
-        m_writer = other.m_writer;
-        other.m_writer = nullptr;
-        return *this;
-    }
-
-    XmlWriter::ScopedElement::~ScopedElement() {
-        if( m_writer )
-            m_writer->endElement();
-    }
-
-    XmlWriter::ScopedElement& XmlWriter::ScopedElement::writeText( std::string const& text, bool indent ) {
-        m_writer->writeText( text, indent );
-        return *this;
-    }
-
-    XmlWriter::XmlWriter( std::ostream& os ) : m_os( os )
-    {
-        writeDeclaration();
-    }
-
-    XmlWriter::~XmlWriter() {
-        while( !m_tags.empty() )
-            endElement();
-    }
-
-    XmlWriter& XmlWriter::startElement( std::string const& name ) {
-        ensureTagClosed();
-        newlineIfNecessary();
-        m_os << m_indent << '<' << name;
-        m_tags.push_back( name );
-        m_indent += "  ";
-        m_tagIsOpen = true;
-        return *this;
-    }
-
-    XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name ) {
-        ScopedElement scoped( this );
-        startElement( name );
-        return scoped;
-    }
-
-    XmlWriter& XmlWriter::endElement() {
-        newlineIfNecessary();
-        m_indent = m_indent.substr( 0, m_indent.size()-2 );
-        if( m_tagIsOpen ) {
-            m_os << "/>";
-            m_tagIsOpen = false;
-        }
-        else {
-            m_os << m_indent << "</" << m_tags.back() << ">";
-        }
-        m_os << std::endl;
-        m_tags.pop_back();
-        return *this;
-    }
-
-    XmlWriter& XmlWriter::writeAttribute( std::string const& name, std::string const& attribute ) {
-        if( !name.empty() && !attribute.empty() )
-            m_os << ' ' << name << "=\"" << XmlEncode( attribute, XmlEncode::ForAttributes ) << '"';
-        return *this;
-    }
-
-    XmlWriter& XmlWriter::writeAttribute( std::string const& name, bool attribute ) {
-        m_os << ' ' << name << "=\"" << ( attribute ? "true" : "false" ) << '"';
-        return *this;
-    }
-
-    XmlWriter& XmlWriter::writeText( std::string const& text, bool indent ) {
-        if( !text.empty() ){
-            bool tagWasOpen = m_tagIsOpen;
-            ensureTagClosed();
-            if( tagWasOpen && indent )
-                m_os << m_indent;
-            m_os << XmlEncode( text );
-            m_needsNewline = true;
-        }
-        return *this;
-    }
-
-    XmlWriter& XmlWriter::writeComment( std::string const& text ) {
-        ensureTagClosed();
-        m_os << m_indent << "<!--" << text << "-->";
-        m_needsNewline = true;
-        return *this;
-    }
-
-    void XmlWriter::writeStylesheetRef( std::string const& url ) {
-        m_os << "<?xml-stylesheet type=\"text/xsl\" href=\"" << url << "\"?>\n";
-    }
-
-    XmlWriter& XmlWriter::writeBlankLine() {
-        ensureTagClosed();
-        m_os << '\n';
-        return *this;
-    }
-
-    void XmlWriter::ensureTagClosed() {
-        if( m_tagIsOpen ) {
-            m_os << ">" << std::endl;
-            m_tagIsOpen = false;
-        }
-    }
-
-    void XmlWriter::writeDeclaration() {
-        m_os << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-    }
-
-    void XmlWriter::newlineIfNecessary() {
-        if( m_needsNewline ) {
-            m_os << std::endl;
-            m_needsNewline = false;
-        }
-    }
-}
-// end catch_xmlwriter.cpp
-// start catch_reporter_bases.cpp
-
-#include <cstring>
-#include <cfloat>
-#include <cstdio>
-#include <cassert>
-#include <memory>
-
-namespace Catch {
-    void prepareExpandedExpression(AssertionResult& result) {
-        result.getExpandedExpression();
-    }
-
-    // Because formatting using c++ streams is stateful, drop down to C is required
-    // Alternatively we could use stringstream, but its performance is... not good.
-    std::string getFormattedDuration( double duration ) {
-        // Max exponent + 1 is required to represent the whole part
-        // + 1 for decimal point
-        // + 3 for the 3 decimal places
-        // + 1 for null terminator
-        const std::size_t maxDoubleSize = DBL_MAX_10_EXP + 1 + 1 + 3 + 1;
-        char buffer[maxDoubleSize];
-
-        // Save previous errno, to prevent sprintf from overwriting it
-        ErrnoGuard guard;
-#ifdef _MSC_VER
-        sprintf_s(buffer, "%.3f", duration);
-#else
-        sprintf(buffer, "%.3f", duration);
-#endif
-        return std::string(buffer);
-    }
-
-    TestEventListenerBase::TestEventListenerBase(ReporterConfig const & _config)
-        :StreamingReporterBase(_config) {}
-
-    void TestEventListenerBase::assertionStarting(AssertionInfo const &) {}
-
-    bool TestEventListenerBase::assertionEnded(AssertionStats const &) {
-        return false;
-    }
-
-} // end namespace Catch
-// end catch_reporter_bases.cpp
-// start catch_reporter_compact.cpp
-
-namespace {
-
-#ifdef CATCH_PLATFORM_MAC
-    const char* failedString() { return "FAILED"; }
-    const char* passedString() { return "PASSED"; }
-#else
-    const char* failedString() { return "failed"; }
-    const char* passedString() { return "passed"; }
-#endif
-
-    // Colour::LightGrey
-    Catch::Colour::Code dimColour() { return Catch::Colour::FileName; }
-
-    std::string bothOrAll( std::size_t count ) {
-        return count == 1 ? std::string() :
-               count == 2 ? "both " : "all " ;
-    }
-
-} // anon namespace
-
-namespace Catch {
-namespace {
-// Colour, message variants:
-// - white: No tests ran.
-// -   red: Failed [both/all] N test cases, failed [both/all] M assertions.
-// - white: Passed [both/all] N test cases (no assertions).
-// -   red: Failed N tests cases, failed M assertions.
-// - green: Passed [both/all] N tests cases with M assertions.
-void printTotals(std::ostream& out, const Totals& totals) {
-    if (totals.testCases.total() == 0) {
-        out << "No tests ran.";
-    } else if (totals.testCases.failed == totals.testCases.total()) {
-        Colour colour(Colour::ResultError);
-        const std::string qualify_assertions_failed =
-            totals.assertions.failed == totals.assertions.total() ?
-            bothOrAll(totals.assertions.failed) : std::string();
-        out <<
-            "Failed " << bothOrAll(totals.testCases.failed)
-            << pluralise(totals.testCases.failed, "test case") << ", "
-            "failed " << qualify_assertions_failed <<
-            pluralise(totals.assertions.failed, "assertion") << '.';
-    } else if (totals.assertions.total() == 0) {
-        out <<
-            "Passed " << bothOrAll(totals.testCases.total())
-            << pluralise(totals.testCases.total(), "test case")
-            << " (no assertions).";
-    } else if (totals.assertions.failed) {
-        Colour colour(Colour::ResultError);
-        out <<
-            "Failed " << pluralise(totals.testCases.failed, "test case") << ", "
-            "failed " << pluralise(totals.assertions.failed, "assertion") << '.';
-    } else {
-        Colour colour(Colour::ResultSuccess);
-        out <<
-            "Passed " << bothOrAll(totals.testCases.passed)
-            << pluralise(totals.testCases.passed, "test case") <<
-            " with " << pluralise(totals.assertions.passed, "assertion") << '.';
-    }
-}
-
-// Implementation of CompactReporter formatting
-class AssertionPrinter {
-public:
-    AssertionPrinter& operator= (AssertionPrinter const&) = delete;
-    AssertionPrinter(AssertionPrinter const&) = delete;
-    AssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages)
-        : stream(_stream)
-        , result(_stats.assertionResult)
-        , messages(_stats.infoMessages)
-        , itMessage(_stats.infoMessages.begin())
-        , printInfoMessages(_printInfoMessages) {}
-
-    void print() {
-        printSourceInfo();
-
-        itMessage = messages.begin();
-
-        switch (result.getResultType()) {
-        case ResultWas::Ok:
-            printResultType(Colour::ResultSuccess, passedString());
-            printOriginalExpression();
-            printReconstructedExpression();
-            if (!result.hasExpression())
-                printRemainingMessages(Colour::None);
-            else
-                printRemainingMessages();
-            break;
-        case ResultWas::ExpressionFailed:
-            if (result.isOk())
-                printResultType(Colour::ResultSuccess, failedString() + std::string(" - but was ok"));
-            else
-                printResultType(Colour::Error, failedString());
-            printOriginalExpression();
-            printReconstructedExpression();
-            printRemainingMessages();
-            break;
-        case ResultWas::ThrewException:
-            printResultType(Colour::Error, failedString());
-            printIssue("unexpected exception with message:");
-            printMessage();
-            printExpressionWas();
-            printRemainingMessages();
-            break;
-        case ResultWas::FatalErrorCondition:
-            printResultType(Colour::Error, failedString());
-            printIssue("fatal error condition with message:");
-            printMessage();
-            printExpressionWas();
-            printRemainingMessages();
-            break;
-        case ResultWas::DidntThrowException:
-            printResultType(Colour::Error, failedString());
-            printIssue("expected exception, got none");
-            printExpressionWas();
-            printRemainingMessages();
-            break;
-        case ResultWas::Info:
-            printResultType(Colour::None, "info");
-            printMessage();
-            printRemainingMessages();
-            break;
-        case ResultWas::Warning:
-            printResultType(Colour::None, "warning");
-            printMessage();
-            printRemainingMessages();
-            break;
-        case ResultWas::ExplicitFailure:
-            printResultType(Colour::Error, failedString());
-            printIssue("explicitly");
-            printRemainingMessages(Colour::None);
-            break;
-            // These cases are here to prevent compiler warnings
-        case ResultWas::Unknown:
-        case ResultWas::FailureBit:
-        case ResultWas::Exception:
-            printResultType(Colour::Error, "** internal error **");
-            break;
-        }
-    }
-
-private:
-    void printSourceInfo() const {
-        Colour colourGuard(Colour::FileName);
-        stream << result.getSourceInfo() << ':';
-    }
-
-    void printResultType(Colour::Code colour, std::string const& passOrFail) const {
-        if (!passOrFail.empty()) {
-            {
-                Colour colourGuard(colour);
-                stream << ' ' << passOrFail;
-            }
-            stream << ':';
-        }
-    }
-
-    void printIssue(std::string const& issue) const {
-        stream << ' ' << issue;
-    }
-
-    void printExpressionWas() {
-        if (result.hasExpression()) {
-            stream << ';';
-            {
-                Colour colour(dimColour());
-                stream << " expression was:";
-            }
-            printOriginalExpression();
-        }
-    }
-
-    void printOriginalExpression() const {
-        if (result.hasExpression()) {
-            stream << ' ' << result.getExpression();
-        }
-    }
-
-    void printReconstructedExpression() const {
-        if (result.hasExpandedExpression()) {
-            {
-                Colour colour(dimColour());
-                stream << " for: ";
-            }
-            stream << result.getExpandedExpression();
-        }
-    }
-
-    void printMessage() {
-        if (itMessage != messages.end()) {
-            stream << " '" << itMessage->message << '\'';
-            ++itMessage;
-        }
-    }
-
-    void printRemainingMessages(Colour::Code colour = dimColour()) {
-        if (itMessage == messages.end())
-            return;
-
-        // using messages.end() directly yields (or auto) compilation error:
-        std::vector<MessageInfo>::const_iterator itEnd = messages.end();
-        const std::size_t N = static_cast<std::size_t>(std::distance(itMessage, itEnd));
-
-        {
-            Colour colourGuard(colour);
-            stream << " with " << pluralise(N, "message") << ':';
-        }
-
-        for (; itMessage != itEnd; ) {
-            // If this assertion is a warning ignore any INFO messages
-            if (printInfoMessages || itMessage->type != ResultWas::Info) {
-                stream << " '" << itMessage->message << '\'';
-                if (++itMessage != itEnd) {
-                    Colour colourGuard(dimColour());
-                    stream << " and";
-                }
-            }
-        }
-    }
-
-private:
-    std::ostream& stream;
-    AssertionResult const& result;
-    std::vector<MessageInfo> messages;
-    std::vector<MessageInfo>::const_iterator itMessage;
-    bool printInfoMessages;
-};
-
-} // anon namespace
-
-        std::string CompactReporter::getDescription() {
-            return "Reports test results on a single line, suitable for IDEs";
-        }
-
-        ReporterPreferences CompactReporter::getPreferences() const {
-            return m_reporterPrefs;
-        }
-
-        void CompactReporter::noMatchingTestCases( std::string const& spec ) {
-            stream << "No test cases matched '" << spec << '\'' << std::endl;
-        }
-
-        void CompactReporter::assertionStarting( AssertionInfo const& ) {}
-
-        bool CompactReporter::assertionEnded( AssertionStats const& _assertionStats ) {
-            AssertionResult const& result = _assertionStats.assertionResult;
-
-            bool printInfoMessages = true;
-
-            // Drop out if result was successful and we're not printing those
-            if( !m_config->includeSuccessfulResults() && result.isOk() ) {
-                if( result.getResultType() != ResultWas::Warning )
-                    return false;
-                printInfoMessages = false;
-            }
-
-            AssertionPrinter printer( stream, _assertionStats, printInfoMessages );
-            printer.print();
-
-            stream << std::endl;
-            return true;
-        }
-
-        void CompactReporter::sectionEnded(SectionStats const& _sectionStats) {
-            if (m_config->showDurations() == ShowDurations::Always) {
-                stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl;
-            }
-        }
-
-        void CompactReporter::testRunEnded( TestRunStats const& _testRunStats ) {
-            printTotals( stream, _testRunStats.totals );
-            stream << '\n' << std::endl;
-            StreamingReporterBase::testRunEnded( _testRunStats );
-        }
-
-        CompactReporter::~CompactReporter() {}
-
-    CATCH_REGISTER_REPORTER( "compact", CompactReporter )
-
-} // end namespace Catch
-// end catch_reporter_compact.cpp
-// start catch_reporter_console.cpp
-
-#include <cfloat>
-#include <cstdio>
-
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
- // Note that 4062 (not all labels are handled
- // and default is missing) is enabled
-#endif
-
-namespace Catch {
-
-namespace {
-
-// Formatter impl for ConsoleReporter
-class ConsoleAssertionPrinter {
-public:
-    ConsoleAssertionPrinter& operator= (ConsoleAssertionPrinter const&) = delete;
-    ConsoleAssertionPrinter(ConsoleAssertionPrinter const&) = delete;
-    ConsoleAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages)
-        : stream(_stream),
-        stats(_stats),
-        result(_stats.assertionResult),
-        colour(Colour::None),
-        message(result.getMessage()),
-        messages(_stats.infoMessages),
-        printInfoMessages(_printInfoMessages) {
-        switch (result.getResultType()) {
-        case ResultWas::Ok:
-            colour = Colour::Success;
-            passOrFail = "PASSED";
-            //if( result.hasMessage() )
-            if (_stats.infoMessages.size() == 1)
-                messageLabel = "with message";
-            if (_stats.infoMessages.size() > 1)
-                messageLabel = "with messages";
-            break;
-        case ResultWas::ExpressionFailed:
-            if (result.isOk()) {
-                colour = Colour::Success;
-                passOrFail = "FAILED - but was ok";
-            } else {
-                colour = Colour::Error;
-                passOrFail = "FAILED";
-            }
-            if (_stats.infoMessages.size() == 1)
-                messageLabel = "with message";
-            if (_stats.infoMessages.size() > 1)
-                messageLabel = "with messages";
-            break;
-        case ResultWas::ThrewException:
-            colour = Colour::Error;
-            passOrFail = "FAILED";
-            messageLabel = "due to unexpected exception with ";
-            if (_stats.infoMessages.size() == 1)
-                messageLabel += "message";
-            if (_stats.infoMessages.size() > 1)
-                messageLabel += "messages";
-            break;
-        case ResultWas::FatalErrorCondition:
-            colour = Colour::Error;
-            passOrFail = "FAILED";
-            messageLabel = "due to a fatal error condition";
-            break;
-        case ResultWas::DidntThrowException:
-            colour = Colour::Error;
-            passOrFail = "FAILED";
-            messageLabel = "because no exception was thrown where one was expected";
-            break;
-        case ResultWas::Info:
-            messageLabel = "info";
-            break;
-        case ResultWas::Warning:
-            messageLabel = "warning";
-            break;
-        case ResultWas::ExplicitFailure:
-            passOrFail = "FAILED";
-            colour = Colour::Error;
-            if (_stats.infoMessages.size() == 1)
-                messageLabel = "explicitly with message";
-            if (_stats.infoMessages.size() > 1)
-                messageLabel = "explicitly with messages";
-            break;
-            // These cases are here to prevent compiler warnings
-        case ResultWas::Unknown:
-        case ResultWas::FailureBit:
-        case ResultWas::Exception:
-            passOrFail = "** internal error **";
-            colour = Colour::Error;
-            break;
-        }
-    }
-
-    void print() const {
-        printSourceInfo();
-        if (stats.totals.assertions.total() > 0) {
-            if (result.isOk())
-                stream << '\n';
-            printResultType();
-            printOriginalExpression();
-            printReconstructedExpression();
-        } else {
-            stream << '\n';
-        }
-        printMessage();
-    }
-
-private:
-    void printResultType() const {
-        if (!passOrFail.empty()) {
-            Colour colourGuard(colour);
-            stream << passOrFail << ":\n";
-        }
-    }
-    void printOriginalExpression() const {
-        if (result.hasExpression()) {
-            Colour colourGuard(Colour::OriginalExpression);
-            stream << "  ";
-            stream << result.getExpressionInMacro();
-            stream << '\n';
-        }
-    }
-    void printReconstructedExpression() const {
-        if (result.hasExpandedExpression()) {
-            stream << "with expansion:\n";
-            Colour colourGuard(Colour::ReconstructedExpression);
-            stream << Column(result.getExpandedExpression()).indent(2) << '\n';
-        }
-    }
-    void printMessage() const {
-        if (!messageLabel.empty())
-            stream << messageLabel << ':' << '\n';
-        for (auto const& msg : messages) {
-            // If this assertion is a warning ignore any INFO messages
-            if (printInfoMessages || msg.type != ResultWas::Info)
-                stream << Column(msg.message).indent(2) << '\n';
-        }
-    }
-    void printSourceInfo() const {
-        Colour colourGuard(Colour::FileName);
-        stream << result.getSourceInfo() << ": ";
-    }
-
-    std::ostream& stream;
-    AssertionStats const& stats;
-    AssertionResult const& result;
-    Colour::Code colour;
-    std::string passOrFail;
-    std::string messageLabel;
-    std::string message;
-    std::vector<MessageInfo> messages;
-    bool printInfoMessages;
-};
-
-std::size_t makeRatio(std::size_t number, std::size_t total) {
-    std::size_t ratio = total > 0 ? CATCH_CONFIG_CONSOLE_WIDTH * number / total : 0;
-    return (ratio == 0 && number > 0) ? 1 : ratio;
-}
-
-std::size_t& findMax(std::size_t& i, std::size_t& j, std::size_t& k) {
-    if (i > j && i > k)
-        return i;
-    else if (j > k)
-        return j;
-    else
-        return k;
-}
-
-struct ColumnInfo {
-    enum Justification { Left, Right };
-    std::string name;
-    int width;
-    Justification justification;
-};
-struct ColumnBreak {};
-struct RowBreak {};
-
-class Duration {
-    enum class Unit {
-        Auto,
-        Nanoseconds,
-        Microseconds,
-        Milliseconds,
-        Seconds,
-        Minutes
-    };
-    static const uint64_t s_nanosecondsInAMicrosecond = 1000;
-    static const uint64_t s_nanosecondsInAMillisecond = 1000 * s_nanosecondsInAMicrosecond;
-    static const uint64_t s_nanosecondsInASecond = 1000 * s_nanosecondsInAMillisecond;
-    static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond;
-
-    uint64_t m_inNanoseconds;
-    Unit m_units;
-
-public:
-    explicit Duration(uint64_t inNanoseconds, Unit units = Unit::Auto)
-        : m_inNanoseconds(inNanoseconds),
-        m_units(units) {
-        if (m_units == Unit::Auto) {
-            if (m_inNanoseconds < s_nanosecondsInAMicrosecond)
-                m_units = Unit::Nanoseconds;
-            else if (m_inNanoseconds < s_nanosecondsInAMillisecond)
-                m_units = Unit::Microseconds;
-            else if (m_inNanoseconds < s_nanosecondsInASecond)
-                m_units = Unit::Milliseconds;
-            else if (m_inNanoseconds < s_nanosecondsInAMinute)
-                m_units = Unit::Seconds;
-            else
-                m_units = Unit::Minutes;
-        }
-
-    }
-
-    auto value() const -> double {
-        switch (m_units) {
-        case Unit::Microseconds:
-            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMicrosecond);
-        case Unit::Milliseconds:
-            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMillisecond);
-        case Unit::Seconds:
-            return m_inNanoseconds / static_cast<double>(s_nanosecondsInASecond);
-        case Unit::Minutes:
-            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMinute);
-        default:
-            return static_cast<double>(m_inNanoseconds);
-        }
-    }
-    auto unitsAsString() const -> std::string {
-        switch (m_units) {
-        case Unit::Nanoseconds:
-            return "ns";
-        case Unit::Microseconds:
-            return "µs";
-        case Unit::Milliseconds:
-            return "ms";
-        case Unit::Seconds:
-            return "s";
-        case Unit::Minutes:
-            return "m";
-        default:
-            return "** internal error **";
-        }
-
-    }
-    friend auto operator << (std::ostream& os, Duration const& duration) -> std::ostream& {
-        return os << duration.value() << " " << duration.unitsAsString();
-    }
-};
-} // end anon namespace
-
-class TablePrinter {
-    std::ostream& m_os;
-    std::vector<ColumnInfo> m_columnInfos;
-    std::ostringstream m_oss;
-    int m_currentColumn = -1;
-    bool m_isOpen = false;
-
-public:
-    TablePrinter( std::ostream& os, std::vector<ColumnInfo> columnInfos )
-    :   m_os( os ),
-        m_columnInfos( std::move( columnInfos ) ) {}
-
-    auto columnInfos() const -> std::vector<ColumnInfo> const& {
-        return m_columnInfos;
-    }
-
-    void open() {
-        if (!m_isOpen) {
-            m_isOpen = true;
-            *this << RowBreak();
-            for (auto const& info : m_columnInfos)
-                *this << info.name << ColumnBreak();
-            *this << RowBreak();
-            m_os << Catch::getLineOfChars<'-'>() << "\n";
-        }
-    }
-    void close() {
-        if (m_isOpen) {
-            *this << RowBreak();
-            m_os << std::endl;
-            m_isOpen = false;
-        }
-    }
-
-    template<typename T>
-    friend TablePrinter& operator << (TablePrinter& tp, T const& value) {
-        tp.m_oss << value;
-        return tp;
-    }
-
-    friend TablePrinter& operator << (TablePrinter& tp, ColumnBreak) {
-        auto colStr = tp.m_oss.str();
-        // This takes account of utf8 encodings
-        auto strSize = Catch::StringRef(colStr).numberOfCharacters();
-        tp.m_oss.str("");
-        tp.open();
-        if (tp.m_currentColumn == static_cast<int>(tp.m_columnInfos.size() - 1)) {
-            tp.m_currentColumn = -1;
-            tp.m_os << "\n";
-        }
-        tp.m_currentColumn++;
-
-        auto colInfo = tp.m_columnInfos[tp.m_currentColumn];
-        auto padding = (strSize + 2 < static_cast<std::size_t>(colInfo.width))
-            ? std::string(colInfo.width - (strSize + 2), ' ')
-            : std::string();
-        if (colInfo.justification == ColumnInfo::Left)
-            tp.m_os << colStr << padding << " ";
-        else
-            tp.m_os << padding << colStr << " ";
-        return tp;
-    }
-
-    friend TablePrinter& operator << (TablePrinter& tp, RowBreak) {
-        if (tp.m_currentColumn > 0) {
-            tp.m_os << "\n";
-            tp.m_currentColumn = -1;
-        }
-        return tp;
-    }
-};
-
-ConsoleReporter::ConsoleReporter(ReporterConfig const& config)
-    : StreamingReporterBase(config),
-    m_tablePrinter(new TablePrinter(config.stream(),
-    {
-        { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 32, ColumnInfo::Left },
-        { "iters", 8, ColumnInfo::Right },
-        { "elapsed ns", 14, ColumnInfo::Right },
-        { "average", 14, ColumnInfo::Right }
-    })) {}
-ConsoleReporter::~ConsoleReporter() = default;
-
-std::string ConsoleReporter::getDescription() {
-    return "Reports test results as plain lines of text";
-}
-
-void ConsoleReporter::noMatchingTestCases(std::string const& spec) {
-    stream << "No test cases matched '" << spec << '\'' << std::endl;
-}
-
-void ConsoleReporter::assertionStarting(AssertionInfo const&) {}
-
-bool ConsoleReporter::assertionEnded(AssertionStats const& _assertionStats) {
-    AssertionResult const& result = _assertionStats.assertionResult;
-
-    bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
-
-    // Drop out if result was successful but we're not printing them.
-    if (!includeResults && result.getResultType() != ResultWas::Warning)
-        return false;
-
-    lazyPrint();
-
-    ConsoleAssertionPrinter printer(stream, _assertionStats, includeResults);
-    printer.print();
-    stream << std::endl;
-    return true;
-}
-
-void ConsoleReporter::sectionStarting(SectionInfo const& _sectionInfo) {
-    m_headerPrinted = false;
-    StreamingReporterBase::sectionStarting(_sectionInfo);
-}
-void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) {
-    m_tablePrinter->close();
-    if (_sectionStats.missingAssertions) {
-        lazyPrint();
-        Colour colour(Colour::ResultError);
-        if (m_sectionStack.size() > 1)
-            stream << "\nNo assertions in section";
-        else
-            stream << "\nNo assertions in test case";
-        stream << " '" << _sectionStats.sectionInfo.name << "'\n" << std::endl;
-    }
-    if (m_config->showDurations() == ShowDurations::Always) {
-        stream << getFormattedDuration(_sectionStats.durationInSeconds) << " s: " << _sectionStats.sectionInfo.name << std::endl;
-    }
-    if (m_headerPrinted) {
-        m_headerPrinted = false;
-    }
-    StreamingReporterBase::sectionEnded(_sectionStats);
-}
-
-void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
-    lazyPrintWithoutClosingBenchmarkTable();
-
-    auto nameCol = Column( info.name ).width( static_cast<std::size_t>( m_tablePrinter->columnInfos()[0].width - 2 ) );
-
-    bool firstLine = true;
-    for (auto line : nameCol) {
-        if (!firstLine)
-            (*m_tablePrinter) << ColumnBreak() << ColumnBreak() << ColumnBreak();
-        else
-            firstLine = false;
-
-        (*m_tablePrinter) << line << ColumnBreak();
-    }
-}
-void ConsoleReporter::benchmarkEnded(BenchmarkStats const& stats) {
-    Duration average(stats.elapsedTimeInNanoseconds / stats.iterations);
-    (*m_tablePrinter)
-        << stats.iterations << ColumnBreak()
-        << stats.elapsedTimeInNanoseconds << ColumnBreak()
-        << average << ColumnBreak();
-}
-
-void ConsoleReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
-    m_tablePrinter->close();
-    StreamingReporterBase::testCaseEnded(_testCaseStats);
-    m_headerPrinted = false;
-}
-void ConsoleReporter::testGroupEnded(TestGroupStats const& _testGroupStats) {
-    if (currentGroupInfo.used) {
-        printSummaryDivider();
-        stream << "Summary for group '" << _testGroupStats.groupInfo.name << "':\n";
-        printTotals(_testGroupStats.totals);
-        stream << '\n' << std::endl;
-    }
-    StreamingReporterBase::testGroupEnded(_testGroupStats);
-}
-void ConsoleReporter::testRunEnded(TestRunStats const& _testRunStats) {
-    printTotalsDivider(_testRunStats.totals);
-    printTotals(_testRunStats.totals);
-    stream << std::endl;
-    StreamingReporterBase::testRunEnded(_testRunStats);
-}
-
-void ConsoleReporter::lazyPrint() {
-
-    m_tablePrinter->close();
-    lazyPrintWithoutClosingBenchmarkTable();
-}
-
-void ConsoleReporter::lazyPrintWithoutClosingBenchmarkTable() {
-
-    if (!currentTestRunInfo.used)
-        lazyPrintRunInfo();
-    if (!currentGroupInfo.used)
-        lazyPrintGroupInfo();
-
-    if (!m_headerPrinted) {
-        printTestCaseAndSectionHeader();
-        m_headerPrinted = true;
-    }
-}
-void ConsoleReporter::lazyPrintRunInfo() {
-    stream << '\n' << getLineOfChars<'~'>() << '\n';
-    Colour colour(Colour::SecondaryText);
-    stream << currentTestRunInfo->name
-        << " is a Catch v" << libraryVersion() << " host application.\n"
-        << "Run with -? for options\n\n";
-
-    if (m_config->rngSeed() != 0)
-        stream << "Randomness seeded to: " << m_config->rngSeed() << "\n\n";
-
-    currentTestRunInfo.used = true;
-}
-void ConsoleReporter::lazyPrintGroupInfo() {
-    if (!currentGroupInfo->name.empty() && currentGroupInfo->groupsCounts > 1) {
-        printClosedHeader("Group: " + currentGroupInfo->name);
-        currentGroupInfo.used = true;
-    }
-}
-void ConsoleReporter::printTestCaseAndSectionHeader() {
-    assert(!m_sectionStack.empty());
-    printOpenHeader(currentTestCaseInfo->name);
-
-    if (m_sectionStack.size() > 1) {
-        Colour colourGuard(Colour::Headers);
-
-        auto
-            it = m_sectionStack.begin() + 1, // Skip first section (test case)
-            itEnd = m_sectionStack.end();
-        for (; it != itEnd; ++it)
-            printHeaderString(it->name, 2);
-    }
-
-    SourceLineInfo lineInfo = m_sectionStack.back().lineInfo;
-
-    if (!lineInfo.empty()) {
-        stream << getLineOfChars<'-'>() << '\n';
-        Colour colourGuard(Colour::FileName);
-        stream << lineInfo << '\n';
-    }
-    stream << getLineOfChars<'.'>() << '\n' << std::endl;
-}
-
-void ConsoleReporter::printClosedHeader(std::string const& _name) {
-    printOpenHeader(_name);
-    stream << getLineOfChars<'.'>() << '\n';
-}
-void ConsoleReporter::printOpenHeader(std::string const& _name) {
-    stream << getLineOfChars<'-'>() << '\n';
-    {
-        Colour colourGuard(Colour::Headers);
-        printHeaderString(_name);
-    }
-}
-
-// if string has a : in first line will set indent to follow it on
-// subsequent lines
-void ConsoleReporter::printHeaderString(std::string const& _string, std::size_t indent) {
-    std::size_t i = _string.find(": ");
-    if (i != std::string::npos)
-        i += 2;
-    else
-        i = 0;
-    stream << Column(_string).indent(indent + i).initialIndent(indent) << '\n';
-}
-
-struct SummaryColumn {
-
-    SummaryColumn( std::string _label, Colour::Code _colour )
-    :   label( std::move( _label ) ),
-        colour( _colour ) {}
-    SummaryColumn addRow( std::size_t count ) {
-        ReusableStringStream rss;
-        rss << count;
-        std::string row = rss.str();
-        for (auto& oldRow : rows) {
-            while (oldRow.size() < row.size())
-                oldRow = ' ' + oldRow;
-            while (oldRow.size() > row.size())
-                row = ' ' + row;
-        }
-        rows.push_back(row);
-        return *this;
-    }
-
-    std::string label;
-    Colour::Code colour;
-    std::vector<std::string> rows;
-
-};
-
-void ConsoleReporter::printTotals( Totals const& totals ) {
-    if (totals.testCases.total() == 0) {
-        stream << Colour(Colour::Warning) << "No tests ran\n";
-    } else if (totals.assertions.total() > 0 && totals.testCases.allPassed()) {
-        stream << Colour(Colour::ResultSuccess) << "All tests passed";
-        stream << " ("
-            << pluralise(totals.assertions.passed, "assertion") << " in "
-            << pluralise(totals.testCases.passed, "test case") << ')'
-            << '\n';
-    } else {
-
-        std::vector<SummaryColumn> columns;
-        columns.push_back(SummaryColumn("", Colour::None)
-                          .addRow(totals.testCases.total())
-                          .addRow(totals.assertions.total()));
-        columns.push_back(SummaryColumn("passed", Colour::Success)
-                          .addRow(totals.testCases.passed)
-                          .addRow(totals.assertions.passed));
-        columns.push_back(SummaryColumn("failed", Colour::ResultError)
-                          .addRow(totals.testCases.failed)
-                          .addRow(totals.assertions.failed));
-        columns.push_back(SummaryColumn("failed as expected", Colour::ResultExpectedFailure)
-                          .addRow(totals.testCases.failedButOk)
-                          .addRow(totals.assertions.failedButOk));
-
-        printSummaryRow("test cases", columns, 0);
-        printSummaryRow("assertions", columns, 1);
-    }
-}
-void ConsoleReporter::printSummaryRow(std::string const& label, std::vector<SummaryColumn> const& cols, std::size_t row) {
-    for (auto col : cols) {
-        std::string value = col.rows[row];
-        if (col.label.empty()) {
-            stream << label << ": ";
-            if (value != "0")
-                stream << value;
-            else
-                stream << Colour(Colour::Warning) << "- none -";
-        } else if (value != "0") {
-            stream << Colour(Colour::LightGrey) << " | ";
-            stream << Colour(col.colour)
-                << value << ' ' << col.label;
-        }
-    }
-    stream << '\n';
-}
-
-void ConsoleReporter::printTotalsDivider(Totals const& totals) {
-    if (totals.testCases.total() > 0) {
-        std::size_t failedRatio = makeRatio(totals.testCases.failed, totals.testCases.total());
-        std::size_t failedButOkRatio = makeRatio(totals.testCases.failedButOk, totals.testCases.total());
-        std::size_t passedRatio = makeRatio(totals.testCases.passed, totals.testCases.total());
-        while (failedRatio + failedButOkRatio + passedRatio < CATCH_CONFIG_CONSOLE_WIDTH - 1)
-            findMax(failedRatio, failedButOkRatio, passedRatio)++;
-        while (failedRatio + failedButOkRatio + passedRatio > CATCH_CONFIG_CONSOLE_WIDTH - 1)
-            findMax(failedRatio, failedButOkRatio, passedRatio)--;
-
-        stream << Colour(Colour::Error) << std::string(failedRatio, '=');
-        stream << Colour(Colour::ResultExpectedFailure) << std::string(failedButOkRatio, '=');
-        if (totals.testCases.allPassed())
-            stream << Colour(Colour::ResultSuccess) << std::string(passedRatio, '=');
-        else
-            stream << Colour(Colour::Success) << std::string(passedRatio, '=');
-    } else {
-        stream << Colour(Colour::Warning) << std::string(CATCH_CONFIG_CONSOLE_WIDTH - 1, '=');
-    }
-    stream << '\n';
-}
-void ConsoleReporter::printSummaryDivider() {
-    stream << getLineOfChars<'-'>() << '\n';
-}
-
-CATCH_REGISTER_REPORTER("console", ConsoleReporter)
-
-} // end namespace Catch
-
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-// end catch_reporter_console.cpp
-// start catch_reporter_junit.cpp
-
-#include <cassert>
-#include <sstream>
-#include <ctime>
-#include <algorithm>
-
-namespace Catch {
-
-    namespace {
-        std::string getCurrentTimestamp() {
-            // Beware, this is not reentrant because of backward compatibility issues
-            // Also, UTC only, again because of backward compatibility (%z is C++11)
-            time_t rawtime;
-            std::time(&rawtime);
-            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
-
-#ifdef _MSC_VER
-            std::tm timeInfo = {};
-            gmtime_s(&timeInfo, &rawtime);
-#else
-            std::tm* timeInfo;
-            timeInfo = std::gmtime(&rawtime);
-#endif
-
-            char timeStamp[timeStampSize];
-            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
-
-#ifdef _MSC_VER
-            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
-#else
-            std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
-#endif
-            return std::string(timeStamp);
-        }
-
-        std::string fileNameTag(const std::vector<std::string> &tags) {
-            auto it = std::find_if(begin(tags),
-                                   end(tags),
-                                   [] (std::string const& tag) {return tag.front() == '#'; });
-            if (it != tags.end())
-                return it->substr(1);
-            return std::string();
-        }
-    } // anonymous namespace
-
-    JunitReporter::JunitReporter( ReporterConfig const& _config )
-        :   CumulativeReporterBase( _config ),
-            xml( _config.stream() )
-        {
-            m_reporterPrefs.shouldRedirectStdOut = true;
-            m_reporterPrefs.shouldReportAllAssertions = true;
-        }
-
-    JunitReporter::~JunitReporter() {}
-
-    std::string JunitReporter::getDescription() {
-        return "Reports test results in an XML format that looks like Ant's junitreport target";
-    }
-
-    void JunitReporter::noMatchingTestCases( std::string const& /*spec*/ ) {}
-
-    void JunitReporter::testRunStarting( TestRunInfo const& runInfo )  {
-        CumulativeReporterBase::testRunStarting( runInfo );
-        xml.startElement( "testsuites" );
-    }
-
-    void JunitReporter::testGroupStarting( GroupInfo const& groupInfo ) {
-        suiteTimer.start();
-        stdOutForSuite.clear();
-        stdErrForSuite.clear();
-        unexpectedExceptions = 0;
-        CumulativeReporterBase::testGroupStarting( groupInfo );
-    }
-
-    void JunitReporter::testCaseStarting( TestCaseInfo const& testCaseInfo ) {
-        m_okToFail = testCaseInfo.okToFail();
-    }
-
-    bool JunitReporter::assertionEnded( AssertionStats const& assertionStats ) {
-        if( assertionStats.assertionResult.getResultType() == ResultWas::ThrewException && !m_okToFail )
-            unexpectedExceptions++;
-        return CumulativeReporterBase::assertionEnded( assertionStats );
-    }
-
-    void JunitReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
-        stdOutForSuite += testCaseStats.stdOut;
-        stdErrForSuite += testCaseStats.stdErr;
-        CumulativeReporterBase::testCaseEnded( testCaseStats );
-    }
-
-    void JunitReporter::testGroupEnded( TestGroupStats const& testGroupStats ) {
-        double suiteTime = suiteTimer.getElapsedSeconds();
-        CumulativeReporterBase::testGroupEnded( testGroupStats );
-        writeGroup( *m_testGroups.back(), suiteTime );
-    }
-
-    void JunitReporter::testRunEndedCumulative() {
-        xml.endElement();
-    }
-
-    void JunitReporter::writeGroup( TestGroupNode const& groupNode, double suiteTime ) {
-        XmlWriter::ScopedElement e = xml.scopedElement( "testsuite" );
-        TestGroupStats const& stats = groupNode.value;
-        xml.writeAttribute( "name", stats.groupInfo.name );
-        xml.writeAttribute( "errors", unexpectedExceptions );
-        xml.writeAttribute( "failures", stats.totals.assertions.failed-unexpectedExceptions );
-        xml.writeAttribute( "tests", stats.totals.assertions.total() );
-        xml.writeAttribute( "hostname", "tbd" ); // !TBD
-        if( m_config->showDurations() == ShowDurations::Never )
-            xml.writeAttribute( "time", "" );
-        else
-            xml.writeAttribute( "time", suiteTime );
-        xml.writeAttribute( "timestamp", getCurrentTimestamp() );
-
-        // Write test cases
-        for( auto const& child : groupNode.children )
-            writeTestCase( *child );
-
-        xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), false );
-        xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), false );
-    }
-
-    void JunitReporter::writeTestCase( TestCaseNode const& testCaseNode ) {
-        TestCaseStats const& stats = testCaseNode.value;
-
-        // All test cases have exactly one section - which represents the
-        // test case itself. That section may have 0-n nested sections
-        assert( testCaseNode.children.size() == 1 );
-        SectionNode const& rootSection = *testCaseNode.children.front();
-
-        std::string className = stats.testInfo.className;
-
-        if( className.empty() ) {
-            className = fileNameTag(stats.testInfo.tags);
-            if ( className.empty() )
-                className = "global";
-        }
-
-        if ( !m_config->name().empty() )
-            className = m_config->name() + "." + className;
-
-        writeSection( className, "", rootSection );
-    }
-
-    void JunitReporter::writeSection(  std::string const& className,
-                        std::string const& rootName,
-                        SectionNode const& sectionNode ) {
-        std::string name = trim( sectionNode.stats.sectionInfo.name );
-        if( !rootName.empty() )
-            name = rootName + '/' + name;
-
-        if( !sectionNode.assertions.empty() ||
-            !sectionNode.stdOut.empty() ||
-            !sectionNode.stdErr.empty() ) {
-            XmlWriter::ScopedElement e = xml.scopedElement( "testcase" );
-            if( className.empty() ) {
-                xml.writeAttribute( "classname", name );
-                xml.writeAttribute( "name", "root" );
-            }
-            else {
-                xml.writeAttribute( "classname", className );
-                xml.writeAttribute( "name", name );
-            }
-            xml.writeAttribute( "time", ::Catch::Detail::stringify( sectionNode.stats.durationInSeconds ) );
-
-            writeAssertions( sectionNode );
-
-            if( !sectionNode.stdOut.empty() )
-                xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), false );
-            if( !sectionNode.stdErr.empty() )
-                xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), false );
-        }
-        for( auto const& childNode : sectionNode.childSections )
-            if( className.empty() )
-                writeSection( name, "", *childNode );
-            else
-                writeSection( className, name, *childNode );
-    }
-
-    void JunitReporter::writeAssertions( SectionNode const& sectionNode ) {
-        for( auto const& assertion : sectionNode.assertions )
-            writeAssertion( assertion );
-    }
-
-    void JunitReporter::writeAssertion( AssertionStats const& stats ) {
-        AssertionResult const& result = stats.assertionResult;
-        if( !result.isOk() ) {
-            std::string elementName;
-            switch( result.getResultType() ) {
-                case ResultWas::ThrewException:
-                case ResultWas::FatalErrorCondition:
-                    elementName = "error";
-                    break;
-                case ResultWas::ExplicitFailure:
-                    elementName = "failure";
-                    break;
-                case ResultWas::ExpressionFailed:
-                    elementName = "failure";
-                    break;
-                case ResultWas::DidntThrowException:
-                    elementName = "failure";
-                    break;
-
-                // We should never see these here:
-                case ResultWas::Info:
-                case ResultWas::Warning:
-                case ResultWas::Ok:
-                case ResultWas::Unknown:
-                case ResultWas::FailureBit:
-                case ResultWas::Exception:
-                    elementName = "internalError";
-                    break;
-            }
-
-            XmlWriter::ScopedElement e = xml.scopedElement( elementName );
-
-            xml.writeAttribute( "message", result.getExpandedExpression() );
-            xml.writeAttribute( "type", result.getTestMacroName() );
-
-            ReusableStringStream rss;
-            if( !result.getMessage().empty() )
-                rss << result.getMessage() << '\n';
-            for( auto const& msg : stats.infoMessages )
-                if( msg.type == ResultWas::Info )
-                    rss << msg.message << '\n';
-
-            rss << "at " << result.getSourceInfo();
-            xml.writeText( rss.str(), false );
-        }
-    }
-
-    CATCH_REGISTER_REPORTER( "junit", JunitReporter )
-
-} // end namespace Catch
-// end catch_reporter_junit.cpp
-// start catch_reporter_listening.cpp
-
-#include <cassert>
-
-namespace Catch {
-
-    ListeningReporter::ListeningReporter() {
-        // We will assume that listeners will always want all assertions
-        m_preferences.shouldReportAllAssertions = true;
-    }
-
-    void ListeningReporter::addListener( IStreamingReporterPtr&& listener ) {
-        m_listeners.push_back( std::move( listener ) );
-    }
-
-    void ListeningReporter::addReporter(IStreamingReporterPtr&& reporter) {
-        assert(!m_reporter && "Listening reporter can wrap only 1 real reporter");
-        m_reporter = std::move( reporter );
-        m_preferences.shouldRedirectStdOut = m_reporter->getPreferences().shouldRedirectStdOut;
-    }
-
-    ReporterPreferences ListeningReporter::getPreferences() const {
-        return m_preferences;
-    }
-
-    std::set<Verbosity> ListeningReporter::getSupportedVerbosities() {
-        return std::set<Verbosity>{ };
-    }
-
-    void ListeningReporter::noMatchingTestCases( std::string const& spec ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->noMatchingTestCases( spec );
-        }
-        m_reporter->noMatchingTestCases( spec );
-    }
-
-    void ListeningReporter::benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->benchmarkStarting( benchmarkInfo );
-        }
-        m_reporter->benchmarkStarting( benchmarkInfo );
-    }
-    void ListeningReporter::benchmarkEnded( BenchmarkStats const& benchmarkStats ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->benchmarkEnded( benchmarkStats );
-        }
-        m_reporter->benchmarkEnded( benchmarkStats );
-    }
-
-    void ListeningReporter::testRunStarting( TestRunInfo const& testRunInfo ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->testRunStarting( testRunInfo );
-        }
-        m_reporter->testRunStarting( testRunInfo );
-    }
-
-    void ListeningReporter::testGroupStarting( GroupInfo const& groupInfo ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->testGroupStarting( groupInfo );
-        }
-        m_reporter->testGroupStarting( groupInfo );
-    }
-
-    void ListeningReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->testCaseStarting( testInfo );
-        }
-        m_reporter->testCaseStarting( testInfo );
-    }
-
-    void ListeningReporter::sectionStarting( SectionInfo const& sectionInfo ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->sectionStarting( sectionInfo );
-        }
-        m_reporter->sectionStarting( sectionInfo );
-    }
-
-    void ListeningReporter::assertionStarting( AssertionInfo const& assertionInfo ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->assertionStarting( assertionInfo );
-        }
-        m_reporter->assertionStarting( assertionInfo );
-    }
-
-    // The return value indicates if the messages buffer should be cleared:
-    bool ListeningReporter::assertionEnded( AssertionStats const& assertionStats ) {
-        for( auto const& listener : m_listeners ) {
-            static_cast<void>( listener->assertionEnded( assertionStats ) );
-        }
-        return m_reporter->assertionEnded( assertionStats );
-    }
-
-    void ListeningReporter::sectionEnded( SectionStats const& sectionStats ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->sectionEnded( sectionStats );
-        }
-        m_reporter->sectionEnded( sectionStats );
-    }
-
-    void ListeningReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->testCaseEnded( testCaseStats );
-        }
-        m_reporter->testCaseEnded( testCaseStats );
-    }
-
-    void ListeningReporter::testGroupEnded( TestGroupStats const& testGroupStats ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->testGroupEnded( testGroupStats );
-        }
-        m_reporter->testGroupEnded( testGroupStats );
-    }
-
-    void ListeningReporter::testRunEnded( TestRunStats const& testRunStats ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->testRunEnded( testRunStats );
-        }
-        m_reporter->testRunEnded( testRunStats );
-    }
-
-    void ListeningReporter::skipTest( TestCaseInfo const& testInfo ) {
-        for ( auto const& listener : m_listeners ) {
-            listener->skipTest( testInfo );
-        }
-        m_reporter->skipTest( testInfo );
-    }
-
-    bool ListeningReporter::isMulti() const {
-        return true;
-    }
-
-} // end namespace Catch
-// end catch_reporter_listening.cpp
-// start catch_reporter_xml.cpp
-
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
-                              // Note that 4062 (not all labels are handled
-                              // and default is missing) is enabled
-#endif
-
-namespace Catch {
-    XmlReporter::XmlReporter( ReporterConfig const& _config )
-    :   StreamingReporterBase( _config ),
-        m_xml(_config.stream())
-    {
-        m_reporterPrefs.shouldRedirectStdOut = true;
-        m_reporterPrefs.shouldReportAllAssertions = true;
-    }
-
-    XmlReporter::~XmlReporter() = default;
-
-    std::string XmlReporter::getDescription() {
-        return "Reports test results as an XML document";
-    }
-
-    std::string XmlReporter::getStylesheetRef() const {
-        return std::string();
-    }
-
-    void XmlReporter::writeSourceInfo( SourceLineInfo const& sourceInfo ) {
-        m_xml
-            .writeAttribute( "filename", sourceInfo.file )
-            .writeAttribute( "line", sourceInfo.line );
-    }
-
-    void XmlReporter::noMatchingTestCases( std::string const& s ) {
-        StreamingReporterBase::noMatchingTestCases( s );
-    }
-
-    void XmlReporter::testRunStarting( TestRunInfo const& testInfo ) {
-        StreamingReporterBase::testRunStarting( testInfo );
-        std::string stylesheetRef = getStylesheetRef();
-        if( !stylesheetRef.empty() )
-            m_xml.writeStylesheetRef( stylesheetRef );
-        m_xml.startElement( "Catch" );
-        if( !m_config->name().empty() )
-            m_xml.writeAttribute( "name", m_config->name() );
-    }
-
-    void XmlReporter::testGroupStarting( GroupInfo const& groupInfo ) {
-        StreamingReporterBase::testGroupStarting( groupInfo );
-        m_xml.startElement( "Group" )
-            .writeAttribute( "name", groupInfo.name );
-    }
-
-    void XmlReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
-        StreamingReporterBase::testCaseStarting(testInfo);
-        m_xml.startElement( "TestCase" )
-            .writeAttribute( "name", trim( testInfo.name ) )
-            .writeAttribute( "description", testInfo.description )
-            .writeAttribute( "tags", testInfo.tagsAsString() );
-
-        writeSourceInfo( testInfo.lineInfo );
-
-        if ( m_config->showDurations() == ShowDurations::Always )
-            m_testCaseTimer.start();
-        m_xml.ensureTagClosed();
-    }
-
-    void XmlReporter::sectionStarting( SectionInfo const& sectionInfo ) {
-        StreamingReporterBase::sectionStarting( sectionInfo );
-        if( m_sectionDepth++ > 0 ) {
-            m_xml.startElement( "Section" )
-                .writeAttribute( "name", trim( sectionInfo.name ) );
-            writeSourceInfo( sectionInfo.lineInfo );
-            m_xml.ensureTagClosed();
-        }
-    }
-
-    void XmlReporter::assertionStarting( AssertionInfo const& ) { }
-
-    bool XmlReporter::assertionEnded( AssertionStats const& assertionStats ) {
-
-        AssertionResult const& result = assertionStats.assertionResult;
-
-        bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
-
-        if( includeResults || result.getResultType() == ResultWas::Warning ) {
-            // Print any info messages in <Info> tags.
-            for( auto const& msg : assertionStats.infoMessages ) {
-                if( msg.type == ResultWas::Info && includeResults ) {
-                    m_xml.scopedElement( "Info" )
-                            .writeText( msg.message );
-                } else if ( msg.type == ResultWas::Warning ) {
-                    m_xml.scopedElement( "Warning" )
-                            .writeText( msg.message );
-                }
-            }
-        }
-
-        // Drop out if result was successful but we're not printing them.
-        if( !includeResults && result.getResultType() != ResultWas::Warning )
-            return true;
-
-        // Print the expression if there is one.
-        if( result.hasExpression() ) {
-            m_xml.startElement( "Expression" )
-                .writeAttribute( "success", result.succeeded() )
-                .writeAttribute( "type", result.getTestMacroName() );
-
-            writeSourceInfo( result.getSourceInfo() );
-
-            m_xml.scopedElement( "Original" )
-                .writeText( result.getExpression() );
-            m_xml.scopedElement( "Expanded" )
-                .writeText( result.getExpandedExpression() );
-        }
-
-        // And... Print a result applicable to each result type.
-        switch( result.getResultType() ) {
-            case ResultWas::ThrewException:
-                m_xml.startElement( "Exception" );
-                writeSourceInfo( result.getSourceInfo() );
-                m_xml.writeText( result.getMessage() );
-                m_xml.endElement();
-                break;
-            case ResultWas::FatalErrorCondition:
-                m_xml.startElement( "FatalErrorCondition" );
-                writeSourceInfo( result.getSourceInfo() );
-                m_xml.writeText( result.getMessage() );
-                m_xml.endElement();
-                break;
-            case ResultWas::Info:
-                m_xml.scopedElement( "Info" )
-                    .writeText( result.getMessage() );
-                break;
-            case ResultWas::Warning:
-                // Warning will already have been written
-                break;
-            case ResultWas::ExplicitFailure:
-                m_xml.startElement( "Failure" );
-                writeSourceInfo( result.getSourceInfo() );
-                m_xml.writeText( result.getMessage() );
-                m_xml.endElement();
-                break;
-            default:
-                break;
-        }
-
-        if( result.hasExpression() )
-            m_xml.endElement();
-
-        return true;
-    }
-
-    void XmlReporter::sectionEnded( SectionStats const& sectionStats ) {
-        StreamingReporterBase::sectionEnded( sectionStats );
-        if( --m_sectionDepth > 0 ) {
-            XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResults" );
-            e.writeAttribute( "successes", sectionStats.assertions.passed );
-            e.writeAttribute( "failures", sectionStats.assertions.failed );
-            e.writeAttribute( "expectedFailures", sectionStats.assertions.failedButOk );
-
-            if ( m_config->showDurations() == ShowDurations::Always )
-                e.writeAttribute( "durationInSeconds", sectionStats.durationInSeconds );
-
-            m_xml.endElement();
-        }
-    }
-
-    void XmlReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
-        StreamingReporterBase::testCaseEnded( testCaseStats );
-        XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResult" );
-        e.writeAttribute( "success", testCaseStats.totals.assertions.allOk() );
-
-        if ( m_config->showDurations() == ShowDurations::Always )
-            e.writeAttribute( "durationInSeconds", m_testCaseTimer.getElapsedSeconds() );
-
-        if( !testCaseStats.stdOut.empty() )
-            m_xml.scopedElement( "StdOut" ).writeText( trim( testCaseStats.stdOut ), false );
-        if( !testCaseStats.stdErr.empty() )
-            m_xml.scopedElement( "StdErr" ).writeText( trim( testCaseStats.stdErr ), false );
-
-        m_xml.endElement();
-    }
-
-    void XmlReporter::testGroupEnded( TestGroupStats const& testGroupStats ) {
-        StreamingReporterBase::testGroupEnded( testGroupStats );
-        // TODO: Check testGroupStats.aborting and act accordingly.
-        m_xml.scopedElement( "OverallResults" )
-            .writeAttribute( "successes", testGroupStats.totals.assertions.passed )
-            .writeAttribute( "failures", testGroupStats.totals.assertions.failed )
-            .writeAttribute( "expectedFailures", testGroupStats.totals.assertions.failedButOk );
-        m_xml.endElement();
-    }
-
-    void XmlReporter::testRunEnded( TestRunStats const& testRunStats ) {
-        StreamingReporterBase::testRunEnded( testRunStats );
-        m_xml.scopedElement( "OverallResults" )
-            .writeAttribute( "successes", testRunStats.totals.assertions.passed )
-            .writeAttribute( "failures", testRunStats.totals.assertions.failed )
-            .writeAttribute( "expectedFailures", testRunStats.totals.assertions.failedButOk );
-        m_xml.endElement();
-    }
-
-    CATCH_REGISTER_REPORTER( "xml", XmlReporter )
-
-} // end namespace Catch
-
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-// end catch_reporter_xml.cpp
-
-namespace Catch {
-    LeakDetector leakDetector;
-}
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-
-// end catch_impl.hpp
-#endif
-
-#ifdef CATCH_CONFIG_MAIN
-// start catch_default_main.hpp
-
-#ifndef __OBJC__
-
-#if defined(CATCH_CONFIG_WCHAR) && defined(WIN32) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)
-// Standard C/C++ Win32 Unicode wmain entry point
-extern "C" int wmain (int argc, wchar_t * argv[], wchar_t * []) {
-#else
-// Standard C/C++ main entry point
-int main (int argc, char * argv[]) {
-#endif
-
-    return Catch::Session().run( argc, argv );
-}
-
-#else // __OBJC__
-
-// Objective-C entry point
-int main (int argc, char * const argv[]) {
-#if !CATCH_ARC_ENABLED
-    NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
-#endif
-
-    Catch::registerTestMethods();
-    int result = Catch::Session().run( argc, (char**)argv );
-
-#if !CATCH_ARC_ENABLED
-    [pool drain];
-#endif
-
-    return result;
-}
-
-#endif // __OBJC__
-
-// end catch_default_main.hpp
-#endif
-
-#if !defined(CATCH_CONFIG_IMPL_ONLY)
-
-#ifdef CLARA_CONFIG_MAIN_NOT_DEFINED
-#  undef CLARA_CONFIG_MAIN
-#endif
-
-#if !defined(CATCH_CONFIG_DISABLE)
-//////
-// If this config identifier is defined then all CATCH macros are prefixed with CATCH_
-#ifdef CATCH_CONFIG_PREFIX_ALL
-
-#define CATCH_REQUIRE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__ )
-#define CATCH_REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
-
-#define CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, "", __VA_ARGS__ )
-#define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
-#define CATCH_REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
-#endif// CATCH_CONFIG_DISABLE_MATCHERS
-#define CATCH_REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
-
-#define CATCH_CHECK( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CATCH_CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
-#define CATCH_CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CATCH_CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CATCH_CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CATCH_CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CATCH_CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
-
-#define CATCH_CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CATCH_CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, "", __VA_ARGS__ )
-#define CATCH_CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
-#define CATCH_CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-#define CATCH_CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CATCH_CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
-
-#define CATCH_REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-
-#define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
-#define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
-#define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE",__VA_ARGS__ )
-
-#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
-#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
-#define CATCH_METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
-#define CATCH_REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
-#define CATCH_SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
-#define CATCH_DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
-#define CATCH_FAIL( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
-#define CATCH_FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CATCH_SUCCEED( ... ) INTERNAL_CATCH_MSG( "CATCH_SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-
-#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()
-
-// "BDD-style" convenience wrappers
-#define CATCH_SCENARIO( ... ) CATCH_TEST_CASE( "Scenario: " __VA_ARGS__ )
-#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
-#define CATCH_GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
-#define CATCH_AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
-#define CATCH_WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
-#define CATCH_AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
-#define CATCH_THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
-#define CATCH_AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
-
-// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not required
-#else
-
-#define REQUIRE( ... ) INTERNAL_CATCH_TEST( "REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__  )
-#define REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
-
-#define REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
-#define REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
-#define REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-#define REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
-
-#define CHECK( ... ) INTERNAL_CATCH_TEST( "CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
-#define CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
-
-#define CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
-#define CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-#define CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
-
-#define REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-
-#define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
-#define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
-#define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE",__VA_ARGS__ )
-
-#define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
-#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
-#define METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
-#define REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
-#define SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
-#define DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
-#define FAIL( ... ) INTERNAL_CATCH_MSG( "FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
-#define FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define SUCCEED( ... ) INTERNAL_CATCH_MSG( "SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
-#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()
-
-#endif
-
-#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature )
-
-// "BDD-style" convenience wrappers
-#define SCENARIO( ... ) TEST_CASE( "Scenario: " __VA_ARGS__ )
-#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
-
-#define GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
-#define AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
-#define WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
-#define AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
-#define THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
-#define AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
-
-using Catch::Detail::Approx;
-
-#else // CATCH_CONFIG_DISABLE
-
-//////
-// If this config identifier is defined then all CATCH macros are prefixed with CATCH_
-#ifdef CATCH_CONFIG_PREFIX_ALL
-
-#define CATCH_REQUIRE( ... )        (void)(0)
-#define CATCH_REQUIRE_FALSE( ... )  (void)(0)
-
-#define CATCH_REQUIRE_THROWS( ... ) (void)(0)
-#define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
-#define CATCH_REQUIRE_THROWS_WITH( expr, matcher )     (void)(0)
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
-#endif// CATCH_CONFIG_DISABLE_MATCHERS
-#define CATCH_REQUIRE_NOTHROW( ... ) (void)(0)
-
-#define CATCH_CHECK( ... )         (void)(0)
-#define CATCH_CHECK_FALSE( ... )   (void)(0)
-#define CATCH_CHECKED_IF( ... )    if (__VA_ARGS__)
-#define CATCH_CHECKED_ELSE( ... )  if (!(__VA_ARGS__))
-#define CATCH_CHECK_NOFAIL( ... )  (void)(0)
-
-#define CATCH_CHECK_THROWS( ... )  (void)(0)
-#define CATCH_CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
-#define CATCH_CHECK_THROWS_WITH( expr, matcher )     (void)(0)
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-#define CATCH_CHECK_NOTHROW( ... ) (void)(0)
-
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CATCH_CHECK_THAT( arg, matcher )   (void)(0)
-
-#define CATCH_REQUIRE_THAT( arg, matcher ) (void)(0)
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-
-#define CATCH_INFO( msg )    (void)(0)
-#define CATCH_WARN( msg )    (void)(0)
-#define CATCH_CAPTURE( msg ) (void)(0)
-
-#define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
-#define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
-#define CATCH_METHOD_AS_TEST_CASE( method, ... )
-#define CATCH_REGISTER_TEST_CASE( Function, ... ) (void)(0)
-#define CATCH_SECTION( ... )
-#define CATCH_DYNAMIC_SECTION( ... )
-#define CATCH_FAIL( ... ) (void)(0)
-#define CATCH_FAIL_CHECK( ... ) (void)(0)
-#define CATCH_SUCCEED( ... ) (void)(0)
-
-#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
-
-// "BDD-style" convenience wrappers
-#define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
-#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className )
-#define CATCH_GIVEN( desc )
-#define CATCH_AND_GIVEN( desc )
-#define CATCH_WHEN( desc )
-#define CATCH_AND_WHEN( desc )
-#define CATCH_THEN( desc )
-#define CATCH_AND_THEN( desc )
-
-// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not required
-#else
-
-#define REQUIRE( ... )       (void)(0)
-#define REQUIRE_FALSE( ... ) (void)(0)
-
-#define REQUIRE_THROWS( ... ) (void)(0)
-#define REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
-#define REQUIRE_THROWS_WITH( expr, matcher ) (void)(0)
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-#define REQUIRE_NOTHROW( ... ) (void)(0)
-
-#define CHECK( ... ) (void)(0)
-#define CHECK_FALSE( ... ) (void)(0)
-#define CHECKED_IF( ... ) if (__VA_ARGS__)
-#define CHECKED_ELSE( ... ) if (!(__VA_ARGS__))
-#define CHECK_NOFAIL( ... ) (void)(0)
-
-#define CHECK_THROWS( ... )  (void)(0)
-#define CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
-#define CHECK_THROWS_WITH( expr, matcher ) (void)(0)
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-#define CHECK_NOTHROW( ... ) (void)(0)
-
-#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)
-#define CHECK_THAT( arg, matcher ) (void)(0)
-
-#define REQUIRE_THAT( arg, matcher ) (void)(0)
-#endif // CATCH_CONFIG_DISABLE_MATCHERS
-
-#define INFO( msg ) (void)(0)
-#define WARN( msg ) (void)(0)
-#define CAPTURE( msg ) (void)(0)
-
-#define TEST_CASE( ... )  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
-#define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
-#define METHOD_AS_TEST_CASE( method, ... )
-#define REGISTER_TEST_CASE( Function, ... ) (void)(0)
-#define SECTION( ... )
-#define DYNAMIC_SECTION( ... )
-#define FAIL( ... ) (void)(0)
-#define FAIL_CHECK( ... ) (void)(0)
-#define SUCCEED( ... ) (void)(0)
-#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ))
-
-#endif
-
-#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
-
-// "BDD-style" convenience wrappers
-#define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ) )
-#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), className )
-
-#define GIVEN( desc )
-#define AND_GIVEN( desc )
-#define WHEN( desc )
-#define AND_WHEN( desc )
-#define THEN( desc )
-#define AND_THEN( desc )
-
-using Catch::Detail::Approx;
-
-#endif
-
-#endif // ! CATCH_CONFIG_IMPL_ONLY
-
-// start catch_reenable_warnings.h
-
-
-#ifdef __clang__
-#    ifdef __ICC // icpc defines the __clang__ macro
-#        pragma warning(pop)
-#    else
-#        pragma clang diagnostic pop
-#    endif
-#elif defined __GNUC__
-#    pragma GCC diagnostic pop
-#endif
-
-// end catch_reenable_warnings.h
-// end catch.hpp
-#endif // TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
-
diff --git a/third_party/cutlass b/third_party/cutlass
deleted file mode 160000
index fd7e058..0000000
--- a/third_party/cutlass
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit fd7e058d0cb3e4bf743edc530c7778a210cb168b
diff --git a/third_party/mp11 b/third_party/mp11
deleted file mode 160000
index 29764aa..0000000
--- a/third_party/mp11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 29764aad4881fde809af6a025c12012e47a55515
diff --git a/third_party/pybind11 b/third_party/pybind11
deleted file mode 160000
index 3b1dbeb..0000000
--- a/third_party/pybind11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
diff --git a/tools/README.md b/tools/README.md
new file mode 100644
index 0000000..303b39a
--- /dev/null
+++ b/tools/README.md
@@ -0,0 +1,23 @@
+<!--
+ Copyright 2021 Yan Yan
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+## How to debug manylinux build
+
+```Bash
+docker run --rm -it -e PLAT=manylinux2014_x86_64 -v `pwd`:/io -v $HOME:/myhome scrin/manylinux2014-cuda:cu114-devel bash
+/io/tools/build-wheels.sh
+
+```
\ No newline at end of file
diff --git a/tools/build-wheels.sh b/tools/build-wheels.sh
new file mode 100644
index 0000000..03875db
--- /dev/null
+++ b/tools/build-wheels.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e -u -x
+
+function repair_wheel {
+    wheel="$1"
+    outpath="$2"
+    if ! auditwheel show "$wheel"; then
+        echo "Skipping non-platform wheel $wheel"
+    else
+        auditwheel repair "$wheel" --plat "$PLAT" -w "$outpath"
+    fi
+}
+
+export SPCONV_DISABLE_JIT="1"
+export CUMM_CUDA_ARCH_LIST="all"
+# Compile wheels, we only support 3.7-3.10.
+# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp310-cp310/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+
+# Bundle external shared libraries into the wheels
+for whl in /io/wheelhouse_tmp/*.whl; do
+    repair_wheel "$whl" /io/dist
+done
+
+rm -rf /io/wheelhouse_tmp
\ No newline at end of file
diff --git a/tools/install_windows_cuda.ps1 b/tools/install_windows_cuda.ps1
new file mode 100644
index 0000000..4c00982
--- /dev/null
+++ b/tools/install_windows_cuda.ps1
@@ -0,0 +1,128 @@
+## -------------------
+## Constants
+## -------------------
+
+# Dictionary of known cuda versions and thier download URLS, which do not follow a consistent pattern :(
+$CUDA_KNOWN_URLS = @{
+    "10.2" = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe";
+    "11.0" = "http://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe";
+    "11.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe";
+    "11.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe";
+    "11.3" = "https://developer.download.nvidia.com/compute/cuda/11.3.1/network_installers/cuda_11.3.1_win10_network.exe";
+    "11.4" = "https://developer.download.nvidia.com/compute/cuda/11.4.2/network_installers/cuda_11.4.2_win10_network.exe";
+}
+
+# cuda_runtime.h is in nvcc <= 10.2, but cudart >= 11.0
+# @todo - make this easier to vary per CUDA version.
+$CUDA_PACKAGES_IN = @(
+    "nvcc";
+    "visual_studio_integration";
+    "curand_dev";
+    "nvrtc_dev";
+    "cudart";
+)
+
+
+## -------------------
+## Select CUDA version
+## -------------------
+
+# Get the cuda version from the environment as env:cuda.
+$CUDA_VERSION_FULL = $env:cuda
+# Make sure CUDA_VERSION_FULL is set and valid, otherwise error.
+
+# Validate CUDA version, extracting components via regex
+$cuda_ver_matched = $CUDA_VERSION_FULL -match "^(?<major>[1-9][0-9]*)\.(?<minor>[0-9]+)$"
+if(-not $cuda_ver_matched){
+    Write-Output "Invalid CUDA version specified, <major>.<minor> required. '$CUDA_VERSION_FULL'."
+    exit 1
+}
+$CUDA_MAJOR=$Matches.major
+$CUDA_MINOR=$Matches.minor
+
+## ------------------------------------------------
+## Select CUDA packages to install from environment
+## ------------------------------------------------
+
+$CUDA_PACKAGES = ""
+
+# for CUDA >= 11 cudart is a required package.
+# if([version]$CUDA_VERSION_FULL -ge [version]"11.0") {
+#     if(-not $CUDA_PACKAGES_IN -contains "cudart") {
+#         $CUDA_PACKAGES_IN += 'cudart'
+#     }
+# }
+
+Foreach ($package in $CUDA_PACKAGES_IN) {
+    # Make sure the correct package name is used for nvcc.
+    if($package -eq "nvcc" -and [version]$CUDA_VERSION_FULL -lt [version]"9.1"){
+        $package="compiler"
+    } elseif($package -eq "compiler" -and [version]$CUDA_VERSION_FULL -ge [version]"9.1") {
+        $package="nvcc"
+    }
+    $CUDA_PACKAGES += " $($package)_$($CUDA_MAJOR).$($CUDA_MINOR)"
+
+}
+echo "$($CUDA_PACKAGES)"
+## -----------------
+## Prepare download
+## -----------------
+
+# Select the download link if known, otherwise have a guess.
+$CUDA_REPO_PKG_REMOTE=""
+if($CUDA_KNOWN_URLS.containsKey($CUDA_VERSION_FULL)){
+    $CUDA_REPO_PKG_REMOTE=$CUDA_KNOWN_URLS[$CUDA_VERSION_FULL]
+} else{
+    # Guess what the url is given the most recent pattern (at the time of writing, 10.1)
+    Write-Output "note: URL for CUDA ${$CUDA_VERSION_FULL} not known, estimating."
+    $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
+}
+$CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
+
+
+## ------------
+## Install CUDA
+## ------------
+
+# Get CUDA network installer
+Write-Output "Downloading CUDA Network Installer for $($CUDA_VERSION_FULL) from: $($CUDA_REPO_PKG_REMOTE)"
+Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
+if(Test-Path -Path $CUDA_REPO_PKG_LOCAL){
+    Write-Output "Downloading Complete"
+} else {
+    Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) from $($CUDA_REPO_PKG_REMOTE)"
+    exit 1
+}
+
+# Invoke silent install of CUDA (via network installer)
+Write-Output "Installing CUDA $($CUDA_VERSION_FULL). Subpackages $($CUDA_PACKAGES)"
+Start-Process -Wait -FilePath .\"$($CUDA_REPO_PKG_LOCAL)" -ArgumentList "-s $($CUDA_PACKAGES)"
+
+# Check the return status of the CUDA installer.
+if (!$?) {
+    Write-Output "Error: CUDA installer reported error. $($LASTEXITCODE)"
+    exit 1 
+}
+
+# Store the CUDA_PATH in the environment for the current session, to be forwarded in the action.
+$CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$($CUDA_MAJOR).$($CUDA_MINOR)"
+$CUDA_PATH_VX_Y = "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)" 
+# Set environmental variables in this session
+$env:CUDA_PATH = "$($CUDA_PATH)"
+$env:CUDA_PATH_VX_Y = "$($CUDA_PATH_VX_Y)"
+Write-Output "CUDA_PATH $($CUDA_PATH)"
+Write-Output "CUDA_PATH_VX_Y $($CUDA_PATH_VX_Y)"
+
+# PATH needs updating elsewhere, anything in here won't persist.
+# Append $CUDA_PATH/bin to path.
+# Set CUDA_PATH as an environmental variable
+
+
+# If executing on github actions, emit the appropriate echo statements to update environment variables
+if (Test-Path "env:GITHUB_ACTIONS") { 
+    # Set paths for subsequent steps, using $env:CUDA_PATH
+    echo "Adding CUDA to CUDA_PATH, CUDA_PATH_X_Y and PATH"
+    echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+    echo "$env:CUDA_PATH_VX_Y=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+    echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+}
diff --git a/tools/msvc_setup.ps1 b/tools/msvc_setup.ps1
new file mode 100644
index 0000000..c8dc43c
--- /dev/null
+++ b/tools/msvc_setup.ps1
@@ -0,0 +1,17 @@
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+$installPath = &"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -property installationpath
+Import-Module (Join-Path $installPath "Common7\Tools\Microsoft.VisualStudio.DevShell.dll")
+Enter-VsDevShell -VsInstallPath $installPath -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
\ No newline at end of file
diff --git a/version.txt b/version.txt
new file mode 100644
index 0000000..359a5b9
--- /dev/null
+++ b/version.txt
@@ -0,0 +1 @@
+2.0.0
\ No newline at end of file