From 01ed382c95704d86bf74f71f7d6eba8fe60399c5 Mon Sep 17 00:00:00 2001 From: "yan.yan" Date: Mon, 18 Oct 2021 11:57:12 +0800 Subject: [PATCH] working on tensor core test --- .github/workflows/build.yaml | 36 - .gitignore | 2 + .gitmodules | 9 - CHANGELOG.md | 6 + CMakeLists.txt | 64 - Dockerfile | 9 - LICENSE | 2 +- PERFORMANCE_GUIDE.md | 31 - README.md | 208 +- codeai-devops.yaml | 116 - docs/API.md | 16 + docs/DEVELOPMENT.md | 16 + docs/PERFORMANCE_GUIDE.md | 16 + example/mnist_sparse.py | 16 +- example/voxel_gen.py | 38 + include/cuhash/cuda_util.h | 51 - include/cuhash/debugging.h | 77 - include/cuhash/definitions.h | 116 - include/cuhash/hash_functions.h | 91 - include/cuhash/hash_table.cuh | 275 - include/cuhash/hash_table.h | 228 - include/paramsgrid.h | 65 - include/spconv/box_iou.h | 156 - include/spconv/cublas_gemm.h | 47 - include/spconv/fused_conv.cu.h | 629 - include/spconv/fused_conv.h | 55 - include/spconv/fused_spconv_ops.h | 126 - include/spconv/geometry.h | 183 - include/spconv/indice.cu.h | 571 - include/spconv/indice.h | 58 - include/spconv/maxpool.h | 44 - include/spconv/minkowski.cu.h | 179 - include/spconv/nms.h | 202 - include/spconv/nms_functor.h | 37 - include/spconv/nms_gpu.h | 18 - include/spconv/nms_ops.h | 74 - include/spconv/pillar_scatter_functor.h | 31 - include/spconv/pillar_scatter_ops.h | 56 - include/spconv/point2voxel.cu.h | 81 - include/spconv/point2voxel.h | 276 - include/spconv/point2voxel_ops.h | 30 - include/spconv/points2voxels.h | 22 - include/spconv/pool_ops.h | 35 - include/spconv/reordering.cu.h | 432 - include/spconv/reordering.h | 47 - include/spconv/spconv_ops.h | 58 - include/spgemm/gemm.h | 81 - include/spgemm/gemm_th.h | 11 - include/sphash/hashmap.h | 11 - include/tensorrt/inference.h | 207 - include/tensorview/cc17.h | 264 - include/tensorview/common.h | 94 - include/tensorview/cuda_utils.h | 31 - include/tensorview/eigen_utils.h | 41 - include/tensorview/kernel_utils.h | 72 - include/tensorview/mp_helper.h | 56 - include/tensorview/prettyprint.h | 475 - include/tensorview/pybind_utils.h | 170 - include/tensorview/tensor.h | 992 -- include/tensorview/tensorview.h | 1503 -- include/tensorview/tools.h | 75 - include/tensorview/torch_utils.h | 147 - include/torch_utils.h | 124 - include/tsl/robin_growth_policy.h | 334 - include/tsl/robin_hash.h | 1360 -- include/tsl/robin_map.h | 715 - include/utility/timer.h | 58 - pyproject.toml | 3 + setup.py | 261 +- spconv/__init__.py | 47 +- spconv/algo.py | 578 + spconv/build.py | 31 + spconv/constants.py | 27 + spconv/core_cc/__init__.pyi | 14 + spconv/core_cc/csrc/__init__.pyi | 14 + spconv/core_cc/csrc/sparse/__init__.pyi | 14 + spconv/core_cc/csrc/sparse/all/__init__.pyi | 110 + spconv/core_cc/csrc/sparse/all/ops1d.pyi | 28 + spconv/core_cc/csrc/sparse/all/ops2d.pyi | 28 + spconv/core_cc/csrc/sparse/all/ops3d.pyi | 28 + spconv/core_cc/csrc/sparse/all/ops4d.pyi | 28 + spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi | 34 + spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi | 34 + spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi | 34 + spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi | 34 + spconv/core_cc/cumm/__init__.pyi | 14 + spconv/core_cc/cumm/gemm/__init__.pyi | 14 + spconv/core_cc/cumm/gemm/gather.pyi | 91 + spconv/core_cc/cumm/gemm/main.pyi | 187 + spconv/csrc/__init__.py | 14 + spconv/csrc/sparse/__init__.py | 14 + spconv/csrc/sparse/all.py | 250 + spconv/csrc/sparse/devleop/sort_bench.py | 37 + spconv/csrc/sparse/devleop/wtf.py | 85 + spconv/csrc/sparse/indices.py | 770 + spconv/csrc/sparse/maxpool.py | 174 + spconv/csrc/sparse/pointops.py | 482 + spconv/ops.py | 160 - spconv/pytorch/__init__.py | 31 + spconv/{ => pytorch}/conv.py | 167 +- spconv/{ => pytorch}/core.py | 14 + spconv/pytorch/cppcore.py | 51 + spconv/{ => pytorch}/functional.py | 10 +- spconv/{ => pytorch}/identity.py | 0 spconv/{ => pytorch}/modules.py | 15 +- spconv/pytorch/ops.py | 590 + spconv/{ => pytorch}/pool.py | 23 +- spconv/{ => pytorch}/spatial.py | 12 +- spconv/{ => pytorch}/tables.py | 18 +- spconv/test_utils.py | 8 +- spconv/utils/__init__.py | 387 +- src/cuhash/CMakeLists.txt | 25 - src/cuhash/debugging.cpp | 104 - src/cuhash/debugging.cu | 236 - src/cuhash/hash_functions.cpp | 14 - src/cuhash/hash_functions.cu | 38 - src/cuhash/hash_table.cpp | 232 - src/cuhash/hash_table.cu | 112 - src/cuhash/main.cc | 43 - src/spconv/CMakeLists.txt | 22 - src/spconv/all.cc | 35 - src/spconv/cublas_gemm.cc | 53 - src/spconv/fused_conv.cu | 155 - src/spconv/indice.cc | 330 - src/spconv/indice.cu | 413 - src/spconv/maxpool.cc | 82 - src/spconv/maxpool.cu | 482 - src/spconv/nms.cc | 142 - src/spconv/nms.cu | 73 - src/spconv/pillar_scatter.cu | 66 - src/spconv/point2voxel.cu | 108 - src/spconv/point2voxel_ops.cc | 42 - src/spconv/pool_ops.cc | 73 - src/spconv/reordering.cc | 74 - src/spconv/reordering.cu | 374 - src/spconv/spconv_ops.cc | 752 - src/spgemm/CMakeLists.txt | 10 - src/spgemm/gemm.cu | 34 - src/spgemm/torchdev_cutlass.cu | 38 - src/utils/CMakeLists.txt | 27 - src/utils/all.cc | 103 - src/utils/nms.cu | 148 - test/aaa.py | 82 + test/benchmark.py | 143 +- test/benchmark_detail.py | 199 - test/benchmark_points_to_voxel.py | 110 - test/benchmark_points_to_voxel_gpu.py | 170 - test/src/catch_main.cpp | 15 - test/src/test_conv_rule.cpp | 126 - test/test_conv.py | 78 +- third_party/catch2/catch.hpp | 14020 ----------------- third_party/cutlass | 1 - third_party/mp11 | 1 - third_party/pybind11 | 1 - tools/README.md | 23 + tools/build-wheels.sh | 42 + tools/install_windows_cuda.ps1 | 128 + tools/msvc_setup.ps1 | 17 + version.txt | 1 + 159 files changed, 4868 insertions(+), 31450 deletions(-) delete mode 100644 .github/workflows/build.yaml delete mode 100644 CMakeLists.txt delete mode 100644 Dockerfile delete mode 100644 PERFORMANCE_GUIDE.md delete mode 100644 codeai-devops.yaml create mode 100644 docs/API.md create mode 100644 docs/DEVELOPMENT.md create mode 100644 docs/PERFORMANCE_GUIDE.md create mode 100644 example/voxel_gen.py delete mode 100644 include/cuhash/cuda_util.h delete mode 100644 include/cuhash/debugging.h delete mode 100644 include/cuhash/definitions.h delete mode 100644 include/cuhash/hash_functions.h delete mode 100644 include/cuhash/hash_table.cuh delete mode 100644 include/cuhash/hash_table.h delete mode 100644 include/paramsgrid.h delete mode 100644 include/spconv/box_iou.h delete mode 100644 include/spconv/cublas_gemm.h delete mode 100644 include/spconv/fused_conv.cu.h delete mode 100644 include/spconv/fused_conv.h delete mode 100644 include/spconv/fused_spconv_ops.h delete mode 100644 include/spconv/geometry.h delete mode 100644 include/spconv/indice.cu.h delete mode 100644 include/spconv/indice.h delete mode 100644 include/spconv/maxpool.h delete mode 100644 include/spconv/minkowski.cu.h delete mode 100644 include/spconv/nms.h delete mode 100644 include/spconv/nms_functor.h delete mode 100644 include/spconv/nms_gpu.h delete mode 100644 include/spconv/nms_ops.h delete mode 100644 include/spconv/pillar_scatter_functor.h delete mode 100644 include/spconv/pillar_scatter_ops.h delete mode 100644 include/spconv/point2voxel.cu.h delete mode 100644 include/spconv/point2voxel.h delete mode 100644 include/spconv/point2voxel_ops.h delete mode 100644 include/spconv/points2voxels.h delete mode 100644 include/spconv/pool_ops.h delete mode 100644 include/spconv/reordering.cu.h delete mode 100644 include/spconv/reordering.h delete mode 100644 include/spconv/spconv_ops.h delete mode 100644 include/spgemm/gemm.h delete mode 100644 include/spgemm/gemm_th.h delete mode 100644 include/sphash/hashmap.h delete mode 100644 include/tensorrt/inference.h delete mode 100644 include/tensorview/cc17.h delete mode 100644 include/tensorview/common.h delete mode 100644 include/tensorview/cuda_utils.h delete mode 100644 include/tensorview/eigen_utils.h delete mode 100644 include/tensorview/kernel_utils.h delete mode 100644 include/tensorview/mp_helper.h delete mode 100644 include/tensorview/prettyprint.h delete mode 100644 include/tensorview/pybind_utils.h delete mode 100644 include/tensorview/tensor.h delete mode 100644 include/tensorview/tensorview.h delete mode 100644 include/tensorview/tools.h delete mode 100644 include/tensorview/torch_utils.h delete mode 100644 include/torch_utils.h delete mode 100644 include/tsl/robin_growth_policy.h delete mode 100644 include/tsl/robin_hash.h delete mode 100644 include/tsl/robin_map.h delete mode 100644 include/utility/timer.h create mode 100644 pyproject.toml create mode 100644 spconv/algo.py create mode 100644 spconv/build.py create mode 100644 spconv/constants.py create mode 100644 spconv/core_cc/__init__.pyi create mode 100644 spconv/core_cc/csrc/__init__.pyi create mode 100644 spconv/core_cc/csrc/sparse/__init__.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/__init__.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops1d.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops2d.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops3d.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops4d.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops_cpu1d.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops_cpu2d.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops_cpu3d.pyi create mode 100644 spconv/core_cc/csrc/sparse/all/ops_cpu4d.pyi create mode 100644 spconv/core_cc/cumm/__init__.pyi create mode 100644 spconv/core_cc/cumm/gemm/__init__.pyi create mode 100644 spconv/core_cc/cumm/gemm/gather.pyi create mode 100644 spconv/core_cc/cumm/gemm/main.pyi create mode 100644 spconv/csrc/__init__.py create mode 100644 spconv/csrc/sparse/__init__.py create mode 100644 spconv/csrc/sparse/all.py create mode 100644 spconv/csrc/sparse/devleop/sort_bench.py create mode 100644 spconv/csrc/sparse/devleop/wtf.py create mode 100644 spconv/csrc/sparse/indices.py create mode 100644 spconv/csrc/sparse/maxpool.py create mode 100644 spconv/csrc/sparse/pointops.py delete mode 100644 spconv/ops.py create mode 100644 spconv/pytorch/__init__.py rename spconv/{ => pytorch}/conv.py (79%) rename spconv/{ => pytorch}/core.py (87%) create mode 100644 spconv/pytorch/cppcore.py rename spconv/{ => pytorch}/functional.py (98%) rename spconv/{ => pytorch}/identity.py (100%) rename spconv/{ => pytorch}/modules.py (97%) create mode 100644 spconv/pytorch/ops.py rename spconv/{ => pytorch}/pool.py (95%) rename spconv/{ => pytorch}/spatial.py (92%) rename spconv/{ => pytorch}/tables.py (67%) delete mode 100644 src/cuhash/CMakeLists.txt delete mode 100644 src/cuhash/debugging.cpp delete mode 100644 src/cuhash/debugging.cu delete mode 100644 src/cuhash/hash_functions.cpp delete mode 100644 src/cuhash/hash_functions.cu delete mode 100644 src/cuhash/hash_table.cpp delete mode 100644 src/cuhash/hash_table.cu delete mode 100644 src/cuhash/main.cc delete mode 100644 src/spconv/CMakeLists.txt delete mode 100644 src/spconv/all.cc delete mode 100644 src/spconv/cublas_gemm.cc delete mode 100644 src/spconv/fused_conv.cu delete mode 100644 src/spconv/indice.cc delete mode 100644 src/spconv/indice.cu delete mode 100644 src/spconv/maxpool.cc delete mode 100644 src/spconv/maxpool.cu delete mode 100644 src/spconv/nms.cc delete mode 100644 src/spconv/nms.cu delete mode 100644 src/spconv/pillar_scatter.cu delete mode 100644 src/spconv/point2voxel.cu delete mode 100644 src/spconv/point2voxel_ops.cc delete mode 100644 src/spconv/pool_ops.cc delete mode 100644 src/spconv/reordering.cc delete mode 100644 src/spconv/reordering.cu delete mode 100644 src/spconv/spconv_ops.cc delete mode 100644 src/spgemm/CMakeLists.txt delete mode 100644 src/spgemm/gemm.cu delete mode 100644 src/spgemm/torchdev_cutlass.cu delete mode 100644 src/utils/CMakeLists.txt delete mode 100644 src/utils/all.cc delete mode 100644 src/utils/nms.cu create mode 100644 test/aaa.py delete mode 100644 test/benchmark_detail.py delete mode 100644 test/benchmark_points_to_voxel.py delete mode 100644 test/benchmark_points_to_voxel_gpu.py delete mode 100644 test/src/catch_main.cpp delete mode 100644 test/src/test_conv_rule.cpp delete mode 100644 third_party/catch2/catch.hpp delete mode 160000 third_party/cutlass delete mode 160000 third_party/mp11 delete mode 160000 third_party/pybind11 create mode 100644 tools/README.md create mode 100644 tools/build-wheels.sh create mode 100644 tools/install_windows_cuda.ps1 create mode 100644 tools/msvc_setup.ps1 create mode 100644 version.txt diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml deleted file mode 100644 index c14d1cb..0000000 --- a/.github/workflows/build.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: build - -on: - push: - branches: - - master - schedule: - # * is a special character in YAML so you have to quote this string - - cron: '0 0 * * 6' # base builds run every saturday - -jobs: - build: - runs-on: ubuntu-latest - env: - DOCKER_IMAGE_NAME: scrin/dev-spconv - DOCKER_FILE_PATH: ./Dockerfile - - # TODO: create a action to reuse code. the problem is how to reuse docker-login. - steps: - - uses: actions/checkout@master - - name: Build Docker - run: | - docker build . --file ${{env.DOCKER_FILE_PATH}} --tag ${{env.DOCKER_IMAGE_NAME}}:latest - docker tag ${{env.DOCKER_IMAGE_NAME}}:latest ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }} - - - - name: Login to Registry - uses: azure/docker-login@v1 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - - name: Publish to Registry - run: | - docker push ${{env.DOCKER_IMAGE_NAME}}:latest - docker push ${{env.DOCKER_IMAGE_NAME}}:${{ github.sha }} diff --git a/.gitignore b/.gitignore index a68262c..657dd08 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,5 @@ venv.bak/ .mypy_cache/ .vscode + +__version__.py \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index 63f9714..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +0,0 @@ -[submodule "third_party/pybind11"] - path = third_party/pybind11 - url = https://github.com/pybind/pybind11.git -[submodule "third_party/cutlass"] - path = third_party/cutlass - url = https://github.com/NVIDIA/cutlass -[submodule "third_party/mp11"] - path = third_party/mp11 - url = https://github.com/boostorg/mp11 diff --git a/CHANGELOG.md b/CHANGELOG.md index ff4fa57..8a3cfcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [2.0.0] - 2021-10-16 +### Changed +- Change build system from cmake to pccm. +- Change pytorch python code to spconv.pytorch +- Rewrite All c++ code. + ## [1.2.1] - 2020-06-04 ### Changed - The subm indice pair generation speed is greatly increased by two tricks: 1. most subm conv use only kernelsize=3, so we can unroll loops to get 100% performance increase. 2. subm indice pairs have a property: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1], so we can get another 100% performance increase. diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 6be70cf..0000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,64 +0,0 @@ -cmake_minimum_required(VERSION 3.13 FATAL_ERROR) - -option(SPCONV_BuildTests "Build the unit tests when BUILD_TESTING is enabled." ON) -option(SPCONV_BuildCUDA "Build cuda code when BUILD_TESTING is enabled." ON) -if (SPCONV_BuildCUDA) - project(SparseConv LANGUAGES CXX CUDA VERSION 1.1) -else() - project(SparseConv LANGUAGES CXX VERSION 1.1) -endif() - -if(WIN32) # true if windows (32 and 64 bit) - add_compile_definitions(TV_WINDOWS) -endif() -add_compile_definitions(PYTORCH_VERSION=${PYTORCH_VERSION}) - -set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - add_compile_definitions(TV_DEBUG) -endif() -# add_compile_definitions(TV_LOG_KERNEL_INFO) - -find_package(Torch REQUIRED) -# set(CMAKE_VERBOSE_MAKEFILE ON) -if (SPCONV_BuildCUDA) - set(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}") - get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY) - get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR}" DIRECTORY) - if(WIN32) # true if windows (32 and 64 bit) - set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64") - else() - set(CUDA_LIB_PATH_HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64") - endif() - find_library(CUDA_CUDART NAMES cudart HINTS ${CUDA_LIB_PATH_HINTS}) - find_library(CUDA_CUBLAS NAMES cublas HINTS ${CUDA_LIB_PATH_HINTS}) - torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA) - string (REPLACE ";" " " NVCC_FLAGS_EXTRA_STR "${NVCC_FLAGS_EXTRA}") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA_STR}") - add_compile_definitions(TV_CUDA) -endif() -# add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) -add_subdirectory(third_party/pybind11) - -set(ALL_LIBS ${TORCH_LIBRARIES}) - -set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include) -set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include) -set(CUTLASS_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include) - -if (SPCONV_BuildCUDA) - set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS}) - set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - add_subdirectory(src/cuhash) - add_subdirectory(src/spgemm) -endif() -add_subdirectory(src/spconv) -add_subdirectory(src/utils) - -if (SPCONV_BuildTests) - include(CTest) #adds option BUILD_TESTING (default ON) - if(BUILD_TESTING) - enable_testing() - add_subdirectory(test) - endif() -endif() diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 007ba79..0000000 --- a/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM scrin/dev:latest - -RUN PROBLEM_FILE=/usr/local/lib/python3.8/dist-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake && \ - sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces;-fopenmp//g' $PROBLEM_FILE && \ - sed -i 's/-Wall;-Wextra;-Wno-unused-parameter;-Wno-missing-field-initializers;-Wno-write-strings;-Wno-unknown-pragmas;-Wno-missing-braces//g' $PROBLEM_FILE && \ - cd /root && \ - git clone --depth 1 --recursive https://www.github.com/traveller59/spconv.git && \ - cd ./spconv && \ - SPCONV_FORCE_BUILD_CUDA=1 python setup.py install diff --git a/LICENSE b/LICENSE index b131473..ba46142 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2019-2020 Yan Yan + Copyright 2019-2021 Yan Yan Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/PERFORMANCE_GUIDE.md b/PERFORMANCE_GUIDE.md deleted file mode 100644 index 54bef8f..0000000 --- a/PERFORMANCE_GUIDE.md +++ /dev/null @@ -1,31 +0,0 @@ -## Performance Guide - -### 1. Regular sparse conv is very slow - -Regular sparse convolution will greatly increase the number of active points. for 3x3x3 3D convolution, we can get at most 27x active points, which means next convolution will perform 27x slower! - -This problem can be solved by using submanifold convolution (SubMConv3d). This kind of sparse convolution doesn't generate new active points. - -**NEVER** use SparseConv3d except downsample data, **NEVER** use SparseConv3dTranspose, use SparseInverseConv3d instead. - -### 2. Large Spatial Shape cost too much GPU memory - -Our implementation use dense map to generate indices in GPU for sparse convolution, which means if your spatial shape is ```[batchSize=4, 1600, 1600, 40]```, it will cost ~2GB GPU memory. - -To solve this problem, you can use CPU algorithm (hash map) for first layer that has large shape, then convert generated indices to GPU and use GPU algorithm for downsampled data. - -Another way is use cuda hash. Unfortunately this library isn't stable enough, it should only be used when the spatial shape is very large. - -### 3. Stacked submanifold convolution can share same indice data - -When you using stacked subm convolution, there is no need to generate indice data again, but this can't be done automatically. you need to specify a unique key ```indice_key="c0"``` and use it for all stacked subm convolution. - -### 4. Different convolution algorithm may lead to different performance - -There are three kind of algorithm: ```Native```, ```Batch```, ```BatchGemmGather```. - -* ```Native```: should be used for all submanifold convolutions. should be used when there are too much active points. - -* ```Batch```: **cost more GPU memory** should be used when number of active points is small. - -* ```BatchGemmGather```: **cost more GPU memory** can be used for regular convolution. \ No newline at end of file diff --git a/README.md b/README.md index 1e95261..c4b671c 100644 --- a/README.md +++ b/README.md @@ -1,173 +1,95 @@ + + # SpConv: PyTorch Spatially Sparse Convolution Library [![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild) -This is a spatially sparse convolution library like [SparseConvNet](https://github.com/facebookresearch/SparseConvNet) but faster and easy to read. This library provide sparse convolution/transposed, submanifold convolution, inverse convolution and sparse maxpool. - - -2020-5-2, we add ConcatTable, JoinTable, AddTable, and Identity function to build ResNet and Unet in this version of spconv. - - -## Docker: - -```docker pull scrin/dev-spconv```, contains python 3.8, cuda 10.1, fish shell, newest pytorch and tensorflow. - -## Install on Ubuntu 16.04/18.04 - -* if you are using pytorch 1.4+ and encounter "nvcc fatal: unknown -Wall", you need to go to torch package dir and remove flags contains "-Wall" in INTERFACE_COMPILE_OPTIONS in Caffe2Targets.cmake. This problem can't be fixed in this project (to avoid this, I need to remove all torch dependency in cuda sources and drop half support). - -0. Use ```git clone xxx.git --recursive``` to clone this repo. - -1. Install boost headers to your system include path, you can use either ```sudo apt-get install libboost-all-dev``` or download compressed files from boost official website and copy headers to include path. - -2. Download cmake >= 3.13.2, then add cmake executables to PATH. - -3. Ensure you have installed pytorch 1.0+ in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```). - -4. Run ```cd ./dist```, use pip to install generated whl file. - -## Install on Windows 10 with CUDA 10 and python 3.6 (python 3.7 may have problem, see [this](https://github.com/pytorch/pytorch/issues/17233)) - -Since install newest driver and CUDA is very simple on windows, please use CUDA 10 on windows. - -0. Install Visual Studio 2017. Use ```git clone xxx.git --recursive``` to clone this repo. - -1. Download compressed files from boost official website and copy headers (i.e. boost_1_69/boost) to spconv/include. - -2. Download and install cmake >= 3.13.2, select add cmake to User or System PATH. - -3. Ensure you have installed pytorch 1.0 in your environment, run ```python setup.py bdist_wheel``` (don't use ```python setup.py install```). - -4. Run ```cd ./dist```, use pip to install generated whl file. - -## Compare with SparseConvNet - -### Features - -* SparseConvNet's Sparse Convolution don't support padding and dilation, spconv support this. - -* spconv only contains sparse convolutions, the batchnorm and activations can directly use layers from torch.nn, SparseConvNet contains lots of their own implementation of layers such as batchnorm and activations. - -### Speed - -* spconv is faster than SparseConvNet due to gpu indice generation and gather-gemm-scatter algorithm. SparseConvNet use hand-written gemm which is slow. - -## Usage - -### SparseConvTensor - -```Python -features = # your features with shape [N, numPlanes] -indices = # your indices/coordinates with shape [N, ndim + 1], batch index must be put in indices[:, 0] -spatial_shape = # spatial shape of your sparse tensor, spatial_shape[i] is shape of indices[:, 1 + i]. -batch_size = # batch size of your sparse tensor. -x = spconv.SparseConvTensor(features, indices, spatial_shape, batch_size) -x_dense_NCHW = x.dense() # convert sparse tensor to dense NCHW tensor. -print(x.sparity) # helper function to check sparity. -``` - -### Sparse Convolution - -```Python -import spconv -from torch import nn -class ExampleNet(nn.Module): - def __init__(self, shape): - super().__init__() - self.net = spconv.SparseSequential( - spconv.SparseConv3d(32, 64, 3), # just like nn.Conv3d but don't support group and all([d > 1, s > 1]) - nn.BatchNorm1d(64), # non-spatial layers can be used directly in SparseSequential. - nn.ReLU(), - spconv.SubMConv3d(64, 64, 3, indice_key="subm0"), - nn.BatchNorm1d(64), - nn.ReLU(), - # when use submanifold convolutions, their indices can be shared to save indices generation time. - spconv.SubMConv3d(64, 64, 3, indice_key="subm0"), - nn.BatchNorm1d(64), - nn.ReLU(), - spconv.SparseConvTranspose3d(64, 64, 3, 2), - nn.BatchNorm1d(64), - nn.ReLU(), - spconv.ToDense(), # convert spconv tensor to dense and convert it to NCHW format. - nn.Conv3d(64, 64, 3), - nn.BatchNorm1d(64), - nn.ReLU(), - ) - self.shape = shape - - def forward(self, features, coors, batch_size): - coors = coors.int() # unlike torch, this library only accept int coordinates. - x = spconv.SparseConvTensor(features, coors, self.shape, batch_size) - return self.net(x)# .dense() -``` +# WORK IN PROGRESS, DON'T USE!!! -### Inverse Convolution +## Breaking changes in Spconv 2.x -Inverse sparse convolution means "inv" of sparse convolution. the output of inverse convolution contains same indices as input of sparse convolution. +* ```spconv.xxx``` move to ```spconv.pytorch.xxx```, change all ```import spconv``` to ```import spconv.pytorch as spconv``` and ```from spconv.xxx import``` to ```from spconv.pytorch.xxx import```. +* ```use_hash``` in Sparse Convolution is removed, we only use hash table in 2.x. +* weight layout has been changed to RSKC (native algorithm) or KRSC (implicit gemm), no longer RSCK (spconv 1.x). RS is kernel size, C is input channel, K is output channel. +* all util ops are removed (pillar scatter/nms/...) +* VoxelGenerator has been replaced by Point2VoxelGPU[1-4]d/Point2VoxelCPU[1-4]d. +* spconv 2.x don't support CPU for now -Inverse convolution usually used in semantic segmentation. +## News in Spconv 2.0.0 -```Python -class ExampleNet(nn.Module): - def __init__(self, shape): - super().__init__() - self.net = spconv.SparseSequential( - spconv.SparseConv3d(32, 64, 3, 2, indice_key="cp0"), - spconv.SparseInverseConv3d(64, 32, 3, indice_key="cp0"), # need provide kernel size to create weight - ) - self.shape = shape +* training/inference speed is increased +* support int8/tensor core +* doesn't depend on pytorch binary. +* If your GPU has tensor core, try mixed precision training in spconv 2.x! +* since spconv 2.x doesn't depend on pytorch binary (never in future), it's impossible to support torch.jit/libtorch inference. - def forward(self, features, coors, batch_size): - coors = coors.int() - x = spconv.SparseConvTensor(features, coors, self.shape, batch_size) - return self.net(x) -``` +## TODO in Spconv 2.x +- [ ] Ampere (A100 / RTX 3000 series) feature support (work in progress) +- [ ] torch QAT support (work in progress) +- [ ] TensorRT (torch.fx based) +- [ ] Build C++ only package +- [ ] JIT compilation for CUDA kernels +- [ ] Document (low priority) +- [ ] CPU support (low priority) -### Utility functions +## Install -* convert point cloud to voxel +You need to install python >= 3.6 first to use spconv 2.x. -```Python +You need to install CUDA toolkit first before using prebuilt binaries or build from source. -voxel_generator = spconv.utils.VoxelGenerator( - voxel_size=[0.1, 0.1, 0.1], - point_cloud_range=[-50, -50, -3, 50, 50, 1], - max_num_points=30, - max_voxels=40000 -) +You need at least CUDA 10.2 to build and run spconv 2.x. We won't offer any support for CUDA < 10.2. -points = # [N, 3+] tensor. -voxels, coords, num_points_per_voxel = voxel_generator.generate(points) -``` +### Prebuilt -## Implementation Details +We offer python 3.6-3.10 and cuda 10.2/11.1/11.4 prebuilt binaries for linux (manylinux) and windows 10/11. -This implementation use gather-gemm-scatter framework to do sparse convolution. +We will offer prebuilts for CUDA versions supported by latest pytorch release. For example, pytorch 1.9 support cuda 10.2 and 11.1, so we support them too. -## Projects using spconv: +For Linux users, you need to install pip >= 20.3 first to install prebuilt. -* [second.pytorch](https://github.com/traveller59/second.pytorch): Point Cloud Object Detection in KITTI Dataset. +```pip install spconv-cu102``` for CUDA 10.2 -## Authors +```pip install spconv-cu111``` for CUDA 11.1 -* **Yan Yan** - *Initial work* - [traveller59](https://github.com/traveller59) +```pip install spconv-cu114``` for CUDA 11.4 -* **Bo Li** - *gpu indice generation idea, owner of patent of the sparse conv gpu indice generation algorithm (don't include subm)* - [prclibo](https://github.com/prclibo) +### Build from source -## Third party libraries +You need to rebuild ```cumm``` first if you are build along a CUDA version that not provided in prebuilts. -* [CUDPP](https://github.com/cudpp/cudpp): A cuda library. contains a cuda hash implementation. +#### Linux -* [robin-map](https://github.com/Tessil/robin-map): A fast c++ hash library. almost 2x faster than std::unordered_map in this project. +1. install build-essential, install CUDA +2. run ```export SPCONV_DISABLE_JIT="1"``` +3. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl``` -* [pybind11](https://github.com/pybind/pybind11): A head-only python c++ binding library. +#### Windows 10/11 -* [prettyprint](https://github.com/louisdx/cxx-prettyprint): A head-only library for container print. +1. install visual studio 2019 or newer. make sure C++ development package is installed. install CUDA +2. set [powershell script execution policy](https://docs.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_execution_policies?view=powershell-7.1) +3. start a new powershell, run ```tools/msvc_setup.ps1``` +4. run ```$Env:SPCONV_DISABLE_JIT = "1"``` +5. run ```python setup.py install```/```pip install -e .```/```python setup.py bdist_wheel```+```pip install dists/xxx.whl``` -## License +## Note -This project is licensed under the Apache license 2.0 License - see the [LICENSE.md](LICENSE.md) file for details +The work is done when the author is an employee at Tusimple. -The [CUDPP](https://github.com/cudpp/cudpp) hash code is licensed under BSD License. +## LICENSE -The [robin-map](https://github.com/Tessil/robin-map) code is licensed under MIT license. +Apache 2.0 \ No newline at end of file diff --git a/codeai-devops.yaml b/codeai-devops.yaml deleted file mode 100644 index 47afa8c..0000000 --- a/codeai-devops.yaml +++ /dev/null @@ -1,116 +0,0 @@ -global: - console_url: localhost:50091 - envs: - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python # c++ libprotobuf and python will conflicit - -analyzers: # only one analyzer is allowed for one type for now. - PythonAnalyzer: - - SimpleCPPAnalyzer: # $ devops.devs = ["_ci_dev_xxx"] is allowed in raw sources. - includes: ["*.cpp", "*.cu", "*.cc", "*.h", "*.hpp", "*.hxx", "*.cxx"] - -observers: - # run test functions when that function change or marked function change. - test: - type: TestObserver - - # run dev functions when that function change or marked function change. - dev: - type: DevObserver - pattern: _ci_dev_.* - - clangdev: - type: CPPDevObserver - main_pattern: dev_.*\.(cc|cpp|cxx) - pattern: .*\.(cc|cpp|cxx|h|hpp|hxx) - compiler: clang++ - executable: build/codeai_dev - includes: [ - include, - /usr/local/cuda/include, - /home/yy/anaconda3/include, - /home/yy/anaconda3/include/python3.7m, - third_party/pybind11/include, - third_party/include, - /home/yy/library/boost_1_72_0, - ] - libpaths: [ - /home/yy/anaconda3/lib, - ] - libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg] - std: c++14 - options: [-Wall, -Wextra] - - cudadev: - type: CPPDevObserver - main_pattern: dev_.*\.cu - pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu) - compiler: nvcc - executable: build/codeai_dev_cuda - run_cmd: [$(executable)] - sources: [] - includes: [ - include, - /usr/local/cuda/include, - /home/yy/anaconda3/include, - /home/yy/anaconda3/include/python3.7m, - third_party/pybind11/include, - third_party/cutlass/include, - ] - libpaths: [ - /usr/local/cuda/lib64, - /home/yy/anaconda3/lib, - ] - libraries: [-lpython3.7m, -lcudart, -lcublas, -ljpeg] - std: c++14 - options: [ - -Wno-deprecated-declarations, - # "-gencode=arch=compute_52,code=sm_52", - "-gencode=arch=compute_61,code=sm_61", - # "-gencode=arch=compute_60,code=sm_60", - # "-gencode=arch=compute_70,code=sm_70", - # "-gencode=arch=compute_75,code=sm_75", - ] - - torchdev: - type: CPPDevObserver - main_pattern: torchdev_.*\.(cu|cpp|cc|cxx) - pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu) - compiler: nvcc - executable: build/codeai_dev_torch - run_cmd: [$(executable)] - fail_cmds: # run cmd when pervious run fail with retcode - -6: [gdb, -ex, run, -ex, bt, -ex, quit, $(executable)] # segfault in unix - includes: [ - include, - /home/yy/anaconda3/lib/python3.7/site-packages/torch/include, - /home/yy/anaconda3/lib/python3.7/site-packages/torch/include/torch/csrc/api/include, - /usr/local/cuda/include, - /home/yy/anaconda3/include, - /home/yy/anaconda3/include/python3.7m, - third_party/pybind11/include, - third_party/cutlass/include, - ] - libpaths: [ - /home/yy/anaconda3/lib/python3.7/site-packages/torch/lib, - /usr/local/cuda/lib64, - /home/yy/anaconda3/lib, - ] - libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread, - "-Xcompiler=\"-Wl,--no-as-needed,-lc10\"", - "-Xcompiler=\"-Wl,--no-as-needed,-ltorch\"", - "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cpu\"", - "-Xcompiler=\"-Wl,--no-as-needed,-lc10_cuda\"", - "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cuda\""] - std: c++14 - # options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0] - - options: [ - -Wno-deprecated-declarations, - --expt-relaxed-constexpr, - "-gencode=arch=compute_61,code=sm_61", - -D_GLIBCXX_USE_CXX11_ABI=0, - ] - - - diff --git a/docs/API.md b/docs/API.md new file mode 100644 index 0000000..820fea0 --- /dev/null +++ b/docs/API.md @@ -0,0 +1,16 @@ + + diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md new file mode 100644 index 0000000..820fea0 --- /dev/null +++ b/docs/DEVELOPMENT.md @@ -0,0 +1,16 @@ + + diff --git a/docs/PERFORMANCE_GUIDE.md b/docs/PERFORMANCE_GUIDE.md new file mode 100644 index 0000000..820fea0 --- /dev/null +++ b/docs/PERFORMANCE_GUIDE.md @@ -0,0 +1,16 @@ + + diff --git a/example/mnist_sparse.py b/example/mnist_sparse.py index 5830c8b..537d26c 100644 --- a/example/mnist_sparse.py +++ b/example/mnist_sparse.py @@ -1,7 +1,21 @@ +# Copyright 2021 Yan Yan +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import print_function import argparse import torch -import spconv +import spconv.pytorch as spconv import torch.nn as nn import torch.nn.functional as F import torch.optim as optim diff --git a/example/voxel_gen.py b/example/voxel_gen.py new file mode 100644 index 0000000..2c910f8 --- /dev/null +++ b/example/voxel_gen.py @@ -0,0 +1,38 @@ +# Copyright 2021 Yan Yan +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from cumm import tensorview as tv +from spconv.utils import Point2VoxelCPU3d + + +def main(): + gen = Point2VoxelCPU3d( + vsize_xyz=[0.1, 0.1, 0.1], + coors_range_xyz=[-80, -80, -2, 80, 80, 6], + num_point_features=3, + max_num_voxels=5000, + max_num_points_per_voxel=5) + + pc = np.random.uniform(-10, 10, size=[1000, 3]) + pc_tv = tv.from_numpy(pc) + # generate voxels, note that voxels_tv reference to a persistent buffer in generator, + # so we can't run it in multi-thread. + voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel(pc_tv) + # run voxel gen and FILL MEAN VALUE to voxel remain + voxels_tv, indices_tv, num_p_in_vx_tv = gen.point_to_voxel_empty_mean(pc_tv) + +if __name__ == "__main__": + main() diff --git a/include/cuhash/cuda_util.h b/include/cuhash/cuda_util.h deleted file mode 100644 index c3ee1ca..0000000 --- a/include/cuhash/cuda_util.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef _CUDA_UTIL_H_ -#define _CUDA_UTIL_H_ - -#if CUDART_VERSION >= 4000 -#define CUDA_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize(); -#else -#define CUDA_DEVICE_SYNCHRONIZE() cudaThreadSynchronize(); -#endif - -#define CUDA_SAFE_CALL_NO_SYNC(call) \ - { \ - cudaError err = call; \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \ - __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } - -#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call); - -//! Check for CUDA error -#ifdef _DEBUG -#define CUDA_CHECK_ERROR(errorMessage) \ - { \ - cudaError_t err = cudaGetLastError(); \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - err = CUDA_DEVICE_SYNCHRONIZE(); \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } -#else -#define CUDA_CHECK_ERROR(errorMessage) \ - { \ - cudaError_t err = cudaGetLastError(); \ - if (cudaSuccess != err) { \ - fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ - errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \ - exit(EXIT_FAILURE); \ - } \ - } -#endif - -#endif \ No newline at end of file diff --git a/include/cuhash/debugging.h b/include/cuhash/debugging.h deleted file mode 100644 index 22d4c1c..0000000 --- a/include/cuhash/debugging.h +++ /dev/null @@ -1,77 +0,0 @@ -// ------------------------------------------------------------- -// cuDPP -- CUDA Data Parallel Primitives library -// ------------------------------------------------------------- -// $Revision:$ -// $Date:$ -// ------------------------------------------------------------- -// This source code is distributed under the terms of license.txt in -// the root directory of this source distribution. -// ------------------------------------------------------------- - -/** - * @file - * debugging.h - * - * @brief Debugging/statistics/performance utilities header for hash tables. - */ - -#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H -#define CUDAHT__CUCKOO__SRC__LIBRARY__DEBUGGING__H - -#include "definitions.h" -#include -#include - -#include - -namespace cuhash { - -//! @name Debugging functions -/// @{ - -void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys, - const unsigned table_size, - const uint2 *constants, - const unsigned kNumHashFunctions); - -//! Output how many probes were required by each thread to perform the -//! retrieval. -/*! @param[in] n_queries Number of queries being performed. - * @param[in] d_retrieval_probes Device array: the number of probes taken for - * each thread's retrieval. - * @param[in] n_functions Number of hash functions used. - */ -void OutputRetrievalStatistics(const unsigned n_queries, - const unsigned *d_retrieval_probes, - const unsigned n_functions); - -//! Outputs information about how many iterations threads required to -//! successfully cuckoo hash. -/*! @param[in] n Number of keys in the input. - * @param[in] d_iterations_taken Device mem: Number of iterations each - * thread took. - * @param[in] d_max_iterations_taken Device mem: Largest number of iterations - * taken by any thread. - */ -void OutputBuildStatistics(const unsigned n, - const unsigned *d_iterations_taken); - -//! Prints out the contents of the stash. -void PrintStashContents(const Entry *d_stash); - -//! Checks if a key is assigned the same slot by different hash functions. -bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys, - const unsigned *d_keys, const unsigned table_size, - uint2 *constants); - -/// @} - -}; // namespace cuhash - -#endif - -// Leave this at the end of the file -// Local Variables: -// mode:c++ -// c-file-style: "NVIDIA" -// End: diff --git a/include/cuhash/definitions.h b/include/cuhash/definitions.h deleted file mode 100644 index 658fb87..0000000 --- a/include/cuhash/definitions.h +++ /dev/null @@ -1,116 +0,0 @@ -// ------------------------------------------------------------- -// cuDPP -- CUDA Data Parallel Primitives library -// ------------------------------------------------------------- -// $Revision:$ -// $Date:$ -// ------------------------------------------------------------- -// This source code is distributed under the terms of license.txt in -// the root directory of this source distribution. -// ------------------------------------------------------------- - -/** - * @file definitions.h - * - * @brief Stores configuration flags and definitions for hard-coded values in - * hash table implementations. - */ - -#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H -#define CUDAHT__CUCKOO__SRC__LIBRARY__DEFINITIONS__H - -#include -#include -#include - -/* -------------------------------------------------------------------------- - Debugging. - -------------------------------------------------------------------------- */ -#ifdef _DEBUG -//! Forces the hash functions to generate a full set of slots for each key when -//! not using subtables. -// #define FORCEFULLY_GENERATE_NO_CYCLES - -//! Count how many iterations are taken to insert/find items. -#define TRACK_ITERATIONS - -//! Count how many items fail to be inserted when the hash table fails to build. -#define COUNT_UNINSERTED - -//! Take some statistics on the hash functions. -#define TAKE_HASH_FUNCTION_STATISTICS - -#ifdef TAKE_HASH_FUNCTION_STATISTICS -//! Determine how many keys hash into each table slot. -#define COUNT_HOW_MANY_HASH_INTO_EACH_SLOT - -//! Determine how many unique slots a key is assigned. -#define COUNT_HOW_MANY_HAVE_CYCLES -#endif -#endif - -#ifdef USE_DAN_OUTPUT -#include -//! Logs any error messages. -inline void PrintMessage(const char *message, const bool error = false) { - PrintIndentedMessage(message, error); -} -#else -//! Prints a message out to the console. -inline void PrintMessage(const char *message, const bool error = false) { - if (error) { - printf("cudahash: %s\n", message); - } else { - printf("%s\n", message); - } -} -#endif - -/* ------------------------------------------------------------------------- - Hash table constants and definitions. - ------------------------------------------------------------------------- */ -namespace cuhash { - -/** - * \addtogroup cudpp_hash_data_structures - * - * @{ - */ - -typedef unsigned long long - Entry; //!< A key and its value are stored in a 64-bit number. The key is - //!< stored in the upper 32 bits. - -const unsigned kMaxRestartAttempts = 10; //!< Number of build attempts. -const unsigned kKeyEmpty = 0xffffffffu; //!< Signifies empty slots in the table. -const unsigned kNotFound = - 0xffffffffu; //!< Signifies that a query key was not found. -const unsigned kMaxHashFunctions = - 5; //!< Maximum number of hash functions allowed. -const unsigned kStashSize = - 101; //!< How many slots the stash hash table contains. - -//! Value indicating that a hash table slot has no valid item within it. -const Entry kEntryEmpty = Entry(kKeyEmpty) << 32; - -//! Value returned when a query fails. -const Entry kEntryNotFound = (Entry(kKeyEmpty) << 32) + kNotFound; - -//! Number of threads to put in a thread block. -const unsigned kBlockSize = 64; - -//! Number of blocks to put along each axis of the grid. -const unsigned kGridSize = 16384; - -//! Minimum table sizes for 2 through 5 functions. -const float kMinimumSpaceUsages[] = {std::numeric_limits::max(), - std::numeric_limits::max(), - 2.01f, - 1.1f, - 1.03f, - 1.02f}; - -/** @} */ // end cudpp_hash_data_structures - -}; // namespace cuhash - -#endif diff --git a/include/cuhash/hash_functions.h b/include/cuhash/hash_functions.h deleted file mode 100644 index 31ce6f7..0000000 --- a/include/cuhash/hash_functions.h +++ /dev/null @@ -1,91 +0,0 @@ -/*! @file hash_functions.h - * @brief Hash function code. - */ - -#ifndef HASH_FUNCTIONS__H -#define HASH_FUNCTIONS__H - -#include "definitions.h" -#include -#include - -namespace cuhash { - -//! Prime number larger than the largest practical hash table size. -const unsigned kPrimeDivisor = 4294967291u; -// https://www.alpertron.com.ar/ECM.HTM -// const unsigned long kPrimeDivisor = 18446744073709551557lu -// const long kPrimeDivisor = 9223372036854775783l -// const Entry kPrimeDivisor = 4300000013lu; -// const unsigned kPrimeDivisor = 334214459; - -//! Generates a set of linear hash function constants. -/*! @param[in] N Number of hash functions. - @param[out] constants CPU pointer to the constants. - @param[in] num_keys Debug only: How many keys are in the input. - @param[in] d_keys Debug only: Device memory array containing the input - keys. - @param[in] table_size Debug only: Size of the hash table. - */ -void GenerateFunctions(const unsigned N, const unsigned num_keys, - const unsigned *d_keys, const unsigned table_size, - uint2 *constants); - -//! Container for all of the hash functions. -template struct Functions { - //! The constants required for all of the hash functions, including the stash. - //! Each function requires 2. - uint2 constants[N]; - - //! Generate new hash function constants. - /*! The parameters are only used for debugging and examining the key - distribution. \param[in] num_keys Debug: Number of keys in the input. - \param[in] d_keys Debug: Device array of the input keys. - \param[in] table_size Debug: Size of the hash table. - */ - void Generate(const unsigned num_keys, const unsigned *d_keys, - const unsigned table_size) { - GenerateFunctions(N, num_keys, d_keys, table_size, constants); - } -}; - -//! Computes the value of a hash function for a given key. -/*! \param[in] constants Constants used by the hash function. - ! \param[in] key Key being hashed. - ! \returns The value of the hash function for the key. - */ -inline __device__ __host__ unsigned hash_function_inner(const uint2 constants, - const unsigned key) { -#if 1 - // Fast version. - return ((constants.x ^ key) + constants.y) % kPrimeDivisor; -#else - // Slow version. - return ((unsigned long long)constants.x * key + constants.y) % kPrimeDivisor; -#endif -} - -//! Computes the value of a hash function for a given key. -/*! \param[in] functions All of the constants used by the hash functions. - ! \param[in] which_function Which hash function is being used. - ! \param[in] key Key being hashed. - ! \returns The value of a hash function with a given key. - */ -template -TV_HOST_DEVICE_INLINE unsigned -hash_function(const Functions functions, - const unsigned which_function, const unsigned key) { - return hash_function_inner(functions.constants[which_function], key); -} - -//! Simple hash function used by the stash. -TV_HOST_DEVICE_INLINE -unsigned stash_hash_function(const uint2 stash_constants, const unsigned key) { - return (stash_constants.x ^ key + stash_constants.y) % kStashSize; -} - -unsigned generate_random_uint32(); - -}; // namespace cuhash - -#endif diff --git a/include/cuhash/hash_table.cuh b/include/cuhash/hash_table.cuh deleted file mode 100644 index 29f9b70..0000000 --- a/include/cuhash/hash_table.cuh +++ /dev/null @@ -1,275 +0,0 @@ -// ------------------------------------------------------------- -// cuDPP -- CUDA Data Parallel Primitives library -// ------------------------------------------------------------- -// $Revision:$ -// $Date:$ -// ------------------------------------------------------------- -// This source code is distributed under the terms of license.txt in -// the root directory of this source distribution. -// ------------------------------------------------------------- - -/** - * @file hash_table.cuh - * - * @brief Implements kernel and __device__ functions for a basic hash table. - */ - -#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH -#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__CUH - -#include "definitions.h" -#include "hash_table.h" -#include -#include - -namespace cuhash { - -//! Makes an 64-bit Entry out of a key-value pair for the hash table. -TV_HOST_DEVICE_INLINE Entry make_entry(unsigned key, unsigned value) { - return (Entry(key) << 32) + value; -} - -//! Returns the key of an Entry. -TV_HOST_DEVICE_INLINE unsigned get_key(Entry entry) { - return (unsigned)(entry >> 32); -} - -//! Returns the value of an Entry. -TV_HOST_DEVICE_INLINE unsigned get_value(Entry entry) { - return (unsigned)(entry & 0xffffffff); -} - -//! @name Internal -//! @brief Functions used for building the hash table. -//! @{ - -//! Fills the entire array with a specific value. -template -__global__ void clear_table(const unsigned table_size, const T value, - T *table) { - unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x + - blockIdx.y * blockDim.x * gridDim.x; - if (thread_index < table_size) { - table[thread_index] = value; - } -} - -//! Determine where in the hash table the key could be located. -template -__device__ void KeyLocations(const Functions constants, - const unsigned table_size, const unsigned key, - unsigned locations[kNumHashFunctions]) { -// Compute all possible locations for the key in the big table. -#pragma unroll - for (int i = 0; i < kNumHashFunctions; ++i) { - locations[i] = hash_function(constants, i, key) % table_size; - } -} -//! @} - -/* -------------------------------------------------------------------------- - Retrieval functions. - -------------------------------------------------------------------------- */ -//! Answers a single query. -/*! @ingroup PublicInterface - * @param[in] key Query key - * @param[in] table_size Size of the hash table - * @param[in] table The contents of the hash table - * @param[in] constants The hash functions used to build the table - * @param[in] stash_constants The hash function used to build the stash - * @param[in] stash_count The number of items in the stash - * @param[out] num_probes_required Debug only: The number of probes required - * to resolve the query. - * @returns The value of the query key, if the key exists in the table. - * Otherwise, \ref kNotFound will be returned. - */ -template -__device__ unsigned -retrieve(const unsigned query_key, const unsigned table_size, - const Entry *table, const Functions constants, - const uint2 stash_constants, const unsigned stash_count, - unsigned *num_probes_required = NULL) { - // Identify all of the locations that the key can be located in. - unsigned locations[kNumHashFunctions]; - KeyLocations(constants, table_size, query_key, locations); - - // Check each location until the key is found. - unsigned num_probes = 1; - Entry entry = table[locations[0]]; - unsigned key = get_key(entry); - -#pragma unroll - for (unsigned i = 1; i < kNumHashFunctions; ++i) { - if (key != query_key && key != kNotFound) { - num_probes++; - entry = table[locations[i]]; - key = get_key(entry); - } - } - - // Check the stash. - if (stash_count && get_key(entry) != query_key) { - num_probes++; - const Entry *stash = table + table_size; - unsigned slot = stash_hash_function(stash_constants, query_key); - entry = stash[slot]; - } - -#ifdef TRACK_ITERATIONS - if (num_probes_required) { - *num_probes_required = num_probes; - } -#endif - - if (get_key(entry) == query_key) { - return get_value(entry); - } else { - return kNotFound; - } -} - -//! Perform a retrieval from a basic hash table. Each thread manages a single -//! query. -template -__global__ void hash_retrieve(const unsigned n_queries, const unsigned *keys_in, - const unsigned table_size, const Entry *table, - const Functions constants, - const uint2 stash_constants, - const unsigned stash_count, unsigned *values_out, - unsigned *num_probes_required = NULL) { - // Get the key. - unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x + - blockIdx.y * blockDim.x * gridDim.x; - if (thread_index >= n_queries) - return; - unsigned key = keys_in[thread_index]; - - values_out[thread_index] = retrieve( - key, table_size, table, constants, stash_constants, stash_count, - (num_probes_required ? num_probes_required + thread_index : NULL)); -} - -/* -------------------------------------------------------------------------- - Build a cuckoo hash table. - -------------------------------------------------------------------------- */ -//! @name Internal -//! @{ - -//! Determine where to insert the key next. The hash functions are used in -//! round-robin order. -template -__device__ unsigned -determine_next_location(const Functions constants, - const unsigned table_size, const unsigned key, - const unsigned previous_location) { - // Identify all possible locations for the entry. - unsigned locations[kNumHashFunctions]; -#pragma unroll - for (unsigned i = 0; i < kNumHashFunctions; ++i) { - locations[i] = hash_function(constants, i, key) % table_size; - } - - // Figure out where the item should be inserted next. - unsigned next_location = locations[0]; -#pragma unroll - for (int i = kNumHashFunctions - 2; i >= 0; --i) { - next_location = - (previous_location == locations[i] ? locations[i + 1] : next_location); - } - return next_location; -} - -//! Attempts to insert a single entry into the hash table. -/*! This process stops after a certain number of iterations. If the thread is - still holding onto an item because of an eviction, it tries the stash. - If it fails to enter the stash, it returns false. - Otherwise, it succeeds and returns true. - */ -template -__device__ bool -insert(const unsigned table_size, const Functions constants, - const uint2 stash_constants, const unsigned max_iteration_attempts, - Entry *table, unsigned *stash_count, Entry entry, - unsigned *iterations_used) { - unsigned key = get_key(entry); - - // The key is always inserted into its first slot at the start. - unsigned location = hash_function(constants, 0, key) % table_size; - - // Keep inserting until an empty slot is found or the eviction chain grows too - // large. - for (unsigned its = 1; its <= max_iteration_attempts; its++) { - // Insert the new entry. - entry = atomicExch(&table[location], entry); - key = get_key(entry); - - // If no key was evicted, we're done. - if (key == kKeyEmpty) { - *iterations_used = its; - break; - } - - // Otherwise, determine where the evicted key will go. - location = determine_next_location(constants, table_size, key, location); - } - - if (key != kKeyEmpty) { - // Shove it into the stash. - unsigned slot = stash_hash_function(stash_constants, key); - Entry *stash = table + table_size; - Entry replaced_entry = atomicCAS(stash + slot, kEntryEmpty, entry); - if (replaced_entry != kEntryEmpty) { - return false; - } else { - atomicAdd(stash_count, 1); - } - } - - return true; -} - -// Build a basic hash table, using one big table. -template -__global__ void CuckooHash(const unsigned n_entries, const unsigned *keys, - const unsigned *values, const unsigned table_size, - const Functions constants, - const unsigned max_iteration_attempts, Entry *table, - uint2 stash_constants, unsigned *stash_count, - unsigned *failures, - unsigned *iterations_taken = nullptr) { - // Check if this thread has an item and if any previous threads failed. - unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x + - blockIdx.y * blockDim.x * gridDim.x; - if (thread_index >= n_entries || *failures) - return; - Entry entry = make_entry(keys[thread_index], values[thread_index]); - - unsigned iterations = 0; - bool success = insert( - table_size, constants, stash_constants, max_iteration_attempts, table, - stash_count, entry, &iterations); - - if (success == false) { - // The eviction chain grew too large. Report failure. -#ifdef COUNT_UNINSERTED - atomicAdd(failures, 1); -#else - *failures = 1; -#endif - } - -#ifdef TRACK_ITERATIONS - iterations_taken[thread_index] = iterations; -#endif -} -//! @} - -}; // namespace cuhash - -#endif - -// Leave this at the end of the file -// Local Variables: -// mode:c++ -// c-file-style: "NVIDIA" -// End: diff --git a/include/cuhash/hash_table.h b/include/cuhash/hash_table.h deleted file mode 100644 index 055e08c..0000000 --- a/include/cuhash/hash_table.h +++ /dev/null @@ -1,228 +0,0 @@ -// ------------------------------------------------------------- -// cuDPP -- CUDA Data Parallel Primitives library -// ------------------------------------------------------------- -// $Revision:$ -// $Date:$ -// ------------------------------------------------------------- -// This source code is distributed under the terms of license.txt in -// the root directory of this source distribution. -// ------------------------------------------------------------- - -/** - * @file hash_table.h - * - * @brief Header for a basic hash table that stores one value per key. - */ - -#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H -#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H - -#include "definitions.h" -#include "hash_functions.h" - -#include - -/** \addtogroup cudpp_app - * @{ - */ - -/** \addtogroup cudpp_hash_data_structures - * @{ - */ - -/* -------------------------------------------------------------------------- - Doxygen definitions. - -------------------------------------------------------------------------- */ -/*! @namespace CudaHT - * @brief Encapsulates the hash table library. - */ - -/*! @namespace CuckooHashing - * @brief Encapsulates the cuckoo hash table that uses stashes. - */ - -/* ------------------------------------------------------------------------- - Hash table code. - ------------------------------------------------------------------------- */ -namespace cuhash { - -//! Compute how many thread blocks are required for the given number of threads. -dim3 ComputeGridDim(unsigned threads); - -//! Compute how long an eviction chain is allowed to become for a given input -//! size. -/*! \param[in] num_keys Number of keys in the input. - * \param[in] table_size Number of slots in the hash table. - * \param[in] num_functions Number of hash functions being used. - * \returns The number of iterations that should be allowed. - * - * The latter two parameters are only needed when using an empirical - * formula for computing the chain length. - */ -unsigned ComputeMaxIterations(const unsigned num_keys, - const unsigned table_size, - const unsigned num_functions); - -//! Basic hash table that stores one value for each key. -/*! The input consists of two unsigned arrays of keys and values. - * None of the keys are expected to be repeated. - * - * @todo Templatize the interface without forcing the header file to - * have CUDA calls. - * @ingroup cudpp_app - */ -class HashTable { -public: - HashTable(); - - virtual ~HashTable() { Release(); } - - //! Initialize the hash table's memory. Must be called before \ref - //! Build() and after the random number generator has been seeded. - /*! @param[in] max_input_size Largest expected number of items in the input. - * @param[in] space_usage Size of the hash table relative to the - * input. Bigger tables are faster to build - * and retrieve from. - * @param[in] num_functions Number of hash functions to use. May be - * 2-5. More hash functions make it easier - * to build the table, but increase - * retrieval times. - * @returns Whether the hash table was initialized successfully (true) - * or not (false). - * - * The minimum space usage is dependent on the number of functions - * being used; for two through five functions, the minimum space - * usage is 2.1, 1.1, 1.03, and 1.02 respectively. - */ - virtual bool Initialize(const unsigned max_input_size, - const float space_usage = 1.25, - const unsigned num_functions = 4); - - //! Free all memory. - virtual void Release(); - - //! Build the hash table. - /*! @param[in] input_size Number of key-value pairs being inserted. - * @param[in] d_keys Device memory array containing all of the input - * keys. - * @param[in] d_vals Device memory array containing the keys' values. - * @returns Whether the hash table was built successfully (true) or - * not (false). - * - * Several attempts are allowed to build the hash table in case of failure. - * The input keys are expected to be completely unique. - * To reduce the chance of a failure, increase the space usage or number of - * functions. - * Keys are not allowed to be equal to cuhash::kKeyEmpty. - */ - virtual bool Build(const unsigned input_size, const unsigned *d_keys, - const unsigned *d_vals); - - //! Query the hash table. - /*! @param[in] n_queries Number of keys in the query set. - * @param[in] d_query_keys Device memory array containing all of - * the query keys. - * @param[in] d_query_results Values for the query keys. - * - * kNotFound is returned for any query key that failed to be found - * in the table. - */ - virtual void Retrieve(const unsigned n_queries, const unsigned *d_query_keys, - unsigned *d_query_results); - - //! @name Accessors - /// @brief Mainly needed to use the __device__ CudaHT::retrieve() - /// function directly. - /// @{ - - //! Returns how many slots the hash table has. - inline unsigned get_table_size() const { return table_size_; } - - //! Returns how many items are stored in the stash. - inline unsigned get_stash_count() const { return stash_count_; } - - //! Returns the constants used by the stash. - inline uint2 get_stash_constants() const { return stash_constants_; } - - //! Returns the hash table contents. - inline const Entry *get_contents() const { return d_contents_; } - - //! Returns the number of hash functions being used. - inline unsigned get_num_hash_functions() const { return num_hash_functions_; } - - //! When using two hash functions, returns the constants. - inline Functions<2> get_constants_2() const { return constants_2_; } - - //! When using three hash functions, returns the constants. - inline Functions<3> get_constants_3() const { return constants_3_; } - - //! When using four hash functions, returns the constants. - inline Functions<4> get_constants_4() const { return constants_4_; } - - //! When using five hash functions, returns the constants. - inline Functions<5> get_constants_5() const { return constants_5_; } - - /// @} - inline Entry *data() { return d_contents_; } - inline const Entry *data() const { return d_contents_; } - -protected: - unsigned table_size_; //!< Size of the hash table. - unsigned num_hash_functions_; //!< Number of hash functions being used. - Entry *d_contents_; //!< Device memory: The hash table contents. The stash is - //!< stored at the end. - unsigned stash_count_; //!< Number of key-value pairs currently stored. - uint2 stash_constants_; //!< Hash function constants for the stash. - - Functions<2> constants_2_; //!< Constants for a set of two hash functions. - Functions<3> constants_3_; //!< Constants for a set of three hash functions. - Functions<4> constants_4_; //!< Constants for a set of four hash functions. - Functions<5> constants_5_; //!< Constants for a set of five hash functions. - - unsigned *d_failures_; //!< Device memory: General use error flag. -}; - -/*! @name Internal - * @{ - */ -namespace CUDAWrapper { -//! Fills a 64-bit array with a particular value. -void ClearTable(const unsigned slots_in_table, const Entry fill_value, - Entry *d_array); - -//! Calls the Cuckoo Hash construction kernel. -void CallCuckooHash(const unsigned n_entries, const unsigned num_hash_functions, - const unsigned *d_keys, const unsigned *d_values, - const unsigned table_size, const Functions<2> constants_2, - const Functions<3> constants_3, - const Functions<4> constants_4, - const Functions<5> constants_5, - const unsigned max_iteration_attempts, Entry *d_contents, - uint2 stash_constants, unsigned *d_stash_count, - unsigned *d_failures, unsigned *d_iterations_taken); - -//! Calls the kernel that performs retrievals. -void CallHashRetrieve(const unsigned n_queries, - const unsigned num_hash_functions, - const unsigned *keys_in, const unsigned table_size, - const Entry *table, const Functions<2> constants_2, - const Functions<3> constants_3, - const Functions<4> constants_4, - const Functions<5> constants_5, - const uint2 stash_constants, const unsigned stash_count, - unsigned *values_out); -}; // namespace CUDAWrapper -/// @} - -}; // namespace cuhash - -/** @} */ // end hash table data structures -/** @} */ // end cudpp_app - -#endif - -// Leave this at the end of the file -// Local Variables: -// mode:c++ -// c-file-style: "NVIDIA" -// End: diff --git a/include/paramsgrid.h b/include/paramsgrid.h deleted file mode 100644 index c978dfe..0000000 --- a/include/paramsgrid.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file is used for c++ unit test, but pytorch jit ops don't support c++ -// debug build. - -#ifndef PARAMS_GRID_H_ -#define PARAMS_GRID_H_ -#include -#include - -namespace detail { -template int getTotalSize(std::vector arg) { return arg.size(); } - -template -int getTotalSize(std::vector arg, std::vector... args) { - return arg.size() * getTotalSize(args...); -} -template int getSize(std::vector arg) { return arg.size(); } - -template -void assigner(TT &src, std::vector counter, std::vector &arg) { - std::get(src) = arg[counter[Idx]]; -} - -template -void assigner(TT &src, std::vector counter, std::vector &arg, - std::vector &... args) { - std::get(src) = arg[counter[Idx]]; - assigner(src, counter, args...); -} -} // namespace detail -template -std::vector> paramsGrid(std::vector... args) { - int length = detail::getTotalSize(args...); - std::vector sizes = {detail::getSize(args)...}; - int size = sizes.size(); - - std::vector> params(length); - std::vector counter(size); - for (int i = 0; i < length; ++i) { - detail::assigner<0>(params[i], counter, args...); - counter[size - 1] += 1; - for (int c = size - 1; c >= 0; --c) { - if (counter[c] == sizes[c] && c > 0) { - counter[c - 1] += 1; - counter[c] = 0; - } - } - } - return params; -} - -#endif \ No newline at end of file diff --git a/include/spconv/box_iou.h b/include/spconv/box_iou.h deleted file mode 100644 index 15ceee8..0000000 --- a/include/spconv/box_iou.h +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef BOX_IOU_H -#define BOX_IOU_H - -#include -// must include pybind11/eigen.h if using eigen matrix as arguments. -#include -#include -#include - -namespace spconv { -// #include "voxelnet/core/cc/pybind11_helper.h" -namespace py = pybind11; -using namespace pybind11::literals; -template -inline py::array_t constant(ShapeContainer shape, DType value) { - // create ROWMAJOR array. - py::array_t array(shape); - std::fill(array.mutable_data(), array.mutable_data() + array.size(), value); - return array; -} - -template -inline py::array_t zeros(std::vector shape) { - return constant>(shape, 0); -} - -template -py::array_t -rbbox_iou(py::array_t box_corners, py::array_t qbox_corners, - py::array_t standup_iou, DType standup_thresh) { - namespace bg = boost::geometry; - typedef bg::model::point point_t; - typedef bg::model::polygon polygon_t; - polygon_t poly, qpoly; - std::vector poly_inter, poly_union; - DType inter_area, union_area; - auto box_corners_r = box_corners.template unchecked<3>(); - auto qbox_corners_r = qbox_corners.template unchecked<3>(); - auto standup_iou_r = standup_iou.template unchecked<2>(); - auto N = box_corners_r.shape(0); - auto K = qbox_corners_r.shape(0); - py::array_t overlaps = zeros({int(N), int(K)}); - auto overlaps_rw = overlaps.template mutable_unchecked<2>(); - if (N == 0 || K == 0) { - return overlaps; - } - for (int k = 0; k < K; ++k) { - for (int n = 0; n < N; ++n) { - if (standup_iou_r(n, k) <= standup_thresh) - continue; - bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); - bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1))); - bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1))); - bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1))); - bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); - - bg::intersection(poly, qpoly, poly_inter); - - if (!poly_inter.empty()) { - inter_area = bg::area(poly_inter.front()); - bg::union_(poly, qpoly, poly_union); - if (!poly_union.empty()) { - union_area = bg::area(poly_union.front()); - overlaps_rw(n, k) = inter_area / union_area; - } - poly_union.clear(); - } - poly.clear(); - qpoly.clear(); - poly_inter.clear(); - } - } - return overlaps; -} - -template -py::array_t rbbox_intersection(py::array_t box_corners, - py::array_t qbox_corners, - py::array_t standup_iou, - DType standup_thresh) { - namespace bg = boost::geometry; - typedef bg::model::point point_t; - typedef bg::model::polygon polygon_t; - polygon_t poly, qpoly; - std::vector poly_inter, poly_union; - DType inter_area, union_area; - auto box_corners_r = box_corners.template unchecked<3>(); - auto qbox_corners_r = qbox_corners.template unchecked<3>(); - auto standup_iou_r = standup_iou.template unchecked<2>(); - auto N = box_corners_r.shape(0); - auto K = qbox_corners_r.shape(0); - py::array_t overlaps = zeros({int(N), int(K)}); - auto overlaps_rw = overlaps.template mutable_unchecked<2>(); - if (N == 0 || K == 0) { - return overlaps; - } - for (int k = 0; k < K; ++k) { - for (int n = 0; n < N; ++n) { - if (standup_iou_r(n, k) <= standup_thresh) - continue; - bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); - bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1))); - bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1))); - bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1))); - bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1))); - bg::append(qpoly, - point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1))); - - bg::intersection(poly, qpoly, poly_inter); - - if (!poly_inter.empty()) { - inter_area = bg::area(poly_inter.front()); - overlaps_rw(n, k) = inter_area; - } - poly.clear(); - qpoly.clear(); - poly_inter.clear(); - } - } - return overlaps; -} - -} // namespace spconv -#endif \ No newline at end of file diff --git a/include/spconv/cublas_gemm.h b/include/spconv/cublas_gemm.h deleted file mode 100644 index 117a127..0000000 --- a/include/spconv/cublas_gemm.h +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once -#include -#include - -namespace spconv { - -template -cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const T *alpha, const T *A, int lda, const T *B, - int ldb, const T *beta, T *C, int ldc); - -template -cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const T *alpha, const T *A, int lda, const T *B, - int ldb, const T *beta, T *C, int ldc) { - return cublasTgemm(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda, - beta, C, ldc); -} - -template inline T constant_scalar(float data) { return T(data); } - -template -cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb, - const tv::TensorView A, const tv::TensorView B, - tv::TensorView C) { - TV_ASSERT_RT_ERR(A.ndim() == 2, "error"); - TV_ASSERT_RT_ERR(B.ndim() == 2, "error"); - auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N; - auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N; - int m = transa ? A.dim(1) : A.dim(0); - int n = transb ? B.dim(0) : B.dim(1); - int ka = transa ? A.dim(0) : A.dim(1); - int kb = transb ? B.dim(1) : B.dim(0); - int lda = transa ? m : ka; - int ldb = transb ? ka : n; - int ldc = n; - TV_ASSERT_RT_ERR(ka == kb, "error"); - T alpha = constant_scalar(1); - T beta = constant_scalar(0); - return cublasTgemmRow(handle, transa_cublas, transb_cublas, m, n, ka, - &alpha, A.data(), lda, B.data(), ldb, &beta, - C.data(), ldc); -} - -} // namespace spconv diff --git a/include/spconv/fused_conv.cu.h b/include/spconv/fused_conv.cu.h deleted file mode 100644 index 15533d1..0000000 --- a/include/spconv/fused_conv.cu.h +++ /dev/null @@ -1,629 +0,0 @@ - -/* -BSD License - -For SparseConvNet software - -Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name Facebook nor the names of its contributors may be used to - endorse or promote products derived from this software without specific - prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#define TACC double - -template -__global__ void -dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, - int32_t input_nPlanes, int32_t input_stride, - int32_t output_nPlanes, int32_t output_stride) { - // nHot must be a multiple of K!! - - // Input x Weight -> Output - // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks - // K is a multiple of V, - - // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M - - int32_t M = input_nPlanes / K; - // N = gridDim.y == output_nPlanes/K - int32_t n = blockIdx.y; - int32_t g = blockIdx.z; - inFeatures += g * input_nPlanes; - outFeatures += n * K + g * output_nPlanes; - w += n * K + g * input_nPlanes * output_nPlanes; - - TACC O[V]; - __shared__ T W[K][K]; - __shared__ T I[K][K]; - int32_t R0[V]; - int32_t R1[V]; - const int32_t tx = threadIdx.x; - int32_t ty[V]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - ty[v] = threadIdx.y + v * (K / V); - - for (int32_t m = 0; m < M; m++) { -// Read w -#pragma unroll - for (int32_t v = 0; v < V; v++) - W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; - - for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { -#pragma unroll - for (int32_t v = 0; v < V; v++) { - R0[v] = rulesIn[s + ty[v]]; - R1[v] = rulesOut[s + ty[v]]; - } - __syncthreads(); - -// Read input, reset O[] -#pragma unroll - for (int32_t v = 0; v < V; v++) { - I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; - O[v] = 0; - } - __syncthreads(); - -#pragma unroll - for (int32_t k = 0; k < K; k++) -#pragma unroll - for (int32_t v = 0; v < V; v++) - O[v] += I[ty[v]][k] * W[k][tx]; - -#pragma unroll - for (int32_t v = 0; v < V; v++) - O[v] += outFeatures[R1[v] * output_stride + tx]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - outFeatures[R1[v] * output_stride + tx] = O[v]; - __syncthreads(); - } - w += K * output_nPlanes; - inFeatures += K; - } -} -template -__global__ void -dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, - int32_t input_nPlanes, int32_t input_stride, - int32_t output_nPlanes, int32_t output_stride) { - // Input x Weight -> Output - // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks - // K is a multiple of V, - - // nHot x KM -> nHot x KN - parallel over N,nHot - loop over M - - int32_t M = input_nPlanes / K; - // N = gridDim.y == output_nPlanes/K - int32_t n = blockIdx.y; - int32_t g = blockIdx.z; - inFeatures += g * input_nPlanes; - outFeatures += n * K + g * output_nPlanes; - w += n * K + g * input_nPlanes * output_nPlanes; - - TACC O[V]; - __shared__ T W[K][K]; - __shared__ T I[K][K]; - int32_t R0[V]; - int32_t R1[V]; - const int32_t tx = threadIdx.x; - int32_t ty[V]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - ty[v] = threadIdx.y + v * (K / V); - - for (int32_t m = 0; m < M; m++) { -// Read w -#pragma unroll - for (int32_t v = 0; v < V; v++) - W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; - - for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (s + ty[v] < nHot) { - R0[v] = rulesIn[s + ty[v]]; - R1[v] = rulesOut[s + ty[v]]; - } - } - __syncthreads(); - -// Read input, reset O[] -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (s + ty[v] < nHot) - I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; - O[v] = 0; - } - __syncthreads(); - -#pragma unroll - for (int32_t k = 0; k < K; k++) -#pragma unroll - for (int32_t v = 0; v < V; v++) - O[v] += I[ty[v]][k] * W[k][tx]; - -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (s + ty[v] < nHot) - O[v] += outFeatures[R1[v] * output_stride + tx]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (s + ty[v] < nHot) - outFeatures[R1[v] * output_stride + tx] = O[v]; - __syncthreads(); - } - w += K * output_nPlanes; - inFeatures += K; - } -} - -#define FOO(T, K, V) \ - { \ - if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ - int32_t o = (nHot / K) * K; \ - if (o >= K) \ - dConvolution_KMxKN_forwardA \ - <<>>( \ - inFeatures, outFeatures, w, rulesIn, rulesOut, o, \ - input_nPlanes, input_stride, output_nPlanes, output_stride); \ - if (nHot > o) \ - dConvolution_KMxKN_forwardB \ - <<>>( \ - inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \ - nHot - o, input_nPlanes, input_stride, output_nPlanes, \ - output_stride); \ - return; \ - } \ - } -template -void dConvolution_forward(cudaStream_t s, T *inFeatures, T *outFeatures, T *w, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, - int32_t input_nPlanes, int32_t input_stride, - int32_t output_nPlanes, int32_t output_stride, - int32_t nGroups) { - FOO(T, 64, 16) - FOO(T, 32, 8) - FOO(T, 16, 4) - FOO(T, 8, 2) - assert(false); -} -template <> -void dConvolution_forward(cudaStream_t s, double *inFeatures, - double *outFeatures, double *w, - int32_t *rulesIn, int32_t *rulesOut, - int32_t nHot, int32_t input_nPlanes, - int32_t input_stride, int32_t output_nPlanes, - int32_t output_stride, int32_t nGroups) { - FOO(double, 32, 8) - FOO(double, 16, 4) - FOO(double, 8, 2) - assert(false); -} -#undef FOO -// dOutput x W^T -> dInput and -// Input^T x dOutput -> dW -// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups) -template -__global__ void dConvolution_KMxKN_backward_dW_A( - T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes, - int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) { - // M = gridDim.y == input_nPlanes / K - int32_t N = output_nPlanes / K; - int32_t m = blockIdx.y; - int32_t g = blockIdx.z; - inFeatures += m * K + g * input_nPlanes; - dInFeatures += m * K + g * input_nPlanes; - dOutFeatures += g * output_nPlanes; - w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes; - dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes; - TACC dI[V]; - TACC dW[V]; - __shared__ T I[K][K]; - __shared__ T dO[K][K]; - __shared__ T W[K][K]; - int32_t R0[V]; - int32_t R1[V]; - const int32_t tx = threadIdx.x; - int32_t ty[V]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - ty[v] = threadIdx.y + v * (K / V); - for (int32_t n = 0; n < N; n++) { -// Read w, reset dW -#pragma unroll - for (int32_t v = 0; v < V; v++) { - W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; - dW[v] = 0; - } - for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { -#pragma unroll - for (int32_t v = 0; v < V; v++) { - R0[v] = rulesIn[s + ty[v]]; - R1[v] = rulesOut[s + ty[v]]; - dI[v] = 0; - } - __syncthreads(); -// Read input and dOutput -#pragma unroll - for (int32_t v = 0; v < V; v++) { - I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; - dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx]; - } - __syncthreads(); -#pragma unroll - for (int32_t k = 0; k < K; k++) -#pragma unroll - for (int32_t v = 0; v < V; v++) { - dI[v] += dO[ty[v]][k] * W[tx][k]; - dW[v] += I[k][ty[v]] * dO[k][tx]; - } -#pragma unroll - for (int32_t v = 0; v < V; v++) - dI[v] += dInFeatures[R0[v] * input_stride + tx]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - dInFeatures[R0[v] * input_stride + tx] = dI[v]; - __syncthreads(); - } -#pragma unroll - for (int32_t v = 0; v < V; v++) - atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]); - w += K; - dw += K; - dOutFeatures += K; - } -} -// dOutput x W^T -> dInput and -// Input^T x dOutput -> dW -// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups) -template -__global__ void dConvolution_KMxKN_backward_dW_B( - T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes, - int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) { - // M = gridDim.y == input_nPlanes / K - int32_t N = output_nPlanes / K; - int32_t m = blockIdx.y; - int32_t g = blockIdx.z; - inFeatures += m * K + g * input_nPlanes; - dInFeatures += m * K + g * input_nPlanes; - dOutFeatures += g * output_nPlanes; - w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes; - dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes; - TACC dI[V]; - TACC dW[V]; - __shared__ T I[K][K]; - __shared__ T dO[K][K]; - __shared__ T W[K][K]; - int32_t R0[V]; - int32_t R1[V]; - const int32_t tx = threadIdx.x; - int32_t ty[V]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - ty[v] = threadIdx.y + v * (K / V); - for (int32_t n = 0; n < N; n++) { -// Read w, reset dW -#pragma unroll - for (int32_t v = 0; v < V; v++) { - W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; - dW[v] = 0; - } - for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (s + ty[v] < nHot) { - R0[v] = rulesIn[s + ty[v]]; - R1[v] = rulesOut[s + ty[v]]; - } - dI[v] = 0; - } - __syncthreads(); -// Read input and dOutput -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (s + ty[v] < nHot) { - I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx]; - dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx]; - } else { - I[ty[v]][tx] = 0; - dO[ty[v]][tx] = 0; - } - __syncthreads(); -#pragma unroll - for (int32_t k = 0; k < K; k++) -#pragma unroll - for (int32_t v = 0; v < V; v++) { - dI[v] += dO[ty[v]][k] * W[tx][k]; - dW[v] += I[k][ty[v]] * dO[k][tx]; - } -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (s + ty[v] < nHot) - dI[v] += dInFeatures[R0[v] * input_stride + tx]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (s + ty[v] < nHot) - dInFeatures[R0[v] * input_stride + tx] = dI[v]; - __syncthreads(); - } -#pragma unroll - for (int32_t v = 0; v < V; v++) - atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]); - w += K; - dw += K; - dOutFeatures += K; - } -} -#define FOO(T, K, V) \ - { \ - if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \ - int32_t o = (nHot / K) * K; \ - if (o >= K) \ - dConvolution_KMxKN_backward_dW_A \ - <<>>(inFeatures, dInFeatures, dOutFeatures, \ - w, dw, rulesIn, rulesOut, o, \ - input_nPlanes, input_stride, \ - output_nPlanes, output_stride); \ - if (nHot > o) \ - dConvolution_KMxKN_backward_dW_B \ - <<>>( \ - inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \ - rulesOut + o, nHot - o, input_nPlanes, input_stride, \ - output_nPlanes, output_stride); \ - return; \ - } \ - } -template -void dConvolution_backward_dW(cudaStream_t s, T *inFeatures, T *dInFeatures, - T *dOutFeatures, T *w, T *dw, int32_t *rulesIn, - int32_t *rulesOut, int32_t nHot, - int32_t input_nPlanes, int32_t input_stride, - int32_t output_nPlanes, int32_t output_stride, - int32_t nGroups) { - FOO(T, 32, 8) - FOO(T, 16, 4) - FOO(T, 8, 2) - assert(false); -} -#undef FOO -template -__global__ void -dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, - int32_t input_nPlanes, int32_t input_stride, - int32_t output_nPlanes, int32_t output_stride) { - // Input x Weight -> Output - // blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks - // K is a multiple of V, - // nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN - // - parallel over N,nHot - loop over M - int32_t M = (input_nPlanes + K - 1) / K; - // N = gridDim.y ~ output_nPlanes/K - int32_t n = blockIdx.y; - int32_t g = blockIdx.z; - inFeatures += g * input_nPlanes; - outFeatures += n * K + g * output_nPlanes; - w += n * K + g * input_nPlanes * output_nPlanes; - int32_t KO = min(K, output_nPlanes - K * n); - TACC O[V]; - __shared__ T W[K][K]; - __shared__ T I[K][K]; - __shared__ int32_t R[K * 2]; - const int32_t tx = threadIdx.x; - int32_t ty[V]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - ty[v] = threadIdx.y + v * (K / V); - for (int32_t m = 0; m < M; m++) { - int32_t KI = min(K, input_nPlanes - K * m); -// Read w -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (ty[v] < KI and tx < KO) - W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; - for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { -// Read rules for K input/output pairs -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (ty[v] < 1) { - if (s + tx < nHot) { - R[2 * tx] = rulesIn[s + tx]; - R[2 * tx + 1] = rulesOut[s + tx]; - } - // R[q] = rules[2 * s + q]; - } - } - __syncthreads(); -// Read input, reset O[] -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (tx < KI and s + ty[v] < nHot) - I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx]; - O[v] = 0; - } - __syncthreads(); -#pragma unroll - for (int32_t k = 0; k < KI; k++) -#pragma unroll - for (int32_t v = 0; v < V; v++) - O[v] += I[ty[v]][k] * W[k][tx]; - __syncthreads(); -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (tx < KO and s + ty[v] < nHot) - outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v]; - __syncthreads(); - } - w += K * output_nPlanes; - inFeatures += K; - } -} -// dOutput x W^T -> dInput and -// Input^T x dOutput -> dW -// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups) -template -__global__ void dConvolution_KMxKN_backward_dW2( - T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes, - int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) { - // M = gridDim.y == input_nPlanes / K - int32_t N = (output_nPlanes + K - 1) / K; - int32_t m = blockIdx.y; - int32_t g = blockIdx.z; - inFeatures += m * K + g * input_nPlanes; - dInFeatures += m * K + g * input_nPlanes; - dOutFeatures += g * output_nPlanes; - w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes; - dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes; - int32_t KI = min(K, input_nPlanes - K * m); - TACC dI[V]; - TACC dW[V]; - __shared__ T I[K][K]; - __shared__ T dO[K][K]; - __shared__ T W[K][K]; - __shared__ int32_t R[K * 2]; - const int32_t tx = threadIdx.x; - int32_t ty[V]; -#pragma unroll - for (int32_t v = 0; v < V; v++) - ty[v] = threadIdx.y + v * (K / V); - for (int32_t n = 0; n < N; n++) { - int32_t KO = min(K, output_nPlanes - K * n); -// Read w, reset dW -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (ty[v] < KI and tx < KO) - W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx]; - dW[v] = 0; - } - for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) { -// Read rules for K input/output pairs, reset dI[] -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (ty[v] < 1) { - if (s + tx < nHot) { - R[2 * tx] = rulesIn[s + tx]; - R[2 * tx + 1] = rulesOut[s + tx]; - } - // R[q] = rules[2 * s + q]; - } - dI[v] = 0; - } - __syncthreads(); -// Read input and dOutput -#pragma unroll - for (int32_t v = 0; v < V; v++) { - if (tx < KI and s + ty[v] < nHot) - I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx]; - else - I[ty[v]][tx] = 0; - if (tx < KO and s + ty[v] < nHot) - dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx]; - else - dO[ty[v]][tx] = 0; - } - __syncthreads(); -#pragma unroll - for (int32_t k = 0; k < KO; k++) -#pragma unroll - for (int32_t v = 0; v < V; v++) - dI[v] += dO[ty[v]][k] * W[tx][k]; -#pragma unroll - for (int32_t k = 0; k < K; k++) -#pragma unroll - for (int32_t v = 0; v < V; v++) - dW[v] += I[k][ty[v]] * dO[k][tx]; - __syncthreads(); -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (tx < KI and s + ty[v] < nHot) - dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v]; - __syncthreads(); - } -#pragma unroll - for (int32_t v = 0; v < V; v++) - if (ty[v] < KI and tx < KO) - atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]); - w += K; - dw += K; - dOutFeatures += K; - } -} -template -void dConvolution_forward2(cudaStream_t s, T *inFeatures, T *outFeatures, T *w, - int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, - int32_t input_nPlanes, int32_t input_stride, - int32_t output_nPlanes, int32_t output_stride, - int32_t nGroups) { - int32_t c = input_nPlanes * output_nPlanes * nGroups; - if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { - const int32_t K = 16; - const int32_t V = 4; - dConvolution_KMxKN_forward2 - <<>>(inFeatures, outFeatures, w, rulesIn, rulesOut, nHot, - input_nPlanes, input_stride, output_nPlanes, output_stride); - - } else { - dConvolution_forward(s, inFeatures, outFeatures, w, rulesIn, rulesOut, nHot, - input_nPlanes, input_stride, output_nPlanes, - output_stride, nGroups); - } -} -template -void dConvolution_backward_dW2(cudaStream_t s, T *inFeatures, T *dInFeatures, - T *dOutFeatures, T *w, T *dw, int32_t *rulesIn, - int32_t *rulesOut, int32_t nHot, - int32_t input_nPlanes, int32_t input_stride, - int32_t output_nPlanes, int32_t output_stride, - int32_t nGroups) { - int32_t c = input_nPlanes * output_nPlanes * nGroups; - if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) { - const int32_t K = 16; - const int32_t V = 4; - dConvolution_KMxKN_backward_dW2 - <<>>(inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn, rulesOut, - nHot, input_nPlanes, input_stride, output_nPlanes, - output_stride); - } else { - dConvolution_backward_dW(s, inFeatures, dInFeatures, dOutFeatures, w, dw, - rulesIn, rulesOut, nHot, input_nPlanes, - input_stride, output_nPlanes, output_stride, - nGroups); - } -} -#undef TACC \ No newline at end of file diff --git a/include/spconv/fused_conv.h b/include/spconv/fused_conv.h deleted file mode 100644 index a02d569..0000000 --- a/include/spconv/fused_conv.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once -#include -#include -#include -namespace spconv { - -enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine }; -using all_fused_conv_algos_t = - tv::mp_list_c; - -void fused_conv_cuda(torch::Tensor output, torch::Tensor features, - torch::Tensor filters, torch::Tensor indicesIn, - torch::Tensor indicesOut, int nHot); - -void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din, - torch::Tensor dout, torch::Tensor filters, - torch::Tensor dfilters, torch::Tensor indicesIn, - torch::Tensor indicesOut, int nHot); - -void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features, - torch::Tensor filters, torch::Tensor indicesIn, - torch::Tensor indicesOut, int nHot); -void fused_conv_backward_cuda_minkowski(torch::Tensor features, - torch::Tensor din, torch::Tensor dout, - torch::Tensor filters, - torch::Tensor dfilters, - torch::Tensor indicesIn, - torch::Tensor indicesOut, int nHot); - -template struct FusedConvDispatch; - -template <> struct FusedConvDispatch { - constexpr static auto *fwd = fused_conv_cuda; - constexpr static auto *bwd = fused_conv_backward_cuda; -}; - -template <> struct FusedConvDispatch { - constexpr static auto *fwd = fused_conv_cuda_minkowski; - constexpr static auto *bwd = fused_conv_backward_cuda_minkowski; -}; - -} // namespace spconv diff --git a/include/spconv/fused_spconv_ops.h b/include/spconv/fused_spconv_ops.h deleted file mode 100644 index cb87e67..0000000 --- a/include/spconv/fused_spconv_ops.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FUSED_SPARSE_CONV_OP_H_ -#define FUSED_SPARSE_CONV_OP_H_ - -#include -#include -#include -#include -#include - -namespace spconv { -// torch.jit's doc says only support int64, so we need to convert to int32. - -torch::Tensor -fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters, - torch::Tensor bias, torch::Tensor indicePairs, - torch::Tensor indiceNum, int64_t numActOut, - int64_t _inverse, int64_t _subM) { - bool subM = _subM != 0; - bool inverse = _inverse != 0; - auto device = features.device().type(); - auto ndim = filters.dim() - 2; - auto kernelVolume = indicePairs.size(0); - auto numInPlanes = features.size(1); - auto numOutPlanes = filters.size(ndim + 1); - auto indicePairNumCpu = indiceNum.to({torch::kCPU}); - auto indicePairMaxSizeIter = - std::max_element(indicePairNumCpu.data_ptr(), - indicePairNumCpu.data_ptr() + kernelVolume); - int indicePairMaxOffset = - indicePairMaxSizeIter - indicePairNumCpu.data_ptr(); - int indicePairMaxSize = *indicePairMaxSizeIter; - - /*if (_subM){ - std::vector indicePairNumVec(indicePairNumCpu.data_ptr(), - indicePairNumCpu.data_ptr() + kernelVolume); - indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset); - - auto indicePairVecMaxSizeIter = std::max_element( - indicePairNumVec.begin(), indicePairNumVec.end()); - indicePairMaxSize = *indicePairVecMaxSizeIter; - }*/ - - auto options = - torch::TensorOptions().dtype(features.dtype()).device(features.device()); - // auto indicePairOptions = - // torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device()); - - torch::Tensor output = - torch::zeros({numActOut, numOutPlanes}, options).copy_(bias); - torch::Tensor inputBuffer = - torch::zeros({indicePairMaxSize, numInPlanes}, options); - torch::Tensor outputBuffer = - torch::zeros({indicePairMaxSize, numOutPlanes}, options); - filters = filters.view({-1, numInPlanes, numOutPlanes}); - if (subM) { // the center index of subm conv don't need gather and scatter - // add. - torch::mm_out(output, features, filters[indicePairMaxOffset]); - } - double totalGatherTime = 0; - double totalGEMMTime = 0; - double totalSAddTime = 0; - for (int i = 0; i < kernelVolume; ++i) { - auto nHot = indicePairNumCpu.data_ptr()[i]; - if (nHot <= 0 || (subM && i == indicePairMaxOffset)) { - continue; - } - // auto timer = spconv::CudaContextTimer<>(); - auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(), - {nHot, numOutPlanes}, options); - auto inputBufferBlob = - torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options); - - if (device == torch::kCPU) { - sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot); - } -#ifdef TV_CUDA - else if (device == torch::kCUDA) { - sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot); - } -#endif - else { - TV_ASSERT_INVALID_ARG(false, "unknown device type"); - } - - // totalGatherTime += timer.report() / 1000.0; - torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]); - // totalGEMMTime += timer.report() / 1000.0; - - if (device == torch::kCPU) { - sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse], - nHot); - } -#ifdef TV_CUDA - else if (device == torch::kCUDA) { - sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse], - nHot); - } -#endif - else { - TV_ASSERT_INVALID_ARG(false, "unknown device type"); - } - - // totalSAddTime += timer.report() / 1000.0; - } - // std::cout << "gather time " << totalGatherTime << std::endl; - // std::cout << "gemm time " << totalGEMMTime << std::endl; - // std::cout << "scatteradd time " << totalSAddTime << std::endl; - return output; -} -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/geometry.h b/include/spconv/geometry.h deleted file mode 100644 index d6bf3de..0000000 --- a/include/spconv/geometry.h +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SPCONV_GEOMETRY_H_ -#define SPCONV_GEOMETRY_H_ - -#include -#include -#include -#include -#include -namespace spconv { - -namespace detail { - -template struct ToUnsigned; - -template <> struct ToUnsigned { using type = uint32_t; }; - -template <> struct ToUnsigned { using type = uint64_t; }; - -template struct FNVInternal; -template <> struct FNVInternal { - constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5; - constexpr static uint32_t prime = 0x01000193; -}; - -template <> struct FNVInternal { - constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325; - constexpr static uint64_t prime = 0x100000001b3; -}; - -} // namespace detail -template -using to_unsigned_t = typename detail::ToUnsigned>::type; - -template struct FNV1a : detail::FNVInternal { - std::size_t operator()(const T *data, std::size_t size) { - to_unsigned_t hash = detail::FNVInternal::defaultOffsetBasis; - for (std::size_t i = 0; i < size; ++i) { - hash *= detail::FNVInternal::prime; - hash ^= static_cast>(data[i]); - } - return hash; - } -}; - -template -TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, - const Index *kernelSize, - const Index *stride, const Index *padding, - const Index *dilation, - const Index *outSpatialShape, Index *out) { - Index lowers[NDim]; - Index uppers[NDim]; - Index counter[NDim]; - Index counterSize[NDim]; - Index pointCounter = 0; - Index val; - Index numPoints = 1; - Index m, offset; - bool valid = false; -#pragma unroll - for (int i = 0; i < NDim; ++i) { - lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + - stride[i] + padding[i]) / - stride[i]; - uppers[i] = (input_pos[i] + padding[i]) / stride[i]; - } - -#pragma unroll - for (unsigned i = 0; i < NDim; ++i) { - counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); - numPoints *= counterSize[i]; - } - -#pragma unroll - for (int i = 0; i < NDim; ++i) { - counter[i] = 0; - } - for (int i = 0; i < numPoints; ++i) { - valid = true; - m = 1; - offset = 0; -#pragma unroll - for (int j = NDim - 1; j >= 0; --j) { - val = uppers[j] - counter[j] * dilation[j]; - out[pointCounter * (NDim + 1) + j] = val; - if (val < 0 || (val > outSpatialShape[j] - 1)) { - valid = false; - // break; - } - offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j]; - m *= kernelSize[j]; - } - - out[pointCounter * (NDim + 1) + NDim] = offset; - if (valid) - ++pointCounter; - counter[NDim - 1] += 1; -#pragma unroll - for (int c = NDim - 1; c >= 0; --c) { - if (counter[c] == counterSize[c] && c > 0) { - counter[c - 1] += 1; - counter[c] = 0; - } - } - } - return pointCounter; -} - -template -TV_HOST_DEVICE Index getValidOutPosTranspose( - const Index *input_pos, const Index *kernelSize, const Index *stride, - const Index *padding, const Index *dilation, const Index *outSpatialShape, - Index *out) { - Index lowers[NDim]; - Index uppers[NDim]; - Index counter[NDim]; - Index counterSize[NDim]; - Index pointCounter = 0; - Index val; - Index numPoints = 1; - Index m, offset; - bool valid = false; -#pragma unroll - for (int i = 0; i < NDim; ++i) { - lowers[i] = input_pos[i] * stride[i] - padding[i]; - uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i]; - } -#pragma unroll - for (unsigned i = 0; i < NDim; ++i) { - counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); - numPoints *= counterSize[i]; - } -#pragma unroll - for (int i = 0; i < NDim; ++i) { - counter[i] = 0; - } - for (int i = 0; i < numPoints; ++i) { - valid = true; - m = 1; - offset = 0; -#pragma unroll - for (int j = NDim - 1; j >= 0; --j) { - val = uppers[j] - counter[j] * dilation[j]; - out[pointCounter * (NDim + 1) + j] = val; - if (val < 0 || (val > outSpatialShape[j] - 1)) { - valid = false; - // break; - } - offset += m * (val - lowers[j]) / dilation[j]; - m *= kernelSize[j]; - } - out[pointCounter * (NDim + 1) + NDim] = offset; - if (valid) - ++pointCounter; - counter[NDim - 1] += 1; -#pragma unroll - for (int c = NDim - 1; c >= 0; --c) { - if (counter[c] == counterSize[c] && c > 0) { - counter[c - 1] += 1; - counter[c] = 0; - } - } - } - return pointCounter; -} - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/indice.cu.h b/include/spconv/indice.cu.h deleted file mode 100644 index f21cc47..0000000 --- a/include/spconv/indice.cu.h +++ /dev/null @@ -1,571 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef INDICE_CU_H_ -#define INDICE_CU_H_ -#include -#include -#include -#include - -namespace spconv { - -template -struct ConvIndiceDispatch; - -template -struct ConvIndiceDispatch { - constexpr static auto *func = getValidOutPosTranspose; -}; -template -struct ConvIndiceDispatch { - constexpr static auto *func = getValidOutPos; -}; - -template -__global__ void prepareIndicePairsKernel( - tv::TensorView indicesIn, tv::TensorView indicePairs, - tv::TensorView indiceNum, tv::TensorView indicePairUnique, - const tv::SimpleVector kernelSize, - const tv::SimpleVector stride, - const tv::SimpleVector padding, - const tv::SimpleVector dilation, - const tv::SimpleVector outSpatialShape) { - auto numActIn = indicesIn.dim(0); - Index spatialVolume = 1; -#pragma unroll - for (int i = 0; i < NDim; ++i) { - spatialVolume *= outSpatialShape[i]; - } - Index kernelVolume = 1; -#pragma unroll - for (int i = 0; i < NDim; ++i) { - kernelVolume *= kernelSize[i]; - } - Index numValidPoints = 0; - Index validPoints[KernelMaxVolume * (NDim + 1)]; - Index *pointPtr = nullptr; - auto indicePairsDim2 = indicePairs.dim(2); - Index index; - for (int ix : tv::KernelLoopX(numActIn)) { - numValidPoints = ConvIndiceDispatch::func( - indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), - stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), - validPoints); - for (Index i = 0; i < numValidPoints; ++i) { - pointPtr = validPoints + i * (NDim + 1); - auto offset = pointPtr[NDim]; - Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); - indicePairs(0, offset, oldNum) = ix; - index = tv::ArrayIndexRowMajor::runPtrs( - pointPtr, outSpatialShape.data(), 0) + - spatialVolume * indicesIn(ix, 0); - indicePairs(1, offset, oldNum) = index; - indicePairUnique[offset * indicePairsDim2 + oldNum] = index; - } - } -} - -template -__global__ void assignGridAndIndiceOutKernel( - tv::TensorView indicesOut, tv::TensorView gridsOut, - int numAct, tv::TensorView indicePairs, - tv::TensorView indicePairUnique, - const tv::SimpleVector outSpatialShape, int batchSize) { - - Index index; - auto indicesOutPtr = indicesOut.data(); - for (int ix : tv::KernelLoopX(numAct)) { - index = indicePairUnique[ix]; - gridsOut[index] = ix; - index = tv::rowArrayIdxInv( - index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); - indicesOut[ix * (NDim + 1)] = index % batchSize; - } -} - -template -__global__ void -assignIndiceOutKernel(tv::TensorView indicesOut, int numAct, - tv::TensorView indicePairUnique, - const tv::SimpleVector outSpatialShape, - int batchSize) { - - Index index; - auto indicesOutPtr = indicesOut.data(); - for (unsigned ix : tv::KernelLoopX(numAct)) { - index = indicePairUnique[ix]; - index = tv::rowArrayIdxInv( - index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); - indicesOut[ix * (NDim + 1)] = index % batchSize; - } -} - -template -__global__ void -assignIndicePairsHashKernel(tv::TensorView indicesOut, int numActIn, - tv::TensorView indicePairs, - tv::TensorView indicePairUnique, - unsigned table_size, const cuhash::Entry *table, - cuhash::Functions constants, - uint2 stash_constants, unsigned stash_count) { - - Index index; - int kernelVolume = indicePairs.dim(1); - auto indicePairsOut = indicePairs.subview(1); - for (int ix : tv::KernelLoopX(numActIn)) { - for (int i = 0; i < kernelVolume; ++i) { - index = indicePairsOut(i, ix); - if (index > -1) { - auto val = cuhash::retrieve((unsigned)(index), table_size, table, - constants, stash_constants, stash_count); - assert(val != cuhash::kNotFound); - indicePairsOut(i, ix) = (unsigned)val; - } - } - } -} - -template -__global__ void -assignIndicePairsKernel(tv::TensorView indicesOut, - tv::TensorView gridsOut, int numActIn, - tv::TensorView indicePairs, - tv::TensorView indicePairUnique, - const tv::SimpleVector outSpatialShape) { - - Index index; - int kernelVolume = indicePairs.dim(1); - auto indicePairsOut = indicePairs.subview(1); - - for (int ix : tv::KernelLoopX(numActIn)) { - for (int i = 0; i < kernelVolume; ++i) { - index = indicePairsOut(i, ix); - if (index > -1) { - indicePairsOut(i, ix) = gridsOut[index]; - } - } - } -} - -template -__global__ void -assignIndicePairsLimitedKernel(tv::TensorView gridsOut, int numActIn, - tv::TensorView indicePairs, - tv::TensorView indiceNum) { - - Index index, val; - int kernelVolume = indicePairs.dim(0); - for (int ix : tv::KernelLoopX(numActIn)) { - for (int i = 0; i < kernelVolume; ++i) { - index = indicePairs(i, 1, ix); - if (index != -1) { - val = gridsOut[index]; - if (val != -1) { - auto oldNum = atomicAdd(indiceNum.data() + i, Index(1)); - indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix); - indicePairs(i, 1, oldNum) = val; - } - } - } - } -} - -template -__global__ void prepareSubMGridKernel( - tv::TensorView indicesIn, tv::TensorView gridsOut, - const tv::SimpleVector outSpatialShape, Index spatialVolume) { - auto numActIn = indicesIn.dim(0); - Index index = 0; - for (int ix : tv::KernelLoopX(numActIn)) { - index = - tv::ArrayIndexRowMajor::runPtrs( - indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) + - spatialVolume * indicesIn(ix, 0); - gridsOut[index] = ix; - } -} - -template -__global__ void -prepareSubMHashKernel(tv::TensorView indicesIn, unsigned *keys, - unsigned *values, - const tv::SimpleVector outSpatialShape) { - auto numActIn = indicesIn.dim(0); - Index spatialVolume = 1; -#pragma unroll - for (int i = 0; i < NDim; ++i) { - spatialVolume *= outSpatialShape[i]; - } - Index index = 0; - for (int ix : tv::KernelLoopX(numActIn)) { - index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1, - outSpatialShape.data()) + - spatialVolume * indicesIn(ix, 0); - keys[ix] = index; - values[ix] = ix; - } -} - -template -__global__ void getSubMIndicePairsKernel( - tv::TensorView indicesIn, tv::TensorView gridsOut, - tv::TensorView indicePairs, tv::TensorView indiceNum, - const tv::SimpleVector kernelSize, - const tv::SimpleVector stride, - const tv::SimpleVector padding, - const tv::SimpleVector dilation, - const tv::SimpleVector outSpatialShape) { - auto numActIn = indicesIn.dim(0); - Index spatialVolume = 1; -#pragma unroll - for (int i = 0; i < NDim; ++i) { - spatialVolume *= outSpatialShape[i]; - } - Index numValidPoints = 0; - Index validPoints[KernelMaxVolume * (NDim + 1)]; - Index *pointPtr = nullptr; - Index index = 0; - for (int ix : tv::KernelLoopX(numActIn)) { - numValidPoints = getValidOutPos( - indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), - stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), - validPoints); - for (int i = 0; i < numValidPoints; ++i) { - pointPtr = validPoints + i * (NDim + 1); - auto offset = pointPtr[NDim]; - index = tv::ArrayIndexRowMajor::runPtrs( - pointPtr, outSpatialShape.data(), 0) + - spatialVolume * indicesIn(ix, 0); - if (gridsOut[index] > -1) { - Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); - indicePairs(1, offset, oldNum) = gridsOut[index]; - indicePairs(0, offset, oldNum) = ix; - } - } - } -} - -template -__global__ void getSubMIndicePairsUnrollKernel3( - tv::TensorView indicesIn, tv::TensorView gridsOut, - tv::TensorView indicePairs, tv::TensorView indiceNum, - const tv::SimpleVector outSpatialShape, Index spatialVolume) { - auto numActIn = indicesIn.dim(0); - - Index point[3]; - Index index = 0; - Index offset; - constexpr unsigned KV = K0 * K1 * K2; - constexpr unsigned center = KV / 2; - *(indiceNum.data() + center) = numActIn; - for (int ix : tv::KernelLoopX(numActIn)) { - const Index *indice_data = indicesIn.data() + ix * (3 + 1); -#pragma unroll - for (int i = 0; i < K0; ++i) { -#pragma unroll - for (int j = 0; j < K1; ++j) { -#pragma unroll - for (int k = 0; k < K2; ++k) { - offset = i * K1 * K2 + j * K2 + k; - if (offset > center) { - continue; - } - if (center == offset) { - // center of subm indice pairs dont need atomicadd - indicePairs(1, offset, ix) = ix; - indicePairs(0, offset, ix) = ix; - } else { - point[2] = indice_data[3] - k + K2 / 2; - point[1] = indice_data[2] - j + K1 / 2; - point[0] = indice_data[1] - i + K0 / 2; - if (point[1] >= 0 && point[1] < outSpatialShape[1] && - point[2] >= 0 && point[2] < outSpatialShape[2] && - point[0] >= 0 && point[0] < outSpatialShape[0]) { - index = tv::ArrayIndexRowMajor<3, 3>::runPtrs( - point, outSpatialShape.data(), 0) + - spatialVolume * indice_data[0]; - if (gridsOut[index] != -1) { - // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - // - 1] - Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); - atomicAdd(indiceNum.data() + KV - offset - 1, Index(1)); - indicePairs(1, offset, oldNum) = gridsOut[index]; - indicePairs(0, offset, oldNum) = ix; - indicePairs(1, KV - offset - 1, oldNum) = ix; - indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index]; - } - } - } - } - } - } - } -} - -template -__global__ void getSubMIndicePairsUnrollKernel2( - tv::TensorView indicesIn, tv::TensorView gridsOut, - tv::TensorView indicePairs, tv::TensorView indiceNum, - const tv::SimpleVector outSpatialShape, Index spatialVolume) { - auto numActIn = indicesIn.dim(0); - Index point[2]; - Index index = 0; - Index offset; - constexpr unsigned KV = K0 * K1; - constexpr unsigned center = KV / 2; - *(indiceNum.data() + center) = numActIn; - - for (int ix : tv::KernelLoopX(numActIn)) { - const Index *indice_data = indicesIn.data() + ix * (2 + 1); -#pragma unroll - for (int i = 0; i < K0; ++i) { -#pragma unroll - for (int j = 0; j < K1; ++j) { - offset = i * K1 + j; - if (offset > center) { - continue; - } - if (center == offset) { - // center of subm indice pairs dont need atomicadd - indicePairs(1, offset, ix) = ix; - indicePairs(0, offset, ix) = ix; - } else { - point[1] = indice_data[2] - j + K1 / 2; - point[0] = indice_data[1] - i + K0 / 2; - if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 && - point[0] < outSpatialShape[0]) { - index = tv::ArrayIndexRowMajor<2, 2>::runPtrs( - point, outSpatialShape.data(), 0) + - spatialVolume * indice_data[0]; - if (gridsOut[index] > -1) { - Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); - atomicAdd(indiceNum.data() + KV - offset - 1, Index(1)); - indicePairs(1, offset, oldNum) = gridsOut[index]; - indicePairs(0, offset, oldNum) = ix; - indicePairs(1, KV - offset - 1, oldNum) = ix; - indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index]; - } - } - } - } - } - } -} - -template -__global__ void getSubMIndicePairsHashKernel( - tv::TensorView indicesIn, tv::TensorView indicePairs, - tv::TensorView indiceNum, - const tv::SimpleVector kernelSize, - const tv::SimpleVector stride, - const tv::SimpleVector padding, - const tv::SimpleVector dilation, - const tv::SimpleVector outSpatialShape, unsigned table_size, - const cuhash::Entry *table, cuhash::Functions constants, - uint2 stash_constants, unsigned stash_count) { - auto numActIn = indicesIn.dim(0); - Index spatialVolume = 1; -#pragma unroll - for (int i = 0; i < NDim; ++i) { - spatialVolume *= outSpatialShape[i]; - } - Index numValidPoints = 0; - Index validPoints[KernelMaxVolume * (NDim + 1)]; - Index *pointPtr = nullptr; - Index index = 0; - for (int ix : tv::KernelLoopX(numActIn)) { - numValidPoints = getValidOutPos( - indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), - stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), - validPoints); - for (int i = 0; i < numValidPoints; ++i) { - pointPtr = validPoints + i * (NDim + 1); - auto offset = pointPtr[NDim]; - index = tv::ArrayIndexRowMajor::runPtrs( - pointPtr, outSpatialShape.data(), 0) + - spatialVolume * indicesIn(ix, 0); - auto val = cuhash::retrieve((unsigned)(index), table_size, table, - constants, stash_constants, stash_count); - if (val != cuhash::kNotFound) { - Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); - indicePairs(1, offset, oldNum) = val; - indicePairs(0, offset, oldNum) = ix; - } - } - } -} - -template -__global__ void getSubMIndicePairsHashUnrollKernel3( - tv::TensorView indicesIn, tv::TensorView indicePairs, - tv::TensorView indiceNum, - const tv::SimpleVector outSpatialShape, Index spatialVolume, - unsigned table_size, const cuhash::Entry *table, - cuhash::Functions constants, uint2 stash_constants, - unsigned stash_count) { - auto numActIn = indicesIn.dim(0); - Index index = 0; - Index offset; - Index point[3]; - constexpr unsigned KV = K0 * K1 * K2; - constexpr unsigned center = KV / 2; - *(indiceNum.data() + center) = numActIn; - for (int ix : tv::KernelLoopX(numActIn)) { - const Index *indice_data = indicesIn.data() + ix * (3 + 1); -#pragma unroll - for (int i = 0; i < K0; ++i) { -#pragma unroll - for (int j = 0; j < K1; ++j) { -#pragma unroll - for (int k = 0; k < K2; ++k) { - offset = i * K1 * K2 + j * K2 + k; - if (offset > center) { - continue; - } - if (center == offset) { - // center of subm indice pairs dont need atomicadd - indicePairs(1, offset, ix) = ix; - indicePairs(0, offset, ix) = ix; - } else { - point[2] = indice_data[3] - k + K2 / 2; - point[1] = indice_data[2] - j + K1 / 2; - point[0] = indice_data[1] - i + K0 / 2; - if (point[1] >= 0 && point[1] < outSpatialShape[1] && - point[2] >= 0 && point[2] < outSpatialShape[2] && - point[0] >= 0 && point[0] < outSpatialShape[0]) { - index = tv::ArrayIndexRowMajor<3, 3>::runPtrs( - point, outSpatialShape.data(), 0) + - spatialVolume * indice_data[0]; - auto val = - cuhash::retrieve((unsigned)(index), table_size, table, - constants, stash_constants, stash_count); - - if (val != cuhash::kNotFound) { - // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - // - 1] - Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); - atomicAdd(indiceNum.data() + KV - offset - 1, Index(1)); - indicePairs(1, offset, oldNum) = val; - indicePairs(0, offset, oldNum) = ix; - indicePairs(1, KV - offset - 1, oldNum) = ix; - indicePairs(0, KV - offset - 1, oldNum) = val; - } - } - } - } - } - } - } -} - -template -__global__ void getSubMIndicePairsHashUnrollKernel2( - tv::TensorView indicesIn, tv::TensorView indicePairs, - tv::TensorView indiceNum, - const tv::SimpleVector outSpatialShape, Index spatialVolume, - unsigned table_size, const cuhash::Entry *table, - cuhash::Functions constants, uint2 stash_constants, - unsigned stash_count) { - auto numActIn = indicesIn.dim(0); - Index index = 0; - Index offset; - Index point[2]; - constexpr unsigned KV = K0 * K1; - constexpr unsigned center = KV / 2; - *(indiceNum.data() + center) = numActIn; - for (int ix : tv::KernelLoopX(numActIn)) { - const Index *indice_data = indicesIn.data() + ix * (2 + 1); -#pragma unroll - for (int i = 0; i < K0; ++i) { -#pragma unroll - for (int j = 0; j < K1; ++j) { - offset = i * K1 + j; - if (offset > center) { - continue; - } - if (center == offset) { - // center of subm indice pairs dont need atomicadd - indicePairs(1, offset, ix) = ix; - indicePairs(0, offset, ix) = ix; - } else { - point[1] = indice_data[2] - j + K1 / 2; - point[0] = indice_data[1] - i + K0 / 2; - if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 && - point[0] < outSpatialShape[0]) { - index = tv::ArrayIndexRowMajor<2, 2>::runPtrs( - point, outSpatialShape.data(), 0) + - spatialVolume * indice_data[0]; - auto val = - cuhash::retrieve((unsigned)(index), table_size, table, - constants, stash_constants, stash_count); - - if (val != cuhash::kNotFound) { - // for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - - // 1] - Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); - atomicAdd(indiceNum.data() + KV - offset - 1, Index(1)); - indicePairs(1, offset, oldNum) = val; - indicePairs(0, offset, oldNum) = ix; - indicePairs(1, KV - offset - 1, oldNum) = ix; - indicePairs(0, KV - offset - 1, oldNum) = val; - } - } - } - } - } - } -} - -template -__global__ void resetGridKernel(const Index *indicePairUnique, - tv::TensorView gridsOut, - int numAct) { - for (int ix : tv::KernelLoopX(numAct)) { - gridsOut[indicePairUnique[ix]] = -1; - } -} - -template __global__ void arangeKernel(T *data, int size) { - for (int ix : tv::KernelLoopX(size)) { - data[ix] = ix; - } -} - -template -__global__ void -resetGridSubMKernel(const Index *indices, tv::TensorView gridsOut, - const tv::SimpleVector outSpatialShape, - int numAct, Index spatialVolume) { - auto indsPtr = indices; - Index index; - for (int ix : tv::KernelLoopX(numAct)) { - indsPtr = indices + ix * (NDim + 1); - index = tv::ArrayIndexRowMajor::runPtrs( - indsPtr + 1, outSpatialShape.data(), 0); - gridsOut[index + spatialVolume * indsPtr[0]] = -1; - } -} - -} // namespace spconv - -#undef atomicAdd - -#endif \ No newline at end of file diff --git a/include/spconv/indice.h b/include/spconv/indice.h deleted file mode 100644 index 81830c0..0000000 --- a/include/spconv/indice.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_ -#define SPARSE_CONV_INDICE_FUNCTOR_H_ -#include -#include - -namespace spconv { -int create_conv_indice_pair_p1_cuda( - torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum, - torch::Tensor indicePairUnique, std::vector kernelSize, - std::vector stride, std::vector padding, - std::vector dilation, std::vector outSpatialShape, - bool transpose); - -int create_conv_indice_pair_p2_cuda( - torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut, - torch::Tensor indicePairs, torch::Tensor indiceNum, - torch::Tensor indicePairUnique, std::vector outSpatialShape, - bool transpose, bool resetGrid, bool useHash); - -int create_submconv_indice_pair_cuda( - torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs, - torch::Tensor indiceNum, std::vector kernelSize, - std::vector stride, std::vector padding, - std::vector dilation, std::vector outSpatialShape, - bool transpose, bool resetGrid, bool useHash); - -int create_conv_indice_pair_cpu( - torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut, - torch::Tensor indicePairs, torch::Tensor indiceNum, - std::vector kernelSize, std::vector stride, - std::vector padding, std::vector dilation, - std::vector outSpatialShape, bool transpose, bool resetGrid, - bool useHash); - -int create_submconv_indice_pair_cpu( - torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs, - torch::Tensor indiceNum, std::vector kernelSize, - std::vector stride, std::vector padding, - std::vector dilation, std::vector outSpatialShape, - bool transpose, bool resetGrid, bool useHash); - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/maxpool.h b/include/spconv/maxpool.h deleted file mode 100644 index 76fce03..0000000 --- a/include/spconv/maxpool.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SPARSE_MAXPOOL_FUNCTOR_H_ -#define SPARSE_MAXPOOL_FUNCTOR_H_ -#include -#include -#include -#include - -namespace spconv { - -void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures, - torch::Tensor dout, torch::Tensor din, - torch::Tensor indicesIn, torch::Tensor indicesOut, - int size); - -void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures, - torch::Tensor indicesIn, torch::Tensor indicesOut, - int size); - -void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures, - torch::Tensor dout, torch::Tensor din, - torch::Tensor indicesIn, torch::Tensor indicesOut, - int size); - -void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures, - torch::Tensor indicesIn, torch::Tensor indicesOut, - int size); - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/minkowski.cu.h b/include/spconv/minkowski.cu.h deleted file mode 100644 index 128e169..0000000 --- a/include/spconv/minkowski.cu.h +++ /dev/null @@ -1,179 +0,0 @@ -/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu). - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural - * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part - * of the code. - */ - -template -__global__ void matmul(const Dtype *A, const int wA, const int hA, - const Dtype *B, const int wB, const int hB, Dtype *C, - const Itype *in_map, const Itype *out_map) { - // Use in_feat as A and kernel as B - - // Block index - const int bx = blockIdx.x; - const int by = blockIdx.y; - - // Thread index - const int tx = threadIdx.x; - const int ty = threadIdx.y; - - // Coordinate. x is for rows, y is for columns. - const int x = BLOCK_SIZE * bx + tx; - const int y = BLOCK_SIZE * by + ty; - - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - Dtype Csub = 0; - - const Itype in_row = y < hA ? in_map[y] : 0; - const Itype out_row = y < hA ? out_map[y] : 0; - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int s = 0; s < wA; s += BLOCK_SIZE) { - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE]; - - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE]; - - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0; - Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0; - - // Synchronize to make sure the matrices are loaded - __syncthreads(); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix -#pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[ty][k] * Bs[k][tx]; - } - - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - if (y < hA && x < wB) - atomicAdd(&C[wB * out_row + x], Csub); - // C[wB * out_row + x] += Csub; -} - -template -__global__ void matmul2(const Dtype *A, const int wA, const int hA, - const Dtype *B, const int wB, const int hB, - const Dtype *D, const int wD, const int hD, Dtype *C, - Dtype *E, const Itype *in_map, const Itype *out_map) { - // Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D - - // Block index - const int bx = blockIdx.x; - const int by = blockIdx.y; - - // Thread index - const int tx = threadIdx.x; - const int ty = threadIdx.y; - - // Coordinate. y is for rows, x is for columns. - const int x = BLOCK_SIZE * bx + tx; - const int y = BLOCK_SIZE * by + ty; - - const Itype in_row = y < hA ? in_map[y] : 0; - const Itype out_row = y < hA ? out_map[y] : 0; - - // Csub is used to store the element of the block sub-matrix - // that is computed by the thread - Dtype Csub = 0; - Dtype Esub = 0; - - // Declaration of the shared memory array As used to - // store the sub-matrix of A - __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE]; - - // Declaration of the shared memory array Bs used to - // store the sub-matrix of B - __shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE]; - - // Declaration of the shared memory array Ds used to - // store the sub-matrix of D - __shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE]; - - // For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A - DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0; - - // Loop over all the sub-matrices of A and B - // required to compute the block sub-matrix - for (int s = 0; s < wA; s += BLOCK_SIZE) { - // Load the matrices from device memory - // to shared memory; each thread loads - // one element of each matrix - As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0; - - // Transposed kernel - BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0; - - // Synchronize to make sure the matrices are loaded - __syncthreads(); - - // Multiply the two matrices together; - // each thread computes one element - // of the block sub-matrix -#pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Csub += As[ty][k] * BTs[k][tx]; - } - - // For Esub, reset to 0 - Esub = 0; -#pragma unroll - for (int k = 0; k < BLOCK_SIZE; ++k) { - Esub += DTs[k][ty] * As[k][tx]; - } - - // Synchronize to make sure that the preceding - // computation is done before loading two new - // sub-matrices of A and B in the next iteration - __syncthreads(); - - // For the E matrix which requires accmulation of multiple blocks, use - // atomic addition. This can be replaced with a more sophisticaed reduction - // algorithm. - if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA) - atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub); - } - - // Write the block sub-matrix to device memory; - // each thread writes one element - if (y < hA && x < hB) - atomicAdd(&C[hB * in_row + x], Csub); -} diff --git a/include/spconv/nms.h b/include/spconv/nms.h deleted file mode 100644 index 4d0afe5..0000000 --- a/include/spconv/nms.h +++ /dev/null @@ -1,202 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef NMS_CPU_H -#define NMS_CPU_H -#include -// must include pybind11/stl.h if using containers in STL in arguments. -#include "box_iou.h" -#include "nms_gpu.h" -#include -#include -#include -#include -#include -namespace spconv { -namespace py = pybind11; -using namespace pybind11::literals; - -template -std::vector non_max_suppression_cpu(py::array_t boxes, - py::array_t order, DType thresh, - DType eps = 0) { - auto ndets = boxes.shape(0); - auto boxes_r = boxes.template unchecked<2>(); - auto order_r = order.template unchecked<1>(); - auto suppressed = zeros({int(ndets)}); - auto suppressed_rw = suppressed.template mutable_unchecked<1>(); - auto area = zeros({int(ndets)}); - auto area_rw = area.template mutable_unchecked<1>(); - // get areas - for (int i = 0; i < ndets; ++i) { - area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) * - (boxes_r(i, 3) - boxes_r(i, 1) + eps); - } - std::vector keep; - int i, j; - DType xx1, xx2, w, h, inter, ovr; - for (int _i = 0; _i < ndets; ++_i) { - i = order_r(_i); - if (suppressed_rw(i) == 1) - continue; - keep.push_back(i); - for (int _j = _i + 1; _j < ndets; ++_j) { - j = order_r(_j); - if (suppressed_rw(j) == 1) - continue; - xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2)); - xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0)); - w = xx2 - xx1 + eps; - if (w > 0) { - xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3)); - xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1)); - h = xx2 - xx1 + eps; - if (h > 0) { - inter = w * h; - ovr = inter / (area_rw(i) + area_rw(j) - inter); - if (ovr >= thresh) - suppressed_rw(j) = 1; - } - } - } - } - return keep; -} - -template -std::vector rotate_non_max_suppression_cpu(py::array_t box_corners, - py::array_t order, - py::array_t standup_iou, - DType thresh) { - auto ndets = box_corners.shape(0); - auto box_corners_r = box_corners.template unchecked<3>(); - auto order_r = order.template unchecked<1>(); - auto suppressed = zeros({int(ndets)}); - auto suppressed_rw = suppressed.template mutable_unchecked<1>(); - auto standup_iou_r = standup_iou.template unchecked<2>(); - std::vector keep; - int i, j; - - namespace bg = boost::geometry; - typedef bg::model::point point_t; - typedef bg::model::polygon polygon_t; - polygon_t poly, qpoly; - std::vector poly_inter, poly_union; - DType inter_area, union_area, overlap; - - for (int _i = 0; _i < ndets; ++_i) { - i = order_r(_i); - if (suppressed_rw(i) == 1) - continue; - keep.push_back(i); - for (int _j = _i + 1; _j < ndets; ++_j) { - j = order_r(_j); - if (suppressed_rw(j) == 1) - continue; - if (standup_iou_r(i, j) <= 0.0) - continue; - // std::cout << "pre_poly" << std::endl; - try { - bg::append(poly, - point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1))); - bg::append(poly, - point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1))); - bg::append(poly, - point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1))); - bg::append(poly, - point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1))); - bg::append(poly, - point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1))); - bg::append(qpoly, - point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1))); - bg::append(qpoly, - point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1))); - bg::append(qpoly, - point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1))); - bg::append(qpoly, - point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1))); - bg::append(qpoly, - point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1))); - bg::intersection(poly, qpoly, poly_inter); - } catch (const std::exception &e) { - std::cout << "box i corners:" << std::endl; - for (int k = 0; k < 4; ++k) { - std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1) - << std::endl; - } - std::cout << "box j corners:" << std::endl; - for (int k = 0; k < 4; ++k) { - std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1) - << std::endl; - } - // throw e; - continue; - } - // std::cout << "post_poly" << std::endl; - // std::cout << "post_intsec" << std::endl; - if (!poly_inter.empty()) { - inter_area = bg::area(poly_inter.front()); - // std::cout << "pre_union" << " " << inter_area << std::endl; - bg::union_(poly, qpoly, poly_union); - /* - if (poly_union.empty()){ - std::cout << "intsec area:" << " " << inter_area << std::endl; - std::cout << "box i corners:" << std::endl; - for(int k = 0; k < 4; ++k){ - std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, - k, 1) << std::endl; - } - std::cout << "box j corners:" << std::endl; - for(int k = 0; k < 4; ++k){ - std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, - k, 1) << std::endl; - } - }*/ - // std::cout << "post_union" << poly_union.empty() << std::endl; - if (!poly_union.empty()) { // ignore invalid box - union_area = bg::area(poly_union.front()); - // std::cout << "post union area" << std::endl; - // std::cout << union_area << "debug" << std::endl; - overlap = inter_area / union_area; - if (overlap >= thresh) - suppressed_rw(j) = 1; - poly_union.clear(); - } - } - poly.clear(); - qpoly.clear(); - poly_inter.clear(); - } - } - return keep; -} -#ifdef TV_CUDA -constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8; - -template -int non_max_suppression(py::array_t boxes, py::array_t keep_out, - DType nms_overlap_thresh, int device_id) { - py::buffer_info info = boxes.request(); - auto boxes_ptr = static_cast(info.ptr); - py::buffer_info info_k = keep_out.request(); - auto keep_out_ptr = static_cast(info_k.ptr); - - return _nms_gpu(keep_out_ptr, boxes_ptr, - boxes.shape(0), boxes.shape(1), - nms_overlap_thresh, device_id); -} -#endif - -} // namespace spconv -#endif diff --git a/include/spconv/nms_functor.h b/include/spconv/nms_functor.h deleted file mode 100644 index ba108e6..0000000 --- a/include/spconv/nms_functor.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef NMS_FUNCTOR_H_ -#define NMS_FUNCTOR_H_ -#include - -namespace spconv { -namespace functor { -template -struct NonMaxSupressionFunctor { - Index operator()(const Device &d, tv::TensorView keep, - tv::TensorView boxes, T threshold, T eps); -}; - -template -struct rotateNonMaxSupressionFunctor { - Index operator()(const Device &d, tv::TensorView keep, - tv::TensorView boxCorners, - tv::TensorView standupIoU, T threshold); -}; - -} // namespace functor -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/nms_gpu.h b/include/spconv/nms_gpu.h deleted file mode 100644 index 15b735f..0000000 --- a/include/spconv/nms_gpu.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -template -int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num, - int boxes_dim, DType nms_overlap_thresh, int device_id); diff --git a/include/spconv/nms_ops.h b/include/spconv/nms_ops.h deleted file mode 100644 index 714cb33..0000000 --- a/include/spconv/nms_ops.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef NMS_TORCH_OP_H_ -#define NMS_TORCH_OP_H_ - -#include -#include -#include -#include -#include -#include - -namespace spconv { -// torch.jit's doc says only support int64, so we need to convert to int32. -template -torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores, - int64_t preMaxSize, int64_t postMaxSize, - double thresh, double eps) { - // auto timer = spconv::CudaContextTimer<>(); - tv::check_torch_dtype(boxes); - auto resOptions = - torch::TensorOptions().dtype(torch::kInt64).device(boxes.device()); - if (boxes.size(0) == 0) { - return torch::zeros({0}, resOptions); - } - torch::Tensor indices; - if (preMaxSize > 0) { - auto numKeepedScores = scores.size(0); - preMaxSize = std::min(numKeepedScores, preMaxSize); - auto res = torch::topk(scores, preMaxSize); - indices = std::get<1>(res); - boxes = torch::index_select(boxes, 0, indices); - } else { - indices = std::get<1>(torch::sort(scores)); - boxes = torch::index_select(boxes, 0, indices); - } - if (boxes.size(0) == 0) - return torch::zeros({0}, resOptions); - - auto keep = torch::zeros({boxes.size(0)}, resOptions); - int64_t keepNum = 0; - if (boxes.device().type() == torch::kCPU) { - auto nmsFunctor = functor::NonMaxSupressionFunctor(); - keepNum = nmsFunctor(tv::CPU(), tv::torch2tv(keep), - tv::torch2tv(boxes), T(thresh), T(eps)); - } else { - TV_ASSERT_RT_ERR(false, "not implemented"); - } - if (postMaxSize <= 0) { - postMaxSize = keepNum; - } - // std::cout << keep << std::endl; - keep = keep.slice(0, 0, std::min(keepNum, postMaxSize)); - if (preMaxSize > 0) { - return torch::index_select(indices, 0, keep); - } - return keep; -} - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/pillar_scatter_functor.h b/include/spconv/pillar_scatter_functor.h deleted file mode 100644 index 518f2a3..0000000 --- a/include/spconv/pillar_scatter_functor.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_ -#define POINTPILLARS_SCATTER_FUNCTOR_H_ -#include - -namespace spconv { -namespace functor { -template -struct PointPillarScatter { - void operator()(const Device &d, tv::TensorView canvas, - tv::TensorView features, - tv::TensorView coors); -}; - -} // namespace functor -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/pillar_scatter_ops.h b/include/spconv/pillar_scatter_ops.h deleted file mode 100644 index 1f5e2c2..0000000 --- a/include/spconv/pillar_scatter_ops.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef PILLAR_SCATTER_OP_H_ -#define PILLAR_SCATTER_OP_H_ - -#include -#include -#include -#include - -namespace spconv { -// torch.jit's doc says only support int64, so we need to convert to int32. - -template -torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors, - torch::Tensor shape) { - TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error"); - TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error"); - TV_ASSERT_RT_ERR(shape.dim() == 1, "error"); - TV_ASSERT_RT_ERR(shape.size(0) == 4, "error"); - TV_ASSERT_RT_ERR(features.dim() >= 3, "error"); - TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1"); - TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1"); - TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err"); - - tv::check_torch_dtype(shape); - tv::check_torch_dtype(coors); - auto shapeData = shape.data_ptr(); - torch::Tensor canvas = - torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]}, - features.options()); - TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error"); -#ifdef TV_CUDA - functor::PointPillarScatter ftor; - ftor(tv::TorchGPU(), tv::torch2tv(canvas), - tv::torch2tv(features.squeeze()), - tv::torch2tv(coors.squeeze())); -#endif - return canvas; -} - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/point2voxel.cu.h b/include/spconv/point2voxel.cu.h deleted file mode 100644 index 2a68750..0000000 --- a/include/spconv/point2voxel.cu.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace spconv { -template -__global__ void scatterPointToGridKernel( - tv::TensorView points, tv::TensorView indexes, - tv::TensorView grids, tv::TensorView numPointsPerGrid, - tv::TensorView pointIndex, - const tv::SimpleVector gridShape) { - Index index; - int numPoints = points.dim(0); - int numFeatures = points.dim(1); - - for (int ix : tv::KernelLoopX(numPoints)) { - index = tv::ArrayIndexRowMajor::runPtrs( - indexes.data() + ix * NDim, gridShape.data(), 0); - pointIndex(ix) = index; - atomicAdd(numPointsPerGrid.data() + index, Index(1)); -#pragma unroll - for (int k = 0; k != numFeatures; ++k) { - atomicAdd(grids.data() + index * numFeatures + k, - *(points.data() + ix * numFeatures + k)); - } - } -} - -template -__global__ void -gatherPointFromGridKernel(tv::TensorView grids, - tv::TensorView numPointsPerGrid, - tv::TensorView pointIndexUnique, - tv::TensorView voxels, - tv::TensorView coors, - const tv::SimpleVector gridShape) { - Index index; - int numVoxels = voxels.dim(0); - int numFeatures = grids.dim(1); - - for (int ix : tv::KernelLoopX(numVoxels)) { - index = pointIndexUnique(ix); -#pragma unroll - for (int k = 0; k != numFeatures; ++k) { - voxels(ix, k) = grids(index, k) / numPointsPerGrid(index); - } - index = tv::rowArrayIdxInv(index, coors.data() + ix * NDim, - gridShape.data()); - } -} - -template -__global__ void resetGridKernel(tv::TensorView grids, - tv::TensorView numPointsPerGrid, - tv::TensorView pointIndexUnique) { - Index index; - int numVoxels = pointIndexUnique.dim(0) - 1; - int numFeatures = grids.dim(1); - - for (int ix : tv::KernelLoopX(numVoxels)) { - index = pointIndexUnique(ix); -#pragma unroll - for (int k = 0; k != numFeatures; ++k) { - grids(index, k) = 0; - numPointsPerGrid(index) = 0; - } - } -} - -template -__global__ void resetPointIndexKernel(tv::TensorView pointIndex, - const Index gridVolume) { - int num_max_points = pointIndex.dim(0) - 1; - - for (int ix : tv::KernelLoopX(num_max_points)) { - pointIndex(ix) = gridVolume; - } -} -} // namespace spconv diff --git a/include/spconv/point2voxel.h b/include/spconv/point2voxel.h deleted file mode 100644 index 477d1af..0000000 --- a/include/spconv/point2voxel.h +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -// must include pybind11/eigen.h if using eigen matrix as arguments. -// must include pybind11/stl.h if using containers in STL in arguments. -#include -#include -#include -// #include -#include -#include - -namespace spconv { -namespace py = pybind11; -using namespace pybind11::literals; - -template -int points_to_voxel_3d_np(py::array_t points, py::array_t voxels, - py::array_t voxel_point_mask, - py::array_t coors, - py::array_t num_points_per_voxel, - py::array_t coor_to_voxelidx, - std::vector voxel_size, - std::vector coors_range, int max_points, - int max_voxels) { - auto points_rw = points.template mutable_unchecked<2>(); - auto voxels_rw = voxels.template mutable_unchecked<3>(); - auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>(); - auto coors_rw = coors.mutable_unchecked<2>(); - auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>(); - auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked(); - auto N = points_rw.shape(0); - auto num_features = points_rw.shape(1); - // auto ndim = points_rw.shape(1) - 1; - constexpr int ndim_minus_1 = NDim - 1; - int voxel_num = 0; - bool failed = false; - int coor[NDim]; - int c; - int grid_size[NDim]; - for (int i = 0; i < NDim; ++i) { - grid_size[i] = - round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]); - } - int voxelidx, num; - for (int i = 0; i < N; ++i) { - failed = false; - for (int j = 0; j < NDim; ++j) { - c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]); - if ((c < 0 || c >= grid_size[j])) { - failed = true; - break; - } - coor[ndim_minus_1 - j] = c; - } - if (failed) - continue; - voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); - if (voxelidx == -1) { - voxelidx = voxel_num; - if (voxel_num >= max_voxels) - continue; - voxel_num += 1; - coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx; - for (int k = 0; k < NDim; ++k) { - coors_rw(voxelidx, k) = coor[k]; - } - } - num = num_points_per_voxel_rw(voxelidx); - if (num < max_points) { - voxel_point_mask_rw(voxelidx, num) = DType(1); - for (int k = 0; k < num_features; ++k) { - voxels_rw(voxelidx, num, k) = points_rw(i, k); - } - num_points_per_voxel_rw(voxelidx) += 1; - } - } - for (int i = 0; i < voxel_num; ++i) { - coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1; - } - return voxel_num; -} - -template -int points_to_voxel_3d_np_mean( - py::array_t points, py::array_t voxel_point_mask, - py::array_t voxels, py::array_t means, py::array_t coors, - py::array_t num_points_per_voxel, py::array_t coor_to_voxelidx, - std::vector voxel_size, std::vector coors_range, - int max_points, int max_voxels) { - auto points_rw = points.template mutable_unchecked<2>(); - auto means_rw = means.template mutable_unchecked<2>(); - auto voxels_rw = voxels.template mutable_unchecked<3>(); - auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>(); - auto coors_rw = coors.mutable_unchecked<2>(); - auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>(); - auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked(); - auto N = points_rw.shape(0); - auto num_features = points_rw.shape(1); - // auto ndim = points_rw.shape(1) - 1; - constexpr int ndim_minus_1 = NDim - 1; - int voxel_num = 0; - bool failed = false; - int coor[NDim]; - int c; - int grid_size[NDim]; - for (int i = 0; i < NDim; ++i) { - grid_size[i] = - round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]); - } - int voxelidx, num; - for (int i = 0; i < N; ++i) { - failed = false; - for (int j = 0; j < NDim; ++j) { - c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]); - if ((c < 0 || c >= grid_size[j])) { - failed = true; - break; - } - coor[ndim_minus_1 - j] = c; - } - if (failed) - continue; - voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); - if (voxelidx == -1) { - voxelidx = voxel_num; - if (voxel_num >= max_voxels) - continue; - voxel_num += 1; - coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx; - for (int k = 0; k < NDim; ++k) { - coors_rw(voxelidx, k) = coor[k]; - } - } - num = num_points_per_voxel_rw(voxelidx); - if (num < max_points) { - voxel_point_mask_rw(voxelidx, num) = DType(1); - for (int k = 0; k < num_features; ++k) { - voxels_rw(voxelidx, num, k) = points_rw(i, k); - } - num_points_per_voxel_rw(voxelidx) += 1; - for (int k = 0; k < num_features; ++k) { - means_rw(voxelidx, k) += - (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1); - } - } - } - for (int i = 0; i < voxel_num; ++i) { - coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1; - num = num_points_per_voxel_rw(i); - for (int j = num; j < max_points; ++j) { - for (int k = 0; k < num_features; ++k) { - voxels_rw(i, j, k) = means_rw(i, k); - } - } - } - return voxel_num; -} - -template -int points_to_voxel_3d_with_filtering( - py::array_t points, py::array_t voxels, - py::array_t voxel_point_mask, py::array_t voxel_mask, - py::array_t mins, py::array_t maxs, py::array_t coors, - py::array_t num_points_per_voxel, py::array_t coor_to_voxelidx, - std::vector voxel_size, std::vector coors_range, - int max_points, int max_voxels, int block_factor, int block_size, - DType height_threshold, DType height_high_threshold) { - auto points_rw = points.template mutable_unchecked<2>(); - auto mins_rw = mins.template mutable_unchecked<2>(); - auto maxs_rw = maxs.template mutable_unchecked<2>(); - auto voxels_rw = voxels.template mutable_unchecked<3>(); - auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>(); - auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>(); - auto coors_rw = coors.mutable_unchecked<2>(); - auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>(); - auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked(); - auto N = points_rw.shape(0); - auto num_features = points_rw.shape(1); - // auto ndim = points_rw.shape(1) - 1; - constexpr int ndim_minus_1 = NDim - 1; - int voxel_num = 0; - bool failed = false; - int coor[NDim]; - int c; - int grid_size[NDim]; - - DType max_value, min_value; - for (int i = 0; i < NDim; ++i) { - grid_size[i] = - round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]); - } - int block_shape_H = grid_size[1] / block_factor; - int block_shape_W = grid_size[0] / block_factor; - int voxelidx, num; - int block_coor[2]; - int startx, stopx, starty, stopy; - for (int i = 0; i < N; ++i) { - failed = false; - for (int j = 0; j < NDim; ++j) { - c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]); - if ((c < 0 || c >= grid_size[j])) { - failed = true; - break; - } - coor[ndim_minus_1 - j] = c; - } - if (failed) - continue; - voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); - if (voxelidx == -1) { - voxelidx = voxel_num; - if (voxel_num >= max_voxels) - continue; - voxel_num += 1; - coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx; - for (int k = 0; k < NDim; ++k) { - coors_rw(voxelidx, k) = coor[k]; - } - } - num = num_points_per_voxel_rw(voxelidx); - if (num < max_points) { - voxel_point_mask_rw(voxelidx, num) = DType(1); - for (int k = 0; k < num_features; ++k) { - voxels_rw(voxelidx, num, k) = points_rw(i, k); - } - block_coor[0] = coor[1] / block_factor; - block_coor[1] = coor[2] / block_factor; - mins_rw(block_coor[0], block_coor[1]) = - std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1])); - maxs_rw(block_coor[0], block_coor[1]) = - std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1])); - num_points_per_voxel_rw(voxelidx) += 1; - } - } - for (int i = 0; i < voxel_num; ++i) { - coor[1] = coors_rw(i, 1); - coor[2] = coors_rw(i, 2); - coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1; - block_coor[0] = coor[1] / block_factor; - block_coor[1] = coor[2] / block_factor; - min_value = mins_rw(block_coor[0], block_coor[1]); - max_value = maxs_rw(block_coor[0], block_coor[1]); - startx = std::max(0, block_coor[0] - block_size / 2); - stopx = - std::min(block_shape_H, block_coor[0] + block_size - block_size / 2); - starty = std::max(0, block_coor[1] - block_size / 2); - stopy = - std::min(block_shape_W, block_coor[1] + block_size - block_size / 2); - - for (int j = startx; j < stopx; ++j) { - for (int k = starty; k < stopy; ++k) { - min_value = std::min(min_value, mins_rw(j, k)); - max_value = std::max(max_value, maxs_rw(j, k)); - } - } - voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) && - ((max_value - min_value) < height_high_threshold); - } - return voxel_num; -} - -} // namespace spconv diff --git a/include/spconv/point2voxel_ops.h b/include/spconv/point2voxel_ops.h deleted file mode 100644 index 9aba5e4..0000000 --- a/include/spconv/point2voxel_ops.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2020 xmyqsh -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -namespace spconv { - -int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes, - torch::Tensor pointIndex, torch::Tensor grids, - torch::Tensor numPointsPerGrid, torch::Tensor voxels, - torch::Tensor coors, std::vector gridShape, - const int64_t ndim); - -} // namespace spconv diff --git a/include/spconv/points2voxels.h b/include/spconv/points2voxels.h deleted file mode 100644 index b587081..0000000 --- a/include/spconv/points2voxels.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include -#include - -namespace spconv { - -void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes, - torch::Tensor grids, - torch::Tensor numPointsPerGrid, - torch::Tensor pointIndex, - std::vector gridShape, const int ndim); - -void gather_point_from_grid_cuda(torch::Tensor grids, - torch::Tensor numPointsPerGrid, - torch::Tensor pointIndex, - torch::Tensor pointIndexUnique, - torch::Tensor voxels, torch::Tensor coors, - std::vector gridShape, - const int ndim); - -} // namespace spconv diff --git a/include/spconv/pool_ops.h b/include/spconv/pool_ops.h deleted file mode 100644 index 39cd59c..0000000 --- a/include/spconv/pool_ops.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SPARSE_POOL_OP_H_ -#define SPARSE_POOL_OP_H_ - -#include -#include -#include -#include - -namespace spconv { -torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs, - torch::Tensor indiceNum, int64_t numAct); - -torch::Tensor indiceMaxPoolBackward(torch::Tensor features, - torch::Tensor outFeatures, - torch::Tensor outGrad, - torch::Tensor indicePairs, - torch::Tensor indiceNum); - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/reordering.cu.h b/include/spconv/reordering.cu.h deleted file mode 100644 index 61bfd55..0000000 --- a/include/spconv/reordering.cu.h +++ /dev/null @@ -1,432 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef REORDERING_CU_H_ -#define REORDERING_CU_H_ -#include -#include -#include -#include - -#if PYTORCH_VERSION < 10500 -#define TH_ATOMIC_ADD atomicAdd -#else -#define TH_ATOMIC_ADD gpuAtomicAdd -#endif - -// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf. -namespace spconv { - -template -__global__ void gatherGenericKernel(T *buffer, const T *features, - const Index *indices, int size, - int numPlanes) { - int ILPStrideX[NumILP]; - Index inds[NumILP]; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) { - if (ix + ILPStrideX[ilp] < size) - inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; - } - for (int iy : tv::KernelLoopY(numPlanes)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - if (ix + ILPStrideX[ilp] < size) - buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = - features[inds[ilp] + iy]; - } - } - } -} - -template -__global__ void gatherVecKernel(T *buffer, const T *features, - const Index *indices, int size, int numPlanes) { - int ILPStrideX[NumILP]; - Index inds[NumILP]; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) { - if (ix + ILPStrideX[ilp] < size) - inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; - } - for (int iy : tv::KernelLoopY(numPlanes)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - if (ix + ILPStrideX[ilp] < size) - reinterpret_cast( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] = - reinterpret_cast(features)[inds[ilp] + iy]; - } - } - } -} - -template -__global__ void gatherVecBlockKernel(T *buffer, const T *features, - const Index *indices, int size, - int numPlanes) { - int ILPStrideX[NumILP]; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - features += blockIdx.y * NumTLP; - buffer += blockIdx.y * NumTLP; - - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - reinterpret_cast( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] = - reinterpret_cast( - features)[indices[ix + ILPStrideX[ilp]] * numPlanes + - threadIdx.y]; - } - } -} - -template -__global__ void batchGatherGenericKernel(T *buffer, const T *features, - const Index *indices, int size, - int numPlanes, int indice_batch_stride, - int feature_batch_stride) { - // size: max indice num * kernel volume - // inds: [volume, num_elems] - int ILPStrideX[NumILP]; - Index inds[NumILP]; - Index inds_elem; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) { - if (ix + ILPStrideX[ilp] < size) { - inds_elem = ix + ILPStrideX[ilp]; - inds[ilp] = - indices[(inds_elem / feature_batch_stride) * indice_batch_stride + - inds_elem % feature_batch_stride]; - } - } - for (int iy : tv::KernelLoopY(numPlanes)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - if (ix + ILPStrideX[ilp] < size) { - if (inds[ilp] != -1) { - buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = - features[inds[ilp] * numPlanes + iy]; - - } else { - buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = T(0); - } - } - } - } - } -} - -template -__global__ void -batchGatherVecKernel(T *buffer, const T *features, const Index *indices, - int size, int feature_offset, int numPlanes, - int indice_batch_stride, int feature_batch_stride) { - int ILPStrideX[NumILP]; - Index inds[NumILP]; - Index zero[sizeof(VecType) / sizeof(T)]; -#pragma unroll - for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) { - zero[i] = T(0); - } - - Index inds_elem; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) { - if (ix + ILPStrideX[ilp] < size) { - inds_elem = ix + ILPStrideX[ilp] + feature_offset; - inds[ilp] = - indices[(inds_elem / feature_batch_stride) * indice_batch_stride + - inds_elem % feature_batch_stride]; - } - } - for (int iy : tv::KernelLoopY(numPlanes)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - if (ix + ILPStrideX[ilp] < size) { - if (inds[ilp] != -1) { - reinterpret_cast( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] = - reinterpret_cast( - features)[inds[ilp] * numPlanes + iy]; - - } else { - reinterpret_cast( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] = - reinterpret_cast(&zero)[0]; - } - } - } - } - } -} - -template -__global__ void -batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices, - int size, int numPlanes, int indice_batch_stride, - int feature_batch_stride) { - int ILPStrideX[NumILP]; - Index inds; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - features += blockIdx.y * NumTLP; - buffer += blockIdx.y * NumTLP; - - Index inds_elem; - Index zero[sizeof(VecType) / sizeof(T)]; -#pragma unroll - for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) { - zero[i] = T(0); - } - - for (int ix : tv::KernelLoopX(size)) { - -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - inds_elem = ix + ILPStrideX[ilp]; - inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride + - inds_elem % feature_batch_stride]; - - if (inds != -1) { - reinterpret_cast( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] = - reinterpret_cast( - features)[inds * numPlanes + threadIdx.y]; - } else { - reinterpret_cast( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] = - reinterpret_cast(&zero)[0]; - } - } - } -} - -template -__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer, - const Index *indices, int size, - int numPlanes) { - int ILPStrideX[NumILP]; - Index inds[NumILP]; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) { - if (ix + ILPStrideX[ilp] < size) - inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; - } - for (int iy : tv::KernelLoopY(numPlanes)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - if (ix + ILPStrideX[ilp] < size) { - outFeatures[inds[ilp] + iy] += - buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]; - } - } - } - } -} - -template -__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer, - const Index *indices, int size, - int numPlanes) { - int ILPStrideX[NumILP]; - constexpr int vecloadFactor = sizeof(VecType) / sizeof(T); - constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2); - -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - outFeatures += blockIdx.y * NumTLP; - buffer += blockIdx.y * NumTLP; - T buf[vecloadFactor]; - T buf2[vecloadFactor]; - Index idx; - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y; - reinterpret_cast(buf)[0] = - reinterpret_cast(outFeatures)[idx]; - reinterpret_cast(buf2)[0] = reinterpret_cast( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]; - if (std::is_same::value) { -#if __CUDA_ARCH__ >= 530 -#pragma unroll - for (int i = 0; i < vecloadHalf2Factor; i++) { - reinterpret_cast<__half2 *>(buf)[i] = - __hadd2(reinterpret_cast<__half2 *>(buf)[i], - reinterpret_cast<__half2 *>(buf2)[i]); - } -#else -#pragma unroll - for (int i = 0; i < vecloadFactor; i++) { - buf[i] += buf2[i]; - } -#endif - } else { -#pragma unroll - for (int i = 0; i < vecloadFactor; i++) { - buf[i] += buf2[i]; - } - } - reinterpret_cast(outFeatures)[idx] = - reinterpret_cast(buf)[0]; - } - } -} - -template -__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer, - const Index *indices, int size, - int numPlanes) { - int ILPStrideX[NumILP]; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - outFeatures += blockIdx.y * NumTLP; - buffer += blockIdx.y * NumTLP; - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] += - buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]; - } - } -} - -#if __CUDA_ARCH__ >= 530 -template -__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer, - const Index *indices, int size, - int numPlanes) { - int ILPStrideX[NumILP]; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - outFeatures += blockIdx.y * NumTLP; - buffer += blockIdx.y * NumTLP; - Index idx; - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y; - reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2( - reinterpret_cast<__half2 *>(outFeatures)[idx], - reinterpret_cast<__half2 *>( - buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]); - } - } -} -#endif - -template -__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer, - const Index *indices, int size, - int feature_offset, int numPlanes, - int indice_batch_stride, - int feature_batch_stride) { - // batch scatter add is greatly slower than native scatter when the number of - // points is large. this may due to atomicAdd? - // batch scatter add is greatly faster than native when the number of points - // is small. - int ILPStrideX[NumILP]; - Index inds[NumILP]; - Index inds_elem; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) { - if (ix + ILPStrideX[ilp] < size) { - inds_elem = ix + ILPStrideX[ilp] + feature_offset; - inds[ilp] = - indices[(inds_elem / feature_batch_stride) * indice_batch_stride + - inds_elem % feature_batch_stride]; - } - } - for (int iy : tv::KernelLoopY(numPlanes)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) { - TH_ATOMIC_ADD(outFeatures + inds[ilp] * numPlanes + iy, - buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]); - } - } - } - } -} - -template -__global__ void -batchScatterAddBlockKernel(T *outFeatures, const T *buffer, - const Index *indices, int size, int numPlanes, - int indice_batch_stride, int feature_batch_stride) { - int ILPStrideX[NumILP]; -#pragma unroll - for (int ilp = 0; ilp < NumILP; ilp++) - ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; - outFeatures += blockIdx.y * NumTLP; - buffer += blockIdx.y * NumTLP; - Index inds, inds_elem; - for (int ix : tv::KernelLoopX(size)) { -#pragma unroll - for (int ilp = 0; ilp < NumILP; ++ilp) { - inds_elem = ix + ILPStrideX[ilp]; - inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride + - inds_elem % feature_batch_stride]; - if (inds != -1) { - TH_ATOMIC_ADD(outFeatures + inds * numPlanes + threadIdx.y, - buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]); - } - } - } -} - -} // namespace spconv - -#undef TH_ATOMIC_ADD - -#endif \ No newline at end of file diff --git a/include/spconv/reordering.h b/include/spconv/reordering.h deleted file mode 100644 index 202c5c8..0000000 --- a/include/spconv/reordering.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SPARSE_REORDERING_FUNCTOR_H_ -#define SPARSE_REORDERING_FUNCTOR_H_ -#include -#include -#include -namespace spconv { - -void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, - torch::Tensor indices, int size); -void batch_sparse_scatter_add_cuda(torch::Tensor buffer, - torch::Tensor outFeatures, - torch::Tensor indices, int size); - -void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, - torch::Tensor indices, int size); -void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, - torch::Tensor indices, int size); - -void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features, - torch::Tensor indices, int size); -void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures, - torch::Tensor indices, int size); - -void sparse_gather_cuda(cudaStream_t s, torch::Tensor buffer, - torch::Tensor features, torch::Tensor indices, - int size); -void sparse_scatter_add_cuda(cudaStream_t s, torch::Tensor buffer, - torch::Tensor outFeatures, torch::Tensor indices, - int size); - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spconv/spconv_ops.h b/include/spconv/spconv_ops.h deleted file mode 100644 index f2747c4..0000000 --- a/include/spconv/spconv_ops.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SPARSE_CONV_OP_H_ -#define SPARSE_CONV_OP_H_ - -#include -#include -#include -#include -#include - -namespace spconv { - -enum ConvAlgo { - kNative = 0, - kBatch, - kBatchGemmGather, - kSparseConvNet, - kMinkowskiEngine -}; -using all_conv_algos_t = tv::mp_list_c; - -// torch.jit's doc says only support int64, so we need to convert to int32. -std::vector -getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize, - std::vector outSpatialShape, - std::vector spatialShape, - std::vector kernelSize, std::vector stride, - std::vector padding, std::vector dilation, - std::vector outPadding, int64_t _subM, - int64_t _transpose, int64_t _useHash); - -torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, - torch::Tensor indicePairs, torch::Tensor indiceNum, - int64_t numActOut, int64_t _inverse, int64_t _subM, - int64_t algo); -std::vector -indiceConvBackward(torch::Tensor features, torch::Tensor filters, - torch::Tensor outGrad, torch::Tensor indicePairs, - torch::Tensor indiceNum, int64_t _inverse, int64_t _subM, - int64_t algo); - -} // namespace spconv - -#endif \ No newline at end of file diff --git a/include/spgemm/gemm.h b/include/spgemm/gemm.h deleted file mode 100644 index bb60662..0000000 --- a/include/spgemm/gemm.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once -#include -#include -namespace spconv { - -template -using determine_acc_t = - std::conditional_t::value, float, T>; - -template -cudaError_t cutlassGemm(cudaStream_t s, int M, int N, int K, T alpha, - T const *A, int lda, T const *B, int ldb, T beta, T *C, - int ldc) { - - // Define type definition for single-precision CUTLASS GEMM with column-major - // input matrices and 128x128x8 threadblock tile size (chosen by default). - // - // To keep the interface manageable, several helpers are defined for plausible - // compositions including the following example for single-precision GEMM. - // Typical values are used as default template arguments. See - // `cutlass/gemm/device/default_gemm_configuration.h` for more details. - // - // To view the full gemm device API interface, see - // `cutlass/gemm/device/gemm.h` - using TAcc = determine_acc_t; - using ColumnMajor = cutlass::layout::ColumnMajor; - using RowMajor = cutlass::layout::RowMajor; - using LayoutA = std::conditional_t; - using LayoutB = std::conditional_t; - using LayoutC = std::conditional_t; - - using CutlassGemm = cutlass::gemm::device::Gemm; // Layout of C matrix - - // Define a CUTLASS GEMM type - CutlassGemm gemm_operator; - - // Construct the CUTLASS GEMM arguments object. - // - // One of CUTLASS's design patterns is to define gemm argument objects that - // are constructible in host code and passed to kernels by value. These may - // include pointers, strides, scalars, and other arguments needed by Gemm and - // its components. - // - // The benefits of this pattern are (1.) a structured, composable strategy for - // passing host-constructible arguments to kernels and (2.) minimized - // initialization overhead on kernel entry. - // - typename CutlassGemm::Arguments args( - {M, N, K}, // Gemm Problem dimensions - {A, lda}, // Tensor-ref for source matrix A - {B, ldb}, // Tensor-ref for source matrix B - {C, ldc}, // Tensor-ref for source matrix C - {C, ldc}, // Tensor-ref for destination matrix D (may be different memory - // than source C matrix) - {alpha, beta}); // Scalars used in the Epilogue - - // - // Launch the CUTLASS GEMM kernel. - // - - cutlass::Status status = gemm_operator(args, nullptr, s); - - // - // Return a cudaError_t if the CUTLASS GEMM operator returned an error code. - // - - if (status != cutlass::Status::kSuccess) { - return cudaErrorUnknown; - } - - // Return success, if no errors were encountered. - return cudaSuccess; -} - -} // namespace spconv diff --git a/include/spgemm/gemm_th.h b/include/spgemm/gemm_th.h deleted file mode 100644 index 87c8eca..0000000 --- a/include/spgemm/gemm_th.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once -#include -#include -#include - -namespace spconv { -void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b); -void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a, - torch::Tensor b); - -} // namespace spconv \ No newline at end of file diff --git a/include/sphash/hashmap.h b/include/sphash/hashmap.h deleted file mode 100644 index f3071e2..0000000 --- a/include/sphash/hashmap.h +++ /dev/null @@ -1,11 +0,0 @@ -#include - -namespace spconv { - -enum HashTypes { kDenseMap = 0, kCUDPPHash = 1 }; - -template struct HashMap; - -template <> struct HashMap {}; - -} // namespace spconv \ No newline at end of file diff --git a/include/tensorrt/inference.h b/include/tensorrt/inference.h deleted file mode 100644 index e222060..0000000 --- a/include/tensorrt/inference.h +++ /dev/null @@ -1,207 +0,0 @@ -#include "NvInfer.h" -#include -#include -#include -#include - -namespace trt { - -template tv::DType trt_dtype_to_tv(T trt_dtype) { - switch (trt_dtype) { - case nvinfer1::DataType::kFLOAT: - return tv::float32; - case nvinfer1::DataType::kHALF: - return tv::float16; - case nvinfer1::DataType::kINT32: - return tv::int32; - case nvinfer1::DataType::kINT8: - return tv::int8; - default:; - } - TV_THROW_INVALID_ARG("unknown trt dtype"); -} - -struct InferDeleter { - template void operator()(T *obj) const { - if (obj) { - obj->destroy(); - } - } -}; - -template using trt_unique_ptr_t = std::unique_ptr; - -class Logger : public nvinfer1::ILogger { -public: - Logger(Severity severity = Severity::kWARNING) - : reportableSeverity(severity) {} - - void log(Severity severity, const char *msg) override { - // suppress messages with severity enum value greater than the reportable - if (severity > reportableSeverity) - return; - - switch (severity) { - case Severity::kINTERNAL_ERROR: - std::cerr << "INTERNAL_ERROR: "; - break; - case Severity::kERROR: - std::cerr << "ERROR: "; - break; - case Severity::kWARNING: - std::cerr << "WARNING: "; - break; - case Severity::kINFO: - std::cerr << "INFO: "; - break; - default: - std::cerr << "UNKNOWN: "; - break; - } - std::cerr << msg << std::endl; - } - - Severity reportableSeverity; -}; - -class InferenceContext { -public: - explicit InferenceContext(const std::string &engine_bin, int device) - : logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) { - TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id"); - int deviceCount; - cudaGetDeviceCount(&deviceCount); - if (device >= deviceCount) { - TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ", - deviceCount, " device."); - } - cudaSetDevice(device); - auto runtime = trt_unique_ptr_t( - nvinfer1::createInferRuntime(logger_)); - engine_ = - trt_unique_ptr_t(runtime->deserializeCudaEngine( - engine_bin.c_str(), engine_bin.size(), nullptr)); - ctx_ = trt_unique_ptr_t( - engine_->createExecutionContext()); - - max_batch_size_ = engine_->getMaxBatchSize(); - for (int i = 0; i < engine_->getNbBindings(); ++i) { - auto dims = engine_->getBindingDimensions(i); - std::vector shape_vec(dims.d, dims.d + dims.nbDims); - shape_vec.insert(shape_vec.begin(), {max_batch_size_}); - tv::TensorShape shape(shape_vec); - std::string name = engine_->getBindingName(i); - auto trt_dtype = engine_->getBindingDataType(i); - auto tv_dtype = trt_dtype_to_tv(trt_dtype); - bool isInput = engine_->bindingIsInput(i); - name_to_idx_[name] = i; - idx_to_name_[i] = name; - name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)}); - name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)}); - if (isInput) - inp_idxes_.push_back(i); - else - out_idxes_.push_back(i); - bindings_.push_back(name_to_dev_mem_[name].raw_data()); - } - checkCudaErrors(cudaStreamCreate(&stream_)); - } - - std::unordered_map - operator()(std::vector inputs) { - TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide", - inp_idxes_.size(), "inputs, but got", inputs.size()); - // inference batch size - int bs = inputs[0].dim(0); - for (auto &inp : inputs) { - TV_ASSERT_INVALID_ARG(inp.dim(0) == bs, - "batch sizes of all input must same"); - } - TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs, - max_batch_size_); - for (int i = 0; i < inputs.size(); ++i) { - auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]]; - auto shape_inp = inputs[i].shape().subshape(1); - auto shape_dev = dev_mem.shape().subshape(1); - TV_ASSERT_INVALID_ARG(shape_inp == shape_dev, - "shape except batch must same", shape_inp, - shape_dev); - dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs), - stream_); - } - - ctx_->enqueue(bs, bindings_.data(), stream_, nullptr); - - for (int i : out_idxes_) { - name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_( - name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_); - } - checkCudaErrors(cudaStreamSynchronize(stream_)); - std::unordered_map output_map; - for (int i = 0; i < out_idxes_.size(); ++i) { - auto name = idx_to_name_[out_idxes_[i]]; - output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs); - } - return output_map; - } - - std::unordered_map - operator()(std::unordered_map inputs) { - std::vector inputs_vec(inp_idxes_.size()); - int count = 0; - for (auto &p : inputs) { - auto iter = name_to_idx_.find(p.first); - TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name", - p.first); - inputs_vec[name_to_idx_[p.first]] = p.second; - } - TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough"); - return (*this)(inputs_vec); - } - - tv::Tensor operator[](std::string name) { - auto iter = name_to_host_mem_.find(name); - if (iter == name_to_host_mem_.end()) { - TV_THROW_INVALID_ARG(name, "not found."); - } - return iter->second; - } - - std::string repr() { - std::stringstream ss; - ss << "InferenceContext[gpu=" << device_ << "]"; - ss << "\n Inputs:"; - std::string name; - for (auto &i : inp_idxes_) { - name = idx_to_name_[i]; - auto &mem = name_to_host_mem_[name]; - ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype()) - << "]: " << mem.shape(); - } - ss << "\n Outputs:"; - for (auto &i : out_idxes_) { - name = idx_to_name_[i]; - auto &mem = name_to_host_mem_[name]; - ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype()) - << "]: " << mem.shape(); - } - return ss.str(); - } - -private: - Logger logger_; - trt_unique_ptr_t engine_; - trt_unique_ptr_t ctx_; - std::unordered_map name_to_dev_mem_; - std::unordered_map name_to_host_mem_; - std::unordered_map name_to_idx_; - std::unordered_map idx_to_name_; - std::vector inp_idxes_; - std::vector out_idxes_; - std::vector bindings_; - cudaStream_t stream_; - int max_batch_size_; - int device_; -}; - -} // namespace trt diff --git a/include/tensorview/cc17.h b/include/tensorview/cc17.h deleted file mode 100644 index c008840..0000000 --- a/include/tensorview/cc17.h +++ /dev/null @@ -1,264 +0,0 @@ -/* -From PyTorch: - -Copyright (c) 2016- Facebook, Inc (Adam Paszke) -Copyright (c) 2014- Facebook, Inc (Soumith Chintala) -Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) -Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) -Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) -Copyright (c) 2011-2013 NYU (Clement Farabet) -Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, -Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute -(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, -Samy Bengio, Johnny Mariethoz) - -From Caffe2: - -Copyright (c) 2016-present, Facebook Inc. All rights reserved. - -All contributions by Facebook: -Copyright (c) 2016 Facebook Inc. - -All contributions by Google: -Copyright (c) 2015 Google Inc. -All rights reserved. - -All contributions by Yangqing Jia: -Copyright (c) 2015 Yangqing Jia -All rights reserved. - -All contributions from Caffe: -Copyright(c) 2013, 2014, 2015, the respective contributors -All rights reserved. - -All other contributions: -Copyright(c) 2015, 2016 the respective contributors -All rights reserved. - -Caffe2 uses a copyright model similar to Caffe: each contributor holds -copyright over their contributions to Caffe2. The project versioning records -all such contribution and copyright details. If a contributor wants to further -mark their specific copyright on a particular contribution, they should -indicate their copyright solely in the commit message of the change when it is -committed. - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories -America and IDIAP Research Institute nor the names of its contributors may be - used to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. -*/ -#pragma once -#include -#include - -namespace tv { - -#ifdef __cpp_lib_void_t - -template using void_t = std::void_t; - -#else - -// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t -// (it takes CWG1558 into account and also works for older compilers) -template struct make_void { typedef void type; }; -template using void_t = typename make_void::type; - -#endif - -namespace detail { -struct _identity final { - template using type_identity = T; - - template decltype(auto) operator()(T &&arg) { - return std::forward(arg); - } -}; -template -struct function_takes_identity_argument : std::false_type {}; -#if defined(_MSC_VER) -// For some weird reason, MSVC shows a compiler error when using guts::void_t -// instead of std::void_t. But we're only building on MSVC versions that have -// std::void_t, so let's just use that one. -template -struct function_takes_identity_argument< - Func, std::void_t()(_identity()))>> - : std::true_type {}; -#else -template -struct function_takes_identity_argument< - Func, void_t()(_identity()))>> - : std::true_type {}; -#endif - -template struct _if_constexpr; - -template <> struct _if_constexpr final { - template < - class ThenCallback, class ElseCallback, - std::enable_if_t::value, - void *> = nullptr> - static decltype(auto) call(ThenCallback &&thenCallback, - ElseCallback && /* elseCallback */) { - // The _identity instance passed in can be used to delay evaluation of an - // expression, because the compiler can't know that it's just the identity - // we're passing in. - return thenCallback(_identity()); - } - - template < - class ThenCallback, class ElseCallback, - std::enable_if_t::value, - void *> = nullptr> - static decltype(auto) call(ThenCallback &&thenCallback, - ElseCallback && /* elseCallback */) { - return thenCallback(); - } -}; - -template <> struct _if_constexpr final { - template < - class ThenCallback, class ElseCallback, - std::enable_if_t::value, - void *> = nullptr> - static decltype(auto) call(ThenCallback && /* thenCallback */, - ElseCallback &&elseCallback) { - // The _identity instance passed in can be used to delay evaluation of an - // expression, because the compiler can't know that it's just the identity - // we're passing in. - return elseCallback(_identity()); - } - - template < - class ThenCallback, class ElseCallback, - std::enable_if_t::value, - void *> = nullptr> - static decltype(auto) call(ThenCallback && /* thenCallback */, - ElseCallback &&elseCallback) { - return elseCallback(); - } -}; -} // namespace detail - -/* - * Get something like C++17 if constexpr in C++14. - * - * Example 1: simple constexpr if/then/else - * template int increment_absolute_value() { - * int result = arg; - * if_constexpr<(arg > 0)>( - * [&] { ++result; } // then-case - * [&] { --result; } // else-case - * ); - * return result; - * } - * - * Example 2: without else case (i.e. conditionally prune code from assembly) - * template int decrement_if_positive() { - * int result = arg; - * if_constexpr<(arg > 0)>( - * // This decrement operation is only present in the assembly for - * // template instances with arg > 0. - * [&] { --result; } - * ); - * return result; - * } - * - * Example 3: branch based on type (i.e. replacement for SFINAE) - * struct MyClass1 {int value;}; - * struct MyClass2 {int val}; - * template - * int func(T t) { - * return if_constexpr::value>( - * [&](auto _) { return _(t).value; }, // this code is invalid for T == - * MyClass2, so a regular non-constexpr if statement wouldn't compile - * [&](auto _) { return _(t).val; } // this code is invalid for T == - * MyClass1 - * ); - * } - * - * Note: The _ argument passed in Example 3 is the identity function, i.e. it - * does nothing. It is used to force the compiler to delay type checking, - * because the compiler doesn't know what kind of _ is passed in. Without it, - * the compiler would fail when you try to access t.value but the member doesn't - * exist. - * - * Note: In Example 3, both branches return int, so func() returns int. This is - * not necessary. If func() had a return type of "auto", then both branches - * could return different types, say func() could return int and - * func() could return string. - */ -template -decltype(auto) if_constexpr(ThenCallback &&thenCallback, - ElseCallback &&elseCallback) { -#if defined(__cpp_if_constexpr) - // If we have C++17, just use it's "if constexpr" feature instead of wrapping - // it. This will give us better error messages. - if constexpr (Condition) { - if constexpr (detail::function_takes_identity_argument< - ThenCallback>::value) { - return std::forward(thenCallback)(detail::_identity()); - } else { - return std::forward(thenCallback)(); - } - } else { - if constexpr (detail::function_takes_identity_argument< - ElseCallback>::value) { - return std::forward(elseCallback)(detail::_identity()); - } else { - return std::forward(elseCallback)(); - } - } -#else - // C++14 implementation of if constexpr - return detail::_if_constexpr::call( - std::forward(thenCallback), - std::forward(elseCallback)); -#endif -} - -template -decltype(auto) if_constexpr(ThenCallback &&thenCallback) { -#if defined(__cpp_if_constexpr) - // If we have C++17, just use it's "if constexpr" feature instead of wrapping - // it. This will give us better error messages. - if constexpr (Condition) { - if constexpr (detail::function_takes_identity_argument< - ThenCallback>::value) { - return std::forward(thenCallback)(detail::_identity()); - } else { - return std::forward(thenCallback)(); - } - } -#else - // C++14 implementation of if constexpr - return if_constexpr(std::forward(thenCallback), - [](auto) {}); -#endif -} - -} // namespace tv diff --git a/include/tensorview/common.h b/include/tensorview/common.h deleted file mode 100644 index edbfbb1..0000000 --- a/include/tensorview/common.h +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once -#include -#include -#ifdef TV_USE_STACKTRACE -#if defined(WIN32) || defined(_WIN32) || \ - defined(__WIN32) && !defined(__CYGWIN__) -#define BOOST_STACKTRACE_USE_WINDBG -#else -// require linking with -ldl and -lbacktrace in linux -#define BOOST_STACKTRACE_USE_BACKTRACE -#endif -#include -#endif - -namespace tv { - -template void sstream_print(SStream &ss, T val) { - ss << val; -} - -template -void sstream_print(SStream &ss, T val, TArgs... args) { - ss << val << " "; - sstream_print(ss, args...); -} - -template void ssprint(TArgs... args) { - std::stringstream ss; - sstream_print(ss, args...); - std::cout << ss.str() << std::endl; -} - -#ifdef TV_USE_STACKTRACE -#define TV_BACKTRACE_PRINT(ss) \ - ss << std::endl << boost::stacktrace::stacktrace(); -#else -#define TV_BACKTRACE_PRINT(ss) -#endif - -#define TV_THROW_RT_ERR(...) \ - { \ - std::stringstream __macro_s; \ - __macro_s << __FILE__ << " " << __LINE__ << "\n"; \ - tv::sstream_print(__macro_s, __VA_ARGS__); \ - TV_BACKTRACE_PRINT(__macro_s); \ - throw std::runtime_error(__macro_s.str()); \ - } - -#define TV_THROW_INVALID_ARG(...) \ - { \ - std::stringstream __macro_s; \ - __macro_s << __FILE__ << " " << __LINE__ << "\n"; \ - tv::sstream_print(__macro_s, __VA_ARGS__); \ - TV_BACKTRACE_PRINT(__macro_s); \ - throw std::invalid_argument(__macro_s.str()); \ - } - -#define TV_ASSERT_RT_ERR(expr, ...) \ - { \ - if (!(expr)) { \ - std::stringstream __macro_s; \ - __macro_s << __FILE__ << " " << __LINE__ << "\n"; \ - __macro_s << #expr << " assert faild. "; \ - tv::sstream_print(__macro_s, __VA_ARGS__); \ - TV_BACKTRACE_PRINT(__macro_s); \ - throw std::runtime_error(__macro_s.str()); \ - } \ - } - -#define TV_ASSERT_INVALID_ARG(expr, ...) \ - { \ - if (!(expr)) { \ - std::stringstream __macro_s; \ - __macro_s << __FILE__ << " " << __LINE__ << "\n"; \ - __macro_s << #expr << " assert faild. "; \ - tv::sstream_print(__macro_s, __VA_ARGS__); \ - TV_BACKTRACE_PRINT(__macro_s); \ - throw std::invalid_argument(__macro_s.str()); \ - } \ - } -} // namespace tv \ No newline at end of file diff --git a/include/tensorview/cuda_utils.h b/include/tensorview/cuda_utils.h deleted file mode 100644 index 9d0e78c..0000000 --- a/include/tensorview/cuda_utils.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once -// from pytorch.aten -#include "tensorview.h" -#include -namespace tv { -namespace cuda { - -template inline int DivUp(const T1 a, const T2 b) { - return (a + b - 1) / b; -} - -// Use 1024 threads per block, which requires cuda sm_2x or above -constexpr int CUDA_NUM_THREADS = 1024; -// CUDA: number of blocks for threads. - -inline int getNumThreads(const int N) { - if (N > CUDA_NUM_THREADS) { - return CUDA_NUM_THREADS; - } - return DivUp(N, 32) * 32; -} - -inline int getBlocks(const int N) { - TV_ASSERT_RT_ERR(N > 0, - "CUDA kernel launch blocks must be positive, but got N=", N); - return DivUp(N, getNumThreads(N)); -} - -} // namespace cuda - -} // namespace tv \ No newline at end of file diff --git a/include/tensorview/eigen_utils.h b/include/tensorview/eigen_utils.h deleted file mode 100644 index b682ac2..0000000 --- a/include/tensorview/eigen_utils.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2019-2020 Yan Yan -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "tensor.h" -#include "tensorview.h" -#include - -namespace tv { - -template -Eigen::Map> -tv2eigen(TensorView view) { - TV_ASSERT_INVALID_ARG(view.ndim() <= 2 && view.ndim() > 0, "error"); - if (Row != Eigen::Dynamic) { - TV_ASSERT_INVALID_ARG(view.dim(0) == Row, "error"); - } - if (Col != Eigen::Dynamic) { - TV_ASSERT_INVALID_ARG(view.dim(1) == Col, "error"); - } - int row = 1; - if (view.ndim() == 2) { - row = view.dim(0); - } - Eigen::Map> eigen_map( - view.data(), row, view.dim(1)); - return eigen_map; -} - -} // namespace tv diff --git a/include/tensorview/kernel_utils.h b/include/tensorview/kernel_utils.h deleted file mode 100644 index 00e9bc2..0000000 --- a/include/tensorview/kernel_utils.h +++ /dev/null @@ -1,72 +0,0 @@ -#pragma once -// from tensorflow -namespace tv { -namespace detail { - -template class KernelLoop { - struct Iterator { - __forceinline__ __device__ Iterator(T index, T delta) - : index_(index), delta_(delta) {} - __forceinline__ __device__ T operator*() const { return index_; } - __forceinline__ __device__ Iterator &operator++() { - index_ += delta_; - return *this; - } - __forceinline__ __device__ bool operator!=(const Iterator &other) const { - bool greater = index_ > other.index_; - bool less = index_ < other.index_; - // Anything past an end iterator (delta_ == 0) is equal. - // In range-based for loops, this optimizes to 'return less'. - if (!other.delta_) { - return less; - } - if (!delta_) { - return greater; - } - return less || greater; - } - - private: - T index_; - const T delta_; - }; - -public: - __forceinline__ __device__ KernelLoop(T begin, T delta, T end) - : begin_(begin), delta_(delta), end_(end) {} - - __forceinline__ __device__ Iterator begin() const { - return Iterator{begin_, delta_}; - } - __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; } - -private: - T begin_; - T delta_; - T end_; -}; - -} // namespace detail -template -__forceinline__ __device__ detail::KernelLoop KernelLoopX(T count) { - return detail::KernelLoop(blockIdx.x * blockDim.x + threadIdx.x, - gridDim.x * blockDim.x * NumILP, count); -} - -// Helper to visit indices in the range 0 <= i < count using the y-coordinate. -// Usage: for(int i : KernelLoopY(count)) { visit(i); } -template -__forceinline__ __device__ detail::KernelLoop KernelLoopY(T count) { - return detail::KernelLoop(blockIdx.y * blockDim.y + threadIdx.y, - gridDim.y * blockDim.y * NumILP, count); -} - -// Helper to visit indices in the range 0 <= i < count using the z-coordinate. -// Usage: for(int i : KernelLoopZ(count)) { visit(i); } -template -__forceinline__ __device__ detail::KernelLoop KernelLoopZ(T count) { - return detail::KernelLoop(blockIdx.z * blockDim.z + threadIdx.z, - gridDim.z * blockDim.z * NumILP, count); -} - -} // namespace tv \ No newline at end of file diff --git a/include/tensorview/mp_helper.h b/include/tensorview/mp_helper.h deleted file mode 100644 index ec56cf7..0000000 --- a/include/tensorview/mp_helper.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef MP_HELPER_H_ -#define MP_HELPER_H_ -#include -#include - -namespace tv { -template struct mp_list {}; - -template -using mp_list_c = mp_list...>; - -template -using mp_list_int_c = mp_list...>; - -namespace detail { - -template -constexpr F mp_for_each_impl(mp_list, F &&f) { - return (void)(std::initializer_list{(f(Ts()), 0)...}), - std::forward(f); -} - -template constexpr F mp_for_each_impl(mp_list<>, F &&f) { - return std::forward(f); -} - -} // namespace detail - -template -using mp_length = std::integral_constant; - -namespace detail { - -template class B> struct mp_rename_impl { - // An error "no type named 'type'" here means that the first argument to - // mp_rename is not a list -}; - -template