From cd99e7a63be24131ada321dc9c2309d4c4fd3cda Mon Sep 17 00:00:00 2001 From: "yan.yan" Date: Fri, 24 Mar 2023 00:30:45 +0800 Subject: [PATCH] fix #575 use a flag to enable large-kernel algo --- CHANGELOG.md | 2 +- docs/PERFORMANCE_GUIDE.md | 4 +- spconv/pytorch/conv.py | 436 ++++++++++++++++++++++---------------- 3 files changed, 260 insertions(+), 182 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac5e860..4e58729 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## [2.3.5] - 2023-03-24 ### Fixed -- pypi project reach size limit, so we need to assign a new version number. +- use a flag to enable large kernel algo (need time to compile at runtime) ## [2.3.4] - 2023-03-23 ### Added diff --git a/docs/PERFORMANCE_GUIDE.md b/docs/PERFORMANCE_GUIDE.md index 7869a7d..df1816e 100644 --- a/docs/PERFORMANCE_GUIDE.md +++ b/docs/PERFORMANCE_GUIDE.md @@ -26,4 +26,6 @@ * spconv 2.x in Windows 10 is 1.5x~2x slower than Linux. use Linux if possible. * If you train with float32 and ampere or later GPUs, you can set ```spconv.constants.SPCONV_ALLOW_TF32``` to enable faster fp32 training. See [benchmark](BENCHMARK.md) for more performance details of different algorithms. -* Different CUDA version of spconv may have different performance. Use newest cuda version if possible. For example, spconv-cu117 is faster than spconv-cu114, spconv-cu114 is faster than spconv-cu111. \ No newline at end of file +* Different CUDA version of spconv may have different performance. Use newest cuda version if possible. For example, spconv-cu117 is faster than spconv-cu114, spconv-cu114 is faster than spconv-cu111. +* if your kernel size volume larger than 32, spconv will use a slower (and more inaccurate in fp16) algorithm. to use a faster algo for large kernel size (need time to compile at runtime), use ```large_kernel_fast_algo=True``` +* use ```SparseGlobalMaxPool``` instead of use large kernel size when you need global pool. \ No newline at end of file diff --git a/spconv/pytorch/conv.py b/spconv/pytorch/conv.py index c056022..4e9e185 100644 --- a/spconv/pytorch/conv.py +++ b/spconv/pytorch/conv.py @@ -43,7 +43,9 @@ _MAX_NUM_VOXELS_DURING_TRAINING = "max_num_voxels_during_training" -def _apply_act(x: torch.Tensor, act_type: tv.gemm.Activation, act_alpha: float, act_beta: float): + +def _apply_act(x: torch.Tensor, act_type: tv.gemm.Activation, act_alpha: float, + act_beta: float): if act_type == tv.gemm.Activation.None_: return x elif act_type == tv.gemm.Activation.ReLU: @@ -55,7 +57,9 @@ def _apply_act(x: torch.Tensor, act_type: tv.gemm.Activation, act_alpha: float, else: raise NotImplementedError + class SparseConvolutionBase: + def __init__(self, ndim: int, in_channels: int, @@ -76,7 +80,8 @@ def __init__(self, record_voxel_count: bool = False, act_type: tv.gemm.Activation = tv.gemm.Activation.None_, act_alpha: float = 0, - act_beta: float = 0): + act_beta: float = 0, + large_kernel_fast_algo: bool = False): assert groups == 1, "don't support groups for now" self.ndim = ndim self.in_channels = in_channels @@ -103,7 +108,10 @@ def __init__(self, self.indice_key = indice_key self.record_voxel_count = record_voxel_count if algo is None: - if kv <= 128 and not CPU_ONLY_BUILD: + limit = 32 + if large_kernel_fast_algo: + limit = 128 + if kv <= limit and not CPU_ONLY_BUILD: if kv < 8: algo = ConvAlgo.MaskImplicitGemm else: @@ -117,7 +125,7 @@ def __init__(self, self.algo = algo self.fp32_accum = fp32_accum # self.algo = ConvAlgo.Native - + if self.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC: if FILTER_HWIO: # RSCK @@ -136,16 +144,23 @@ def __init__(self, self.zero_point = 0 if self.conv1x1: assert act_type == tv.gemm.Activation.None_, "conv1x1 don't support fused act" - + def is_inverseable(self): return self.indice_key is not None and not self.subm - def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.Tensor, bias: Optional[torch.Tensor], add_input: Optional[SparseConvTensor] = None, - channel_scale: Optional[torch.Tensor] = None, output_scale: Optional[float] = None, name: Optional[str] = None, - sparse_unique_name: str = "", - act_type: tv.gemm.Activation = tv.gemm.Activation.None_, - act_alpha: float = 0, - act_beta: float = 0): + def _conv_forward(self, + training: bool, + input: SparseConvTensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor], + add_input: Optional[SparseConvTensor] = None, + channel_scale: Optional[torch.Tensor] = None, + output_scale: Optional[float] = None, + name: Optional[str] = None, + sparse_unique_name: str = "", + act_type: tv.gemm.Activation = tv.gemm.Activation.None_, + act_alpha: float = 0, + act_beta: float = 0): # assert isinstance(input, SparseConvTensor) is_int8 = input.is_quantized and weight.is_quantized if is_int8: @@ -165,9 +180,9 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T if add_input is not None: output_add_scale = add_input.q_scale() if training: - msg = "act don't support backward, only used in inference" + msg = "act don't support backward, only used in inference" assert self.act_type == tv.gemm.Activation.None_, msg - + if not self.subm: if self.transposed: out_spatial_shape = ops.get_deconv_output_size( @@ -252,7 +267,7 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T indice_pair_num = datas.indice_pair_num out_spatial_shape = datas.spatial_shape self._check_inverse_reuse_valid(input, spatial_shape, - datas) + datas) else: if self.indice_key is not None and datas is not None: outids = datas.out_indices @@ -283,8 +298,8 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T if input.benchmark: torch.cuda.synchronize() interval = time.time() - t - out_tensor.benchmark_record[ - name]["indice_gen_time"].append(interval) + out_tensor.benchmark_record[name][ + "indice_gen_time"].append(interval) indice_data = IndiceData(outids, indices, @@ -310,43 +325,21 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T indice_pairs_calc = indice_pairs.to(features.device) if self.subm: out_features = Fsp.indice_subm_conv( - features, - weight, - indice_pairs_calc, - indice_pair_num, - outids.shape[0], - algo, - input._timer, - bias_for_infer, - act_alpha, - act_beta, - act_type) + features, weight, indice_pairs_calc, indice_pair_num, + outids.shape[0], algo, input._timer, bias_for_infer, + act_alpha, act_beta, act_type) else: if self.inverse: out_features = Fsp.indice_inverse_conv( - features, - weight, - indice_pairs_calc, - indice_pair_num, - outids.shape[0], - algo, - input._timer, - bias_for_infer, - act_alpha, - act_beta, + features, weight, indice_pairs_calc, + indice_pair_num, outids.shape[0], algo, + input._timer, bias_for_infer, act_alpha, act_beta, act_type) else: out_features = Fsp.indice_conv( - features, - weight, - indice_pairs_calc, - indice_pair_num, - outids.shape[0], - algo, - input._timer, - bias_for_infer, - act_type, - act_beta, + features, weight, indice_pairs_calc, + indice_pair_num, outids.shape[0], algo, + input._timer, bias_for_infer, act_type, act_beta, act_type) else: datas = input.find_indice_pair(self.indice_key) @@ -367,7 +360,7 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T # assert datas.ksize == self.kernel_size, "inverse conv must have same kernel size as its couple conv" self._check_inverse_reuse_valid(input, spatial_shape, - datas) + datas) else: if self.indice_key is not None and datas is not None: outids = datas.out_indices @@ -416,8 +409,8 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T if input.benchmark: torch.cuda.synchronize() interval = time.time() - t - out_tensor.benchmark_record[ - name]["indice_gen_time"].append(interval) + out_tensor.benchmark_record[name][ + "indice_gen_time"].append(interval) outids = res[0] num_inds_per_loc = res[1] pair_fwd = res[2] @@ -465,28 +458,34 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T pair_mask_fwd_splits, pair_mask_bwd_splits, mask_argsort_fwd_splits, mask_argsort_bwd_splits, num_activate_out, masks, training, self.subm, - input._timer, self.fp32_accum, - bias_cur, - act_alpha, - act_beta, - act_type) + input._timer, self.fp32_accum, bias_cur, act_alpha, + act_beta, act_type) else: - output_dtype = None + output_dtype = None if output_scale is None: output_dtype = weight.dtype out_features, _, _ = ops.implicit_gemm( - features, weight_cur, pair_fwd, pair_mask_fwd_splits, - mask_argsort_fwd_splits, - num_activate_out, masks, training, self.subm, - input._timer, self.fp32_accum, + features, + weight_cur, + pair_fwd, + pair_mask_fwd_splits, + mask_argsort_fwd_splits, + num_activate_out, + masks, + training, + self.subm, + input._timer, + self.fp32_accum, bias_cur, act_alpha, act_beta, act_type, # TODO do we really need output scale to scale bias in kernel? - 1.0 if output_scale is None else output_scale, # output_scale - channel_scale, # scale - output_add=add_input.features if add_input is not None else None, + 1.0 if output_scale is None else + output_scale, # output_scale + channel_scale, # scale + output_add=add_input.features + if add_input is not None else None, output_add_scale=output_add_scale, output_dtype=output_dtype) @@ -511,8 +510,10 @@ def _conv_forward(self, training: bool, input: SparseConvTensor, weight: torch.T out_tensor.spatial_shape = out_spatial_shape if add_input is not None and not is_int8: # in int8, we apply add + act in kernel. - out_tensor = out_tensor.replace_feature(_apply_act(out_tensor.features + add_input.features, self.act_type, self.act_alpha, self.act_beta)) - + out_tensor = out_tensor.replace_feature( + _apply_act(out_tensor.features + add_input.features, + self.act_type, self.act_alpha, self.act_beta)) + return out_tensor def _check_subm_reuse_valid(self, inp: SparseConvTensor, @@ -539,9 +540,9 @@ def _check_subm_reuse_valid(self, inp: SparseConvTensor, ) def _check_inverse_reuse_valid(self, inp: SparseConvTensor, - spatial_shape: List[int], - datas: Union[ImplicitGemmIndiceData, - IndiceData]): + spatial_shape: List[int], + datas: Union[ImplicitGemmIndiceData, + IndiceData]): if self.kernel_size != datas.ksize: raise ValueError( f"Inverse with same indice_key must have same kernel" @@ -556,8 +557,8 @@ def _check_inverse_reuse_valid(self, inp: SparseConvTensor, raise ValueError( f"Inverse with same indice_key must have same num of indices" f", expect {datas.indices.shape[0]}, input {inp.indices.shape[0]}, " - "please check Inverse Convolution in ." - ) + "please check Inverse Convolution in .") + class SparseConvolution(SparseConvolutionBase, SparseModule): __constants__ = [ @@ -586,11 +587,21 @@ def __init__(self, act_type: tv.gemm.Activation = tv.gemm.Activation.None_, act_alpha: float = 0, act_beta: float = 0, + large_kernel_fast_algo: bool = False, name=None, - device=None, + device=None, dtype=None): factory_kwargs = {"device": device, "dtype": dtype} - SparseConvolutionBase.__init__(self, ndim, in_channels, out_channels, kernel_size, stride, padding, dilation, groups, + SparseConvolutionBase.__init__( + self, + ndim, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, bias=False, subm=subm, output_padding=output_padding, @@ -602,14 +613,17 @@ def __init__(self, record_voxel_count=record_voxel_count, act_type=act_type, act_alpha=act_alpha, - act_beta=act_beta) + act_beta=act_beta, + large_kernel_fast_algo=large_kernel_fast_algo) SparseModule.__init__(self, name=name) if record_voxel_count and not self.subm and not self.inverse: # we record maximum voxel num in both inference and training if # record_voxel_count flag setting. - self.register_buffer(_MAX_NUM_VOXELS_DURING_TRAINING, - torch.zeros(1, dtype=torch.int32, device=device)) - self.weight = Parameter(torch.zeros(*self.weight_shape, **factory_kwargs)) + self.register_buffer( + _MAX_NUM_VOXELS_DURING_TRAINING, + torch.zeros(1, dtype=torch.int32, device=device)) + self.weight = Parameter( + torch.zeros(*self.weight_shape, **factory_kwargs)) if bias: self.bias = Parameter(torch.zeros(out_channels, **factory_kwargs)) else: @@ -636,8 +650,7 @@ def _load_weight_different_layout(self, state_dict, prefix, local_metadata, error_msgs): name = prefix + _MAX_NUM_VOXELS_DURING_TRAINING if self.record_voxel_count and not self.subm and not self.inverse and name not in state_dict: - state_dict[name] = torch.zeros( - 1, dtype=torch.int32) + state_dict[name] = torch.zeros(1, dtype=torch.int32) if not SAVED_WEIGHT_LAYOUT: return key = prefix + "weight" @@ -736,14 +749,23 @@ def reset_parameters(self): bound = 1 / math.sqrt(fan_in) init.uniform_(self.bias, -bound, bound) - - def forward(self, input: SparseConvTensor, add_input: Optional[SparseConvTensor] = None): - return self._conv_forward(self.training, input, self.weight, self.bias, add_input, - name=self.name, sparse_unique_name=self._sparse_unique_name, act_type=self.act_type, - act_alpha=self.act_alpha, act_beta=self.act_beta) + def forward(self, + input: SparseConvTensor, + add_input: Optional[SparseConvTensor] = None): + return self._conv_forward(self.training, + input, + self.weight, + self.bias, + add_input, + name=self.name, + sparse_unique_name=self._sparse_unique_name, + act_type=self.act_type, + act_alpha=self.act_alpha, + act_beta=self.act_beta) class SparseConv1d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -757,6 +779,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConv1d, self).__init__(1, @@ -772,10 +795,12 @@ def __init__(self, algo=algo, fp32_accum=fp32_accum, record_voxel_count=record_voxel_count, + large_kernel_fast_algo=large_kernel_fast_algo, name=name) class SparseConv2d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -789,6 +814,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConv2d, self).__init__(2, @@ -803,11 +829,13 @@ def __init__(self, indice_key=indice_key, algo=algo, fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, record_voxel_count=record_voxel_count, name=name) class SparseConv3d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -821,6 +849,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConv3d, self).__init__(3, @@ -835,11 +864,13 @@ def __init__(self, indice_key=indice_key, algo=algo, fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, record_voxel_count=record_voxel_count, name=name) class SparseConv4d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -853,6 +884,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConv4d, self).__init__(4, @@ -867,11 +899,13 @@ def __init__(self, indice_key=indice_key, algo=algo, fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, record_voxel_count=record_voxel_count, name=name) class SparseConvTranspose1d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -885,6 +919,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConvTranspose1d, self).__init__(1, @@ -900,11 +935,13 @@ def __init__(self, indice_key=indice_key, algo=algo, fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, record_voxel_count=record_voxel_count, name=name) class SparseConvTranspose2d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -918,6 +955,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConvTranspose2d, self).__init__(2, @@ -933,11 +971,13 @@ def __init__(self, indice_key=indice_key, algo=algo, fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, record_voxel_count=record_voxel_count, name=name) class SparseConvTranspose3d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -951,6 +991,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConvTranspose3d, self).__init__(3, @@ -966,11 +1007,13 @@ def __init__(self, indice_key=indice_key, algo=algo, fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, record_voxel_count=record_voxel_count, name=name) class SparseConvTranspose4d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -984,6 +1027,7 @@ def __init__(self, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, record_voxel_count: bool = False, + large_kernel_fast_algo: bool = False, name=None): super(SparseConvTranspose4d, self).__init__(4, @@ -999,11 +1043,13 @@ def __init__(self, indice_key=indice_key, algo=algo, fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, record_voxel_count=record_voxel_count, name=name) class SparseInverseConv1d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1012,20 +1058,24 @@ def __init__(self, bias=True, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SparseInverseConv1d, self).__init__(1, - in_channels, - out_channels, - kernel_size, - bias=bias, - inverse=True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SparseInverseConv1d, + self).__init__(1, + in_channels, + out_channels, + kernel_size, + bias=bias, + inverse=True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) class SparseInverseConv2d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1034,20 +1084,24 @@ def __init__(self, bias=True, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SparseInverseConv2d, self).__init__(2, - in_channels, - out_channels, - kernel_size, - bias=bias, - inverse=True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SparseInverseConv2d, + self).__init__(2, + in_channels, + out_channels, + kernel_size, + bias=bias, + inverse=True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) class SparseInverseConv3d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1056,20 +1110,24 @@ def __init__(self, bias=True, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SparseInverseConv3d, self).__init__(3, - in_channels, - out_channels, - kernel_size, - bias=bias, - inverse=True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SparseInverseConv3d, + self).__init__(3, + in_channels, + out_channels, + kernel_size, + bias=bias, + inverse=True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) class SparseInverseConv4d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1078,20 +1136,24 @@ def __init__(self, bias=True, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SparseInverseConv4d, self).__init__(4, - in_channels, - out_channels, - kernel_size, - bias=bias, - inverse=True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SparseInverseConv4d, + self).__init__(4, + in_channels, + out_channels, + kernel_size, + bias=bias, + inverse=True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) class SubMConv1d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1104,24 +1166,28 @@ def __init__(self, indice_key=None, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SubMConv1d, self).__init__(1, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SubMConv1d, + self).__init__(1, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) class SubMConv2d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1134,24 +1200,28 @@ def __init__(self, indice_key=None, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SubMConv2d, self).__init__(2, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SubMConv2d, + self).__init__(2, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) class SubMConv3d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1164,24 +1234,28 @@ def __init__(self, indice_key=None, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SubMConv3d, self).__init__(3, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SubMConv3d, + self).__init__(3, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) class SubMConv4d(SparseConvolution): + def __init__(self, in_channels, out_channels, @@ -1194,21 +1268,24 @@ def __init__(self, indice_key=None, algo: Optional[ConvAlgo] = None, fp32_accum: Optional[bool] = None, + large_kernel_fast_algo: bool = False, name=None): - super(SubMConv4d, self).__init__(4, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - True, - indice_key=indice_key, - algo=algo, - fp32_accum=fp32_accum, - name=name) + super(SubMConv4d, + self).__init__(4, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + True, + indice_key=indice_key, + algo=algo, + fp32_accum=fp32_accum, + large_kernel_fast_algo=large_kernel_fast_algo, + name=name) DEFAULT_SPARSE_CONV_TYPES = { @@ -1229,4 +1306,3 @@ def __init__(self, SparseConvTranspose3d, SparseConvTranspose4d, } -