diff --git a/cpp/src/convolution/_convolution.cu b/cpp/src/convolution/_convolution.cu index 7dba54ce..8d8c0444 100644 --- a/cpp/src/convolution/_convolution.cu +++ b/cpp/src/convolution/_convolution.cu @@ -158,7 +158,12 @@ __device__ void _cupy_correlate( const T *__restrict__ inp, if ( mode == 0 ) { // Valid if ( tid >= 0 && tid < inpW ) { for ( int j = 0; j < kerW; j++ ) { - temp += inp[tid + j] * kernel[j]; + if constexpr ( std::is_same_v> || + std::is_same_v> ) { + temp += inp[tid + j] * thrust::conj( kernel[j] ); + } else { + temp += inp[tid + j] * kernel[j]; + } } } } else if ( mode == 1 ) { // Same @@ -171,7 +176,12 @@ __device__ void _cupy_correlate( const T *__restrict__ inp, } for ( int j = 0; j < kerW; j++ ) { if ( ( start + j >= 0 ) && ( start + j < inpW ) ) { - temp += inp[start + j] * kernel[j]; + if constexpr ( std::is_same_v> || + std::is_same_v> ) { + temp += inp[start + j] * thrust::conj( kernel[j] ); + } else { + temp += inp[start + j] * kernel[j]; + } } } } else { // Full @@ -179,13 +189,22 @@ __device__ void _cupy_correlate( const T *__restrict__ inp, const int start { 0 - P1 + tid }; for ( int j = 0; j < kerW; j++ ) { if ( ( start + j >= 0 ) && ( start + j < inpW ) ) { - temp += inp[start + j] * kernel[j]; + if constexpr ( std::is_same_v> || + std::is_same_v> ) { + temp += inp[start + j] * thrust::conj( kernel[j] ); + } else { + temp += inp[start + j] * kernel[j]; + } } } } if ( swapped_inputs ) { - out[outW - tid - 1] = temp; // TODO: Move to shared memory + if constexpr ( std::is_same_v> || std::is_same_v> ) { + out[outW - tid - 1] = thrust::conj( temp ); + } else { + out[outW - tid - 1] = temp; + } } else { out[tid] = temp; } diff --git a/python/cusignal/test/test_convolution.py b/python/cusignal/test/test_convolution.py index a7015aee..8c810717 100644 --- a/python/cusignal/test/test_convolution.py +++ b/python/cusignal/test/test_convolution.py @@ -14,6 +14,7 @@ import cupy as cp import cusignal import pytest +import numpy as np from cusignal.test.utils import array_equal, _check_rapids_pytest_benchmark from scipy import signal @@ -23,7 +24,20 @@ class TestConvolution: @pytest.mark.benchmark(group="Correlate") - @pytest.mark.parametrize("num_samps", [2 ** 7, 2 ** 10 + 1, 2 ** 13]) + @pytest.mark.parametrize( + "dtype", + [ + np.int32, + np.int64, + np.float32, + np.float64, + np.complex64, + np.complex128, + ], + ) + @pytest.mark.parametrize( + "num_samps", [2 ** 7, 2 ** 10 + 1, 2 ** 13] + ) @pytest.mark.parametrize("num_taps", [125, 2 ** 8, 2 ** 13]) @pytest.mark.parametrize("mode", ["full", "valid", "same"]) @pytest.mark.parametrize("method", ["direct", "fft", "auto"]) @@ -44,27 +58,29 @@ def test_correlate1d_cpu( self, rand_data_gen, benchmark, + dtype, num_samps, num_taps, mode, method, ): - cpu_sig, _ = rand_data_gen(num_samps, 1) - cpu_filt, _ = rand_data_gen(num_taps, 1) + cpu_sig, _ = rand_data_gen(num_samps, 1, dtype) + cpu_filt, _ = rand_data_gen(num_taps, 1, dtype) benchmark(self.cpu_version, cpu_sig, cpu_filt, mode, method) def test_correlate1d_gpu( self, rand_data_gen, gpubenchmark, + dtype, num_samps, num_taps, mode, method, ): - cpu_sig, gpu_sig = rand_data_gen(num_samps, 1) - cpu_filt, gpu_filt = rand_data_gen(num_taps, 1) + cpu_sig, gpu_sig = rand_data_gen(num_samps, 1, dtype) + cpu_filt, gpu_filt = rand_data_gen(num_taps, 1, dtype) output = gpubenchmark( self.gpu_version, gpu_sig, @@ -77,7 +93,20 @@ def test_correlate1d_gpu( array_equal(output, key) @pytest.mark.benchmark(group="Convolve") - @pytest.mark.parametrize("num_samps", [2 ** 7, 2 ** 10 + 1, 2 ** 13]) + @pytest.mark.parametrize( + "dtype", + [ + np.int32, + np.int64, + np.float32, + np.float64, + np.complex64, + np.complex128, + ], + ) + @pytest.mark.parametrize( + "num_samps", [2 ** 7, 2 ** 10 + 1, 2 ** 13] + ) @pytest.mark.parametrize("num_taps", [125, 2 ** 8, 2 ** 13]) @pytest.mark.parametrize("mode", ["full", "valid", "same"]) @pytest.mark.parametrize("method", ["direct", "fft", "auto"]) @@ -96,13 +125,14 @@ def test_convolve1d_cpu( self, rand_data_gen, benchmark, + dtype, num_samps, num_taps, mode, method, ): - cpu_sig, _ = rand_data_gen(num_samps, 1) - cpu_win = signal.windows.hann(num_taps, 1) + cpu_sig, _ = rand_data_gen(num_samps, 1, dtype) + cpu_win = signal.windows.hann(num_taps, 1).astype(dtype) benchmark(self.cpu_version, cpu_sig, cpu_win, mode, method) @@ -110,19 +140,20 @@ def test_convolve1d_gpu( self, rand_data_gen, gpubenchmark, + dtype, num_samps, num_taps, mode, method, ): - cpu_sig, gpu_sig = rand_data_gen(num_samps, 1) - gpu_win = cusignal.windows.hann(num_taps, 1) + cpu_sig, gpu_sig = rand_data_gen(num_samps, 1, dtype) + gpu_win = cusignal.windows.hann(num_taps, 1).astype(dtype) output = gpubenchmark( self.gpu_version, gpu_sig, gpu_win, mode, method ) - cpu_win = signal.windows.hann(num_taps, 1) + cpu_win = signal.windows.hann(num_taps, 1).astype(dtype) key = self.cpu_version(cpu_sig, cpu_win, mode, method) array_equal(output, key) @@ -157,6 +188,17 @@ def test_fftconvolve_gpu( array_equal(output, key) @pytest.mark.benchmark(group="Convolve2d") + @pytest.mark.parametrize( + "dtype", + [ + np.int32, + np.int64, + np.float32, + np.float64, + np.complex64, + np.complex128, + ], + ) @pytest.mark.parametrize("num_samps", [2 ** 8]) @pytest.mark.parametrize("num_taps", [5, 100]) @pytest.mark.parametrize("boundary", ["fill", "wrap", "symm"]) @@ -178,27 +220,29 @@ def test_convolve2d_cpu( self, rand_data_gen, benchmark, + dtype, num_samps, num_taps, boundary, mode, ): - cpu_sig, _ = rand_data_gen(num_samps, 2) - cpu_filt, _ = rand_data_gen(num_taps, 2) + cpu_sig, _ = rand_data_gen(num_samps, 2, dtype) + cpu_filt, _ = rand_data_gen(num_taps, 2, dtype) benchmark(self.cpu_version, cpu_sig, cpu_filt, boundary, mode) def test_convolve2d_gpu( self, rand_data_gen, gpubenchmark, + dtype, num_samps, num_taps, boundary, mode, ): - cpu_sig, gpu_sig = rand_data_gen(num_samps, 2) - cpu_filt, gpu_filt = rand_data_gen(num_taps, 2) + cpu_sig, gpu_sig = rand_data_gen(num_samps, 2, dtype) + cpu_filt, gpu_filt = rand_data_gen(num_taps, 2, dtype) output = gpubenchmark( self.gpu_version, gpu_sig, @@ -211,6 +255,17 @@ def test_convolve2d_gpu( array_equal(output, key) @pytest.mark.benchmark(group="Correlate2d") + @pytest.mark.parametrize( + "dtype", + [ + np.int32, + np.int64, + np.float32, + np.float64, + np.complex64, + np.complex128, + ], + ) @pytest.mark.parametrize("num_samps", [2 ** 8]) @pytest.mark.parametrize("num_taps", [5, 100]) @pytest.mark.parametrize("boundary", ["fill", "wrap", "symm"]) @@ -232,27 +287,29 @@ def test_correlate2d_cpu( self, rand_data_gen, benchmark, + dtype, num_samps, num_taps, boundary, mode, ): - cpu_sig, _ = rand_data_gen(num_samps, 2) - cpu_filt, _ = rand_data_gen(num_taps, 2) + cpu_sig, _ = rand_data_gen(num_samps, 2, dtype) + cpu_filt, _ = rand_data_gen(num_taps, 2, dtype) benchmark(self.cpu_version, cpu_sig, cpu_filt, boundary, mode) def test_correlate2d_gpu( self, rand_data_gen, gpubenchmark, + dtype, num_samps, num_taps, boundary, mode, ): - cpu_sig, gpu_sig = rand_data_gen(num_samps, 2) - cpu_filt, gpu_filt = rand_data_gen(num_taps, 2) + cpu_sig, gpu_sig = rand_data_gen(num_samps, 2, dtype) + cpu_filt, gpu_filt = rand_data_gen(num_taps, 2, dtype) output = gpubenchmark( self.gpu_version, gpu_sig,