Skip to content

Commit

Permalink
Going away from separate compilation, as cython does not allow multip…
Browse files Browse the repository at this point in the history
…le link stages
  • Loading branch information
pierrepaleo committed Mar 1, 2017
1 parent d9205d6 commit a1fd1c9
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 40 deletions.
21 changes: 14 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,35 @@ PDWTOBJ=build/wt.o build/common.o build/utils.o build/separable.o build/nonsepar

#
# Using constant memory accross several files requires to use separate compilation (relocatable device code),
# Otherwise a new constant memory buffer is created for each file (even if the symbol is defined in a common file).
# This was fine until the introduction of Wavelets::set_filters().
# Otherwise a new constant memory buffer is created for each file (even if the symbol is defined in a common file),
# since __constant__ variables have a file scope linkage. This was fine until the introduction of Wavelets::set_filters().
# As constant memory is managed through the use of "symbols" rather than buffers, another strategy would be to
# get the pointer address with cudaGetSymbolAddress().
# However, separate compilation might be the way to go for better modularity, easier refactoring and compilation speed.
# get the pointer address with cudaGetSymbolAddress(), which is not recommended.
#
# Separate compilation might be the way to go for better modularity, easier refactoring and compilation speed.
# However, cython does not offer flexibility to make two linkage steps (one "nvcc --dlink" to link the cuda ".o" together,
# the other to link the pyx ".o" with the linked cuda ".o").
#
# If you still want to use separate compilation :
# - replace "-c $^" with "-dc $^" in the Makefile targets rules
# - uncomment the definition of SEPARATE_COMPILATION in filters.h
#
demo: $(PDWTCORE) src/demo.cpp src/io.cpp
mkdir -p build
$(NVCC) -g $(CFLAGS) -odir build -dc $^
$(NVCC) -g $(CFLAGS) -odir build -c $^
$(NVCC) $(CFLAGS) -o demo $(PDWTOBJ) build/demo.o build/io.o $(LDFLAGS)


libpdwt.so: $(PDWTCORE)
mkdir -p build
$(NVCC) --ptxas-options=-v --compiler-options '-fPIC' -odir build -dc $^
$(NVCC) $(CFLAGS) --ptxas-options=-v --compiler-options '-fPIC' -odir build -c $^
$(NVCC) $(CFLAGS) -o $@ --shared $(PDWTOBJ) $(LDFLAGS)


# Double precision library
libpdwtd.so: $(PDWTCORE)
mkdir -p build
$(NVCC) --ptxas-options=-v --compiler-options '-fPIC' -DDOUBLEPRECISION -odir build -dc $^
$(NVCC) --ptxas-options=-v --compiler-options '-fPIC' -DDOUBLEPRECISION -odir build -c $^
$(NVCC) $(CFLAGS) -o $@ --shared $(PDWTOBJ) $(LDFLAGS)


Expand Down
12 changes: 12 additions & 0 deletions src/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#define MAX_FILTER_WIDTH 40

#ifdef SEPARATE_COMPILATION
extern __constant__ DTYPE c_kern_L[MAX_FILTER_WIDTH];
extern __constant__ DTYPE c_kern_H[MAX_FILTER_WIDTH];
extern __constant__ DTYPE c_kern_IL[MAX_FILTER_WIDTH];
Expand All @@ -23,6 +24,17 @@ extern __constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
extern __constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
extern __constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
extern __constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
#else
__constant__ DTYPE c_kern_L[MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_H[MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_IL[MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_IH[MAX_FILTER_WIDTH];

__constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
#endif


__global__ void w_kern_soft_thresh(DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, DTYPE beta, int Nr, int Nc);
Expand Down
4 changes: 4 additions & 0 deletions src/filters.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
#define LIBEXPORT
#endif

// Uncomment to use separate compilation
// mind that the "-dc" compilation flag must be used instead of "-c"
// #define SEPARATE_COMPILATION


LIBEXPORT DTYPE DB2_L[4];

Expand Down
33 changes: 33 additions & 0 deletions src/nonseparable.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
#include "nonseparable.h"
#include "common.h"

#ifdef SEPARATE_COMPILATION
// Required for separate compilation (see Makefile)
#ifndef CONSTMEM_FILTERS_NS
#define CONSTMEM_FILTERS_NS
__constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
#endif
#endif

// outer product of arrays "a", "b" of length "len"
DTYPE* w_outer(DTYPE* a, DTYPE* b, int len) {
Expand Down Expand Up @@ -73,6 +83,29 @@ int w_compute_filters(const char* wname, int direction, int do_swt) {
}


int w_set_filters_forward_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len) {
if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
return 0;
}

int w_set_filters_inverse_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len) {
if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
return 0;
}





Expand Down
4 changes: 4 additions & 0 deletions src/nonseparable.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

DTYPE* w_outer(DTYPE* a, DTYPE* b, int len);
int w_compute_filters(const char* wname, int direction, int do_swt);
int w_set_filters_forward_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len);
int w_set_filters_inverse_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len);



__global__ void w_kern_forward(DTYPE* img, DTYPE* c_a, DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, int Nr, int Nc, int hlen);
__global__ void w_kern_inverse(DTYPE* img, DTYPE* c_a, DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, int Nr, int Nc, int Nr2, int Nc2, int hlen);
Expand Down
33 changes: 26 additions & 7 deletions src/separable.cu
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
#include "separable.h"
#include "common.h"

#ifdef SEPARATE_COMPILATION
// Required for separate compilation (see Makefile)
#ifndef CONSTMEM_FILTERS
#define CONSTMEM_FILTERS
#ifndef CONSTMEM_FILTERS_S
#define CONSTMEM_FILTERS_S
__constant__ DTYPE c_kern_L[MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_H[MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_IL[MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_IH[MAX_FILTER_WIDTH];

__constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
__constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
#endif
#endif


Expand Down Expand Up @@ -57,6 +54,28 @@ int w_compute_filters_separable(const char* wname, int do_swt) {
}


int w_set_filters_forward(DTYPE* filter1, DTYPE* filter2, uint len) {
if (cudaMemcpyToSymbol(c_kern_L, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_H, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
return 0;
}

int w_set_filters_inverse(DTYPE* filter1, DTYPE* filter2, uint len) {
if (cudaMemcpyToSymbol(c_kern_IL, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_IH, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
return 0;
}








Expand Down
4 changes: 4 additions & 0 deletions src/separable.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
#include "utils.h"

int w_compute_filters_separable(const char* wname, int do_swt);
int w_set_filters_forward(DTYPE* filter1, DTYPE* filter2, uint len);
int w_set_filters_inverse(DTYPE* filter1, DTYPE* filter2, uint len);


__global__ void w_kern_forward_pass1(DTYPE* img, DTYPE* tmp_a1, DTYPE* tmp_a2, int Nr, int Nc, int hlen);
__global__ void w_kern_forward_pass2(DTYPE* tmp_a1, DTYPE* tmp_a2, DTYPE* c_a, DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, int Nr, int Nc, int hlen);
int w_forward_separable(DTYPE* d_image, DTYPE** d_coeffs, DTYPE* d_tmp, w_info winfos);
Expand Down
38 changes: 12 additions & 26 deletions src/wt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -543,64 +543,50 @@ void Wavelets::print_informations() {
/// Provide a custom filter bank to the current Wavelet instance.
/// If do_separable = 1, the filters are expected to be L, H.
/// Otherwise, the filters are expected to be A, H, V, D (square size)
// We cannot directly use the __constant__ symbols (unless with separate compilation),
// hence a further indirection in (non)separable.cu where these symbols are defined
int Wavelets::set_filters_forward(char* filtername, uint len, DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4) {
int res = 0;
if (len > MAX_FILTER_WIDTH) {
printf("ERROR: Wavelets.set_filters_forward(): filter length (%d) exceeds the maximum size (%d)\n", len, MAX_FILTER_WIDTH);
return -1;
}
if (do_separable) {
if (cudaMemcpyToSymbol(c_kern_L, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_H, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
res = w_set_filters_forward(filter1, filter2, len);
}
else {
if (filter3 == NULL || filter4 == NULL) {
puts("ERROR: Wavelets.set_filters_forward(): expected argument 4 and 5 for non-separable filtering");
return -2;
}
if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
res = w_set_filters_forward_nonseparable(filter1, filter2, filter3, filter4, len);
}
winfos.hlen = len;
strncpy(wname, filtername, 128);

return 0;
return res;
}

/// Here the filters are assumed to be of the same size of those provided to set_filters_forward()
// We cannot directly use the __constant__ symbols (unless with separate compilation),
// hence a further indirection in (non)separable.cu where these symbols are defined
int Wavelets::set_filters_inverse(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4) {
uint len = winfos.hlen;
int res = 0;
if (do_separable) {
// ignoring args 4 and 5
if (cudaMemcpyToSymbol(c_kern_IL, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_IH, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
res = w_set_filters_inverse(filter1, filter2, len);
}
else {
if (filter3 == NULL || filter4 == NULL) {
puts("ERROR: Wavelets.set_filters_inverse(): expected argument 4 and 5 for non-separable filtering");
return -2;
}
// The same symbols are used for the inverse filters
if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
|| cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
{
return -3;
}
res = w_set_filters_inverse_nonseparable(filter1, filter2, filter3, filter4, len);
}

return 0;
return res;
}


Expand Down

0 comments on commit a1fd1c9

Please sign in to comment.