Going away from separate compilation, as cython does not allow multip…

…le link stages
pierrepaleo · Mar 1, 2017 · a1fd1c9 · a1fd1c9
1 parent d9205d6
commit a1fd1c9
Show file tree

Hide file tree

Showing 8 changed files with 109 additions and 40 deletions.
diff --git a/Makefile b/Makefile
@@ -7,28 +7,35 @@ PDWTOBJ=build/wt.o build/common.o build/utils.o build/separable.o build/nonsepar
 
 #
 # Using constant memory accross several files requires to use separate compilation (relocatable device code),
-# Otherwise a new constant memory buffer is created for each file (even if the symbol is defined in a common file).
-# This was fine until the introduction of Wavelets::set_filters().
+# Otherwise a new constant memory buffer is created for each file (even if the symbol is defined in a common file),
+# since __constant__ variables have a file scope linkage. This was fine until the introduction of Wavelets::set_filters().
 # As constant memory is managed through the use of "symbols" rather than buffers, another strategy would be to
-# get the pointer address with cudaGetSymbolAddress().
-# However, separate compilation might be the way to go for better modularity, easier refactoring and compilation speed.
+# get the pointer address with cudaGetSymbolAddress(), which is not recommended.
+#
+# Separate compilation might be the way to go for better modularity, easier refactoring and compilation speed.
+# However, cython does not offer flexibility to make two linkage steps (one "nvcc --dlink" to link the cuda ".o" together,
+# the other to link the pyx ".o" with the linked cuda ".o").
+#
+# If you still want to use separate compilation :
+#   - replace "-c $^" with "-dc $^" in the Makefile targets rules
+#   - uncomment the definition of SEPARATE_COMPILATION in filters.h
 #
 demo: $(PDWTCORE) src/demo.cpp src/io.cpp
 	mkdir -p build
-	$(NVCC) -g $(CFLAGS) -odir build -dc $^
+	$(NVCC) -g $(CFLAGS) -odir build -c $^
 	$(NVCC) $(CFLAGS) -o demo $(PDWTOBJ) build/demo.o build/io.o $(LDFLAGS)
 
 
 libpdwt.so: $(PDWTCORE)
 	mkdir -p build
-	$(NVCC) --ptxas-options=-v --compiler-options '-fPIC' -odir build -dc $^
+	$(NVCC) $(CFLAGS) --ptxas-options=-v --compiler-options '-fPIC' -odir build -c $^
 	$(NVCC) $(CFLAGS)  -o $@ --shared $(PDWTOBJ) $(LDFLAGS)
 
 
 # Double precision library
 libpdwtd.so: $(PDWTCORE)
 	mkdir -p build
-	$(NVCC) --ptxas-options=-v --compiler-options '-fPIC' -DDOUBLEPRECISION -odir build -dc $^
+	$(NVCC) --ptxas-options=-v --compiler-options '-fPIC' -DDOUBLEPRECISION -odir build -c $^
 	$(NVCC) $(CFLAGS)  -o $@ --shared $(PDWTOBJ) $(LDFLAGS)
 
 

diff --git a/src/common.h b/src/common.h
@@ -14,6 +14,7 @@
 
 #define MAX_FILTER_WIDTH 40
 
+#ifdef SEPARATE_COMPILATION
 extern __constant__ DTYPE c_kern_L[MAX_FILTER_WIDTH];
 extern __constant__ DTYPE c_kern_H[MAX_FILTER_WIDTH];
 extern __constant__ DTYPE c_kern_IL[MAX_FILTER_WIDTH];
@@ -23,6 +24,17 @@ extern __constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
 extern __constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
 extern __constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
 extern __constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+#else
+__constant__ DTYPE c_kern_L[MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_H[MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_IL[MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_IH[MAX_FILTER_WIDTH];
+
+__constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+#endif
 
 
 __global__ void w_kern_soft_thresh(DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, DTYPE beta, int Nr, int Nc);

diff --git a/src/filters.h b/src/filters.h
@@ -40,6 +40,10 @@
 #define LIBEXPORT
 #endif
 
+// Uncomment to use separate compilation
+// mind that the "-dc" compilation flag must be used instead of "-c"
+// #define SEPARATE_COMPILATION
+
 
 LIBEXPORT DTYPE DB2_L[4];
 

diff --git a/src/nonseparable.cu b/src/nonseparable.cu
@@ -1,6 +1,16 @@
 #include "nonseparable.h"
 #include "common.h"
 
+#ifdef SEPARATE_COMPILATION
+// Required for separate compilation (see Makefile)
+#ifndef CONSTMEM_FILTERS_NS
+#define CONSTMEM_FILTERS_NS
+__constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+__constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+#endif
+#endif
 
 // outer product of arrays "a", "b" of length "len"
 DTYPE* w_outer(DTYPE* a, DTYPE* b, int len) {
@@ -73,6 +83,29 @@ int w_compute_filters(const char* wname, int direction, int do_swt) {
 }
 
 
+int w_set_filters_forward_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len) {
+    if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+            || cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+            || cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+            || cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+    {
+        return -3;
+    }
+    return 0;
+}
+
+int w_set_filters_inverse_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len) {
+    if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+        || cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+        || cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+        || cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+    {
+        return -3;
+    }
+    return 0;
+}
+
+
 
 
 

diff --git a/src/nonseparable.h b/src/nonseparable.h
@@ -5,6 +5,10 @@
 
 DTYPE* w_outer(DTYPE* a, DTYPE* b, int len);
 int w_compute_filters(const char* wname, int direction, int do_swt);
+int w_set_filters_forward_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len);
+int w_set_filters_inverse_nonseparable(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4, uint len);
+
+
 
 __global__ void w_kern_forward(DTYPE* img, DTYPE* c_a, DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, int Nr, int Nc, int hlen);
 __global__ void w_kern_inverse(DTYPE* img, DTYPE* c_a, DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, int Nr, int Nc, int Nr2, int Nc2, int hlen);

diff --git a/src/separable.cu b/src/separable.cu
@@ -1,18 +1,15 @@
 #include "separable.h"
 #include "common.h"
 
+#ifdef SEPARATE_COMPILATION
 // Required for separate compilation (see Makefile)
-#ifndef CONSTMEM_FILTERS
-#define CONSTMEM_FILTERS
+#ifndef CONSTMEM_FILTERS_S
+#define CONSTMEM_FILTERS_S
 __constant__ DTYPE c_kern_L[MAX_FILTER_WIDTH];
 __constant__ DTYPE c_kern_H[MAX_FILTER_WIDTH];
 __constant__ DTYPE c_kern_IL[MAX_FILTER_WIDTH];
 __constant__ DTYPE c_kern_IH[MAX_FILTER_WIDTH];
-
-__constant__ DTYPE c_kern_LL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
-__constant__ DTYPE c_kern_LH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
-__constant__ DTYPE c_kern_HL[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
-__constant__ DTYPE c_kern_HH[MAX_FILTER_WIDTH * MAX_FILTER_WIDTH];
+#endif
 #endif
 
 
@@ -57,6 +54,28 @@ int w_compute_filters_separable(const char* wname, int do_swt) {
 }
 
 
+int w_set_filters_forward(DTYPE* filter1, DTYPE* filter2, uint len) {
+    if (cudaMemcpyToSymbol(c_kern_L, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+        || cudaMemcpyToSymbol(c_kern_H, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+    {
+        return -3;
+    }
+    return 0;
+}
+
+int w_set_filters_inverse(DTYPE* filter1, DTYPE* filter2, uint len) {
+    if (cudaMemcpyToSymbol(c_kern_IL, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
+        || cudaMemcpyToSymbol(c_kern_IH, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
+    {
+        return -3;
+    }
+    return 0;
+}
+
+
+
+
+
 
 
 

diff --git a/src/separable.h b/src/separable.h
@@ -3,6 +3,10 @@
 #include "utils.h"
 
 int w_compute_filters_separable(const char* wname, int do_swt);
+int w_set_filters_forward(DTYPE* filter1, DTYPE* filter2, uint len);
+int w_set_filters_inverse(DTYPE* filter1, DTYPE* filter2, uint len);
+
+
 __global__ void w_kern_forward_pass1(DTYPE* img, DTYPE* tmp_a1, DTYPE* tmp_a2, int Nr, int Nc, int hlen);
 __global__ void w_kern_forward_pass2(DTYPE* tmp_a1, DTYPE* tmp_a2, DTYPE* c_a, DTYPE* c_h, DTYPE* c_v, DTYPE* c_d, int Nr, int Nc, int hlen);
 int w_forward_separable(DTYPE* d_image, DTYPE** d_coeffs, DTYPE* d_tmp, w_info winfos);

diff --git a/src/wt.cu b/src/wt.cu
@@ -543,64 +543,50 @@ void Wavelets::print_informations() {
 /// Provide a custom filter bank to the current Wavelet instance.
 /// If do_separable = 1, the filters are expected to be L, H.
 /// Otherwise, the filters are expected to be A, H, V, D (square size)
+// We cannot directly use the __constant__ symbols (unless with separate compilation),
+// hence a further indirection in (non)separable.cu where these symbols are defined
 int Wavelets::set_filters_forward(char* filtername, uint len, DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4) {
+    int res = 0;
     if (len > MAX_FILTER_WIDTH) {
         printf("ERROR: Wavelets.set_filters_forward(): filter length (%d) exceeds the maximum size (%d)\n", len, MAX_FILTER_WIDTH);
         return -1;
     }
     if (do_separable) {
-        if (cudaMemcpyToSymbol(c_kern_L, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_H, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            return -3;
-        }
+        res = w_set_filters_forward(filter1, filter2, len);
     }
     else {
         if (filter3 == NULL || filter4 == NULL) {
             puts("ERROR: Wavelets.set_filters_forward(): expected argument 4 and 5 for non-separable filtering");
             return -2;
         }
-        if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            return -3;
-        }
+        res = w_set_filters_forward_nonseparable(filter1, filter2, filter3, filter4, len);
     }
     winfos.hlen = len;
     strncpy(wname, filtername, 128);
 
-    return 0;
+    return res;
 }
 
 /// Here the filters are assumed to be of the same size of those provided to set_filters_forward()
+// We cannot directly use the __constant__ symbols (unless with separate compilation),
+// hence a further indirection in (non)separable.cu where these symbols are defined
 int Wavelets::set_filters_inverse(DTYPE* filter1, DTYPE* filter2, DTYPE* filter3, DTYPE* filter4) {
     uint len = winfos.hlen;
+    int res = 0;
     if (do_separable) {
         // ignoring args 4 and 5
-        if (cudaMemcpyToSymbol(c_kern_IL, filter1, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_IH, filter2, len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            return -3;
-        }
+        res = w_set_filters_inverse(filter1, filter2, len);
     }
     else {
         if (filter3 == NULL || filter4 == NULL) {
             puts("ERROR: Wavelets.set_filters_inverse(): expected argument 4 and 5 for non-separable filtering");
             return -2;
         }
         // The same symbols are used for the inverse filters
-        if (cudaMemcpyToSymbol(c_kern_LL, filter1, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_LH, filter2, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_HL, filter3, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess
-            || cudaMemcpyToSymbol(c_kern_HH, filter4, len*len*sizeof(DTYPE), 0, cudaMemcpyHostToDevice) != cudaSuccess)
-        {
-            return -3;
-        }
+        res = w_set_filters_inverse_nonseparable(filter1, filter2, filter3, filter4, len);
     }
 
-    return 0;
+    return res;
 }