Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 98c236a

Browse files
authored
Initial support for CUDA unified memory (#163)
- most of the data structures are allocated with cudaMallocManaged - few data structures still need to be copied explicitly because of how we use non POD types between CPU and GPU - Performance of this is not as good as explicit copy using OpenACC but this is because of frequent data access (page faults) for container structures like NrnThread, MembList etc. - Added -DENABLE_UNIFIED=1 variable to CMake - REDME is not updated because this won't be exposed to normal build (yet)
1 parent 31ac2eb commit 98c236a

20 files changed

+425
-312
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ option(ENABLE_ISPC_TARGET "Enable ispc interoperability structs and data" OFF)
6363
option(ENABLE_NMODL "Enable external nmodl source-to-source compiler" OFF)
6464
option(ENABLE_CALIPER "Enable Caliper instrumentation" OFF)
6565
option(CORENEURON_ENABLE_LIKWID "Enable LIKWID instrumentation" OFF)
66+
option(ENABLE_UNIFIED "Enable Unified Memory implementation of GPU" OFF)
6667

6768
## set C++11 standard to be default
6869
set(CMAKE_CXX_STANDARD 11)
@@ -264,6 +265,9 @@ endif()
264265

265266
if(ENABLE_OPENACC)
266267
set(COMPILE_LIBRARY_TYPE "STATIC")
268+
if (ENABLE_UNIFIED)
269+
add_definitions( -DUNIFIED_MEMORY)
270+
endif()
267271
if(${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
268272
add_definitions( -DPG_ACC_BUGS)
269273
set(ACC_FLAGS "-acc -Minline=size:200,levels:10")

coreneuron/nrnconf.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ extern void* emalloc(size_t size);
7878
extern void* ecalloc(size_t n, size_t size);
7979
extern void* erealloc(void* ptr, size_t size);
8080
extern void* emalloc_align(size_t size, size_t alignment);
81-
extern void* ecalloc_align(size_t n, size_t alignment, size_t size);
81+
extern void* ecalloc_align(size_t n, size_t size, size_t alignment);
8282
extern double hoc_Exp(double x);
8383
extern void check_bbcore_write_version(const char*);
8484

coreneuron/nrniv/cellorder.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "coreneuron/nrniv/cellorder.h"
66
#include "coreneuron/nrniv/tnode.h"
77
#include "coreneuron/nrniv/lpt.h"
8+
#include "coreneuron/nrniv/memory.h"
89

910
#include "coreneuron/nrniv/node_permute.h" // for print_quality
1011
#include <set>
@@ -80,13 +81,13 @@ InterleaveInfo& InterleaveInfo::operator=(const InterleaveInfo& info) {
8081

8182
InterleaveInfo::~InterleaveInfo() {
8283
if (stride) {
83-
delete[] stride;
84-
delete[] firstnode;
85-
delete[] lastnode;
86-
delete[] cellsize;
84+
free_memory(stride);
85+
free_memory(firstnode);
86+
free_memory(lastnode);
87+
free_memory(cellsize);
8788
}
8889
if (stridedispl) {
89-
delete[] stridedispl;
90+
free_memory(stridedispl);
9091
}
9192
if (idle) {
9293
delete[] nnode;

coreneuron/nrniv/cellorder1.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
// just for use_interleave_permute
77
#include "coreneuron/nrniv/nrniv_decl.h"
8+
#include "coreneuron/nrniv/memory.h"
89

910
#include <map>
1011
#include <set>
@@ -531,9 +532,9 @@ static void admin1(int ncell,
531532
// cellsize is the number of nodes in the cell not counting root.
532533
// nstride is the maximum cell size (not counting root)
533534
// stride[i] is the number of cells with an ith node.
534-
firstnode = new int[ncell];
535-
lastnode = new int[ncell];
536-
cellsize = new int[ncell];
535+
firstnode = (int*)ecalloc_align(ncell, sizeof(int));
536+
lastnode = (int*)ecalloc_align(ncell, sizeof(int));
537+
cellsize = (int*)ecalloc_align(ncell, sizeof(int));
537538

538539
nwarp = (ncell % warpsize == 0) ? (ncell / warpsize) : (ncell / warpsize + 1);
539540

@@ -557,7 +558,7 @@ static void admin1(int ncell,
557558
}
558559
}
559560

560-
stride = new int[nstride + 1]; // in case back substitution accesses this
561+
stride = (int*)ecalloc_align(nstride + 1, sizeof(int));
561562
for (int i = 0; i <= nstride; ++i) {
562563
stride[i] = 0;
563564
}
@@ -617,10 +618,11 @@ static void admin2(int ncell,
617618
// ncore is the number of warps * warpsize
618619
nwarp = nodevec[ncell - 1]->groupindex + 1;
619620

620-
ncycles = new int[nwarp];
621-
stridedispl = new int[nwarp + 1]; // running sum of ncycles (start at 0)
622-
rootbegin = new int[nwarp + 1]; // index (+1) of first root in warp.
623-
nodebegin = new int[nwarp + 1]; // index (+1) of first node in warp.
621+
ncycles = (int*)ecalloc_align(nwarp, sizeof(int));
622+
stridedispl =
623+
(int*)ecalloc_align(nwarp + 1, sizeof(int)); // running sum of ncycles (start at 0)
624+
rootbegin = (int*)ecalloc_align(nwarp + 1, sizeof(int)); // index (+1) of first root in warp.
625+
nodebegin = (int*)ecalloc_align(nwarp + 1, sizeof(int)); // index (+1) of first node in warp.
624626

625627
// rootbegin and nodebegin are the root index values + 1 of the last of
626628
// the sequence of constant groupindex
@@ -650,7 +652,7 @@ static void admin2(int ncell,
650652
}
651653

652654
// strides
653-
strides = new int[nstride];
655+
strides = (int*)ecalloc_align(nstride, sizeof(int));
654656
nstride = 0;
655657
for (size_t iwarp = 0; iwarp < (size_t)nwarp; ++iwarp) {
656658
size_t j = size_t(nodebegin[iwarp + 1]);

coreneuron/nrniv/main1.cpp

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -364,52 +364,53 @@ const char* nrn_version(int) {
364364
// bsize = 0 then per step transfer
365365
// bsize > 1 then full trajectory save into arrays.
366366
void get_nrn_trajectory_requests(int bsize) {
367-
if (nrn2core_get_trajectory_requests_) {
368-
for (int tid=0; tid < nrn_nthread; ++tid) {
369-
NrnThread& nt = nrn_threads[tid];
370-
int n_pr;
371-
int n_trajec;
372-
int* types;
373-
int* indices;
374-
void** vpr;
375-
double** varrays;
376-
double** pvars;
377-
378-
// bsize is passed by reference, the return value will determine if
379-
// per step return or entire trajectory return.
380-
(*nrn2core_get_trajectory_requests_)(tid, bsize, n_pr, vpr, n_trajec, types, indices, pvars, varrays);
381-
delete_trajectory_requests(nt);
382-
if (n_trajec) {
383-
TrajectoryRequests* tr = new TrajectoryRequests;
384-
nt.trajec_requests = tr;
385-
tr->bsize = bsize;
386-
tr->n_pr = n_pr;
387-
tr->n_trajec = n_trajec;
388-
tr->vsize = 0;
389-
tr->vpr = vpr;
390-
tr->gather = new double*[n_trajec];
391-
tr->varrays = varrays;
392-
tr->scatter = pvars;
393-
for (int i=0; i < n_trajec; ++i) {
394-
tr->gather[i] = stdindex2ptr(types[i], indices[i], nt);
367+
if (nrn2core_get_trajectory_requests_) {
368+
for (int tid = 0; tid < nrn_nthread; ++tid) {
369+
NrnThread& nt = nrn_threads[tid];
370+
int n_pr;
371+
int n_trajec;
372+
int* types;
373+
int* indices;
374+
void** vpr;
375+
double** varrays;
376+
double** pvars;
377+
378+
// bsize is passed by reference, the return value will determine if
379+
// per step return or entire trajectory return.
380+
(*nrn2core_get_trajectory_requests_)(tid, bsize, n_pr, vpr, n_trajec, types, indices,
381+
pvars, varrays);
382+
delete_trajectory_requests(nt);
383+
if (n_trajec) {
384+
TrajectoryRequests* tr = new TrajectoryRequests;
385+
nt.trajec_requests = tr;
386+
tr->bsize = bsize;
387+
tr->n_pr = n_pr;
388+
tr->n_trajec = n_trajec;
389+
tr->vsize = 0;
390+
tr->vpr = vpr;
391+
tr->gather = new double*[n_trajec];
392+
tr->varrays = varrays;
393+
tr->scatter = pvars;
394+
for (int i = 0; i < n_trajec; ++i) {
395+
tr->gather[i] = stdindex2ptr(types[i], indices[i], nt);
396+
}
397+
delete[] types;
398+
delete[] indices;
399+
}
395400
}
396-
delete [] types;
397-
delete [] indices;
398-
}
399401
}
400-
}
401402
}
402403

403404
static void trajectory_return() {
404-
if (nrn2core_trajectory_return_) {
405-
for (int tid=0; tid < nrn_nthread; ++tid) {
406-
NrnThread& nt = nrn_threads[tid];
407-
TrajectoryRequests* tr = nt.trajec_requests;
408-
if (tr && tr->varrays) {
409-
(*nrn2core_trajectory_return_)(tid, tr->n_pr, tr->vsize, tr->vpr, nt._t);
410-
}
405+
if (nrn2core_trajectory_return_) {
406+
for (int tid = 0; tid < nrn_nthread; ++tid) {
407+
NrnThread& nt = nrn_threads[tid];
408+
TrajectoryRequests* tr = nt.trajec_requests;
409+
if (tr && tr->varrays) {
410+
(*nrn2core_trajectory_return_)(tid, tr->n_pr, tr->vsize, tr->vpr, nt._t);
411+
}
412+
}
411413
}
412-
}
413414
}
414415

415416
} // namespace coreneuron
@@ -493,7 +494,7 @@ extern "C" int run_solve_core(int argc, char** argv) {
493494
if (corenrn_embedded) {
494495
// arg is vector size required but NEURON can instead
495496
// specify that returns will be on a per time step basis.
496-
get_nrn_trajectory_requests(int(tstop/dt) + 2);
497+
get_nrn_trajectory_requests(int(tstop / dt) + 2);
497498
(*nrn2core_part2_clean_)();
498499
}
499500

coreneuron/nrniv/memory.h

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,57 @@ THE POSSIBILITY OF SUCH DAMAGE.
2929
#ifndef _H_MEMORY_
3030
#define _H_MEMORY_
3131

32-
#include <stdlib.h>
3332
#include <string.h>
33+
3434
#include "coreneuron/nrniv/nrn_assert.h"
3535

36+
#if !defined(NRN_SOA_BYTE_ALIGN)
37+
// for layout 0, every range variable array must be aligned by at least 16 bytes (the size of the
38+
// simd memory bus)
39+
#define NRN_SOA_BYTE_ALIGN (8 * sizeof(double))
40+
#endif
41+
42+
/// for gpu builds with unified memory support
43+
#if (defined(__CUDACC__) || defined(UNIFIED_MEMORY))
44+
45+
#include <cuda_runtime_api.h>
46+
47+
// TODO : error handling for CUDA routines
48+
inline void alloc_memory(void*& pointer, size_t num_bytes, size_t /*alignment*/) {
49+
cudaMallocManaged(&pointer, num_bytes);
50+
}
51+
52+
inline void calloc_memory(void*& pointer, size_t num_bytes, size_t /*alignment*/) {
53+
alloc_memory(pointer, num_bytes, 64);
54+
cudaMemset(pointer, 0, num_bytes);
55+
}
56+
57+
inline void free_memory(void* pointer) {
58+
cudaFree(pointer);
59+
}
60+
61+
/// for cpu builds use posix memalign
62+
#else
63+
64+
#include <stdlib.h>
65+
66+
inline void alloc_memory(void*& pointer, size_t num_bytes, size_t alignment) {
67+
nrn_assert(posix_memalign(&pointer, alignment, num_bytes) == 0);
68+
}
69+
70+
inline void calloc_memory(void*& pointer, size_t num_bytes, size_t alignment) {
71+
alloc_memory(pointer, num_bytes, alignment);
72+
memset(pointer, 0, num_bytes);
73+
}
74+
75+
inline void free_memory(void* pointer) {
76+
free(pointer);
77+
}
78+
79+
#endif
80+
3681
namespace coreneuron {
82+
3783
/** Independent function to compute the needed chunkding,
3884
the chunk argument is the number of doubles the chunk is chunkded upon.
3985
*/
@@ -57,23 +103,22 @@ inline bool is_aligned(void* pointer, size_t alignment) {
57103

58104
/** Allocate the aligned memory.
59105
*/
60-
inline void* emalloc_align(size_t size, size_t alignment) {
106+
inline void* emalloc_align(size_t size, size_t alignment = NRN_SOA_BYTE_ALIGN) {
61107
void* memptr;
62-
nrn_assert(posix_memalign(&memptr, alignment, size) == 0);
108+
alloc_memory(memptr, size, alignment);
63109
nrn_assert(is_aligned(memptr, alignment));
64110
return memptr;
65111
}
66112

67-
/** Allocate the aligned memory and set it to 1.
113+
/** Allocate the aligned memory and set it to 0.
68114
*/
69-
inline void* ecalloc_align(size_t n, size_t alignment, size_t size) {
115+
inline void* ecalloc_align(size_t n, size_t size, size_t alignment = NRN_SOA_BYTE_ALIGN) {
70116
void* p;
71117
if (n == 0) {
72118
return (void*)0;
73119
}
74-
nrn_assert(posix_memalign(&p, alignment, n * size) == 0);
120+
calloc_memory(p, n * size, alignment);
75121
nrn_assert(is_aligned(p, alignment));
76-
memset(p, 1, n * size); // Avoid native division by zero (cyme...)
77122
return p;
78123
}
79124
} // namespace coreneuron

coreneuron/nrniv/mk_mech.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ void mk_mech(const char* datpath) {
119119
// we are embedded in NEURON, get info as stringstream from nrnbbcore_write.cpp
120120
static void mk_mech() {
121121
static bool done = false;
122-
if (done) { return; }
122+
if (done) {
123+
return;
124+
}
123125
nrn_need_byteswap = 0;
124126
std::stringstream ss;
125127
nrn_assert(nrn2core_mkmech_info_);

coreneuron/nrniv/nrn2core_direct.h

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -92,28 +92,20 @@ extern void (*nrn2core_part2_clean_)();
9292

9393
/* what variables to send back to NEURON on each time step */
9494
extern void (*nrn2core_get_trajectory_requests_)(int tid,
95-
int& bsize,
96-
int& n_pr,
97-
void**& vpr,
98-
int& n_trajec,
99-
int*& types,
100-
int*& indices,
101-
double**& pvars,
102-
double**& varrays);
95+
int& bsize,
96+
int& n_pr,
97+
void**& vpr,
98+
int& n_trajec,
99+
int*& types,
100+
int*& indices,
101+
double**& pvars,
102+
double**& varrays);
103103

104104
/* send values to NEURON on each time step */
105-
extern void (*nrn2core_trajectory_values_)(int tid,
106-
int n_pr,
107-
void** vpr,
108-
double t);
105+
extern void (*nrn2core_trajectory_values_)(int tid, int n_pr, void** vpr, double t);
109106

110107
/* Filled the Vector data arrays and send back the sizes at end of run */
111-
extern void (*nrn2core_trajectory_return_)(int tid,
112-
int n_pr,
113-
int vecsz,
114-
void** vpr,
115-
double t);
116-
108+
extern void (*nrn2core_trajectory_return_)(int tid, int n_pr, int vecsz, void** vpr, double t);
117109
}
118110

119111
#endif /* nrn2core_direct_h */

0 commit comments

Comments
 (0)