Skip to content

Commit

Permalink
Initial commit with code
Browse files Browse the repository at this point in the history
  • Loading branch information
Erik Traff committed Oct 20, 2022
1 parent f640716 commit e633d4f
Show file tree
Hide file tree
Showing 27 changed files with 4,049 additions and 1 deletion.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
*.o
*.json
*.vtu
*.txt

benchmark
test_stencil_methods
top3d
31 changes: 31 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
CFLAGS = -Wall -g -mp=gpu -gpu=cc80 -mp=noautopar -Msafeptr -march=native -O4 -mavx -mavx2 #-Minfo
DEFS = -DDTU_HPC
LIBS = -lcholmod -lm -lsuitesparseconfig -lcxsparse
INCL = ""

ifdef OWN_CHOLMOD
LIBS += -L$(HOME)/SuiteSparse/SuiteSparse-5.10.1/lib -L/usr/lib64 -lopenblaso-r0.3.3
INCL += -I$(HOME)/SuiteSparse/SuiteSparse-5.10.1/include/
$(info Using custom suite-sparse)
else
LIBS += -L"/appl/SuiteSparse/5.1.2-sl73/lib/" -L"/appl/OpenBLAS/0.2.20/XeonGold6226R/gcc-6.4.0/lib" -lopenblas
INCL += -I"/appl/SuiteSparse/5.1.2-sl73/include/"
$(info Using old gbar suite-sparse)
endif

$(info LIBS are $(LIBS))
CC = nvc
CXX = g++

OBJ = stencil_methods.o stencil_assembly.o stencil_solvers.o stencil_grid_utility.o stencil_optimization.o local_matrix.o gpu_definitions.o

all: top3d

top3d: top3d.c $(OBJ)
$(CC) -std=c11 $(CFLAGS) -o $@ $^ $(INCL) $(DEFS) $(LIBS)

%.o: %.c definitions.h
$(CC) -std=c11 $(CFLAGS) -o $@ -c $< $(INCL) $(DEFS)

clean:
-rm -f top3dmgcg_matrixfree top3dmgcg_eightColored benchmark test_stencil_methods core top3d *.core *.o
62 changes: 61 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,61 @@
# MatrixFreeOpenmpGPU
# GPU Accelerated Topology Optimisation Based on OpenMP Target Offloading
This repository contains an implementation of a topology optimisation solver for linear elastic compliance minimisation in three dimensions. The implementation is based on OpenMP target offloading to one GPU.

## Linear MBB Beam Example
The result after 20 design iterations can be seen in the folloowing figure.
<p align="center">
<img src="../figures/result.png" width="600," title="Result from 25 iterations">
</p>

## Compilation
Even though OpenMP offloading to GPUs is supported by a wide range of compilers such as LLVM/Clang, ROCm, ICPC, NVC, and GCC, the compilers do not support the same subset of the OpenMP specification. We used NVIDIA's NVC compiler from the NVIDIA HPC SDK. The OpenMP directives may need to be adjusted if you choose to use another compiler.
<br>

### Dependencies
We used the following versions of the `cholmod` library and the `nvc` compiler.

| **Pachage** | **Version** | **Installation** |
| :--- | :--- | :--- |
| `SuiteSparse/CHOLMOD` | 5.1.2 | [See Github release notes](https://github.com/DrTimothyAldenDavis/SuiteSparse/releases/tag/v5.1.2) |
| `nvhpc`| 22.5 | [NVIDIA HPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-releases)|
| `CUDA` | 11.1 | [CUDA Toolkit 11.1.0](https://developer.nvidia.com/cuda-11.1.0-download-archive?target_os=Linux) |

To load the dependencies, you may adjust the module names in `setup_gbar_env.sh`. To load the modules and update `LD_LIBRARY_PATH`, run the following command.
```bash
$ source setup_gbar_env.sh
```

### Compiling the Project
After loading the modules and updating the dynamic link loader path, you may simply compile the project with
```bash
$ make
```
Note that the project has been tuned for an NVIDIA Tesla A100 GPU which has compute capability `-gpu=cc80`. For an NVIDIA Tesla V100, the compute capability must be adjusted to `-gpu=cc70`.
## Running the Code

To run 20 iterations of the code on a grid of 128 times 64 times 64 voxels and save the result you may use the following commands:
```bash
$ export OMP_NUM_THREADS=12
$ export OMP_TARGET_OFFLOAD=MANDATORY
$ export CUDA_VISIBLE_DEVICES=0
$ ./top3d -x 16 -y 8 -z 8 -l 4 -w 1
```
In the above example, we specify that we wish to use four levels in the multi-grid preconditioner with `-l 4`. Therefore we must divide the domain size by 2^l-1 to find the size of the coarsest grid which the application takes as input.

### Inputs and Options
The following table includes a list of all the available options.

| **Option** | **Description** | **Default Value** |
| :--- | :--- | ---: |
| -x | The number of elements in the x-dimension on the coarsest grid. | 12 |
| -y | The number of elements in the y-dimension on the coarsest grid. | 6 |
| -z | The number of elements in the z-dimension on the coarsest grid. | 6 |
| -f | Volume fraction | 0.20 |
| -r | Filter radius in elements | 1.5 |
| -i | Number of design iterations | 20 |
| -v | Verbose mode. Select 0 to disable. Set to 1 to enable. Set to 2 to get detailed information. | 0 |
| -l | Number of layers in multigrid. | 4 |
| -w | Write result to `.vtu` file by setting `-w 1`. | 0 |

## Authorship
This code has been developed by Erik Albert Träff in collaboration with Anton Rydahl under the supervision of Niels Aage, Ole Signmund, and Sven Karlsson.
52 changes: 52 additions & 0 deletions definitions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include <math.h>
#include <stdalign.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#define __force_inline __attribute__((always_inline))
#define __force_unroll __attribute__((optimize("unroll-loops")))
#define __alignBound 32

#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) > (b) ? (b) : (a))

// define stencil sizes at compile time
#define STENCIL_SIZE_Y 8
// The code will not work if STENCIL_SIZE_X and STENCIL_SIZE_Z are not equal to one!
#define STENCIL_SIZE_X 1
#define STENCIL_SIZE_Z 1

#define number_of_matrix_free_levels 2

typedef double MTYPE;
typedef double STYPE;
typedef float CTYPE;

typedef float DTYPE; // design type, for element denseties, gradients and such.

struct FixedDofs {
uint_fast32_t n;
uint_fast32_t *idx;
};

struct gridContext {
double E0;
double Emin;
double nu;
double elementSizeX;
double elementSizeY;
double elementSizeZ;
uint_fast32_t nelx;
uint_fast32_t nely;
uint_fast32_t nelz;
uint_fast32_t wrapx;
uint_fast32_t wrapy;
uint_fast32_t wrapz;
double penal;
MTYPE **precomputedKE;

struct FixedDofs *fixedDofs;
};
Binary file added figures/cg_running_time.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/result.pdf
Binary file not shown.
Binary file added figures/result.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/running_time.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/speedup.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
195 changes: 195 additions & 0 deletions gpu_definitions.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#include "gpu_definitions.h"

void gridContextToGPUGrid(
struct gridContext * gc,
struct gpuGrid * gpu_gc,
const int nl,
const int verbose)
{
const int num_targets = gpu_gc->num_targets;
gpu_gc->target = (struct gpuNode *) malloc(num_targets*sizeof(struct gpuNode));

// Finding size of x per target
const int nelx = gc->nelx;
const int block_x = (int) nelx/num_targets;

for (int i = 0;i<num_targets;i++){
// Initializing node
(gpu_gc->target[i]).gc = (struct gridContext *) malloc(sizeof(struct gridContext));
struct gpuNode * node = &(gpu_gc->target[i]);
node -> id = i;
if (i>0){
node -> prev = &(gpu_gc->target[i-1]);
node -> offset_x = 1;
}
else {
node -> prev = NULL;
node -> offset_x = 0;
}
if (i<num_targets-1){
node -> next = &(gpu_gc->target[i+1]);
}
else {
node -> next = NULL;
}

// Global offset in the grid
node -> x_global = i *block_x;

// Calculating x size
int nelx_local = ( i== num_targets-1) ? nelx - i*num_targets : block_x;
if (node -> prev != NULL){
nelx_local += 1;
}
if (node -> next != NULL){
nelx_local += 1;
}
setupGC(node->gc,nl,nelx_local,gc->nely,gc->nelz);
if (verbose == 1) {
printf("GPU %d:\n\tGlobal index: %d\n\tOffset: %d\n",node->id,node->x_global,node->offset_x);
printf("\tPrev is null? %d\n\tNext is null? %d\n",node->prev == NULL,node->next==NULL);
}
}
}

void freeGPUGrid(
struct gpuGrid * gpu_gc,
const int nl)
{
const int num_targets = gpu_gc->num_targets;
for (int i=0;i<num_targets;i++){
struct gpuNode * node = &(gpu_gc->target[i]);
freeGC(node->gc,nl);
}
free(gpu_gc->target);
}

void computePadding(
const struct gridContext * gc,
const int l,
int * ncell,
int * wrapx,
int * wrapy,
int * wrapz,
uint_fast32_t * ndof
){
*ncell = pow(2, l);
const int32_t nelx = gc->nelx / (*ncell);
const int32_t nely = gc->nely / (*ncell);
const int32_t nelz = gc->nelz / (*ncell);
const int paddingx =
(STENCIL_SIZE_X - ((nelx + 1) % STENCIL_SIZE_X)) % STENCIL_SIZE_X;
const int paddingy =
(STENCIL_SIZE_Y - ((nely + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
const int paddingz =
(STENCIL_SIZE_Z - ((nelz + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;

// Writing the result
*wrapx = nelx + paddingx + 3;
*wrapy = nely + paddingy + 3;
*wrapz = nelz + paddingz + 3;
*ndof = 3 * (*wrapy) * (*wrapz) * (*wrapx);
}

DTYPE compute_volume(
const struct gridContext gc,
DTYPE * xPhys
){
DTYPE vol = 0.0;
#pragma omp target teams distribute parallel for collapse(3) map(always,tofrom:vol) reduction(+ : vol)
for (int i = 1; i < gc.nelx + 1; i++)
for (int k = 1; k < gc.nelz + 1; k++)
for (int j = 1; j < gc.nely + 1; j++) {
const int idx =
i * (gc.wrapy - 1) * (gc.wrapz - 1) +
k * (gc.wrapy - 1) + j;

vol += xPhys[idx];
}
return vol;
}

float compute_change(
DTYPE * x,
DTYPE * xnew,
const int nelem
){
float change = 0.0;

#pragma omp target teams distribute parallel for schedule(static) map(always,tofrom:change) reduction(max : change)
for (uint_least32_t i = 0; i < nelem; i++) {
change = MAX(change, fabs(x[i] - xnew[i]));
x[i] = xnew[i];
}
return change;
}

void update_solution(
const DTYPE * x,
DTYPE * xnew,
const DTYPE * dv,
const DTYPE * dc,
const int nelem,
const DTYPE g
){
DTYPE l1 = 0.0, l2 = 1e9, move = 0.2;

while ((l2 - l1) / (l1 + l2) > 1e-6) {
DTYPE lmid = 0.5 * (l2 + l1);
DTYPE gt = 0.0;


#pragma omp target teams distribute parallel for schedule(static) map(always,tofrom:gt) firstprivate(move,lmid) reduction(+:gt)
for (uint_least32_t i = 0; i < nelem; i++) {
xnew[i] =
MAX(0.0, MAX(x[i] - move,
MIN(1.0, MIN(x[i] + move,
x[i] * sqrt(-dc[i] / (dv[i] * lmid))))));
gt += dv[i] * (xnew[i] - x[i]);
}

gt += g;
if (gt > 0)
l1 = lmid;
else
l2 = lmid;
}
}

void enter_data(
const struct gridContext gridContext,
const struct SolverDataBuffer solverData,
const struct gpuGrid gpu_gc,
const DTYPE * xPhys,
const DTYPE * x,
const DTYPE * xnew,
const STYPE * U,
const CTYPE * F,
const DTYPE * dc,
const int nelem,
const int nl
){
int u_size = 0;
const int ndof = 3 * gridContext.wrapx * gridContext.wrapy * gridContext.wrapz;
for (int l_tmp = 0;l_tmp < nl; l_tmp++){
int ncell;
int wrapxc;
int wrapyc;
int wrapzc;
uint_fast32_t ndofc;
computePadding(&gridContext,l_tmp,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
if (l_tmp == 0){
u_size = ndofc;
}
CTYPE *zptr = solverData.zmg[l_tmp];
CTYPE *dptr = solverData.dmg[l_tmp];
CTYPE *rptr = solverData.rmg[l_tmp];
MTYPE *invDptr = solverData.invD[l_tmp];
const MTYPE * KE = gpu_gc.target[0].gc->precomputedKE[l_tmp];
#pragma omp target enter data map(to:zptr[:ndofc],invDptr[:ndofc],rptr[:ndofc],\
dptr[:ndofc],KE[:24*24*ncell*ncell*ncell]) device(gpu_gc.target[0].id) nowait
}
#pragma omp target enter data map(to:xPhys[:nelem],x[:nelem],dc[:nelem],xnew[:nelem],U[:u_size],F[:ndof],\
solverData.r[:ndof],solverData.p[:ndof],solverData.q[:ndof],solverData.z[:ndof]) device(gpu_gc.target[0].id) nowait
#pragma omp taskwait
}
Loading

0 comments on commit e633d4f

Please sign in to comment.