Initial commit with code

topopt · Oct 20, 2022 · e633d4f · e633d4f
1 parent f640716
commit e633d4f
Show file tree

Hide file tree

Showing 27 changed files with 4,049 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+*.o
+*.json
+*.vtu
+*.txt
+
+benchmark
+test_stencil_methods
+top3d
diff --git a/Makefile b/Makefile
@@ -0,0 +1,31 @@
+CFLAGS = -Wall -g -mp=gpu -gpu=cc80 -mp=noautopar -Msafeptr -march=native -O4 -mavx -mavx2 #-Minfo
+DEFS = -DDTU_HPC
+LIBS = -lcholmod -lm -lsuitesparseconfig -lcxsparse
+INCL = ""
+
+ifdef OWN_CHOLMOD
+LIBS += -L$(HOME)/SuiteSparse/SuiteSparse-5.10.1/lib -L/usr/lib64 -lopenblaso-r0.3.3
+INCL += -I$(HOME)/SuiteSparse/SuiteSparse-5.10.1/include/
+$(info Using custom suite-sparse)
+else 
+LIBS += -L"/appl/SuiteSparse/5.1.2-sl73/lib/" -L"/appl/OpenBLAS/0.2.20/XeonGold6226R/gcc-6.4.0/lib" -lopenblas
+INCL += -I"/appl/SuiteSparse/5.1.2-sl73/include/"
+$(info Using old gbar suite-sparse)
+endif
+
+$(info LIBS are $(LIBS))
+CC = nvc
+CXX = g++
+
+OBJ = stencil_methods.o stencil_assembly.o stencil_solvers.o stencil_grid_utility.o stencil_optimization.o local_matrix.o gpu_definitions.o
+
+all: top3d
+
+top3d: top3d.c $(OBJ)
+	$(CC) -std=c11 $(CFLAGS) -o $@ $^ $(INCL) $(DEFS) $(LIBS)
+
+%.o: %.c definitions.h
+	$(CC) -std=c11 $(CFLAGS) -o $@ -c $< $(INCL) $(DEFS)
+
+clean:
+	-rm -f top3dmgcg_matrixfree top3dmgcg_eightColored benchmark test_stencil_methods core top3d *.core *.o 
diff --git a/README.md b/README.md
@@ -1 +1,61 @@
-# MatrixFreeOpenmpGPU
+# GPU Accelerated Topology Optimisation Based on OpenMP Target Offloading
+This repository contains an implementation of a topology  optimisation solver for linear  elastic compliance minimisation in three dimensions. The implementation is based on OpenMP target offloading to one GPU.
+
+## Linear MBB Beam Example
+The result after 20 design iterations can be seen in the folloowing figure.
+<p align="center">
+  <img src="../figures/result.png" width="600," title="Result from 25 iterations">
+</p>
+
+## Compilation
+Even though OpenMP offloading to GPUs is supported by a wide range of compilers such as LLVM/Clang, ROCm, ICPC, NVC, and GCC, the compilers do not support the same subset of the OpenMP specification. We used NVIDIA's NVC compiler from the NVIDIA HPC SDK. The OpenMP directives may need to be adjusted if you choose to use another compiler.
+<br>
+
+### Dependencies
+We used the following versions of the `cholmod` library and the `nvc` compiler.
+
+| **Pachage**           | **Version** | **Installation** |
+| :---                  | :---        | :---           |
+| `SuiteSparse/CHOLMOD` | 5.1.2       | [See Github release notes](https://github.com/DrTimothyAldenDavis/SuiteSparse/releases/tag/v5.1.2) |
+| `nvhpc`| 22.5 | [NVIDIA HPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-releases)|
+| `CUDA` | 11.1 | [CUDA Toolkit 11.1.0](https://developer.nvidia.com/cuda-11.1.0-download-archive?target_os=Linux) |
+
+To load the dependencies, you may adjust the module names in `setup_gbar_env.sh`. To load the modules and update `LD_LIBRARY_PATH`, run the following command.
+```bash
+$ source setup_gbar_env.sh
+```
+
+### Compiling the Project
+After loading the modules and updating the dynamic link loader path, you may simply compile the project with
+```bash
+$ make
+```
+Note that the project has been tuned for an NVIDIA Tesla A100 GPU which has compute capability `-gpu=cc80`. For an NVIDIA Tesla V100, the compute capability must be adjusted to `-gpu=cc70`. 
+## Running the Code
+
+To run 20 iterations of the code on a grid of 128 times 64 times 64 voxels and save the result you may use the following commands:
+```bash
+$ export OMP_NUM_THREADS=12
+$ export OMP_TARGET_OFFLOAD=MANDATORY
+$ export CUDA_VISIBLE_DEVICES=0
+$ ./top3d -x 16 -y 8 -z 8 -l 4 -w 1
+```
+In the above example, we specify that we wish to use four levels in the multi-grid preconditioner with `-l 4`. Therefore we must divide the domain size by 2^l-1 to find the size of the coarsest grid which the application takes as input. 
+
+### Inputs and Options
+The following table includes a list of all the available options.
+
+| **Option** |                                          **Description**                                     | **Default Value** |
+| :---       | :---                                                                                         | ---:              |
+| -x         | The number of elements in the x-dimension on the coarsest grid.                              | 12                |
+| -y         | The number of elements in the y-dimension on the coarsest grid.                              | 6                 |
+| -z         | The number of elements in the z-dimension on the coarsest grid.                              | 6                 |
+| -f         | Volume fraction                                                                              | 0.20              |
+| -r         | Filter radius in elements                                                                    | 1.5               |
+| -i         | Number of design iterations                                                                  | 20                |
+| -v         | Verbose mode. Select 0 to disable. Set to 1 to enable. Set to 2 to get detailed information. | 0                 |
+| -l         | Number of layers in multigrid.                                                               | 4                 |
+| -w         | Write result to `.vtu` file by setting `-w 1`.                                               | 0                 |
+
+## Authorship
+This code has been developed by Erik Albert Träff in collaboration with Anton Rydahl under the supervision of Niels Aage, Ole Signmund, and Sven Karlsson.
diff --git a/definitions.h b/definitions.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <math.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define __force_inline __attribute__((always_inline))
+#define __force_unroll __attribute__((optimize("unroll-loops")))
+#define __alignBound 32
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+
+// define stencil sizes at compile time
+#define STENCIL_SIZE_Y 8
+// The code will not work if STENCIL_SIZE_X and STENCIL_SIZE_Z are not equal to one!
+#define STENCIL_SIZE_X 1
+#define STENCIL_SIZE_Z 1
+
+#define number_of_matrix_free_levels 2
+
+typedef double MTYPE;
+typedef double STYPE;
+typedef float CTYPE;
+
+typedef float DTYPE; // design type, for element denseties, gradients and such.
+
+struct FixedDofs {
+  uint_fast32_t n;
+  uint_fast32_t *idx;
+};
+
+struct gridContext {
+  double E0;
+  double Emin;
+  double nu;
+  double elementSizeX;
+  double elementSizeY;
+  double elementSizeZ;
+  uint_fast32_t nelx;
+  uint_fast32_t nely;
+  uint_fast32_t nelz;
+  uint_fast32_t wrapx;
+  uint_fast32_t wrapy;
+  uint_fast32_t wrapz;
+  double penal;
+  MTYPE **precomputedKE;
+
+  struct FixedDofs *fixedDofs;
+};
diff --git a/figures/cg_running_time.png b/figures/cg_running_time.png
diff --git a/figures/result.pdf b/figures/result.pdf
diff --git a/figures/result.png b/figures/result.png
diff --git a/figures/running_time.png b/figures/running_time.png
diff --git a/figures/speedup.png b/figures/speedup.png
diff --git a/gpu_definitions.c b/gpu_definitions.c
@@ -0,0 +1,195 @@
+#include "gpu_definitions.h"
+
+void gridContextToGPUGrid(
+    struct gridContext * gc,
+    struct gpuGrid * gpu_gc,
+    const int nl,
+    const int verbose)
+{
+    const int num_targets = gpu_gc->num_targets;
+    gpu_gc->target = (struct gpuNode *) malloc(num_targets*sizeof(struct gpuNode));
+
+    // Finding size of x per target
+    const int nelx = gc->nelx;
+    const int block_x = (int) nelx/num_targets;
+
+    for (int i = 0;i<num_targets;i++){
+        // Initializing node
+        (gpu_gc->target[i]).gc = (struct gridContext *) malloc(sizeof(struct gridContext));
+        struct gpuNode * node = &(gpu_gc->target[i]);
+        node -> id = i;
+        if (i>0){
+            node -> prev = &(gpu_gc->target[i-1]);
+            node -> offset_x = 1;
+        }
+        else {
+            node -> prev = NULL;
+            node -> offset_x = 0;
+        }
+        if (i<num_targets-1){
+            node -> next = &(gpu_gc->target[i+1]);
+        }
+        else {
+            node -> next = NULL;
+        }
+
+        // Global offset in the grid
+        node -> x_global = i *block_x;
+
+        // Calculating x size
+        int nelx_local = ( i== num_targets-1) ? nelx - i*num_targets : block_x;
+        if (node -> prev != NULL){
+            nelx_local += 1;
+        }
+        if (node -> next != NULL){
+            nelx_local += 1;
+        }
+        setupGC(node->gc,nl,nelx_local,gc->nely,gc->nelz);
+        if (verbose == 1) {
+            printf("GPU %d:\n\tGlobal index: %d\n\tOffset: %d\n",node->id,node->x_global,node->offset_x);
+            printf("\tPrev is null? %d\n\tNext is null? %d\n",node->prev == NULL,node->next==NULL);
+        }
+    }
+}
+
+void freeGPUGrid(
+    struct gpuGrid * gpu_gc,
+    const int nl)
+{
+    const int num_targets = gpu_gc->num_targets;
+    for (int i=0;i<num_targets;i++){
+        struct gpuNode * node = &(gpu_gc->target[i]);
+        freeGC(node->gc,nl);
+    }
+    free(gpu_gc->target);
+}
+
+void computePadding(
+    const struct gridContext * gc,
+    const int l,
+    int * ncell,
+    int * wrapx,
+    int * wrapy,
+    int * wrapz,
+    uint_fast32_t * ndof
+){
+    *ncell = pow(2, l);
+    const int32_t nelx = gc->nelx / (*ncell);
+    const int32_t nely = gc->nely / (*ncell);
+    const int32_t nelz = gc->nelz / (*ncell);
+    const int paddingx =
+        (STENCIL_SIZE_X - ((nelx + 1) % STENCIL_SIZE_X)) % STENCIL_SIZE_X;
+    const int paddingy =
+        (STENCIL_SIZE_Y - ((nely + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+    const int paddingz =
+        (STENCIL_SIZE_Z - ((nelz + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+    // Writing the result
+    *wrapx = nelx + paddingx + 3;
+    *wrapy = nely + paddingy + 3;
+    *wrapz = nelz + paddingz + 3;
+    *ndof = 3 * (*wrapy) * (*wrapz) * (*wrapx);
+}
+
+DTYPE compute_volume(
+    const struct gridContext gc,
+    DTYPE * xPhys
+){
+    DTYPE vol = 0.0;
+    #pragma omp target teams distribute parallel for collapse(3) map(always,tofrom:vol) reduction(+ : vol)
+    for (int i = 1; i < gc.nelx + 1; i++)
+      for (int k = 1; k < gc.nelz + 1; k++)
+        for (int j = 1; j < gc.nely + 1; j++) {
+          const int idx =
+              i * (gc.wrapy - 1) * (gc.wrapz - 1) +
+              k * (gc.wrapy - 1) + j;
+
+          vol += xPhys[idx];
+        }
+    return vol;
+}
+
+float compute_change(
+    DTYPE * x,
+    DTYPE * xnew,
+    const int nelem
+){
+    float change = 0.0;
+
+    #pragma omp target teams distribute parallel for schedule(static) map(always,tofrom:change) reduction(max : change)
+    for (uint_least32_t i = 0; i < nelem; i++) {
+      change = MAX(change, fabs(x[i] - xnew[i]));
+      x[i] = xnew[i];
+    }
+    return change;
+}
+
+void update_solution(
+    const DTYPE * x,
+    DTYPE * xnew,
+    const DTYPE * dv, 
+    const DTYPE * dc, 
+    const int nelem,
+    const DTYPE g
+){
+    DTYPE l1 = 0.0, l2 = 1e9, move = 0.2;
+
+    while ((l2 - l1) / (l1 + l2) > 1e-6) {
+      DTYPE lmid = 0.5 * (l2 + l1);
+      DTYPE gt = 0.0;
+
+
+      #pragma omp target teams distribute parallel for schedule(static) map(always,tofrom:gt) firstprivate(move,lmid) reduction(+:gt)
+      for (uint_least32_t i = 0; i < nelem; i++) {
+        xnew[i] =
+              MAX(0.0, MAX(x[i] - move,
+                          MIN(1.0, MIN(x[i] + move,
+                                        x[i] * sqrt(-dc[i] / (dv[i] * lmid))))));
+        gt += dv[i] * (xnew[i] - x[i]);
+      }
+
+      gt += g;
+      if (gt > 0)
+        l1 = lmid;
+      else
+        l2 = lmid;
+    }
+}
+
+void enter_data(
+    const struct gridContext gridContext,
+    const struct SolverDataBuffer solverData,
+    const struct gpuGrid gpu_gc,
+    const DTYPE * xPhys,
+    const DTYPE * x,
+    const DTYPE * xnew,
+    const STYPE * U,
+    const CTYPE * F,
+    const DTYPE * dc,
+    const int nelem,
+    const int nl
+){
+    int u_size = 0;
+    const int ndof = 3 * gridContext.wrapx * gridContext.wrapy * gridContext.wrapz;
+    for (int l_tmp = 0;l_tmp < nl; l_tmp++){
+        int ncell;
+        int wrapxc;
+        int wrapyc;
+        int wrapzc;
+        uint_fast32_t ndofc;
+        computePadding(&gridContext,l_tmp,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+        if (l_tmp == 0){
+            u_size = ndofc;
+        }
+        CTYPE *zptr = solverData.zmg[l_tmp];
+        CTYPE *dptr = solverData.dmg[l_tmp];
+        CTYPE *rptr = solverData.rmg[l_tmp];
+        MTYPE *invDptr = solverData.invD[l_tmp];
+        const MTYPE * KE = gpu_gc.target[0].gc->precomputedKE[l_tmp];
+        #pragma omp target enter data map(to:zptr[:ndofc],invDptr[:ndofc],rptr[:ndofc],\
+                                    dptr[:ndofc],KE[:24*24*ncell*ncell*ncell]) device(gpu_gc.target[0].id) nowait
+    }
+    #pragma omp target enter data map(to:xPhys[:nelem],x[:nelem],dc[:nelem],xnew[:nelem],U[:u_size],F[:ndof],\
+    solverData.r[:ndof],solverData.p[:ndof],solverData.q[:ndof],solverData.z[:ndof]) device(gpu_gc.target[0].id) nowait
+    #pragma omp taskwait
+}