diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e648b6c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+*.o
+*.json
+*.vtu
+*.txt
+
+benchmark
+test_stencil_methods
+top3d
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e09ba9b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,31 @@
+CFLAGS = -Wall -g -mp=gpu -gpu=cc80 -mp=noautopar -Msafeptr -march=native -O4 -mavx -mavx2 #-Minfo
+DEFS = -DDTU_HPC
+LIBS = -lcholmod -lm -lsuitesparseconfig -lcxsparse
+INCL = ""
+
+ifdef OWN_CHOLMOD
+LIBS += -L$(HOME)/SuiteSparse/SuiteSparse-5.10.1/lib -L/usr/lib64 -lopenblaso-r0.3.3
+INCL += -I$(HOME)/SuiteSparse/SuiteSparse-5.10.1/include/
+$(info Using custom suite-sparse)
+else 
+LIBS += -L"/appl/SuiteSparse/5.1.2-sl73/lib/" -L"/appl/OpenBLAS/0.2.20/XeonGold6226R/gcc-6.4.0/lib" -lopenblas
+INCL += -I"/appl/SuiteSparse/5.1.2-sl73/include/"
+$(info Using old gbar suite-sparse)
+endif
+
+$(info LIBS are $(LIBS))
+CC = nvc
+CXX = g++
+
+OBJ = stencil_methods.o stencil_assembly.o stencil_solvers.o stencil_grid_utility.o stencil_optimization.o local_matrix.o gpu_definitions.o
+
+all: top3d
+
+top3d: top3d.c $(OBJ)
+	$(CC) -std=c11 $(CFLAGS) -o $@ $^ $(INCL) $(DEFS) $(LIBS)
+
+%.o: %.c definitions.h
+	$(CC) -std=c11 $(CFLAGS) -o $@ -c $< $(INCL) $(DEFS)
+
+clean:
+	-rm -f top3dmgcg_matrixfree top3dmgcg_eightColored benchmark test_stencil_methods core top3d *.core *.o 
diff --git a/README.md b/README.md
index 86c5287..a000967 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,61 @@
-# MatrixFreeOpenmpGPU
\ No newline at end of file
+# GPU Accelerated Topology Optimisation Based on OpenMP Target Offloading
+This repository contains an implementation of a topology  optimisation solver for linear  elastic compliance minimisation in three dimensions. The implementation is based on OpenMP target offloading to one GPU.
+
+## Linear MBB Beam Example
+The result after 20 design iterations can be seen in the folloowing figure.
+<p align="center">
+  <img src="../figures/result.png" width="600," title="Result from 25 iterations">
+</p>
+
+## Compilation
+Even though OpenMP offloading to GPUs is supported by a wide range of compilers such as LLVM/Clang, ROCm, ICPC, NVC, and GCC, the compilers do not support the same subset of the OpenMP specification. We used NVIDIA's NVC compiler from the NVIDIA HPC SDK. The OpenMP directives may need to be adjusted if you choose to use another compiler.
+<br>
+
+### Dependencies
+We used the following versions of the `cholmod` library and the `nvc` compiler.
+
+| **Pachage**           | **Version** | **Installation** |
+| :---                  | :---        | :---           |
+| `SuiteSparse/CHOLMOD` | 5.1.2       | [See Github release notes](https://github.com/DrTimothyAldenDavis/SuiteSparse/releases/tag/v5.1.2) |
+| `nvhpc`| 22.5 | [NVIDIA HPC SDK](https://developer.nvidia.com/nvidia-hpc-sdk-releases)|
+| `CUDA` | 11.1 | [CUDA Toolkit 11.1.0](https://developer.nvidia.com/cuda-11.1.0-download-archive?target_os=Linux) |
+
+To load the dependencies, you may adjust the module names in `setup_gbar_env.sh`. To load the modules and update `LD_LIBRARY_PATH`, run the following command.
+```bash
+$ source setup_gbar_env.sh
+```
+
+### Compiling the Project
+After loading the modules and updating the dynamic link loader path, you may simply compile the project with
+```bash
+$ make
+```
+Note that the project has been tuned for an NVIDIA Tesla A100 GPU which has compute capability `-gpu=cc80`. For an NVIDIA Tesla V100, the compute capability must be adjusted to `-gpu=cc70`. 
+## Running the Code
+
+To run 20 iterations of the code on a grid of 128 times 64 times 64 voxels and save the result you may use the following commands:
+```bash
+$ export OMP_NUM_THREADS=12
+$ export OMP_TARGET_OFFLOAD=MANDATORY
+$ export CUDA_VISIBLE_DEVICES=0
+$ ./top3d -x 16 -y 8 -z 8 -l 4 -w 1
+```
+In the above example, we specify that we wish to use four levels in the multi-grid preconditioner with `-l 4`. Therefore we must divide the domain size by 2^l-1 to find the size of the coarsest grid which the application takes as input. 
+
+### Inputs and Options
+The following table includes a list of all the available options.
+
+| **Option** |                                          **Description**                                     | **Default Value** |
+| :---       | :---                                                                                         | ---:              |
+| -x         | The number of elements in the x-dimension on the coarsest grid.                              | 12                |
+| -y         | The number of elements in the y-dimension on the coarsest grid.                              | 6                 |
+| -z         | The number of elements in the z-dimension on the coarsest grid.                              | 6                 |
+| -f         | Volume fraction                                                                              | 0.20              |
+| -r         | Filter radius in elements                                                                    | 1.5               |
+| -i         | Number of design iterations                                                                  | 20                |
+| -v         | Verbose mode. Select 0 to disable. Set to 1 to enable. Set to 2 to get detailed information. | 0                 |
+| -l         | Number of layers in multigrid.                                                               | 4                 |
+| -w         | Write result to `.vtu` file by setting `-w 1`.                                               | 0                 |
+
+## Authorship
+This code has been developed by Erik Albert Träff in collaboration with Anton Rydahl under the supervision of Niels Aage, Ole Signmund, and Sven Karlsson.
diff --git a/definitions.h b/definitions.h
new file mode 100644
index 0000000..ac7119d
--- /dev/null
+++ b/definitions.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <math.h>
+#include <stdalign.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define __force_inline __attribute__((always_inline))
+#define __force_unroll __attribute__((optimize("unroll-loops")))
+#define __alignBound 32
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+
+// define stencil sizes at compile time
+#define STENCIL_SIZE_Y 8
+// The code will not work if STENCIL_SIZE_X and STENCIL_SIZE_Z are not equal to one!
+#define STENCIL_SIZE_X 1
+#define STENCIL_SIZE_Z 1
+
+#define number_of_matrix_free_levels 2
+
+typedef double MTYPE;
+typedef double STYPE;
+typedef float CTYPE;
+
+typedef float DTYPE; // design type, for element denseties, gradients and such.
+
+struct FixedDofs {
+  uint_fast32_t n;
+  uint_fast32_t *idx;
+};
+
+struct gridContext {
+  double E0;
+  double Emin;
+  double nu;
+  double elementSizeX;
+  double elementSizeY;
+  double elementSizeZ;
+  uint_fast32_t nelx;
+  uint_fast32_t nely;
+  uint_fast32_t nelz;
+  uint_fast32_t wrapx;
+  uint_fast32_t wrapy;
+  uint_fast32_t wrapz;
+  double penal;
+  MTYPE **precomputedKE;
+
+  struct FixedDofs *fixedDofs;
+};
diff --git a/figures/cg_running_time.png b/figures/cg_running_time.png
new file mode 100644
index 0000000..e77abb4
Binary files /dev/null and b/figures/cg_running_time.png differ
diff --git a/figures/result.pdf b/figures/result.pdf
new file mode 100644
index 0000000..0921466
Binary files /dev/null and b/figures/result.pdf differ
diff --git a/figures/result.png b/figures/result.png
new file mode 100644
index 0000000..565b0ab
Binary files /dev/null and b/figures/result.png differ
diff --git a/figures/running_time.png b/figures/running_time.png
new file mode 100644
index 0000000..a15fd04
Binary files /dev/null and b/figures/running_time.png differ
diff --git a/figures/speedup.png b/figures/speedup.png
new file mode 100644
index 0000000..185aa86
Binary files /dev/null and b/figures/speedup.png differ
diff --git a/gpu_definitions.c b/gpu_definitions.c
new file mode 100644
index 0000000..64e139a
--- /dev/null
+++ b/gpu_definitions.c
@@ -0,0 +1,195 @@
+#include "gpu_definitions.h"
+
+void gridContextToGPUGrid(
+    struct gridContext * gc,
+    struct gpuGrid * gpu_gc,
+    const int nl,
+    const int verbose)
+{
+    const int num_targets = gpu_gc->num_targets;
+    gpu_gc->target = (struct gpuNode *) malloc(num_targets*sizeof(struct gpuNode));
+
+    // Finding size of x per target
+    const int nelx = gc->nelx;
+    const int block_x = (int) nelx/num_targets;
+
+    for (int i = 0;i<num_targets;i++){
+        // Initializing node
+        (gpu_gc->target[i]).gc = (struct gridContext *) malloc(sizeof(struct gridContext));
+        struct gpuNode * node = &(gpu_gc->target[i]);
+        node -> id = i;
+        if (i>0){
+            node -> prev = &(gpu_gc->target[i-1]);
+            node -> offset_x = 1;
+        }
+        else {
+            node -> prev = NULL;
+            node -> offset_x = 0;
+        }
+        if (i<num_targets-1){
+            node -> next = &(gpu_gc->target[i+1]);
+        }
+        else {
+            node -> next = NULL;
+        }
+
+        // Global offset in the grid
+        node -> x_global = i *block_x;
+
+        // Calculating x size
+        int nelx_local = ( i== num_targets-1) ? nelx - i*num_targets : block_x;
+        if (node -> prev != NULL){
+            nelx_local += 1;
+        }
+        if (node -> next != NULL){
+            nelx_local += 1;
+        }
+        setupGC(node->gc,nl,nelx_local,gc->nely,gc->nelz);
+        if (verbose == 1) {
+            printf("GPU %d:\n\tGlobal index: %d\n\tOffset: %d\n",node->id,node->x_global,node->offset_x);
+            printf("\tPrev is null? %d\n\tNext is null? %d\n",node->prev == NULL,node->next==NULL);
+        }
+    }
+}
+
+void freeGPUGrid(
+    struct gpuGrid * gpu_gc,
+    const int nl)
+{
+    const int num_targets = gpu_gc->num_targets;
+    for (int i=0;i<num_targets;i++){
+        struct gpuNode * node = &(gpu_gc->target[i]);
+        freeGC(node->gc,nl);
+    }
+    free(gpu_gc->target);
+}
+
+void computePadding(
+    const struct gridContext * gc,
+    const int l,
+    int * ncell,
+    int * wrapx,
+    int * wrapy,
+    int * wrapz,
+    uint_fast32_t * ndof
+){
+    *ncell = pow(2, l);
+    const int32_t nelx = gc->nelx / (*ncell);
+    const int32_t nely = gc->nely / (*ncell);
+    const int32_t nelz = gc->nelz / (*ncell);
+    const int paddingx =
+        (STENCIL_SIZE_X - ((nelx + 1) % STENCIL_SIZE_X)) % STENCIL_SIZE_X;
+    const int paddingy =
+        (STENCIL_SIZE_Y - ((nely + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+    const int paddingz =
+        (STENCIL_SIZE_Z - ((nelz + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+    // Writing the result
+    *wrapx = nelx + paddingx + 3;
+    *wrapy = nely + paddingy + 3;
+    *wrapz = nelz + paddingz + 3;
+    *ndof = 3 * (*wrapy) * (*wrapz) * (*wrapx);
+}
+
+DTYPE compute_volume(
+    const struct gridContext gc,
+    DTYPE * xPhys
+){
+    DTYPE vol = 0.0;
+    #pragma omp target teams distribute parallel for collapse(3) map(always,tofrom:vol) reduction(+ : vol)
+    for (int i = 1; i < gc.nelx + 1; i++)
+      for (int k = 1; k < gc.nelz + 1; k++)
+        for (int j = 1; j < gc.nely + 1; j++) {
+          const int idx =
+              i * (gc.wrapy - 1) * (gc.wrapz - 1) +
+              k * (gc.wrapy - 1) + j;
+
+          vol += xPhys[idx];
+        }
+    return vol;
+}
+
+float compute_change(
+    DTYPE * x,
+    DTYPE * xnew,
+    const int nelem
+){
+    float change = 0.0;
+
+    #pragma omp target teams distribute parallel for schedule(static) map(always,tofrom:change) reduction(max : change)
+    for (uint_least32_t i = 0; i < nelem; i++) {
+      change = MAX(change, fabs(x[i] - xnew[i]));
+      x[i] = xnew[i];
+    }
+    return change;
+}
+
+void update_solution(
+    const DTYPE * x,
+    DTYPE * xnew,
+    const DTYPE * dv, 
+    const DTYPE * dc, 
+    const int nelem,
+    const DTYPE g
+){
+    DTYPE l1 = 0.0, l2 = 1e9, move = 0.2;
+
+    while ((l2 - l1) / (l1 + l2) > 1e-6) {
+      DTYPE lmid = 0.5 * (l2 + l1);
+      DTYPE gt = 0.0;
+      
+
+      #pragma omp target teams distribute parallel for schedule(static) map(always,tofrom:gt) firstprivate(move,lmid) reduction(+:gt)
+      for (uint_least32_t i = 0; i < nelem; i++) {
+        xnew[i] =
+              MAX(0.0, MAX(x[i] - move,
+                          MIN(1.0, MIN(x[i] + move,
+                                        x[i] * sqrt(-dc[i] / (dv[i] * lmid))))));
+        gt += dv[i] * (xnew[i] - x[i]);
+      }
+
+      gt += g;
+      if (gt > 0)
+        l1 = lmid;
+      else
+        l2 = lmid;
+    }
+}
+
+void enter_data(
+    const struct gridContext gridContext,
+    const struct SolverDataBuffer solverData,
+    const struct gpuGrid gpu_gc,
+    const DTYPE * xPhys,
+    const DTYPE * x,
+    const DTYPE * xnew,
+    const STYPE * U,
+    const CTYPE * F,
+    const DTYPE * dc,
+    const int nelem,
+    const int nl
+){
+    int u_size = 0;
+    const int ndof = 3 * gridContext.wrapx * gridContext.wrapy * gridContext.wrapz;
+    for (int l_tmp = 0;l_tmp < nl; l_tmp++){
+        int ncell;
+        int wrapxc;
+        int wrapyc;
+        int wrapzc;
+        uint_fast32_t ndofc;
+        computePadding(&gridContext,l_tmp,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+        if (l_tmp == 0){
+            u_size = ndofc;
+        }
+        CTYPE *zptr = solverData.zmg[l_tmp];
+        CTYPE *dptr = solverData.dmg[l_tmp];
+        CTYPE *rptr = solverData.rmg[l_tmp];
+        MTYPE *invDptr = solverData.invD[l_tmp];
+        const MTYPE * KE = gpu_gc.target[0].gc->precomputedKE[l_tmp];
+        #pragma omp target enter data map(to:zptr[:ndofc],invDptr[:ndofc],rptr[:ndofc],\
+                                    dptr[:ndofc],KE[:24*24*ncell*ncell*ncell]) device(gpu_gc.target[0].id) nowait
+    }
+    #pragma omp target enter data map(to:xPhys[:nelem],x[:nelem],dc[:nelem],xnew[:nelem],U[:u_size],F[:ndof],\
+    solverData.r[:ndof],solverData.p[:ndof],solverData.q[:ndof],solverData.z[:ndof]) device(gpu_gc.target[0].id) nowait
+    #pragma omp taskwait
+}
diff --git a/gpu_definitions.h b/gpu_definitions.h
new file mode 100644
index 0000000..76c8396
--- /dev/null
+++ b/gpu_definitions.h
@@ -0,0 +1,88 @@
+#ifndef GPU_DEFINITIONS
+#define GPU_DEFINITIONS
+#include "definitions.h"
+#include "stencil_grid_utility.h"
+#include "stencil_assembly.h"
+
+struct SolverDataBuffer {
+  // cg data
+  CTYPE *r;
+  CTYPE *p;
+  CTYPE *q;
+  CTYPE *z;
+
+  // jacobi + mg data
+  MTYPE **invD;
+  CTYPE **dmg;
+  CTYPE **rmg;
+  CTYPE **zmg;
+
+  // explicitly assembled matrices
+  struct CSRMatrix *coarseMatrices;
+  struct CoarseSolverData bottomSolver;
+};
+
+struct gpuNode {
+    int id;
+    struct gpuNode * prev;
+    struct gpuNode * next;
+    struct gridContext * gc;
+    int offset_x;
+    int x_global;
+};
+
+struct gpuGrid {
+  int num_targets;
+  struct gpuNode * target;
+};
+
+void gridContextToGPUGrid(
+    struct gridContext * gc,
+    struct gpuGrid * gpu_gc,
+    const int nl,
+    const int verbose);
+
+void freeGPUGrid(
+    struct gpuGrid * gpu_gc,
+    const int nl);
+
+void computePadding(
+    const struct gridContext * gc,
+    const int l,
+    int * ncell,
+    int * wrapx,
+    int * wrapy,
+    int * wrapz,
+    uint_fast32_t * ndof);
+
+DTYPE compute_volume(
+    const struct gridContext gc,
+    DTYPE * xPhys);
+
+float compute_change(
+    DTYPE * x,
+    DTYPE * xnew,
+    const int nelem);
+
+void update_solution(
+    const DTYPE * x,
+    DTYPE * xnew,
+    const DTYPE * dv, 
+    const DTYPE * dc, 
+    const int nelem,
+    const DTYPE g);
+
+void enter_data(
+    const struct gridContext gridContext,
+    const struct SolverDataBuffer solverData,
+    const struct gpuGrid gpu_gc,
+    const DTYPE * xPhys,
+    const DTYPE * x,
+    const DTYPE * xnew,
+    const STYPE * U,
+    const CTYPE * F,
+    const DTYPE * dc,
+    const int nelem,
+    const int nl);
+
+#endif
diff --git a/local_matrix.c b/local_matrix.c
new file mode 100644
index 0000000..db3c64f
--- /dev/null
+++ b/local_matrix.c
@@ -0,0 +1,319 @@
+#include "local_matrix.h"
+
+#include <cblas.h>
+
+// compute the contitutive matrix
+// temperature: frozen, called only in preprocessing
+void getC(MTYPE C[6][6],  /* out */
+          const MTYPE nu) /*  in */
+{
+  const MTYPE temp1 = (1.0 - nu) / ((1.0 + nu) * (1.0 - 2.0 * nu));
+  const MTYPE temp2 = nu / ((1.0 + nu) * (1.0 - 2.0 * nu));
+  const MTYPE temp3 = 1.0 / (2.0 * (1.0 + nu));
+
+  for (unsigned int i = 0; i < 6 * 6; i++)
+    *((MTYPE *)C + i) = 0.0;
+
+  C[0][0] = temp1;
+  C[1][1] = temp1;
+  C[2][2] = temp1;
+  C[3][3] = temp3;
+  C[4][4] = temp3;
+  C[5][5] = temp3;
+  C[0][1] = temp2;
+  C[1][0] = temp2;
+  C[0][2] = temp2;
+  C[2][0] = temp2;
+  C[1][2] = temp2;
+  C[2][1] = temp2;
+}
+
+// compute the strain-displacement matrix
+// temperature: frozen, called only in preprocessing
+void getB(MTYPE B[6][24],     /* out */
+          MTYPE *jdet,        /* out */
+          const MTYPE iso[3], /*  in */
+          const MTYPE xe[24]) /*  in */
+{
+  /*     xi = iso(1); */
+  const MTYPE xi = iso[1 - 1];
+  /*     eta = iso(2); */
+  const MTYPE eta = iso[2 - 1];
+  /*     zeta = iso(3); */
+  const MTYPE zeta = iso[3 - 1];
+
+  const MTYPE n1xi = -0.125 * (1 - eta) * (1 - zeta);
+  const MTYPE n1eta = -0.125 * (1 - xi) * (1 - zeta);
+  const MTYPE n1zeta = -0.125 * (1 - xi) * (1 - eta);
+  const MTYPE n2xi = 0.125 * (1 - eta) * (1 - zeta);
+  const MTYPE n2eta = -0.125 * (1 + xi) * (1 - zeta);
+  const MTYPE n2zeta = -0.125 * (1 + xi) * (1 - eta);
+
+  const MTYPE n3xi = 0.125 * (1 + eta) * (1 - zeta);
+  const MTYPE n3eta = 0.125 * (1 + xi) * (1 - zeta);
+  const MTYPE n3zeta = -0.125 * (1 + xi) * (1 + eta);
+  const MTYPE n4xi = -0.125 * (1 + eta) * (1 - zeta);
+  const MTYPE n4eta = 0.125 * (1 - xi) * (1 - zeta);
+  const MTYPE n4zeta = -0.125 * (1 - xi) * (1 + eta);
+
+  const MTYPE n5xi = -0.125 * (1 - eta) * (1 + zeta);
+  const MTYPE n5eta = -0.125 * (1 - xi) * (1 + zeta);
+  const MTYPE n5zeta = 0.125 * (1 - xi) * (1 - eta);
+  const MTYPE n6xi = 0.125 * (1 - eta) * (1 + zeta);
+  const MTYPE n6eta = -0.125 * (1 + xi) * (1 + zeta);
+  const MTYPE n6zeta = 0.125 * (1 + xi) * (1 - eta);
+
+  const MTYPE n7xi = 0.125 * (1 + eta) * (1 + zeta);
+  const MTYPE n7eta = 0.125 * (1 + xi) * (1 + zeta);
+  const MTYPE n7zeta = 0.125 * (1 + xi) * (1 + eta);
+  const MTYPE n8xi = -0.125 * (1 + eta) * (1 + zeta);
+  const MTYPE n8eta = 0.125 * (1 - xi) * (1 + zeta);
+  const MTYPE n8zeta = 0.125 * (1 - xi) * (1 + eta);
+
+  MTYPE L[6][9];
+  for (unsigned int i = 0; i < 6 * 9; i++)
+    *((MTYPE *)L + i) = 0.0;
+
+  MTYPE jac[3][3];
+  for (unsigned int i = 0; i < 3 * 3; i++)
+    *((MTYPE *)jac + i) = 0.0;
+
+  MTYPE jacinvt[9][9];
+  for (unsigned int i = 0; i < 9 * 9; i++)
+    *((MTYPE *)jacinvt + i) = 0.0;
+
+  MTYPE Nt[9][24];
+  for (unsigned int i = 0; i < 9 * 24; i++)
+    *((MTYPE *)Nt + i) = 0.0;
+
+  L[0][0] = 1.0;
+  L[1][4] = 1.0;
+  L[2][8] = 1.0;
+  L[3][1] = 1.0;
+  L[3][3] = 1.0;
+  L[4][5] = 1.0;
+  L[4][7] = 1.0;
+  L[5][2] = 1.0;
+  L[5][6] = 1.0;
+
+  Nt[0][0] = n1xi;
+  Nt[1][0] = n1eta;
+  Nt[2][0] = n1zeta;
+  Nt[0][3] = n2xi;
+  Nt[1][3] = n2eta;
+  Nt[2][3] = n2zeta;
+  Nt[0][6] = n3xi;
+  Nt[1][6] = n3eta;
+  Nt[2][6] = n3zeta;
+  Nt[0][9] = n4xi;
+  Nt[1][9] = n4eta;
+  Nt[2][9] = n4zeta;
+  Nt[0][12] = n5xi;
+  Nt[1][12] = n5eta;
+  Nt[2][12] = n5zeta;
+  Nt[0][15] = n6xi;
+  Nt[1][15] = n6eta;
+  Nt[2][15] = n6zeta;
+  Nt[0][18] = n7xi;
+  Nt[1][18] = n7eta;
+  Nt[2][18] = n7zeta;
+  Nt[0][21] = n8xi;
+  Nt[1][21] = n8eta;
+  Nt[2][21] = n8zeta;
+
+  Nt[3][1] = n1xi;
+  Nt[4][1] = n1eta;
+  Nt[5][1] = n1zeta;
+  Nt[3][4] = n2xi;
+  Nt[4][4] = n2eta;
+  Nt[5][4] = n2zeta;
+  Nt[3][7] = n3xi;
+  Nt[4][7] = n3eta;
+  Nt[5][7] = n3zeta;
+  Nt[3][10] = n4xi;
+  Nt[4][10] = n4eta;
+  Nt[5][10] = n4zeta;
+  Nt[3][13] = n5xi;
+  Nt[4][13] = n5eta;
+  Nt[5][13] = n5zeta;
+  Nt[3][16] = n6xi;
+  Nt[4][16] = n6eta;
+  Nt[5][16] = n6zeta;
+  Nt[3][19] = n7xi;
+  Nt[4][19] = n7eta;
+  Nt[5][19] = n7zeta;
+  Nt[3][22] = n8xi;
+  Nt[4][22] = n8eta;
+  Nt[5][22] = n8zeta;
+
+  Nt[6][2] = n1xi;
+  Nt[7][2] = n1eta;
+  Nt[8][2] = n1zeta;
+  Nt[6][5] = n2xi;
+  Nt[7][5] = n2eta;
+  Nt[8][5] = n2zeta;
+  Nt[6][8] = n3xi;
+  Nt[7][8] = n3eta;
+  Nt[8][8] = n3zeta;
+  Nt[6][11] = n4xi;
+  Nt[7][11] = n4eta;
+  Nt[8][11] = n4zeta;
+  Nt[6][14] = n5xi;
+  Nt[7][14] = n5eta;
+  Nt[8][14] = n5zeta;
+  Nt[6][17] = n6xi;
+  Nt[7][17] = n6eta;
+  Nt[8][17] = n6zeta;
+  Nt[6][20] = n7xi;
+  Nt[7][20] = n7eta;
+  Nt[8][20] = n7zeta;
+  Nt[6][23] = n8xi;
+  Nt[7][23] = n8eta;
+  Nt[8][23] = n8zeta;
+
+  jac[0][0] = n1xi * xe[0] + n2xi * xe[3] + n3xi * xe[6] + n4xi * xe[9] +
+              n5xi * xe[12] + n6xi * xe[15] + n7xi * xe[18] + n8xi * xe[21];
+
+  jac[1][0] = n1eta * xe[0] + n2eta * xe[3] + n3eta * xe[6] + n4eta * xe[9] +
+              n5eta * xe[12] + n6eta * xe[15] + n7eta * xe[18] + n8eta * xe[21];
+
+  jac[2][0] = n1zeta * xe[0] + n2zeta * xe[3] + n3zeta * xe[6] +
+              n4zeta * xe[9] + n5zeta * xe[12] + n6zeta * xe[15] +
+              n7zeta * xe[18] + n8zeta * xe[21];
+
+  jac[0][1] = n1xi * xe[1] + n2xi * xe[4] + n3xi * xe[7] + n4xi * xe[10] +
+              n5xi * xe[13] + n6xi * xe[16] + n7xi * xe[19] + n8xi * xe[22];
+
+  jac[1][1] = n1eta * xe[1] + n2eta * xe[4] + n3eta * xe[7] + n4eta * xe[10] +
+              n5eta * xe[13] + n6eta * xe[16] + n7eta * xe[19] + n8eta * xe[22];
+
+  jac[2][1] = n1zeta * xe[1] + n2zeta * xe[4] + n3zeta * xe[7] +
+              n4zeta * xe[10] + n5zeta * xe[13] + n6zeta * xe[16] +
+              n7zeta * xe[19] + n8zeta * xe[22];
+
+  jac[0][2] = n1xi * xe[2] + n2xi * xe[5] + n3xi * xe[8] + n4xi * xe[11] +
+              n5xi * xe[14] + n6xi * xe[17] + n7xi * xe[20] + n8xi * xe[23];
+
+  jac[1][2] = n1eta * xe[2] + n2eta * xe[5] + n3eta * xe[8] + n4eta * xe[11] +
+              n5eta * xe[14] + n6eta * xe[17] + n7eta * xe[20] + n8eta * xe[23];
+
+  jac[2][2] = n1zeta * xe[2] + n2zeta * xe[5] + n3zeta * xe[8] +
+              n4zeta * xe[11] + n5zeta * xe[14] + n6zeta * xe[17] +
+              n7zeta * xe[20] + n8zeta * xe[23];
+
+  /*     jdet = det(jac); */
+  //  det(A)=A11(A22A33−A23A32)−A12(A21A33−A23A31)+A13(A21A32−A22A31)
+  (*jdet) = jac[0][0] * (jac[1][1] * jac[2][2] - jac[1][2] * jac[1][2]) -
+            jac[0][1] * (jac[1][0] * jac[2][2] - jac[1][2] * jac[2][0]) +
+            jac[0][2] * (jac[1][0] * jac[2][1] - jac[1][1] * jac[2][0]);
+
+  /*     ijac = inv(jac); */
+  // https://en.wikipedia.org/wiki/Invertible_matrix#Inversion_of_3_%C3%97_3_matrices
+  jacinvt[0][0] =
+      (1.0 / (*jdet)) * (jac[1][1] * jac[2][2] - jac[1][2] * jac[1][2]);
+  jacinvt[0][1] =
+      (1.0 / (*jdet)) * -1.0 * (jac[1][0] * jac[2][2] - jac[1][2] * jac[2][0]);
+  jacinvt[0][2] =
+      (1.0 / (*jdet)) * (jac[1][0] * jac[2][1] - jac[1][1] * jac[2][0]);
+
+  jacinvt[1][0] =
+      (1.0 / (*jdet)) * -1.0 * (jac[0][1] * jac[2][2] - jac[0][2] * jac[2][1]);
+  jacinvt[1][1] =
+      (1.0 / (*jdet)) * (jac[0][0] * jac[2][2] - jac[0][2] * jac[2][0]);
+  jacinvt[1][2] =
+      (1.0 / (*jdet)) * -1.0 * (jac[0][0] * jac[2][1] - jac[0][1] * jac[2][0]);
+
+  jacinvt[2][0] =
+      (1.0 / (*jdet)) * (jac[0][1] * jac[1][2] - jac[0][2] * jac[1][1]);
+  jacinvt[2][1] =
+      (1.0 / (*jdet)) * -1.0 * (jac[0][0] * jac[1][2] - jac[0][2] * jac[1][0]);
+  jacinvt[2][2] =
+      (1.0 / (*jdet)) * (jac[0][0] * jac[1][1] - jac[0][1] * jac[1][0]);
+
+  for (int k = 1; k < 3; k++)
+    for (int i = 0; i < 3; i++)
+      for (int j = 0; j < 3; j++)
+        jacinvt[3 * k + i][3 * k + j] = jacinvt[i][j];
+
+  MTYPE partial[9][24];
+
+  /*! \todo add checks so that double / single precision GEMM
+    is selected. */
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 9, 24, 9, 1.0,
+              (MTYPE *)jacinvt, 9, (MTYPE *)Nt, 24, 0.0, (MTYPE *)partial, 24);
+
+  cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 6, 24, 9, 1.0,
+              (MTYPE *)L, 9, (MTYPE *)partial, 24, 0.0, (MTYPE *)B, 24);
+}
+
+// compute the local stiffness matrix
+// temperature: frozen, called only in preprocessing
+void getKEsubspace(MTYPE *KEarray, /* out */
+                   const MTYPE nu, /* in */
+                   const int l) {  /* in */
+
+  const int ncell = pow(2, l);
+  const int integrationPoints = 10;
+
+  const MTYPE a = 0.5 * ncell;
+  const MTYPE b = 0.5 * ncell;
+  const MTYPE c = 0.5 * ncell;
+
+  const MTYPE xe[24] = {-a, -b, -c, a, -b, -c, a, b, -c, -a, b, -c,
+                        -a, -b, c,  a, -b, c,  a, b, c,  -a, b, c};
+
+  MTYPE C[6][6];
+  getC(C, nu);
+
+  const MTYPE spacing = 2.0 / (MTYPE)ncell / (MTYPE)integrationPoints;
+  const MTYPE subCellVolume = spacing * spacing * spacing;
+
+  MTYPE iso[3];
+  MTYPE B[6][24];
+  MTYPE partial[24][6];
+  MTYPE jdet;
+
+  for (int i = 0; i < 24 * 24 * ncell * ncell * ncell; i++)
+    KEarray[i] = 0.0;
+
+  for (int ii = 0; ii < ncell; ii++)
+    for (int kk = 0; kk < ncell; kk++)
+      for (int jj = 0; jj < ncell; jj++) {
+
+        const int cellidx = ncell * ncell * ii + ncell * kk + jj;
+
+        MTYPE igp = -1.0 + spacing / 2.0 + 2.0 / (MTYPE)ncell * ((MTYPE)ii);
+
+        for (int iii = 0; iii < integrationPoints; iii++) {
+
+          MTYPE jgp = -1.0 + spacing / 2.0 + 2.0 / (MTYPE)ncell * ((MTYPE)jj);
+
+          for (int jjj = 0; jjj < integrationPoints; jjj++) {
+
+            MTYPE kgp = -1.0 + spacing / 2.0 + 2.0 / (MTYPE)ncell * ((MTYPE)kk);
+
+            for (int kkk = 0; kkk < integrationPoints; kkk++) {
+
+              iso[0] = igp;
+              iso[1] = -1.0 * jgp; /*important to flip y/eta-coordinate, due to
+                                  bad numbering in original code*/
+              iso[2] = kgp;
+
+              getB(B, &jdet, iso, xe);
+
+              cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, 24, 6, 6,
+                          1.0, (MTYPE *)B, 24, (MTYPE *)C, 6, 0.0,
+                          (MTYPE *)partial, 6);
+
+              cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 24, 24, 6,
+                          jdet * subCellVolume, (MTYPE *)partial, 6, (MTYPE *)B,
+                          24, 1.0, KEarray + 24 * 24 * cellidx, 24);
+
+              kgp += spacing;
+            }
+            jgp += spacing;
+          }
+          igp += spacing;
+        }
+      }
+}
diff --git a/local_matrix.h b/local_matrix.h
new file mode 100644
index 0000000..f36f4c8
--- /dev/null
+++ b/local_matrix.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "definitions.h"
+
+// compute the contitutive matrix
+// temperature: frozen, called only in preprocessing
+void getC(MTYPE C[6][6],   /* out */
+          const MTYPE nu); /*  in */
+
+// compute the strain-displacement matrix
+// temperature: frozen, called only in preprocessing
+void getB(MTYPE B[6][24],      /* out */
+          MTYPE *jdet,         /* out */
+          const MTYPE iso[3],  /*  in */
+          const MTYPE xe[24]); /*  in */
+
+// compute the local stiffness matrix
+// temperature: frozen, called only in preprocessing
+void getKEsubspace(MTYPE *KEarray, /* out */
+                   const MTYPE nu, /* in */
+                   const int l);   /* in */
diff --git a/plot_timings.m b/plot_timings.m
new file mode 100644
index 0000000..4c4fe31
--- /dev/null
+++ b/plot_timings.m
@@ -0,0 +1,184 @@
+function plot_timings()
+close all;
+% reading C output
+
+threads_new=[1,4,8,16];
+threads_old=[1,4,8,16];
+threads_cpu=[1,4,8,16];
+
+X = cell(length(threads_new),1);
+Y = cell(length(threads_old),1);
+Z = cell(length(threads_cpu),1);
+
+
+for i=1:length(threads_new)
+    filename = strcat(['./tests/new_version_',num2str(threads_new(i)),'.txt']);
+    fileID = fopen(filename,'r');
+    formatSpec = '%lf';
+    A = fscanf(fileID,formatSpec);
+    X{i} = reshape(A,[7,length(A)/7]);
+end
+
+for i=1:length(threads_old)
+    filename = strcat(['./tests/old_version_',num2str(threads_old(i)),'.txt']);
+    fileID = fopen(filename,'r');
+    formatSpec = '%lf';
+    A = fscanf(fileID,formatSpec);
+    Y{i} = reshape(A,[7,length(A)/7]);
+end
+
+for i=1:length(threads_cpu)
+    filename = strcat(['./tests/cpu_version_',num2str(threads_cpu(i)),'.txt']);
+    fileID = fopen(filename,'r');
+    formatSpec = '%lf';
+    A = fscanf(fileID,formatSpec);
+    Z{i} = reshape(A,[7,length(A)/7]);
+end
+
+dim = [2,2];
+
+%% Time per iteration
+
+fig = figure('visible','off','Renderer', 'painters');
+fig.Position(3:4)=[1000,800];
+sgtitle('Average Wall Clock Time per Iteration','fontsize',20)
+for i=1:length(threads_new)
+    subplot(dim(1),dim(2),i)
+    new_time = X{i}(7,:)./X{i}(1,:);
+    plot(X{i}(5,:),new_time,'*',...
+        'displayname','New GPU Version')
+    hold on
+    old_time = Y{i}(7,:)./Y{i}(1,:);
+    plot(Y{i}(5,:),old_time,'*',...
+        'displayname','Original GPU Version')
+    cpu_time = Z{i}(7,:)./Z{i}(1,:);
+    plot(Z{i}(5,:),cpu_time,'*',...
+        'displayname','Original CPU Version')
+    hold off
+    ylabel('$t\:[s]$','interpreter','latex','fontsize',14)
+    xlabel('Grid Size $N_x\times N_y \times N_z$','interpreter','latex','fontsize',14)
+    legend('location','nw')
+    title(strcat([num2str(threads_new(i)),' Threads']),...
+        'interpreter','latex','fontsize',16)
+    grid()
+end
+
+%saveas(fig,'./figures/running_time.png')
+z=getframe(fig);
+imwrite(z.cdata,'./figures/running_time.png')
+
+%% Time per Gonjugate Gradient Iteration
+
+fig = figure('visible','off','Renderer', 'painters');
+fig.Position(3:4)=[1000,800];
+sgtitle('Average Wall Clock Time per Conjugate Gradient Iteration','fontsize',20)
+for i=1:length(threads_new)
+    subplot(dim(1),dim(2),i)
+    new_time = X{i}(7,:)./X{i}(6,:);
+    plot(X{i}(5,:),new_time,'*',...
+        'displayname','New GPU Version')
+    hold on
+    old_time = Y{i}(7,:)./Y{i}(6,:);
+    plot(Y{i}(5,:),old_time,'*',...
+        'displayname','Original GPU Version')
+    cpu_time = Z{i}(7,:)./Z{i}(6,:);
+    plot(Z{i}(5,:),cpu_time,'*',...
+        'displayname','Original CPU Version')
+    hold off
+    ylabel('$t\:[s]$','interpreter','latex','fontsize',14)
+    xlabel('Grid Size $N_x\times N_y \times N_z$','interpreter','latex','fontsize',14)
+    legend('location','nw')
+    title(strcat([num2str(threads_new(i)),' Threads']),...
+        'interpreter','latex','fontsize',16)
+    grid()
+end
+
+z=getframe(fig);
+imwrite(z.cdata,'./figures/cg_running_time.png')
+
+%% Time per Gonjugate Gradient Iteration
+
+fig = figure('visible','off','Renderer', 'painters');
+fig.Position(3:4)=[500,400];
+new_time = X{1}(7,:)./X{1}(1,:);
+old_time = Y{1}(7,:)./Y{1}(1,:);
+plot(X{1}(5,:),old_time./new_time,'*',...
+        'displayname','1 Thread - Old GPU')
+hold on
+for i=2:length(threads_new)
+    new_time = X{i}(7,:)./X{i}(1,:);
+    old_time = Y{i}(7,:)./Y{i}(1,:);
+    plot(X{i}(5,:),old_time./new_time,'*',...
+        'displayname',strcat([num2str(threads_new(i)),' Threads']))
+end
+hold off
+ylabel('$\frac{t_{original}}{t_{new}}$','interpreter','latex','fontsize',16)
+xlabel('Grid Size $N_x\times N_y \times N_z$','interpreter','latex','fontsize',14)
+legend('location','ne')
+grid()
+title('Speed-up','fontsize',18)
+
+z=getframe(fig);
+imwrite(z.cdata,'./figures/speedup.png')
+end
+% function plot_timings()
+% close all;
+% % reading C output
+% 
+% threads_new=[1,4,8,16];
+% threads_old=[1,4,8,16];
+% 
+% X = cell(length(threads_new),1);
+% Y = cell(length(threads_old),1);
+% 
+% for i=1:length(threads_new)
+%     filename = strcat(['./tests/new_version_',num2str(threads_new(i)),'.txt']);
+%     fileID = fopen(filename,'r');
+%     formatSpec = '%lf';
+%     A = fscanf(fileID,formatSpec);
+%     X{i} = reshape(A,[7,length(A)/7]);
+% end
+% 
+% for i=1:length(threads_old)
+%     filename = strcat(['./tests/old_version_',num2str(threads_old(i)),'.txt']);
+%     fileID = fopen(filename,'r');
+%     formatSpec = '%lf';
+%     A = fscanf(fileID,formatSpec);
+%     Y{i} = reshape(A,[7,length(A)/7]);
+% end
+% 
+% dim = [1,2];
+% % 'visible','off',
+% fig = figure('Renderer', 'painters', 'Position', [10 10 1000 330]);
+% sgtitle('Average Wall Clock Time per Iteration','fontsize',20)
+% subplot(dim(1),dim(2),1)
+% plot(X{1}(5,:),X{1}(7,:)./X{1}(1,:),'*',...
+%     'displayname',strcat([num2str(threads_new(1)),' threads']))
+% hold on
+% for i=2:length(threads_new)
+%     plot(X{i}(5,:),X{i}(7,:)./X{i}(1,:),'*',...
+%     'displayname',strcat([num2str(threads_new(i)),' threads']))
+% end
+% hold off
+% ylim([0,20])
+% ylabel('$t\:[s]$','interpreter','latex','fontsize',16)
+% xlabel('Grid Size','interpreter','latex','fontsize',16)
+% legend('location','nw')
+% title('New GPU Version','interpreter','latex','fontsize',18)
+% 
+% subplot(dim(1),dim(2),2)
+% plot(Y{1}(5,:),Y{1}(7,:)./Y{1}(1,:),'*',...
+%     'displayname',strcat([num2str(threads_old(1)),' threads']))
+% hold on
+% for i=2:length(threads_old)
+%     plot(Y{i}(5,:),Y{i}(7,:)./Y{i}(1,:),'*',...
+%     'displayname',strcat([num2str(threads_old(i)),' threads']))
+% end
+% hold off
+% %ylim([0,20])
+% ylabel('$t\:[s]$','interpreter','latex','fontsize',16)
+% xlabel('Grid Size','interpreter','latex','fontsize',16)
+% legend('location','nw')
+% title('Old GPU Version','interpreter','latex','fontsize',18)
+% %saveas(fig,'./figures/running_time.png')
+% end
\ No newline at end of file
diff --git a/setup_gbar_env.sh b/setup_gbar_env.sh
new file mode 100644
index 0000000..7da68ca
--- /dev/null
+++ b/setup_gbar_env.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+module load suitesparse/5.1.2
+module load nvhpc/22.5-nompi
+module load cuda/11.1
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/appl/SuiteSparse/5.1.2-sl73/lib:/appl/OpenBLAS/0.2.20/XeonGold6226R/gcc-6.4.0/lib
\ No newline at end of file
diff --git a/stencil_assembly.c b/stencil_assembly.c
new file mode 100644
index 0000000..2d0b73d
--- /dev/null
+++ b/stencil_assembly.c
@@ -0,0 +1,503 @@
+#include "stencil_assembly.h"
+
+#include "stencil_utility.h"
+
+#define CHOLMOD_INT_TYPE int
+
+
+void allocateSubspaceMatrix(const struct gridContext gc, const int l,
+                            struct CSRMatrix *M) {
+
+  // compute number of rows
+  const int ncell = pow(2, l);
+  const int32_t nelxc = gc.nelx / ncell;
+  const int32_t nelyc = gc.nely / ncell;
+  const int32_t nelzc = gc.nelz / ncell;
+
+  const int32_t nxc = nelxc + 1;
+  const int32_t nyc = nelyc + 1;
+  const int32_t nzc = nelzc + 1;
+  const int32_t ndofc = 3 * nxc * nyc * nzc;
+
+  // allocate row offset array
+  (*M).nrows = ndofc;
+  (*M).rowOffsets = malloc(sizeof(int) * (ndofc + 1));
+
+  // calculate size of rows
+  (*M).rowOffsets[0] = 0;
+
+#pragma omp parallel for collapse(3) schedule(static) default(none) firstprivate(nxc,nzc,nyc) shared(M)
+  for (int i = 0; i < nxc; i++)
+    for (int k = 0; k < nzc; k++)
+      for (int j = 0; j < nyc; j++) {
+        const int nidx = (i * nyc * nzc + nyc * k + j);
+
+        // add 1 to index to offset the result
+        const uint32_t idx1 = 3 * nidx + 0 + 1;
+        const uint32_t idx2 = 3 * nidx + 1 + 1;
+        const uint32_t idx3 = 3 * nidx + 2 + 1;
+
+        const int32_t xmin = MAX(i - 1, 0);
+        const int32_t ymin = MAX(j - 1, 0);
+        const int32_t zmin = MAX(k - 1, 0);
+
+        const int32_t xmax = MIN(i + 1, nxc - 1);
+        const int32_t ymax = MIN(j + 1, nyc - 1);
+        const int32_t zmax = MIN(k + 1, nzc - 1);
+
+        const int32_t localSize =
+            3 * (xmax - xmin + 1) * (ymax - ymin + 1) * (zmax - zmin + 1);
+
+        (*M).rowOffsets[idx1] = localSize;
+        (*M).rowOffsets[idx2] = localSize;
+        (*M).rowOffsets[idx3] = localSize;
+      }
+
+  // perform cummulative sum
+  for (int i = 1; i < ndofc + 1; i++)
+    (*M).rowOffsets[i] += (*M).rowOffsets[i - 1];
+
+  // allocate column and val arrays
+  const int nnz = (*M).rowOffsets[ndofc];
+  (*M).nnz = nnz;
+  (*M).colIndex = malloc(sizeof(int) * nnz);
+  (*M).vals = malloc(sizeof(MTYPE) * nnz);
+
+  // populate col array
+#pragma omp parallel for collapse(3) schedule(static) default(none) firstprivate(nxc,nyc,nzc) shared(M)
+  for (int i = 0; i < nxc; i++)
+    for (int k = 0; k < nzc; k++)
+      for (int j = 0; j < nyc; j++) {
+        const int rowNodeIndex = i * nyc * nzc + nyc * k + j;
+
+        // add 1 to index to offset the result
+        const uint32_t row1 = 3 * rowNodeIndex + 0;
+        const uint32_t row2 = 3 * rowNodeIndex + 1;
+        const uint32_t row3 = 3 * rowNodeIndex + 2;
+
+        const int32_t xmin = MAX(i - 1, 0);
+        const int32_t ymin = MAX(j - 1, 0);
+        const int32_t zmin = MAX(k - 1, 0);
+
+        const int32_t xmax = MIN(i + 2, nxc);
+        const int32_t ymax = MIN(j + 2, nyc);
+        const int32_t zmax = MIN(k + 2, nzc);
+
+        int rowOffset1 = (*M).rowOffsets[row1];
+        int rowOffset2 = (*M).rowOffsets[row2];
+        int rowOffset3 = (*M).rowOffsets[row3];
+
+        for (int ii = xmin; ii < xmax; ii++)
+          for (int kk = zmin; kk < zmax; kk++)
+            for (int jj = ymin; jj < ymax; jj++) {
+
+              const int colNodeIndex = ii * nyc * nzc + nyc * kk + jj;
+
+              const uint32_t col1 = 3 * colNodeIndex + 0;
+              const uint32_t col2 = 3 * colNodeIndex + 1;
+              const uint32_t col3 = 3 * colNodeIndex + 2;
+
+              (*M).colIndex[rowOffset1 + 0] = col1;
+              (*M).colIndex[rowOffset2 + 0] = col1;
+              (*M).colIndex[rowOffset3 + 0] = col1;
+
+              (*M).colIndex[rowOffset1 + 1] = col2;
+              (*M).colIndex[rowOffset2 + 1] = col2;
+              (*M).colIndex[rowOffset3 + 1] = col2;
+
+              (*M).colIndex[rowOffset1 + 2] = col3;
+              (*M).colIndex[rowOffset2 + 2] = col3;
+              (*M).colIndex[rowOffset3 + 2] = col3;
+
+              rowOffset1 += 3;
+              rowOffset2 += 3;
+              rowOffset3 += 3;
+            }
+      }
+}
+
+void freeSubspaceMatrix(struct CSRMatrix *M) {
+
+  free((*M).rowOffsets);
+  free((*M).vals);
+  free((*M).colIndex);
+}
+
+void assembleSubspaceMatrix(const struct gridContext gc, const int l,
+                            const DTYPE *x, struct CSRMatrix M, MTYPE *tmp) {
+
+// zero the val array
+#pragma omp parallel for schedule(static) default(none) shared(M)
+  for (int i = 0; i < M.nnz; i++)
+    M.vals[i] = 0.0;
+
+  const int ncell = pow(2, l);
+  const int32_t nelxc = gc.nelx / ncell;
+  const int32_t nelyc = gc.nely / ncell;
+  const int32_t nelzc = gc.nelz / ncell;
+
+  const int32_t nyc = nelyc + 1;
+  const int32_t nzc = nelzc + 1;
+
+  const int paddingyc =
+      (STENCIL_SIZE_Y - ((nelyc + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+  const int paddingzc =
+      (STENCIL_SIZE_Z - ((nelzc + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+  const int wrapyc = nelyc + paddingyc + 3;
+  const int wrapzc = nelzc + paddingzc + 3;
+
+  // loop over active elements, in 8 colored fashion to avoid data-races
+  for (int32_t bx = 0; bx < 2; bx++)
+    for (int32_t bz = 0; bz < 2; bz++)
+      for (int32_t by = 0; by < 2; by++)
+
+#pragma omp parallel for collapse(3) schedule(static) default(none) firstprivate(bx,by,bz,nelxc,nelzc,nelyc) shared(gc,x,M)
+        for (int32_t i = bx; i < nelxc; i += 2)
+          for (int32_t k = bz; k < nelzc; k += 2)
+            for (int32_t j = by; j < nelyc; j += 2) {
+
+              //// get 'true' edof
+              uint_fast32_t edof[24];
+              getEdof_halo(edof, i, j, k, nyc, nzc);
+
+              const int32_t i_halo = i + 1;
+              const int32_t j_halo = j + 1;
+              const int32_t k_halo = k + 1;
+
+              // make local zero size buffer
+              alignas(__alignBound) MTYPE ke[24][24];
+              for (int iii = 0; iii < 24; iii++)
+                for (int jjj = 0; jjj < 24; jjj++)
+                  ke[iii][jjj] = 0.0;
+
+              // assemble local matrix
+              for (int ii = 0; ii < ncell; ii++)
+                for (int kk = 0; kk < ncell; kk++)
+                  for (int jj = 0; jj < ncell; jj++) {
+                    const int ifine = ((i_halo - 1) * ncell) + ii + 1;
+                    const int jfine = ((j_halo - 1) * ncell) + jj + 1;
+                    const int kfine = ((k_halo - 1) * ncell) + kk + 1;
+
+                    const int cellidx = ncell * ncell * ii + ncell * kk + jj;
+
+                    const uint_fast32_t elementIndex =
+                        ifine * (gc.wrapy - 1) * (gc.wrapz - 1) +
+                        kfine * (gc.wrapy - 1) + jfine;
+                    const MTYPE elementScale =
+                        gc.Emin + x[elementIndex] * x[elementIndex] *
+                                      x[elementIndex] * (gc.E0 - gc.Emin);
+
+                    for (int iii = 0; iii < 24; iii++)
+                      for (int jjj = 0; jjj < 24; jjj++)
+                        ke[iii][jjj] += elementScale *
+                                        gc.precomputedKE[l][24 * 24 * cellidx +
+                                                            24 * iii + jjj];
+                  }
+
+              // add matrix contribution to val slices
+              // initial naive implementation w. linear search and row-by-row
+              // update
+              // potential optimizations: bisection search, node-by-node
+              // update
+              for (int iii = 0; iii < 24; iii++) {
+                const int32_t rowStart = M.rowOffsets[edof[iii]];
+                const int32_t rowEnd = M.rowOffsets[edof[iii] + 1];
+
+                for (int jjj = 0; jjj < 24; jjj++) {
+                  // printf("Looking for %i in %i, [%i %i]: ", edof[jjj],
+                  //        edof[iii], rowStart, rowEnd);
+
+                  for (int idx = rowStart; idx < rowEnd; idx++) {
+                    if (M.colIndex[idx] == edof[jjj]) {
+                      //#pragma omp atomic update
+                      M.vals[idx] += ke[iii][jjj];
+                      // printf("%i", idx);
+                      break;
+                    }
+                  }
+
+                  // printf("\n");
+                }
+              }
+            }
+
+            // apply boundaryConditions
+#pragma omp parallel for schedule(static) default(none) shared(M,tmp)
+  for (int row = 0; row < M.nrows; row++) {
+    tmp[row] = 1.0;
+  }
+
+#pragma omp parallel for schedule(static) default(none) firstprivate(wrapyc, wrapzc, nyc, nzc) shared(gc,tmp)
+  for (int i = 0; i < gc.fixedDofs[l].n; i++) {
+    const int row =
+        haloToTrue(gc.fixedDofs[l].idx[i], wrapyc, wrapzc, nyc, nzc);
+    tmp[row] = 0.0;
+  }
+
+  // if either row or col is marked as fixed dof, zero the term
+#pragma omp parallel for schedule(static) default(none) shared(M,tmp)
+  for (int row = 0; row < M.nrows; row++) {
+    const int32_t rowStart = M.rowOffsets[row];
+    const int32_t rowEnd = M.rowOffsets[row + 1];
+
+    for (int col = rowStart; col < rowEnd; col++)
+      M.vals[col] *= tmp[row] * tmp[M.colIndex[col]];
+  }
+
+#pragma omp parallel for schedule(static) default(none) firstprivate(wrapyc, wrapzc, nyc, nzc) shared(gc,M)
+  for (int i = 0; i < gc.fixedDofs[l].n; i++) {
+    const int row =
+        haloToTrue(gc.fixedDofs[l].idx[i], wrapyc, wrapzc, nyc, nzc);
+
+    // printf("%i -> %i (%i)\n", gc.fixedDofs[l].idx[i], row, fdtmp.idx[i]);
+    const int32_t rowStart = M.rowOffsets[row];
+    const int32_t rowEnd = M.rowOffsets[row + 1];
+    for (int col = rowStart; col < rowEnd; col++)
+      if (row == M.colIndex[col])
+        M.vals[col] = 1.0;
+  }
+}
+
+void applyStateOperatorSubspaceMatrix(const struct gridContext gc, const int l,
+                                      const struct CSRMatrix M, const CTYPE *in,
+                                      CTYPE *out) {
+
+  const int ncell = pow(2, l);
+  const int32_t nelxc = gc.nelx / ncell;
+  const int32_t nelyc = gc.nely / ncell;
+  const int32_t nelzc = gc.nelz / ncell;
+
+  const int32_t nxc = nelxc + 1;
+  const int32_t nyc = nelyc + 1;
+  const int32_t nzc = nelzc + 1;
+
+  const int paddingxc =
+      (STENCIL_SIZE_X - ((nelxc + 1) % STENCIL_SIZE_X)) % STENCIL_SIZE_X;
+  const int paddingyc =
+      (STENCIL_SIZE_Y - ((nelyc + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+  const int paddingzc =
+      (STENCIL_SIZE_Z - ((nelzc + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+  const int wrapxc = nelxc + paddingxc + 3;
+  const int wrapyc = nelyc + paddingyc + 3;
+  const int wrapzc = nelzc + paddingzc + 3;
+
+  const int ndofc = 3 * wrapxc * wrapyc * wrapzc;
+
+#pragma omp parallel for schedule(static) default(none) firstprivate(ndofc) shared(out)
+  for (int i = 0; i < ndofc; i++)
+    out[i] = 0.0;
+
+#pragma omp parallel for collapse(3) schedule(static) default(none) firstprivate(nxc,nyc,nzc,wrapyc,wrapzc) shared(M,in,out)
+  for (int32_t i = 1; i < nxc + 1; i++)
+    for (int32_t k = 1; k < nzc + 1; k++)
+      for (int32_t j = 1; j < nyc + 1; j++) {
+
+        const int haloNodeIndex = i * wrapyc * wrapzc + wrapyc * k + j;
+        const uint32_t rowHaloIndex1 = 3 * haloNodeIndex + 0;
+        const uint32_t rowHaloIndex2 = 3 * haloNodeIndex + 1;
+        const uint32_t rowHaloIndex3 = 3 * haloNodeIndex + 2;
+
+        const uint32_t i_no_halo = i - 1;
+        const uint32_t j_no_halo = j - 1;
+        const uint32_t k_no_halo = k - 1;
+
+        const int rowNodeIndex =
+            i_no_halo * nyc * nzc + nyc * k_no_halo + j_no_halo;
+        const uint32_t rowIndex1 = 3 * rowNodeIndex + 0;
+        const uint32_t rowIndex2 = 3 * rowNodeIndex + 1;
+        const uint32_t rowIndex3 = 3 * rowNodeIndex + 2;
+
+        double outBufferRow1 = 0.0;
+        double outBufferRow2 = 0.0;
+        double outBufferRow3 = 0.0;
+
+        const int32_t xmin = MAX(i - 1, 1);
+        const int32_t ymin = MAX(j - 1, 1);
+        const int32_t zmin = MAX(k - 1, 1);
+
+        const int32_t xmax = MIN(i + 2, nxc + 1);
+        const int32_t ymax = MIN(j + 2, nyc + 1);
+        const int32_t zmax = MIN(k + 2, nzc + 1);
+
+        // recalculate indices instead of recreating i,j,k without halo
+        int offsetRow1 = M.rowOffsets[rowIndex1];
+        int offsetRow2 = M.rowOffsets[rowIndex2];
+        int offsetRow3 = M.rowOffsets[rowIndex3];
+
+        for (int ii = xmin; ii < xmax; ii++)
+          for (int kk = zmin; kk < zmax; kk++)
+            for (int jj = ymin; jj < ymax; jj++) {
+
+              const int haloColNodeIndex =
+                  ii * wrapyc * wrapzc + wrapyc * kk + jj;
+
+              const uint32_t colHaloIndex1 = 3 * haloColNodeIndex + 0;
+              const uint32_t colHaloIndex2 = 3 * haloColNodeIndex + 1;
+              const uint32_t colHaloIndex3 = 3 * haloColNodeIndex + 2;
+
+              outBufferRow1 +=
+                  (double)in[colHaloIndex1] * M.vals[offsetRow1 + 0] +
+                  (double)in[colHaloIndex2] * M.vals[offsetRow1 + 1] +
+                  (double)in[colHaloIndex3] * M.vals[offsetRow1 + 2];
+
+              outBufferRow2 +=
+                  (double)in[colHaloIndex1] * M.vals[offsetRow2 + 0] +
+                  (double)in[colHaloIndex2] * M.vals[offsetRow2 + 1] +
+                  (double)in[colHaloIndex3] * M.vals[offsetRow2 + 2];
+
+              outBufferRow3 +=
+                  (double)in[colHaloIndex1] * M.vals[offsetRow3 + 0] +
+                  (double)in[colHaloIndex2] * M.vals[offsetRow3 + 1] +
+                  (double)in[colHaloIndex3] * M.vals[offsetRow3 + 2];
+
+              offsetRow1 += 3;
+              offsetRow2 += 3;
+              offsetRow3 += 3;
+            }
+
+        out[rowHaloIndex1] = outBufferRow1;
+        out[rowHaloIndex2] = outBufferRow2;
+        out[rowHaloIndex3] = outBufferRow3;
+      }
+}
+
+void initializeCholmod(const struct gridContext gc, const int l,
+                       struct CoarseSolverData *solverData,
+                       const struct CSRMatrix M) {
+
+  solverData->cholmodCommon = malloc(sizeof(cholmod_common));
+
+  // In order to enable GPU acceloration we must use _l_ functions
+  cholmod_start(solverData->cholmodCommon);
+  solverData->cholmodCommon->nmethods = 9;
+
+  solverData->sparseMatrix = cholmod_allocate_sparse(
+      M.nrows, /* # of rows of A */
+      M.nrows, /* # of columns of A */
+      M.nnz,   /* max # of nonzeros of A */
+      1,       /* TRUE if columns of A sorted, FALSE otherwise */
+      1,       /* TRUE if A will be packed, FALSE otherwise */
+      1, /* stype of A 0=use both upper and lower, 1=use upper, -1 use lower */
+      CHOLMOD_REAL, /* CHOLMOD_PATTERN, _REAL, _COMPLEX, or _ZOMPLEX */
+      solverData->cholmodCommon);
+
+#pragma omp parallel for schedule(static) default(none) shared(M,solverData)
+  for (int i = 0; i < M.nrows + 1; i++)
+    ((CHOLMOD_INT_TYPE *)solverData->sparseMatrix->p)[i] = M.rowOffsets[i];
+
+#pragma omp parallel for schedule(static) default(none) shared(M,solverData)
+  for (int i = 0; i < M.nnz; i++)
+    ((CHOLMOD_INT_TYPE *)solverData->sparseMatrix->i)[i] = M.colIndex[i];
+
+  solverData->factoredMatrix = cholmod_analyze(
+      solverData->sparseMatrix, /* matrix to order and analyze */
+      solverData->cholmodCommon);
+
+  solverData->rhs = cholmod_allocate_dense(
+      M.nrows,      /* # of rows of matrix */
+      1,            /* # of cols of matrix */
+      M.nrows,      /* leading dimension */
+      CHOLMOD_REAL, /* CHOLMOD_REAL, _COMPLEX, or _ZOMPLEX */
+      solverData->cholmodCommon);
+
+  solverData->solution = NULL;
+  solverData->Y_workspace = NULL;
+  solverData->E_workspace = NULL;
+}
+
+void finishCholmod(const struct gridContext gc, const int l,
+                   struct CoarseSolverData *solverData,
+                   const struct CSRMatrix M, const int verbose) {
+  if (verbose == 1){
+    cholmod_print_common("common", solverData->cholmodCommon);
+  }
+  cholmod_free_dense(&solverData->rhs, solverData->cholmodCommon);
+  cholmod_free_dense(&solverData->solution, solverData->cholmodCommon);
+  cholmod_free_dense(&solverData->Y_workspace, solverData->cholmodCommon);
+  cholmod_free_dense(&solverData->E_workspace, solverData->cholmodCommon);
+
+  cholmod_free_sparse(&solverData->sparseMatrix, solverData->cholmodCommon);
+  cholmod_free_factor(&solverData->factoredMatrix, solverData->cholmodCommon);
+
+  cholmod_finish(solverData->cholmodCommon);
+  free(solverData->cholmodCommon);
+}
+
+void factorizeSubspaceMatrix(const struct gridContext gc, const int l,
+                             struct CoarseSolverData solverData,
+                             const struct CSRMatrix M) {
+  #pragma omp parallel for schedule(static) default(none) shared(M,solverData)
+  for (int i = 0; i < M.nnz; i++)
+    ((double *)solverData.sparseMatrix->x)[i] = M.vals[i];
+  
+  cholmod_factorize(solverData.sparseMatrix, solverData.factoredMatrix,
+                    solverData.cholmodCommon);
+}
+
+void solveSubspaceMatrix(const struct gridContext gc, const int l,
+                         struct CoarseSolverData solverData, const CTYPE *in,
+                         CTYPE *out) {
+  const int ncell = pow(2, l);
+  const int32_t nelxc = gc.nelx / ncell;
+  const int32_t nelyc = gc.nely / ncell;
+  const int32_t nelzc = gc.nelz / ncell;
+
+  const int32_t nxc = nelxc + 1;
+  const int32_t nyc = nelyc + 1;
+  const int32_t nzc = nelzc + 1;
+
+  const int paddingyc =
+      (STENCIL_SIZE_Y - ((nelyc + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+  const int paddingzc =
+      (STENCIL_SIZE_Z - ((nelzc + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+  const int wrapyc = nelyc + paddingyc + 3;
+  const int wrapzc = nelzc + paddingzc + 3;
+
+// copy grid data to vector
+#pragma omp parallel for collapse(3) schedule(static) default(none) firstprivate(nxc,nyc,nzc) shared(solverData,in)
+  for (int i = 1; i < nxc + 1; i++)
+    for (int k = 1; k < nzc + 1; k++)
+      for (int j = 1; j < nyc + 1; j++) {
+        const int nidx = ((i - 1) * nyc * nzc + (k - 1) * nyc + (j - 1));
+        const int nidx_s = (i * wrapyc * wrapzc + wrapyc * k + j);
+
+        ((double *)solverData.rhs->x)[3 * nidx + 0] = in[3 * nidx_s + 0];
+        ((double *)solverData.rhs->x)[3 * nidx + 1] = in[3 * nidx_s + 1];
+        ((double *)solverData.rhs->x)[3 * nidx + 2] = in[3 * nidx_s + 2];
+      }
+
+  // cholmod_print_dense(solverData.rhs, "rhs", &solverData.cholmodCommon);
+
+  // call cholmod solve
+
+  cholmod_solve2(CHOLMOD_A,                 // system to solve 
+                 solverData.factoredMatrix, // factorization to use
+                 solverData.rhs,            // right-hand-side
+                 NULL,                      // handle 
+                 &solverData.solution,      // solution, allocated if need be
+                 NULL,                      // handle
+                 &solverData.Y_workspace,   // workspace, or NULL 
+                 &solverData.E_workspace,   // workspace, or NULL 
+                 solverData.cholmodCommon);
+
+// printf("cholmod solve status: %i\n", solveSuccess);
+
+// cholmod_print_dense(solverData.solution, "solution",
+//                     &solverData.cholmodCommon);
+
+// copy data back to grid format
+#pragma omp parallel for collapse(3) schedule(static) default(none) firstprivate(nxc,nyc,nzc) shared(out,solverData)
+  for (int i = 1; i < nxc + 1; i++)
+    for (int k = 1; k < nzc + 1; k++)
+      for (int j = 1; j < nyc + 1; j++) {
+        const int nidx = ((i - 1) * nyc * nzc + (k - 1) * nyc + (j - 1));
+        const int nidx_s = (i * wrapyc * wrapzc + wrapyc * k + j);
+
+        out[3 * nidx_s + 0] = ((double *)solverData.solution->x)[3 * nidx + 0];
+        out[3 * nidx_s + 1] = ((double *)solverData.solution->x)[3 * nidx + 1];
+        out[3 * nidx_s + 2] = ((double *)solverData.solution->x)[3 * nidx + 2];
+      }
+}
diff --git a/stencil_assembly.h b/stencil_assembly.h
new file mode 100644
index 0000000..aee34bb
--- /dev/null
+++ b/stencil_assembly.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "definitions.h"
+
+// #include "/zhome/c3/7/127558/SuiteSparse/SuiteSparse-5.10.1/include/cholmod.h"
+#include "cholmod.h"
+
+struct CSRMatrix {
+  uint64_t nnz;
+  int32_t nrows;
+
+  int *rowOffsets;
+  int *colIndex;
+  MTYPE *vals;
+};
+
+struct CoarseSolverData {
+  cholmod_common *cholmodCommon;
+  cholmod_sparse *sparseMatrix;
+  cholmod_factor *factoredMatrix;
+
+  cholmod_dense *rhs;
+  cholmod_dense *solution;
+
+  cholmod_dense *Y_workspace;
+  cholmod_dense *E_workspace;
+};
+
+void allocateSubspaceMatrix(const struct gridContext gc, const int l,
+                            struct CSRMatrix *M);
+
+void freeSubspaceMatrix(struct CSRMatrix *M);
+
+void assembleSubspaceMatrix(const struct gridContext gc, const int l,
+                            const DTYPE *x, struct CSRMatrix M, MTYPE *tmp);
+
+void applyStateOperatorSubspaceMatrix(const struct gridContext gc, const int l,
+                                      const struct CSRMatrix M, const CTYPE *in,
+                                      CTYPE *out);
+
+void initializeCholmod(const struct gridContext gc, const int l,
+                       struct CoarseSolverData *ssolverData,
+                       const struct CSRMatrix M);
+
+void finishCholmod(const struct gridContext gc, const int l,
+                   struct CoarseSolverData *solverData,
+                   const struct CSRMatrix M, const int verbose);
+
+void factorizeSubspaceMatrix(const struct gridContext gc, const int l,
+                             struct CoarseSolverData solverData,
+                             const struct CSRMatrix M);
+
+void solveSubspaceMatrix(const struct gridContext gc, const int l,
+                         struct CoarseSolverData solverData, const CTYPE *in,
+                         CTYPE *out);
diff --git a/stencil_grid_utility.c b/stencil_grid_utility.c
new file mode 100644
index 0000000..b185d1e
--- /dev/null
+++ b/stencil_grid_utility.c
@@ -0,0 +1,164 @@
+#include "stencil_grid_utility.h"
+
+#include "local_matrix.h"
+
+#include "gpu_definitions.h"
+
+void setFixedDof_halo(struct gridContext *gc, const int l) {
+
+  // Computing dimensions for grid
+  int ncell,wrapxc,wrapyc,wrapzc;
+  uint_fast32_t ndofc;
+  computePadding(gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+  const int32_t nelyc = (*gc).nely / ncell;
+  const int32_t nelzc = (*gc).nelz / ncell;
+
+  const int nyc = (nelyc + 1);
+  const int nzc = (nelzc + 1);
+
+  // classic cantilever
+  // (*gc).fixedDofs[l].n = 3 * nyc * nzc;
+  // (*gc).fixedDofs[l].idx = malloc(sizeof(uint_fast32_t) *
+  // (*gc).fixedDofs[l].n); int offset = 0; for (uint_fast32_t k = 1; k < (nzc +
+  // 1); k++)
+  //   for (uint_fast32_t j = 1; j < (nyc + 1); j++) {
+  //     (*gc).fixedDofs[l].idx[offset + 0] =
+  //         3 * (wrapyc * wrapzc + wrapyc * k + j) + 0;
+  //     (*gc).fixedDofs[l].idx[offset + 1] =
+  //         3 * (wrapyc * wrapzc + wrapyc * k + j) + 1;
+  //     (*gc).fixedDofs[l].idx[offset + 2] =
+  //         3 * (wrapyc * wrapzc + wrapyc * k + j) + 2;
+  //     offset += 3;
+  //   }
+
+  // new cantilever
+  const int nodelimit = (nelyc / 4) + 1;
+  (*gc).fixedDofs[l].n = 3 * nzc * 2 * nodelimit;
+  (*gc).fixedDofs[l].idx = malloc(sizeof(uint_fast32_t) * (*gc).fixedDofs[l].n);
+  int offset = 0;
+  const int i = 1;
+  for (uint_fast32_t k = 1; k < (nzc + 1); k++) {
+    for (uint_fast32_t j = 1; j < nodelimit + 1; j++) {
+      (*gc).fixedDofs[l].idx[offset + 0] =
+          3 * (i * wrapyc * wrapzc + wrapyc * k + j) + 0;
+      (*gc).fixedDofs[l].idx[offset + 1] =
+          3 * (i * wrapyc * wrapzc + wrapyc * k + j) + 1;
+      (*gc).fixedDofs[l].idx[offset + 2] =
+          3 * (i * wrapyc * wrapzc + wrapyc * k + j) + 2;
+      offset += 3;
+    }
+    for (uint_fast32_t j = (nyc + 1) - nodelimit; j < (nyc + 1); j++) {
+      (*gc).fixedDofs[l].idx[offset + 0] =
+          3 * (i * wrapyc * wrapzc + wrapyc * k + j) + 0;
+      (*gc).fixedDofs[l].idx[offset + 1] =
+          3 * (i * wrapyc * wrapzc + wrapyc * k + j) + 1;
+      (*gc).fixedDofs[l].idx[offset + 2] =
+          3 * (i * wrapyc * wrapzc + wrapyc * k + j) + 2;
+      offset += 3;
+    }
+  }
+
+  const uint_fast32_t n = (*gc).fixedDofs[l].n;
+  const uint_fast32_t *pidx = (*gc).fixedDofs[l].idx;
+
+  #pragma omp target enter data map(to : pidx[:n])
+}
+
+void setupGC(struct gridContext *gc, const int nl, const int nelx, const int nely, const int nelz) {
+
+  (*gc).E0 = 1;
+  (*gc).Emin = 1e-6;
+  (*gc).nu = 0.3;
+  (*gc).penal = 3; // dummy variable, does nothing
+
+  (*gc).elementSizeX = 0.5;
+  (*gc).elementSizeY = 0.5;
+  (*gc).elementSizeZ = 0.5;
+
+  (*gc).nelx = nelx;
+  (*gc).nely = nely;
+  (*gc).nelz = nelz;
+
+  const int paddingx =
+      (STENCIL_SIZE_X - (((*gc).nelx + 1) % STENCIL_SIZE_X)) % STENCIL_SIZE_X;
+  const int paddingy =
+      (STENCIL_SIZE_Y - (((*gc).nely + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+  const int paddingz =
+      (STENCIL_SIZE_Z - (((*gc).nelz + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+  (*gc).wrapx = (*gc).nelx + paddingx + 3;
+  (*gc).wrapy = (*gc).nely + paddingy + 3;
+  (*gc).wrapz = (*gc).nelz + paddingz + 3;
+
+  (*gc).precomputedKE = malloc(sizeof(MTYPE *) * nl);
+  (*gc).fixedDofs = malloc(sizeof(struct FixedDofs) * nl);
+
+  for (int l = 0; l < nl; l++) {
+    const int ncell = pow(2, l);
+    const int pKESize = 24 * 24 * ncell * ncell * ncell;
+    (*gc).precomputedKE[l] = malloc(sizeof(MTYPE) * pKESize);
+    getKEsubspace((*gc).precomputedKE[l], (*gc).nu, l);
+
+    setFixedDof_halo(gc, l);
+
+    //MTYPE *pKE = (*gc).precomputedKE[l];
+
+//#pragma omp target enter data map(to : pKE[:pKESize])
+  }
+}
+
+void freeGC(struct gridContext *gc, const int nl) {
+
+  for (int l = 0; l < nl; l++) {
+    free((*gc).precomputedKE[l]);
+    free((*gc).fixedDofs[l].idx);
+  }
+
+  free((*gc).precomputedKE);
+  free((*gc).fixedDofs);
+}
+
+void allocateZeroPaddedStateField(const struct gridContext gc, const int l,
+                                  CTYPE **v) {
+
+  // Computing dimensions for coarse grid
+  int ncell,wrapx,wrapy,wrapz;
+  uint_fast32_t ndof;
+  computePadding(&gc,l,&ncell,&wrapx,&wrapy,&wrapz,&ndof);
+
+  (*v) = malloc(sizeof(CTYPE) * ndof);
+
+#pragma omp parallel for schedule(static) default(none) firstprivate(ndof) shared(v)
+  for (int i = 0; i < ndof; i++)
+    (*v)[i] = 0.0;
+}
+
+void allocateZeroPaddedStateField_MTYPE(const struct gridContext gc,
+                                        const int l, MTYPE **v) {
+
+  // Computing dimensions for coarse grid
+  int ncell,wrapx,wrapy,wrapz;
+  uint_fast32_t ndof;
+  computePadding(&gc,l,&ncell,&wrapx,&wrapy,&wrapz,&ndof);
+
+  (*v) = malloc(sizeof(MTYPE) * ndof);
+
+#pragma omp parallel for schedule(static) default(none) firstprivate(ndof) shared(v)
+  for (int i = 0; i < ndof; i++)
+    (*v)[i] = 0.0;
+}
+
+void allocateZeroPaddedStateField_STYPE(const struct gridContext gc,
+                                        const int l, STYPE **v) {
+
+  // Computing dimensions for coarse grid
+  int ncell,wrapx,wrapy,wrapz;
+  uint_fast32_t ndof;
+  computePadding(&gc,l,&ncell,&wrapx,&wrapy,&wrapz,&ndof);
+
+  (*v) = malloc(sizeof(STYPE) * ndof);
+
+#pragma omp parallel for schedule(static) default(none) firstprivate(ndof) shared(v)
+  for (int i = 0; i < ndof; i++)
+    (*v)[i] = 0.0;
+}
diff --git a/stencil_grid_utility.h b/stencil_grid_utility.h
new file mode 100644
index 0000000..1985a24
--- /dev/null
+++ b/stencil_grid_utility.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "definitions.h"
+
+void setFixedDof_halo(struct gridContext *gc, const int l);
+
+void setupGC(struct gridContext *gc, const int nl, const int nelx, const int nely, const int nelz);
+
+void freeGC(struct gridContext *gc, const int nl);
+
+void allocateZeroPaddedStateField(const struct gridContext gc, const int l,
+                                  CTYPE **v);
+void allocateZeroPaddedStateField_MTYPE(const struct gridContext gc,
+                                        const int l, MTYPE **v);
+
+void allocateZeroPaddedStateField_STYPE(const struct gridContext gc,
+                                        const int l, STYPE **v);
diff --git a/stencil_methods.c b/stencil_methods.c
new file mode 100644
index 0000000..10b07b3
--- /dev/null
+++ b/stencil_methods.c
@@ -0,0 +1,862 @@
+#include "stencil_methods.h"
+
+#include "stencil_utility.h"
+
+#include <cblas.h>
+
+void applyStateOperator_stencil(const struct gpuNode * gpu_node, const DTYPE *x,
+                                const CTYPE *in, CTYPE *out) {
+  const struct gridContext gc = *(gpu_node->gc);
+  const uint32_t nx = gc.nelx + 1;
+  const uint32_t ny = gc.nely + 1;
+  const uint32_t nz = gc.nelz + 1;
+
+  // this is necessary for omp to recognize that gc.precomputedKE[0] is already
+  // mapped
+  const MTYPE *precomputedKE = gc.precomputedKE[0];
+  const int wrapy = gc.wrapy;
+  const int wrapz = gc.wrapz;
+
+  // loop over elements, depends on the which level you are on. For the finest
+  // (level 0) nelx*nely*nelz = 100.000 or more, but for every level you go down
+  // the number of iterations reduce by a factor of 8. i.e. level 2 will only
+  // have ~1000. This specific loop accounts for ~90% runtime
+  //#pragma omp teams distribute parallel for collapse(3) schedule(static)
+
+#pragma omp target teams distribute parallel for schedule(static) collapse(3) device(gpu_node->id)
+  for (int32_t i = 1; i < nx + 1; i += STENCIL_SIZE_X) {
+    for (int32_t k = 1; k < nz + 1; k += STENCIL_SIZE_Z) {
+      for (int32_t j = 1; j < ny + 1; j += STENCIL_SIZE_Y) {
+
+        alignas(__alignBound) MTYPE out_x[STENCIL_SIZE_Y];
+        alignas(__alignBound) MTYPE out_y[STENCIL_SIZE_Y];
+        alignas(__alignBound) MTYPE out_z[STENCIL_SIZE_Y];
+
+        alignas(__alignBound) MTYPE in_x[STENCIL_SIZE_Y];
+        alignas(__alignBound) MTYPE in_y[STENCIL_SIZE_Y];
+        alignas(__alignBound) MTYPE in_z[STENCIL_SIZE_Y];
+
+// zero the values about to be written in this
+        #pragma omp simd safelen(STENCIL_SIZE_Y) simdlen(STENCIL_SIZE_Y)               \
+            aligned(out_x, out_y, out_z                                                \
+            : __alignBound)
+        for (int jj = 0; jj < STENCIL_SIZE_Y; jj++) {
+          out_x[jj] = 0.0;
+          out_y[jj] = 0.0;
+          out_z[jj] = 0.0;
+        }
+
+        // center line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){0, 0, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 0},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){0, 1, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, 0},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, 0},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, 0},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, 0},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){0, -1, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, 0},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, 0},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, 0},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, 0},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // side line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){0, 0, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 1},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 1},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 1},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, 1},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){0, 1, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, 1},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, 1},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){0, -1, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, 1},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, 1},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // side line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){0, 0, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, -1},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, -1},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, -1},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 0, -1},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){0, 1, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, -1},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, 1, -1},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){0, -1, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, -1},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){0, -1, -1},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // side line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){1, 0, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, 0},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, 0},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, 0},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, 0},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){1, 1, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 1, 0},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 1, 0},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){1, -1, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, -1, 0},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, -1, 0},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // side line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){-1, 0, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, 0},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, 0},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, 0},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, 0},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){-1, -1, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, -1, 0},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, -1, 0},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){-1, 1, 0}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 1, 0},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 1, 0},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // edge line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){-1, 1, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 1, -1},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){-1, 0, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, -1},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, -1},
+                                        (const int[]){-1, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){-1, -1, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, -1, -1},
+                                        (const int[]){-1, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // edge line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){1, 0, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, -1},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, -1},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){1, -1, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, -1, -1},
+                                        (const int[]){0, -1, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){1, 1, -1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 1, -1},
+                                        (const int[]){0, 0, -1}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // edge line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){1, 0, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, 1},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 0, 1},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){1, 1, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, 1, 1},
+                                        (const int[]){0, 0, 0}, ny, nz, x, in_x,
+                                        in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){1, -1, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){1, -1, 1},
+                                        (const int[]){0, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        // edge line, uses uses the same 15 doubles from in
+        loadStencilInput(gc, i, j, k, (const int[]){-1, 0, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, 1},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 0, 1},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){-1, -1, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, -1, 1},
+                                        (const int[]){-1, -1, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+
+        loadStencilInput(gc, i, j, k, (const int[]){-1, 1, 1}, in, in_x, in_y,
+                         in_z);
+        applyStateStencilSpoke_finegrid(gc, precomputedKE, i, j, k,
+                                        (const int[]){-1, 1, 1},
+                                        (const int[]){-1, 0, 0}, ny, nz, x,
+                                        in_x, in_y, in_z, out_x, out_y, out_z);
+        #pragma omp simd safelen(STENCIL_SIZE_Y) simdlen(STENCIL_SIZE_Y)
+        for (int jj = 0; jj < STENCIL_SIZE_Y; jj++) {
+          const uint_fast32_t offset =
+              3 * (i * wrapy * wrapz + k * wrapy + j + jj);
+          out[offset + 0] = out_x[jj];
+          out[offset + 1] = out_y[jj];
+          out[offset + 2] = out_z[jj];
+        }
+      }
+    }
+  }
+  
+// zero out the extra padded nodes
+#pragma omp target teams distribute parallel for collapse(3) schedule(static) device(gpu_node->id)
+  for (int32_t i = 0; i < gc.wrapx; i++)
+    for (int32_t k = 0; k < gc.wrapz; k++)
+      for (int32_t j = ny + 1; j < gc.wrapy; j++) {
+
+        const uint_fast32_t offset =
+            3 * (i * wrapy * wrapz + k * wrapy + j);
+
+        out[offset + 0] = 0.0;
+        out[offset + 1] = 0.0;
+        out[offset + 2] = 0.0;
+      }
+
+  const uint_fast32_t n = gc.fixedDofs[0].n;
+  const uint_fast32_t *fidx = gc.fixedDofs[0].idx;
+
+// apply boundaryConditions
+#pragma omp target teams distribute parallel for schedule(static) device(gpu_node->id)
+  for (int i = 0; i < n; i++) {
+    out[fidx[i]] = in[fidx[i]];
+  }
+}
+
+
+// Apply the global matrix vector product out = K * in
+// temperature: very hot, called ~25 x (number of mg levels [1-5]) x
+// (number of cg iterations [125-2500]) = [125-12500]  times pr design
+// iteration
+
+void applyStateOperatorSubspace_halo(const struct gpuNode * gpu_node, const int l,
+                                     const DTYPE *x, CTYPE *in, CTYPE *out) {
+  const struct gridContext gc = *(gpu_node->gc);
+  
+  // Computing dimensions for coarse grid
+  int ncell,wrapxc,wrapyc,wrapzc;
+  uint_fast32_t ndofc;
+  computePadding(&gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+
+  const int32_t nelxc = gc.nelx / ncell;
+  const int32_t nelyc = gc.nely / ncell;
+  const int32_t nelzc = gc.nelz / ncell;
+
+  MTYPE* KE = gc.precomputedKE[l];
+  uint_fast32_t * idx = gc.fixedDofs[l].idx;
+  uint_fast32_t n = gc.fixedDofs[l].n;
+
+  #pragma omp target teams distribute parallel for schedule(static) device(gpu_node->id)
+  for (uint32_t i = 0; i < ndofc; i++)
+    out[i] = 0.0;
+
+  for (int32_t bx = 0; bx < 2; bx++)
+    for (int32_t bz = 0; bz < 2; bz++)
+      for (int32_t by = 0; by < 2; by++)
+              #pragma omp target teams distribute parallel for collapse(4) schedule(static) device(gpu_node->id)
+              for (int32_t i = bx + 1; i < nelxc + 1; i += 2)
+                for (int32_t k = bz + 1; k < nelzc + 1; k += 2)
+                  for (int32_t j = by + 1; j < nelyc + 1; j += 2) 
+                    for (int imv=0;imv<24;imv++){
+                      MTYPE out_local = 0.0;
+                      uint_fast32_t idx_update = 0;
+                      for (int kt=0;kt<2;kt++){
+                        for (int jt=0;jt<2;jt++){
+                          for (int it=0;it<2;it++){
+                              int nx = i + it;
+                              if (jt == 1){
+                                nx = i + 1-it;
+                              }
+                              const int pos = 3*(kt*4+jt*2+it);
+                              const int nz = k + kt;
+                              const int ny = j + 1-jt;
+                              const uint_fast32_t nIndex = nx * wrapyc * wrapzc + nz * wrapyc + ny;
+                              uint_fast32_t edof = 3 * nIndex;
+                              for (int off1 = 0;off1<3;off1++){
+                                if (pos + off1 == imv){
+                                  idx_update = edof+off1;
+                                }
+                              }
+                              for (int ii = 0; ii < ncell; ii++)
+                                for (int kk = 0; kk < ncell; kk++)
+                                  for (int jj = 0; jj < ncell; jj++){
+                                    const int ifine = ((i - 1) * ncell) + ii + 1;
+                                    const int jfine = ((j - 1) * ncell) + jj + 1;
+                                    const int kfine = ((k - 1) * ncell) + kk + 1;
+
+                                    const int cellidx = 24 * 24 * (ncell * ncell * ii + ncell * kk + jj);
+
+                                    const uint_fast32_t elementIndex =
+                                        ifine * (gc.wrapy - 1) * (gc.wrapz - 1) +
+                                        kfine * (gc.wrapy - 1) + jfine;
+                                    const MTYPE elementScale =
+                                        gc.Emin + x[elementIndex] * x[elementIndex] *
+                                        x[elementIndex] * (gc.E0 - gc.Emin);
+                                    for (int off = 0;off<3;off++){
+                                      out_local += elementScale*KE[cellidx+imv*24+pos+off]*((MTYPE)in[edof+off]);
+                                    }
+                                  }
+                              
+                          }
+                        }
+                      }
+                      #pragma omp atomic update
+                      out[idx_update] += (CTYPE)out_local;
+                    }
+                  
+            
+
+            // apply boundaryConditions
+//#pragma omp parallel for schedule(static) default(none) shared(gc,in,out)
+#pragma omp target teams distribute parallel for schedule(static) device(gpu_node->id)//default(none) shared(gc,in,out)
+  for (int i = 0; i < n; i++) {
+    out[idx[i]] = in[idx[i]];
+  }
+}
+
+void projectToFinerGrid_halo(const struct gpuNode * gpu_node,
+                             /*in*/ const int l,   /*in*/
+                             const CTYPE *ucoarse, /*in*/
+                             CTYPE *ufine /*out*/) {
+  const struct gridContext * gc = gpu_node->gc;
+
+  // Computing dimensions for fine grid
+  int ncellf,wrapxf,wrapyf,wrapzf;
+  uint_fast32_t ndoff;
+  computePadding(gc,l,&ncellf,&wrapxf,&wrapyf,&wrapzf,&ndoff);
+
+  // Computing dimensions for coarse grid
+  int ncellc,wrapxc,wrapyc,wrapzc;
+  uint_fast32_t ndofc;
+  computePadding(gc,l+1,&ncellc,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+
+  const int nelxf = gc->nelx / ncellf;
+  const int nelyf = gc->nely / ncellf;
+  const int nelzf = gc->nelz / ncellf;
+
+  const int nxf = nelxf + 1;
+  const int nyf = nelyf + 1;
+  const int nzf = nelzf + 1;
+
+  // loop over nodes, usually very large with nx*ny*nz = 100.000 or more
+  #pragma omp target teams distribute parallel for collapse(4) schedule(static) device(gpu_node->id)
+  for (int32_t ifine = 1; ifine < nxf + 1; ifine++)
+    for (int32_t kfine = 1; kfine < nzf + 1; kfine++)
+      for (int32_t jfine = 1; jfine < nyf + 1; jfine++) {
+        for(int offset = 0;offset<3;offset++){
+
+        const uint32_t fineIndex =
+            ifine * wrapyf * wrapzf + kfine * wrapyf + jfine;
+
+        const uint32_t icoarse1 = (ifine - 1) / 2 + 1;
+        const uint32_t icoarse2 = (ifine) / 2 + 1;
+        const uint32_t jcoarse1 = (jfine - 1) / 2 + 1;
+        const uint32_t jcoarse2 = (jfine) / 2 + 1;
+        const uint32_t kcoarse1 = (kfine - 1) / 2 + 1;
+        const uint32_t kcoarse2 = (kfine) / 2 + 1;
+
+        // Node indices on coarse grid
+        const uint_fast32_t coarseIndex1 =
+            icoarse1 * wrapyc * wrapzc + kcoarse1 * wrapyc + jcoarse2;
+        const uint_fast32_t coarseIndex2 =
+            icoarse2 * wrapyc * wrapzc + kcoarse1 * wrapyc + jcoarse2;
+        const uint_fast32_t coarseIndex3 =
+            icoarse2 * wrapyc * wrapzc + kcoarse1 * wrapyc + jcoarse1;
+        const uint_fast32_t coarseIndex4 =
+            icoarse1 * wrapyc * wrapzc + kcoarse1 * wrapyc + jcoarse1;
+        const uint_fast32_t coarseIndex5 =
+            icoarse1 * wrapyc * wrapzc + kcoarse2 * wrapyc + jcoarse2;
+        const uint_fast32_t coarseIndex6 =
+            icoarse2 * wrapyc * wrapzc + kcoarse2 * wrapyc + jcoarse2;
+        const uint_fast32_t coarseIndex7 =
+            icoarse2 * wrapyc * wrapzc + kcoarse2 * wrapyc + jcoarse1;
+        const uint_fast32_t coarseIndex8 =
+            icoarse1 * wrapyc * wrapzc + kcoarse2 * wrapyc + jcoarse1;
+
+        ufine[3 * fineIndex + offset] = 0.125 * ucoarse[3 * coarseIndex1 + offset] +
+                                   0.125 * ucoarse[3 * coarseIndex2 + offset] +
+                                   0.125 * ucoarse[3 * coarseIndex3 + offset] +
+                                   0.125 * ucoarse[3 * coarseIndex4 + offset] +
+                                   0.125 * ucoarse[3 * coarseIndex5 + offset] +
+                                   0.125 * ucoarse[3 * coarseIndex6 + offset] +
+                                   0.125 * ucoarse[3 * coarseIndex7 + offset] +
+                                   0.125 * ucoarse[3 * coarseIndex8 + offset];
+        }
+      }
+}
+
+// projects a field to a coarser grid ufine -> ucoarse
+// temperature: medium, called (number of mg levels [1-5]) x (number of cg
+// iterations [5-100]) = [5-500]  times pr design iteration
+void projectToCoarserGrid_halo(const struct gpuNode * gpu_node,//in
+                                const int l,//in
+                               const CTYPE *ufine, //in
+                               CTYPE *ucoarse //out
+                               ) {
+  const struct gridContext * gc = gpu_node->gc;
+
+  // Computing dimensions for fine grid
+  int ncellf,wrapxf,wrapyf,wrapzf;
+  uint_fast32_t ndoff;
+  computePadding(gc,l,&ncellf,&wrapxf,&wrapyf,&wrapzf,&ndoff);
+
+  // Computing dimensions for coarse grid
+  int ncellc,wrapxc,wrapyc,wrapzc;
+  uint_fast32_t ndofc;
+  computePadding(gc,l+1,&ncellc,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+
+  const int nelxc = gc->nelx / ncellc;
+  const int nelyc = gc->nely / ncellc;
+  const int nelzc = gc->nelz / ncellc;
+
+  const int nxc = nelxc + 1;
+  const int nyc = nelyc + 1;
+  const int nzc = nelzc + 1;
+
+  const MTYPE vals[4] = {1.0, 0.5, 0.25, 0.125};
+  //const int n_threads = 8;
+  //const int n_teams = 5120/n_threads;
+
+  // loop over nodes, usually very large with nx*ny*nz = 100.000 or more
+  //#pragma omp target data map(to:ufine[:ndofc_fine],vals[:4]) map(tofrom:ucoarse[:ndofc_coarse])
+  #pragma omp target data map(to:vals[:4]) device(gpu_node->id)
+  {
+  //#pragma omp target teams num_teams(n_teams) distribute collapse(3)
+  #pragma omp target teams distribute parallel for schedule(static) collapse(3) device(gpu_node->id)
+  for (int32_t icoarse = 1; icoarse < nxc + 1; icoarse++)
+    for (int32_t kcoarse = 1; kcoarse < nzc + 1; kcoarse++)
+      for (int32_t jcoarse = 1; jcoarse < nyc + 1; jcoarse++) {
+
+        const int coarseIndex =
+            icoarse * wrapyc * wrapzc + kcoarse * wrapyc + jcoarse;
+
+        ucoarse[3 * coarseIndex] = 0;
+        ucoarse[3 * coarseIndex+1] = 0;
+        ucoarse[3 * coarseIndex+2] = 0;
+      }
+  #pragma omp target teams distribute parallel for schedule(static) collapse(3) device(gpu_node->id)
+  for (int32_t icoarse = 1; icoarse < nxc + 1; icoarse++)
+    for (int32_t kcoarse = 1; kcoarse < nzc + 1; kcoarse++)
+      for (int32_t jcoarse = 1; jcoarse < nyc + 1; jcoarse++) {
+
+        const int coarseIndex =
+            icoarse * wrapyc * wrapzc + kcoarse * wrapyc + jcoarse;
+
+
+        // Node indices on fine grid
+        const int nx1 = (icoarse - 1) * 2 + 1;
+        const int ny1 = (jcoarse - 1) * 2 + 1;
+        const int nz1 = (kcoarse - 1) * 2 + 1;
+
+        const int xmin = nx1 - 1;
+        const int ymin = ny1 - 1;
+        const int zmin = nz1 - 1;
+
+        const int xmax = nx1 + 2;
+        const int ymax = ny1 + 2;
+        const int zmax = nz1 + 2;
+
+        /*ucoarse[3 * coarseIndex] = 0;
+        ucoarse[3 * coarseIndex+1] = 0;
+        ucoarse[3 * coarseIndex+2] = 0;*/
+
+        // this can be done faster by writing out the 27 iterations by hand,
+        // do it when necessary.
+        //#pragma omp parallel for num_threads(n_threads) collapse(4) schedule(static)
+        for (int32_t ifine = xmin; ifine < xmax; ifine++)
+          for (int32_t kfine = zmin; kfine < zmax; kfine++)
+            for (int32_t jfine = ymin; jfine < ymax; jfine++) 
+              for (int32_t offset = 0; offset < 3; offset++){
+
+              const uint32_t fineIndex =
+                  ifine * wrapyf * wrapzf + kfine * wrapyf + jfine;
+
+              const int ind = (nx1 - ifine) * (nx1 - ifine) +
+                              (ny1 - jfine) * (ny1 - jfine) +
+                              (nz1 - kfine) * (nz1 - kfine);
+              int idx = 3 * coarseIndex+offset;
+              CTYPE prod = vals[ind] * ufine[3 * fineIndex + offset];
+              #pragma omp atomic update
+              ucoarse[idx] += prod;
+            }
+      }
+  }
+}
+
+// generate the matrix diagonal for jacobi smoothing.
+// temperature: low-medium, called number of levels for every design
+// iteration.
+void assembleInvertedMatrixDiagonalSubspace_halo(const struct gpuNode * gpu_node,
+                                                 const DTYPE *x, const int l,
+                                                 MTYPE *diag) {
+  const struct gridContext * gc = gpu_node->gc;
+  int ncell,wrapxc,wrapyc,wrapzc;
+  uint_fast32_t ndofc;
+  computePadding(gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+  const int32_t nelxc = gc->nelx / ncell;
+  const int32_t nelyc = gc->nely / ncell;
+  const int32_t nelzc = gc->nelz / ncell;
+  const int wrapy = gc->wrapy;
+  const int wrapz = gc->wrapz;
+
+  uint_fast32_t* idx = gc->fixedDofs[l].idx;
+  uint_fast32_t n = gc->fixedDofs[l].n;
+  const double Emin = gc->Emin;
+  const double E0_Emin = gc->E0-gc->Emin;
+  const MTYPE * KE = gc->precomputedKE[l];
+
+  //#pragma omp target data map(to:idx[:n])
+  //{
+  #pragma omp target teams distribute parallel for schedule(static) device(gpu_node->id)
+  for (unsigned int i = 0; i < ndofc; i++)
+    diag[i] = 0.0;
+  
+  #pragma omp target teams distribute parallel for collapse(9) schedule(static) device(gpu_node->id)
+  for (int32_t bx = 0; bx < 2; bx++)
+    for (int32_t bz = 0; bz < 2; bz++)
+      for (int32_t by = 0; by < 2; by++)
+        for (int32_t it = 1; it < nelxc + 1; it += 2)
+          for (int32_t kt = 1; kt < nelzc + 1; kt += 2)
+            for (int32_t jt = 1; jt < nelyc + 1; jt += 2) 
+              for (int ii = 0; ii < ncell; ii++)
+                for (int kk = 0; kk < ncell; kk++)
+                  for (int jj = 0; jj < ncell; jj++) {
+                    int32_t i = bx + it;
+                    int32_t k = bz + kt;
+                    int32_t j = by + jt;
+                    if ((i >= nelxc + 1) || (k >= nelzc + 1) || (j >= nelyc + 1)){
+                      break;
+                    }
+                    uint_fast32_t edof[24];
+                    getEdof_halo(edof, i, j, k, wrapyc, wrapzc);
+                    const int ifine = ((i - 1) * ncell) + ii + 1;
+                    const int jfine = ((j - 1) * ncell) + jj + 1;
+                    const int kfine = ((k - 1) * ncell) + kk + 1;
+
+                    const int cellidx = ncell * ncell * ii + ncell * kk + jj;
+
+                    const uint_fast32_t elementIndex =
+                        ifine * (wrapy - 1) * (wrapz - 1) +
+                        kfine * (wrapy - 1) + jfine;
+                    const MTYPE elementScale =
+                        Emin + x[elementIndex] * x[elementIndex] *
+                                      x[elementIndex] * E0_Emin;
+
+                    for (int iii = 0; iii < 24; iii++){
+                      #pragma omp atomic update
+                      diag[edof[iii]] += elementScale * KE[24 * 24 * cellidx + iii * 24 + iii];
+                    }
+                  }
+//printf("(wrapxc,wrapyc,wrapzc)=(%d,%d,%d)\n",wrapxc,wrapyc,wrapzc);
+
+// apply boundaryConditions
+#pragma omp target teams distribute parallel for schedule(static) device(gpu_node->id)
+  for (int i = 0; i < n; i++){
+    diag[idx[i]] = 1.0;
+  }
+
+#pragma omp target teams distribute parallel for collapse(3) schedule(static) device(gpu_node->id)
+  for (int i = 1; i < nelxc + 2; i++)
+    for (int k = 1; k < nelzc + 2; k++)
+      for (int j = 1; j < nelyc + 2; j++) {
+        const int nidx = (i * wrapyc * wrapzc + wrapyc * k + j);
+
+        diag[3 * nidx + 0] = 1.0 / diag[3 * nidx + 0];
+        diag[3 * nidx + 1] = 1.0 / diag[3 * nidx + 1];
+        diag[3 * nidx + 2] = 1.0 / diag[3 * nidx + 2];
+      }
+  //}
+}
+
+__force_inline inline void getEdof_halo_8(uint_fast32_t edof[8], const int i,
+                                        const int j, const int k,
+                                        const int wrapy, const int wrapz) {
+
+  const int nx_1 = i;
+  const int nx_2 = i + 1;
+  const int nz_1 = k;
+  const int nz_2 = k + 1;
+  const int ny_1 = j;
+  const int ny_2 = j + 1;
+
+  const uint_fast32_t nIndex1 = nx_1 * wrapy * wrapz + nz_1 * wrapy + ny_2;
+  const uint_fast32_t nIndex2 = nx_2 * wrapy * wrapz + nz_1 * wrapy + ny_2;
+  const uint_fast32_t nIndex3 = nx_2 * wrapy * wrapz + nz_1 * wrapy + ny_1;
+  const uint_fast32_t nIndex4 = nx_1 * wrapy * wrapz + nz_1 * wrapy + ny_1;
+  const uint_fast32_t nIndex5 = nx_1 * wrapy * wrapz + nz_2 * wrapy + ny_2;
+  const uint_fast32_t nIndex6 = nx_2 * wrapy * wrapz + nz_2 * wrapy + ny_2;
+  const uint_fast32_t nIndex7 = nx_2 * wrapy * wrapz + nz_2 * wrapy + ny_1;
+  const uint_fast32_t nIndex8 = nx_1 * wrapy * wrapz + nz_2 * wrapy + ny_1;
+
+  edof[0] = 3 * nIndex1;
+  edof[1] = 3 * nIndex2;
+  edof[2] = 3 * nIndex3;
+  edof[3] = 3 * nIndex4;
+
+  edof[4] = 3 * nIndex5;
+  edof[5] = 3 * nIndex6;
+  edof[6] = 3 * nIndex7;
+  edof[7] = 3 * nIndex8;
+}
+
+// generate elementwise gradients from displacement.
+// temperature: cold, called once for every design iteration.
+void getComplianceAndSensetivity_halo(const struct gpuNode * gpu_node,
+                                      const DTYPE *x, STYPE *u, DTYPE *c,
+                                      DTYPE *dcdx) {
+  const struct gridContext gc = *(gpu_node->gc);
+
+  c[0] = 0.0;
+  DTYPE cc = 0.0;
+
+  #pragma omp target teams distribute parallel for schedule(static) collapse(3) device(gpu_node->id)
+  for (int32_t i = 1; i < gc.nelx + 1; i++)
+    for (int32_t k = 1; k < gc.nelz + 1; k++)
+      for (int32_t j = 1; j < gc.nely + 1; j++) {
+        const uint_fast32_t elementIndex =
+              i * (gc.wrapy - 1) * (gc.wrapz - 1) + k * (gc.wrapy - 1) + j;
+        dcdx[elementIndex] = 0.0;
+      }
+
+// loops over all elements, typically 100.000 or more. Note that there are no
+// write dependencies, other than the reduction.
+  MTYPE * KE = gc.precomputedKE[0];
+  double E_diff = gc.E0 - gc.Emin;
+  double Emin = gc.Emin;
+  //#pragma omp target data map(always,tofrom:cc)
+  {
+  for (int ii=0;ii<8;ii++)
+  #pragma omp target teams distribute parallel for schedule(static) collapse(3) map(always,tofrom:cc) reduction(+ : cc) firstprivate(Emin,E_diff) device(gpu_node->id)
+  for (int32_t i = 1; i < gc.nelx + 1; i++)
+    for (int32_t k = 1; k < gc.nelz + 1; k++)
+      for (int32_t j = 1; j < gc.nely + 1; j++) {
+          uint_fast32_t edof[8];
+
+          getEdof_halo_8(edof, i, j, k, gc.wrapy, gc.wrapz);
+          const uint_fast32_t elementIndex =
+              i * (gc.wrapy - 1) * (gc.wrapz - 1) + k * (gc.wrapy - 1) + j;
+
+          // clocal = ulocal^T * ke * ulocal
+          STYPE clocal = 0.0;
+          for(int iii=0;iii<3;iii++){
+            MTYPE tmp = 0.0;
+            for (int jj=0;jj<8;jj++){
+              tmp += KE[(ii*3+iii)*24+3*jj]*u[edof[jj]];
+              tmp += KE[(ii*3+iii)*24+3*jj+1]*u[edof[jj]+1];
+              tmp += KE[(ii*3+iii)*24+3*jj+2]*u[edof[jj]+2];
+            }
+            clocal += u[edof[ii]+iii] * tmp;
+          }
+          // apply contribution to c and dcdx
+          cc += clocal * (Emin + x[elementIndex] * x[elementIndex] *
+                                        x[elementIndex] * E_diff);
+          dcdx[elementIndex] += clocal * (-3.0 * E_diff *
+                                        x[elementIndex] * x[elementIndex]);
+          }
+  }
+  c[0] = cc;
+}
diff --git a/stencil_methods.h b/stencil_methods.h
new file mode 100644
index 0000000..6239b3a
--- /dev/null
+++ b/stencil_methods.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#include "definitions.h"
+#include "gpu_definitions.h"
+
+void applyStateOperator_stencil(const struct gpuNode * gpu_node, const DTYPE *x,
+                                const CTYPE *in, CTYPE *out);
+
+void applyStateOperatorSubspace_halo(const struct gpuNode * gpu_node, const int l,
+                                     const DTYPE *x, CTYPE *in, CTYPE *out);
+
+void getComplianceAndSensetivity_halo(const struct gpuNode * gpu_node,
+                                      const DTYPE *x, STYPE *u, DTYPE *c,
+                                      DTYPE *dcdx);
+
+void projectToFinerGrid_halo(const struct gpuNode * gpu_node,
+                             /*in*/ const int l,   /*in*/
+                             const CTYPE *ucoarse, /*in*/
+                             CTYPE *ufine /*out*/);
+
+void projectToCoarserGrid_halo(const struct gpuNode * gpu_node,
+                               /*in*/ const int l, /*in*/
+                               const CTYPE *ufine, /*in*/
+                               CTYPE *ucoarse /*out*/);
+
+void assembleInvertedMatrixDiagonalSubspace_halo(const struct gpuNode * gpu_node,
+                                                 const DTYPE *x, const int l,
+                                                 MTYPE *diag);
+
+#pragma omp declare target
+__force_inline int inline getLocalNodeIndex(
+    const int nodeOffsetFromElement[3]) {
+  int output = -1;
+
+  const int ox = nodeOffsetFromElement[0];
+  const int oy = nodeOffsetFromElement[1];
+  const int oz = nodeOffsetFromElement[2];
+
+  // hack by jumptable, inputs are compile-time constant, so it should be fine.
+  // I do miss templates for this stuff though..
+
+  if (ox == 1 && oy == 1 && oz == 1)
+    output = 5;
+  else if (ox == 1 && oy == 0 && oz == 1)
+    output = 6;
+  else if (ox == 0 && oy == 1 && oz == 1)
+    output = 4;
+  else if (ox == 0 && oy == 0 && oz == 1)
+    output = 7;
+  else if (ox == 1 && oy == 1 && oz == 0)
+    output = 1;
+  else if (ox == 1 && oy == 0 && oz == 0)
+    output = 2;
+  else if (ox == 0 && oy == 1 && oz == 0)
+    output = 0;
+  else if (ox == 0 && oy == 0 && oz == 0)
+    output = 3;
+
+  return output;
+}
+
+__force_inline inline void applyStateStencilSpoke_finegrid(
+    const struct gridContext gc, const MTYPE *precomputedKE, const int i_center,
+    const int j_center, const int k_center, const int nodeOffset[3],
+    const int elementOffset[3], const uint32_t ny, const uint32_t nz, const DTYPE *x,
+    MTYPE inBuffer_x[STENCIL_SIZE_Y], MTYPE inBuffer_y[STENCIL_SIZE_Y],
+    MTYPE inBuffer_z[STENCIL_SIZE_Y], MTYPE outBuffer_x[STENCIL_SIZE_Y],
+    MTYPE outBuffer_y[STENCIL_SIZE_Y], MTYPE outBuffer_z[STENCIL_SIZE_Y]) {
+
+  // compute sending and recieving local node number, hopefully be evaluated at
+  // compile-time
+
+  const int recievingNodeOffset[3] = {-elementOffset[0], -elementOffset[1],
+                                      -elementOffset[2]};
+  const int nodeOffsetFromElement[3] = {nodeOffset[0] - elementOffset[0],
+                                        nodeOffset[1] - elementOffset[1],
+                                        nodeOffset[2] - elementOffset[2]};
+
+  const int localRecievingNodeIdx = getLocalNodeIndex(recievingNodeOffset);
+  const int localSendingNodeIdx = getLocalNodeIndex(nodeOffsetFromElement);
+
+  // compute index for element
+  const int i_element = i_center + elementOffset[0];
+  const int j_element = j_center + elementOffset[1];
+  const int k_element = k_center + elementOffset[2];
+
+  MTYPE localBuf_x[STENCIL_SIZE_Y];
+  MTYPE localBuf_y[STENCIL_SIZE_Y];
+  MTYPE localBuf_z[STENCIL_SIZE_Y];
+
+  const int startRecieve_local = 3 * localRecievingNodeIdx;
+  const int startSend_local = 3 * localSendingNodeIdx;
+
+  // loop over simd stencil size
+// currently does not compile to simd instructions..
+#pragma omp simd safelen(STENCIL_SIZE_Y) simdlen(STENCIL_SIZE_Y) aligned(      \
+    inBuffer_x, inBuffer_y, inBuffer_z, outBuffer_x, outBuffer_y, outBuffer_z  \
+    : __alignBound)
+  for (int jj = 0; jj < STENCIL_SIZE_Y; jj++) {
+
+    // local coordinates
+    const uint_fast32_t elementIndex =
+        (i_element) * (gc.wrapy - 1) * (gc.wrapz - 1) +
+        (k_element) * (gc.wrapy - 1) + (j_element + jj);
+    MTYPE elementScale = gc.Emin + x[elementIndex] * x[elementIndex] *
+                                       x[elementIndex] * (gc.E0 - gc.Emin);
+
+    // important, sets true zero to halo values. This is necessary for
+    // correctness. Performance can be gained by removing the constant Emin, and
+    // setting the minimum allowed density to a corresponding non-zero value.
+    // But this is left for the future at the moment.
+    if (i_element == 0 || i_element > gc.nelx || j_element + jj == 0 ||
+        j_element + jj > gc.nely || k_element == 0 || k_element > gc.nelz)
+      elementScale = 0.0;
+
+    localBuf_x[jj] = 0.0;
+    localBuf_y[jj] = 0.0;
+    localBuf_z[jj] = 0.0;
+
+    // add the spoke contribution
+    localBuf_x[jj] +=
+        precomputedKE[24 * (startRecieve_local + 0) + (startSend_local + 0)] *
+        inBuffer_x[jj];
+    localBuf_x[jj] +=
+        precomputedKE[24 * (startRecieve_local + 0) + (startSend_local + 1)] *
+        inBuffer_y[jj];
+    localBuf_x[jj] +=
+        precomputedKE[24 * (startRecieve_local + 0) + (startSend_local + 2)] *
+        inBuffer_z[jj];
+
+    localBuf_y[jj] +=
+        precomputedKE[24 * (startRecieve_local + 1) + (startSend_local + 0)] *
+        inBuffer_x[jj];
+    localBuf_y[jj] +=
+        precomputedKE[24 * (startRecieve_local + 1) + (startSend_local + 1)] *
+        inBuffer_y[jj];
+    localBuf_y[jj] +=
+        precomputedKE[24 * (startRecieve_local + 1) + (startSend_local + 2)] *
+        inBuffer_z[jj];
+
+    localBuf_z[jj] +=
+        precomputedKE[24 * (startRecieve_local + 2) + (startSend_local + 0)] *
+        inBuffer_x[jj];
+    localBuf_z[jj] +=
+        precomputedKE[24 * (startRecieve_local + 2) + (startSend_local + 1)] *
+        inBuffer_y[jj];
+    localBuf_z[jj] +=
+        precomputedKE[24 * (startRecieve_local + 2) + (startSend_local + 2)] *
+        inBuffer_z[jj];
+
+    outBuffer_x[jj] += elementScale * localBuf_x[jj];
+    outBuffer_y[jj] += elementScale * localBuf_y[jj];
+    outBuffer_z[jj] += elementScale * localBuf_z[jj];
+  }
+}
+
+__force_inline inline void
+loadStencilInput(const struct gridContext gc, const int i_center,
+                 const int j_center, const int k_center,
+                 const int nodeOffset[3], const CTYPE *in,
+                 MTYPE buffer_x[STENCIL_SIZE_Y], MTYPE buffer_y[STENCIL_SIZE_Y],
+                 MTYPE buffer_z[STENCIL_SIZE_Y]) {
+
+  const int i_sender = i_center + nodeOffset[0];
+  const int j_sender = j_center + nodeOffset[1];
+  const int k_sender = k_center + nodeOffset[2];
+
+#pragma omp simd safelen(STENCIL_SIZE_Y) simdlen(STENCIL_SIZE_Y)               \
+    aligned(buffer_x, buffer_y, buffer_z                                       \
+            : __alignBound)
+  for (int jj = 0; jj < STENCIL_SIZE_Y; jj++) {
+
+    const uint_fast32_t sendingNodeIndex =
+        (i_sender)*gc.wrapy * gc.wrapz + (k_sender)*gc.wrapy + (j_sender + jj);
+
+    const int startSend = 3 * sendingNodeIndex;
+
+    buffer_x[jj] = in[startSend + 0];
+    buffer_y[jj] = in[startSend + 1];
+    buffer_z[jj] = in[startSend + 2];
+  }
+}
+#pragma omp end declare target
diff --git a/stencil_optimization.c b/stencil_optimization.c
new file mode 100644
index 0000000..0d5cf4a
--- /dev/null
+++ b/stencil_optimization.c
@@ -0,0 +1,598 @@
+#include "stencil_optimization.h"
+
+#include "stencil_grid_utility.h"
+#include "stencil_methods.h"
+#include "stencil_solvers.h"
+#include "gpu_definitions.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+// main function
+void top3dmgcg(const uint_fast32_t nelx, const uint_fast32_t nely,
+               const uint_fast32_t nelz, const DTYPE volfrac, const DTYPE rmin,
+               const uint_fast32_t nl, const float cgtol,
+               const uint_fast32_t cgmax, const int verbose,const int write_result,const int max_iterations) {
+
+  struct gridContext gridContext;
+  setupGC(&gridContext, nl,nelx,nely,nelz);
+
+  struct gpuGrid gpu_gc;
+  const int num_targets = 1; //omp_get_num_devices();
+  if (verbose == 1){
+    printf("OpenMP enabled with %d devices.\n",num_targets);
+    printf("OpenMP default device: %d\n",omp_get_default_device());
+  }
+  gpu_gc.num_targets = num_targets;
+
+  gridContextToGPUGrid(&gridContext,&gpu_gc,nl,verbose);
+
+  const uint_fast64_t nelem = (gridContext.wrapx - 1) *
+                              (gridContext.wrapy - 1) * (gridContext.wrapz - 1);
+
+  CTYPE *F;
+  STYPE *U;
+  allocateZeroPaddedStateField(gridContext, 0, &F);
+  allocateZeroPaddedStateField_STYPE(gridContext, 0, &U);
+
+  double forceMagnitude = -1;
+
+  { // setup cantilever load
+    const int ny = nely + 1;
+
+    const int k = 0;
+
+    const double radius = ((double)ny) / 5.0; // snap
+    const double radius2 = radius * radius;
+    const double center_x = (double)nelx;
+    const double center_y = ((double)nely - 1.0) / 2.0;
+
+    int num_elements = 0;
+    for (int i = 0; i < nelx; i++) {
+      for (int j = 0; j < nely; j++) {
+        const double dx = (double)i - center_x;
+        const double dy = (double)j - center_y;
+        const double dist2 = dx * dx + dy * dy;
+        if (dist2 < radius2) {
+          num_elements++;
+        }
+      }
+    }
+
+    double nodalForce = forceMagnitude / (4.0 * (double)num_elements);
+    for (int i = 0; i < nelx; i++) {
+      for (int j = 0; j < nely; j++) {
+
+        const int ii = i + 1;
+        const int jj = j + 1;
+        const int kk = k + 1;
+
+        const double dx = (double)i - center_x;
+        const double dy = (double)j - center_y;
+        const double dist2 = dx * dx + dy * dy;
+
+        if (dist2 < radius2) {
+          const uint_fast32_t nidx1 =
+              (ii + 1) * gridContext.wrapy * gridContext.wrapz +
+              gridContext.wrapy * kk + (jj + 1);
+          const uint_fast32_t nidx2 =
+              (ii + 1) * gridContext.wrapy * gridContext.wrapz +
+              gridContext.wrapy * kk + jj;
+          const uint_fast32_t nidx3 =
+              ii * gridContext.wrapy * gridContext.wrapz +
+              gridContext.wrapy * kk + (jj + 1);
+          const uint_fast32_t nidx4 =
+              ii * gridContext.wrapy * gridContext.wrapz +
+              gridContext.wrapy * kk + jj;
+          F[3 * nidx1 + 2] += nodalForce;
+          F[3 * nidx2 + 2] += nodalForce;
+          F[3 * nidx3 + 2] += nodalForce;
+          F[3 * nidx4 + 2] += nodalForce;
+        }
+      }
+    }
+  }
+
+  DTYPE *dc = malloc(sizeof(DTYPE) * nelem);
+  DTYPE *dv = malloc(sizeof(DTYPE) * nelem);
+  DTYPE *x = malloc(sizeof(DTYPE) * nelem);
+  DTYPE *xPhys = malloc(sizeof(DTYPE) * nelem);
+  DTYPE *xnew = malloc(sizeof(DTYPE) * nelem);
+  DTYPE c = 0.0;
+
+  #pragma omp parallel for schedule(static) default(none) firstprivate(nelem) shared(x,xPhys,dv)
+  for (uint_fast64_t i = 0; i < nelem; i++) {
+    x[i] = 0.0;
+    xPhys[i] = 0.0;
+    dv[i] = 1.0;
+  }
+
+  #pragma omp parallel for collapse(3) schedule(static) default(none) firstprivate(volfrac) shared(gridContext,x,xPhys)
+  for (int i = 1; i < gridContext.nelx + 1; i++)
+    for (int k = 1; k < gridContext.nelz + 1; k++)
+      for (int j = 1; j < gridContext.nely + 1; j++) {
+        const int idx = i * (gridContext.wrapy - 1) * (gridContext.wrapz - 1) +
+                        k * (gridContext.wrapy - 1) + j;
+
+        x[idx] = volfrac;
+        xPhys[idx] = volfrac;
+      }
+
+  // allocate needed memory for solver
+  struct SolverDataBuffer solverData;
+  allocateSolverData(gridContext, nl, &solverData);
+  initializeCholmod(gridContext, nl - 1, &solverData.bottomSolver,
+                    solverData.coarseMatrices[nl - 1]);
+
+  #pragma omp target enter data map(to:dv[:nelem]) device(gpu_gc.target[0].id)
+  applyDensityFilterGradient(gridContext, rmin, dv);
+
+  unsigned int loop = 0;
+  float change = 1;
+
+  #ifdef _OPENMP
+    if (verbose ==1){ 
+      printf(" OpenMP enabled with %d threads\n", omp_get_max_threads());
+    }
+
+    const double start_wtime = omp_get_wtime();
+  #endif
+  
+  // Mapping data to the GPU grid
+  enter_data(gridContext,solverData,gpu_gc,xPhys,x,xnew,U,F,dc,nelem,nl);
+
+  /* %% START ITERATION */
+  DTYPE vol = 0.0;
+  int gciter_total = 0;
+  while ((change > 1e-2) && (loop < max_iterations)) {
+
+    loop++;
+
+    int cgiter;
+    float cgres;
+    const int nswp = 4;
+    solveStateMG_halo(&gpu_gc, xPhys, nswp, nl, cgtol, &solverData, &cgiter,
+                      &cgres, F, U);
+    //printf("Exit solveStateMG_halo\n");
+    vol = compute_volume(gridContext,xPhys);
+
+    getComplianceAndSensetivity_halo(&(gpu_gc.target[0]), xPhys, U, &c, dc);
+    //printf("Exit getComplianceAndSensetivity_halo\n");
+    applyDensityFilterGradient(gridContext, rmin, dc);
+
+    //#pragma omp taskwait
+    vol /= (DTYPE)(gridContext.nelx * gridContext.nely * gridContext.nelz);
+    DTYPE g = vol - volfrac;
+
+    // Iteratively stepping solution forward
+    update_solution(x,xnew,dv,dc,nelem,g);
+    
+    // Computing the maximum change over all elements
+    change = compute_change(x,xnew,nelem);
+
+    #pragma omp target update from(x[:nelem])
+    applyDensityFilter(gridContext, rmin, x, xPhys);
+    #pragma omp target update to(xPhys[:nelem])
+
+    gciter_total += cgiter;
+
+    if (verbose ==1){ 
+      printf("It.:%4i Obj.:%6.3e Vol.:%6.3f ch.:%4.2f relres: %4.2e iters: %4i ",
+           loop, c, vol, change, cgres, cgiter);
+      #ifdef _OPENMP
+        printf("time: %6.3f \n", omp_get_wtime() - start_wtime);
+      #else
+        printf(" \n");
+      #endif
+    }
+  }
+
+  //#pragma omp target update from(xPhys[:nelem])
+
+  printf("%4i %12.6f %6.3f %4.2f %9i %9i",loop, c, vol, change,nelx*nely*nelz,gciter_total);
+  #ifdef _OPENMP
+  printf(" %6.3f \n", omp_get_wtime() - start_wtime);
+  #else 
+  printf("\n");
+  #endif
+  if (verbose ==1){ 
+    #ifdef _OPENMP
+      printf("End time: %6.3f \n", omp_get_wtime() - start_wtime);
+    #endif
+  }
+  char name1[60];
+  char name2[60];
+  if (write_result == 1){
+    snprintf(name1, 60, "out_%d_%d_%d.vtu",(int)nelx, (int)nely, (int)nelz);
+    writeDensityVtkFile(nelx, nely, nelz, xPhys,name1);
+    snprintf(name2, 60, "out_%d_%d_%d_halo.vtu",(int)nelx, (int)nely, (int)nelz);
+    writeDensityVtkFileWithHalo(nelx, nely, nelz, xPhys,name2);
+  }
+  finishCholmod(gridContext, nl - 1, &solverData.bottomSolver,
+                solverData.coarseMatrices[nl - 1],verbose);
+  freeSolverData(&solverData, nl);
+  freeGC(&gridContext, nl);
+  freeGPUGrid(&gpu_gc,nl);
+
+  free(dc);
+  free(dv);
+  free(x);
+  free(xPhys);
+  free(xnew);
+}
+
+// this function acts as a matrix-free replacement for out = (H*rho(:))./Hs
+// note that rho and out cannot be the same pointer!
+// temperature: cold, called once pr design iteration
+void applyDensityFilter(const struct gridContext gc, const DTYPE rmin,
+                        const DTYPE *rho, DTYPE *out) {
+
+  const uint32_t nelx = gc.nelx;
+  const uint32_t nely = gc.nely;
+  const uint32_t nelz = gc.nelz;
+
+  const uint32_t elWrapy = gc.wrapy - 1;
+  const uint32_t elWrapz = gc.wrapz - 1;
+
+  //#pragma omp target teams distribute parallel for firstprivate(rmin,elWrapy,elWrapz)
+  // It was found that this function was hard to parallelize in an efficient way on the GPU
+
+// loop over elements, usually very large with nelx*nely*nelz = 100.000 or
+// more
+  #pragma omp parallel for collapse(3) default(none) firstprivate(nelx,nely,nelz,rmin,elWrapy,elWrapz) shared(out,rho)
+  for (unsigned int i1 = 1; i1 < nelx + 1; i1++)
+    for (unsigned int k1 = 1; k1 < nelz + 1; k1++)
+      for (unsigned int j1 = 1; j1 < nely + 1; j1++) {
+
+        const uint32_t e1 = i1 * elWrapy * elWrapz + k1 * elWrapy + j1;
+
+        double oute1 = 0.0;
+        double unityScale = 0.0;
+
+        // loop over neighbourhood
+        const uint32_t i2max = MIN(i1 + (ceil(rmin) + 1), nelx + 1);
+        const uint32_t i2min = MAX(i1 - (ceil(rmin) - 1), 1);
+
+        // the three loops herein are over a constant neighbourhood. typically
+        // 4x4x4 or something like that
+        for (uint32_t i2 = i2min; i2 < i2max; i2++) {
+
+          const uint32_t k2max = MIN(k1 + (ceil(rmin) + 1), nelz + 1);
+          const uint32_t k2min = MAX(k1 - (ceil(rmin) - 1), 1);
+
+          for (uint32_t k2 = k2min; k2 < k2max; k2++) {
+
+            const uint32_t j2max = MIN(j1 + (ceil(rmin) + 1), nely + 1);
+            const uint32_t j2min = MAX(j1 - (ceil(rmin) - 1), 1);
+
+            for (uint32_t j2 = j2min; j2 < j2max; j2++) {
+
+              const uint32_t e2 = i2 * elWrapy * elWrapz + k2 * elWrapy + j2;
+
+              const double filterWeight =
+                  MAX(0.0, rmin - sqrt((i1 - i2) * (i1 - i2) +
+                                       (j1 - j2) * (j1 - j2) +
+                                       (k1 - k2) * (k1 - k2)));
+
+              oute1 += filterWeight * rho[e2];
+              unityScale += filterWeight;
+            }
+          }
+        }
+        out[e1] = oute1 / unityScale;
+      }
+}
+
+// this function acts as a matrix-free replacement for v = H* (v(:)./Hs)
+// note that rho and out cannot be the same pointer!
+// temperature: cold, called twice pr design iteration
+void applyDensityFilterGradient(const struct gridContext gc, const DTYPE rmin,
+                                DTYPE *v) {
+  const uint32_t nelx = gc.nelx;
+  const uint32_t nely = gc.nely;
+  const uint32_t nelz = gc.nelz;
+  const uint32_t elWrapy = gc.wrapy - 1;
+  const uint32_t elWrapz = gc.wrapz - 1;
+  const uint32_t N = (gc.wrapx - 1) * elWrapy * elWrapz;
+  DTYPE *tmp = malloc(sizeof(DTYPE) * N);
+
+// loop over elements, usually very large with nelx*nely*nelz = 100.000 or
+// more
+  const uint_fast64_t nelem = (gc.wrapx - 1) *
+                              (gc.wrapy - 1) * (gc.wrapz - 1);
+  const double ceil_rmin = ceil(rmin);
+  #pragma omp target data map(to:tmp[:N]) //map(always,tofrom:v[:nelem])
+  {
+  #pragma omp target teams distribute parallel for
+  for(int i=0;i<N;i++){
+    tmp[i] = 0;
+  }
+  #pragma omp target teams distribute parallel for collapse(3) schedule(static) firstprivate(ceil_rmin,rmin) // default(none) firstprivate(ceil_rmin,rmin,nelx,nely,nelz) shared(tmp,v)
+  for (uint32_t i1 = 1; i1 < nelx + 1; i1++)
+    for (uint32_t k1 = 1; k1 < nelz + 1; k1++)
+      for (uint32_t j1 = 1; j1 < nely + 1; j1++) {
+
+        const uint32_t e1 = i1 * elWrapy * elWrapz + k1 * elWrapy + j1;
+
+        double unityScale = 0.0;
+
+        // loop over neighbourhood
+        const uint32_t i2max = MIN(i1 + (ceil_rmin + 1), nelx + 1);
+        const uint32_t i2min = MAX(i1 - (ceil_rmin - 1), 1);
+
+        // the three loops herein are over a constant neighbourhood. typically
+        // 4x4x4 or something like that
+        for (uint32_t i2 = i2min; i2 < i2max; i2++) {
+
+          const uint32_t k2max = MIN(k1 + (ceil_rmin + 1), nelz + 1);
+          const uint32_t k2min = MAX(k1 - (ceil_rmin - 1), 1);
+
+          for (uint32_t k2 = k2min; k2 < k2max; k2++) {
+
+            const uint32_t j2max = MIN(j1 + (ceil_rmin + 1), nely + 1);
+            const uint32_t j2min = MAX(j1 - (ceil_rmin - 1), 1);
+
+            for (uint32_t j2 = j2min; j2 < j2max; j2++) {
+
+              const double filterWeight =
+                  MAX(0.0, rmin - sqrt((i1 - i2) * (i1 - i2) +
+                                       (j1 - j2) * (j1 - j2) +
+                                       (k1 - k2) * (k1 - k2)));
+
+              unityScale += filterWeight;
+            }
+          }
+        }
+        //#pragma omp atomic write
+        tmp[e1] = ((double) v[e1]) / unityScale;
+      }
+  
+// loop over elements, usually very large with nelx*nely*nelz = 100.000 or
+// more
+  #pragma omp target teams distribute parallel for collapse(3) schedule(static) firstprivate(ceil_rmin,rmin,elWrapy,elWrapz) //default(none) firstprivate(ceil_rmin,nelx,nely,nelz,rmin,elWrapy,elWrapz) shared(tmp,v)
+  for (uint32_t i1 = 1; i1 < nelx + 1; i1++)
+    for (uint32_t k1 = 1; k1 < nelz + 1; k1++)
+      for (uint32_t j1 = 1; j1 < nely + 1; j1++) {
+
+        const uint32_t e1 = i1 * elWrapy * elWrapz + k1 * elWrapy + j1;
+
+        double ve1 = 0.0;
+
+        // loop over neighbourhood
+        const uint32_t i2max = MIN(i1 + (ceil_rmin + 1), nelx + 1);
+        const uint32_t i2min = MAX(i1 - (ceil_rmin - 1), 1);
+
+        // the three loops herein are over a constant neighbourhood. typically
+        // 4x4x4 or something like that
+        for (uint32_t i2 = i2min; i2 < i2max; i2++) {
+
+          const uint32_t k2max = MIN(k1 + (ceil_rmin + 1), nelz + 1);
+          const uint32_t k2min = MAX(k1 - (ceil_rmin - 1), 1);
+
+          for (uint32_t k2 = k2min; k2 < k2max; k2++) {
+
+            const uint32_t j2max = MIN(j1 + (ceil_rmin + 1), nely + 1);
+            const uint32_t j2min = MAX(j1 - (ceil_rmin - 1), 1);
+
+            for (uint32_t j2 = j2min; j2 < j2max; j2++) {
+
+              const uint32_t e2 = i2 * elWrapy * elWrapz + k2 * elWrapy + j2;
+
+              const double  filterWeight =
+                  MAX(0.0, ((double) rmin) - sqrt((i1 - i2) * (i1 - i2) +
+                                       (j1 - j2) * (j1 - j2) +
+                                       (k1 - k2) * (k1 - k2)));
+
+              ve1 += filterWeight * tmp[e2];
+            }
+          }
+        }
+        v[e1] = ve1;
+      }
+  }
+  free(tmp);
+}
+
+// writes a file with a snapshot of the density field (x,xPhys), can be opened
+// with paraview temperature: very cold, usually called once only
+void writeDensityVtkFile(const int nelx, const int nely, const int nelz,
+                         const DTYPE *densityArray, const char *filename) {
+  int nx = nelx + 1;
+  int ny = nely + 1;
+  int nz = nelz + 1;
+
+  const int paddingy =
+      (STENCIL_SIZE_Y - ((nely + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+  const int paddingz =
+      (STENCIL_SIZE_Z - ((nelz + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+  const int elWrapy = nely + paddingy + 3 - 1;
+  const int elWrapz = nelz + paddingz + 3 - 1;
+
+  int numberOfNodes = nx * ny * nz;
+  int numberOfElements = nelx * nely * nelz;
+
+  FILE *fid = fopen(filename, "w");
+
+  // write header
+  fprintf(fid, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" "
+               "byte_order=\"LittleEndian\">\n");
+  fprintf(fid, "<UnstructuredGrid>\n");
+  fprintf(fid, "<Piece NumberOfPoints=\"%i\" NumberOfCells=\"%i\">\n",
+          numberOfNodes, numberOfElements);
+
+  // points
+  fprintf(fid, "<Points>\n");
+  fprintf(fid,
+          "<DataArray type=\"Float32\" NumberOfComponents=\"%i\" "
+          "format=\"ascii\">\n",
+          3);
+  for (int i = 0; i < nx; i++)
+    for (int k = 0; k < nz; k++)
+      for (int j = 0; j < ny; j++)
+        fprintf(fid, "%e %e %e\n", (float)i, (float)j, (float)k);
+  fprintf(fid, "</DataArray>\n");
+  fprintf(fid, "</Points>\n");
+
+  fprintf(fid, "<Cells>\n");
+
+  fprintf(
+      fid,
+      "<DataArray type=\"Int32\" Name=\"connectivity\" format= \"ascii\">\n");
+  for (int i = 0; i < nelx; i++)
+    for (int k = 0; k < nelz; k++)
+      for (int j = 0; j < nely; j++) {
+        const int nx_1 = i;
+        const int nx_2 = i + 1;
+        const int nz_1 = k;
+        const int nz_2 = k + 1;
+        const int ny_1 = j;
+        const int ny_2 = j + 1;
+        fprintf(fid, "%d %d %d %d %d %d %d %d\n",
+                nx_1 * ny * nz + nz_1 * ny + ny_2,
+                nx_2 * ny * nz + nz_1 * ny + ny_2,
+                nx_2 * ny * nz + nz_1 * ny + ny_1,
+                nx_1 * ny * nz + nz_1 * ny + ny_1,
+                nx_1 * ny * nz + nz_2 * ny + ny_2,
+                nx_2 * ny * nz + nz_2 * ny + ny_2,
+                nx_2 * ny * nz + nz_2 * ny + ny_1,
+                nx_1 * ny * nz + nz_2 * ny + ny_1);
+      }
+
+  fprintf(fid, "</DataArray>\n");
+
+  fprintf(fid,
+          "<DataArray type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");
+  for (int i = 1; i < numberOfElements + 1; i++)
+    fprintf(fid, "%d\n", i * 8);
+  fprintf(fid, "</DataArray>\n");
+
+  fprintf(fid, "<DataArray type=\"UInt8\" Name=\"types\" format=\"ascii\">\n");
+  for (int i = 0; i < numberOfElements; i++)
+    fprintf(fid, "%d\n", 12);
+  fprintf(fid, "</DataArray>\n");
+  fprintf(fid, "</Cells>\n");
+
+  fprintf(fid, "<CellData>\n");
+  fprintf(fid, "<DataArray type=\"Float32\" NumberOfComponents=\"1\" "
+               "Name=\"density\" format=\"ascii\">\n");
+  for (unsigned int i1 = 1; i1 < nelx + 1; i1++)
+    for (unsigned int k1 = 1; k1 < nelz + 1; k1++)
+      for (unsigned int j1 = 1; j1 < nely + 1; j1++) {
+        const uint64_t idx = i1 * elWrapy * elWrapz + k1 * elWrapy + j1;
+        fprintf(fid, "%e\n", densityArray[idx]);
+      }
+  fprintf(fid, "</DataArray>\n");
+  fprintf(fid, "</CellData>\n");
+
+  fprintf(fid, "</Piece>\n");
+  fprintf(fid, "</UnstructuredGrid>\n");
+  fprintf(fid, "</VTKFile>\n");
+
+  fclose(fid);
+}
+
+// writes a file with a snapshot of the density field (x,xPhys), can be opened
+// with paraview temperature: very cold, usually called once only
+void writeDensityVtkFileWithHalo(const int nelx, const int nely, const int nelz,
+                                 const DTYPE *densityArray,
+                                 const char *filename) {
+
+  const int paddingx =
+      (STENCIL_SIZE_X - ((nelx + 1) % STENCIL_SIZE_X)) % STENCIL_SIZE_X;
+  const int paddingy =
+      (STENCIL_SIZE_Y - ((nely + 1) % STENCIL_SIZE_Y)) % STENCIL_SIZE_Y;
+  const int paddingz =
+      (STENCIL_SIZE_Z - ((nelz + 1) % STENCIL_SIZE_Z)) % STENCIL_SIZE_Z;
+
+  const int wrapx = nelx + paddingx + 3;
+  const int wrapy = nely + paddingy + 3;
+  const int wrapz = nelz + paddingz + 3;
+
+  const int elWrapx = wrapx - 1;
+  const int elWrapy = wrapy - 1;
+  const int elWrapz = wrapz - 1;
+
+  int numberOfNodes = wrapx * wrapy * wrapz;
+  int numberOfElements = elWrapx * elWrapy * elWrapz;
+
+  FILE *fid = fopen(filename, "w");
+
+  // write header
+  fprintf(fid, "<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" "
+               "byte_order=\"LittleEndian\">\n");
+  fprintf(fid, "<UnstructuredGrid>\n");
+  fprintf(fid, "<Piece NumberOfPoints=\"%i\" NumberOfCells=\"%i\">\n",
+          numberOfNodes, numberOfElements);
+
+  // points
+  fprintf(fid, "<Points>\n");
+  fprintf(fid,
+          "<DataArray type=\"Float32\" NumberOfComponents=\"%i\" "
+          "format=\"ascii\">\n",
+          3);
+  for (int i = 0; i < wrapx; i++)
+    for (int k = 0; k < wrapz; k++)
+      for (int j = 0; j < wrapy; j++)
+        fprintf(fid, "%e %e %e\n", (float)i, (float)j, (float)k);
+  fprintf(fid, "</DataArray>\n");
+  fprintf(fid, "</Points>\n");
+
+  fprintf(fid, "<Cells>\n");
+
+  fprintf(
+      fid,
+      "<DataArray type=\"Int32\" Name=\"connectivity\" format= \"ascii\">\n");
+  for (int i = 0; i < elWrapx; i++)
+    for (int k = 0; k < elWrapz; k++)
+      for (int j = 0; j < elWrapy; j++) {
+        const int nx_1 = i;
+        const int nx_2 = i + 1;
+        const int nz_1 = k;
+        const int nz_2 = k + 1;
+        const int ny_1 = j;
+        const int ny_2 = j + 1;
+        fprintf(fid, "%d %d %d %d %d %d %d %d\n",
+                nx_1 * wrapy * wrapz + nz_1 * wrapy + ny_2,
+                nx_2 * wrapy * wrapz + nz_1 * wrapy + ny_2,
+                nx_2 * wrapy * wrapz + nz_1 * wrapy + ny_1,
+                nx_1 * wrapy * wrapz + nz_1 * wrapy + ny_1,
+                nx_1 * wrapy * wrapz + nz_2 * wrapy + ny_2,
+                nx_2 * wrapy * wrapz + nz_2 * wrapy + ny_2,
+                nx_2 * wrapy * wrapz + nz_2 * wrapy + ny_1,
+                nx_1 * wrapy * wrapz + nz_2 * wrapy + ny_1);
+      }
+
+  fprintf(fid, "</DataArray>\n");
+
+  fprintf(fid,
+          "<DataArray type=\"Int32\" Name=\"offsets\" format=\"ascii\">\n");
+  for (int i = 1; i < numberOfElements + 1; i++)
+    fprintf(fid, "%d\n", i * 8);
+  fprintf(fid, "</DataArray>\n");
+
+  fprintf(fid, "<DataArray type=\"UInt8\" Name=\"types\" format=\"ascii\">\n");
+  for (int i = 0; i < numberOfElements; i++)
+    fprintf(fid, "%d\n", 12);
+  fprintf(fid, "</DataArray>\n");
+  fprintf(fid, "</Cells>\n");
+
+  fprintf(fid, "<CellData>\n");
+  fprintf(fid, "<DataArray type=\"Float32\" NumberOfComponents=\"1\" "
+               "Name=\"density\" format=\"ascii\">\n");
+  for (unsigned int i1 = 0; i1 < elWrapx; i1++)
+    for (unsigned int k1 = 0; k1 < elWrapz; k1++)
+      for (unsigned int j1 = 0; j1 < elWrapy; j1++) {
+        const uint64_t idx = i1 * elWrapy * elWrapz + k1 * elWrapy + j1;
+        fprintf(fid, "%e\n", densityArray[idx]);
+      }
+  fprintf(fid, "</DataArray>\n");
+  fprintf(fid, "</CellData>\n");
+
+  fprintf(fid, "</Piece>\n");
+  fprintf(fid, "</UnstructuredGrid>\n");
+  fprintf(fid, "</VTKFile>\n");
+
+  fclose(fid);
+}
diff --git a/stencil_optimization.h b/stencil_optimization.h
new file mode 100644
index 0000000..e67e4c7
--- /dev/null
+++ b/stencil_optimization.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "definitions.h"
+
+void top3dmgcg(const uint_fast32_t nelx, const uint_fast32_t nely,
+               const uint_fast32_t nelz, const DTYPE volfrac, const DTYPE rmin,
+               const uint_fast32_t nl, const float cgtol,
+               const uint_fast32_t cgmax, const int verbose,
+               const int write_result,const int max_iterations);
+
+void applyDensityFilter(const struct gridContext gc, const DTYPE rmin,
+                        const DTYPE *rho, DTYPE *out);
+
+void applyDensityFilterGradient(const struct gridContext gc, const DTYPE rmin,
+                                DTYPE *v);
+
+void writeDensityVtkFile(const int nelx, const int nely, const int nelz,
+                         const DTYPE *densityArray, const char *filename);
+
+void writeDensityVtkFileWithHalo(const int nelx, const int nely, const int nelz,
+                                 const DTYPE *densityArray,
+                                 const char *filename);
diff --git a/stencil_solvers.c b/stencil_solvers.c
new file mode 100644
index 0000000..a28f1cc
--- /dev/null
+++ b/stencil_solvers.c
@@ -0,0 +1,457 @@
+#include "stencil_solvers.h"
+
+#include "stencil_grid_utility.h"
+#include "stencil_methods.h"
+
+// jacobi smoothing/preconditioning
+// temperature: hot, called 2x(number of levels)x(number of cg iterations) ~
+// [20-1000] times every design iteration. Note that most compute time is spent
+// in child function.
+
+void smoothDampedJacobiSubspace_halo(const struct gpuNode * gpu_node,
+                                     const DTYPE *x, const int l,
+                                     const uint_fast32_t nswp,
+                                     const CTYPE omega, const MTYPE *invD,
+                                     CTYPE *u, const CTYPE *b, CTYPE *tmp) {
+
+  const struct gridContext * gc = gpu_node->gc;
+
+  int ncell,wrapxc,wrapyc,wrapzc;
+  uint_fast32_t ndof;
+  computePadding(gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndof);
+
+  const int32_t nelxc = gc->nelx / ncell;
+  const int32_t nelyc = gc->nely / ncell;
+  const int32_t nelzc = gc->nelz / ncell;
+
+  // usually nswp is between 1 and 5
+  for (int s = 0; s < nswp; s++) {
+    applyStateOperatorSubspace_halo(gpu_node, l, x, u, tmp);
+  
+
+    // long for loop, as ndof is typically 300.000 or more, but also trivially
+    // parallel.
+    #pragma omp target teams distribute parallel for collapse(3) schedule(static) device(gpu_node->id)
+    for (int i = 1; i < nelxc + 2; i++)
+      for (int k = 1; k < nelzc + 2; k++)
+        for (int j = 1; j < nelyc + 2; j++) {
+          const int nidx = (i * wrapyc * wrapzc + wrapyc * k + j);
+
+          const uint32_t idx1 = 3 * nidx + 0;
+          const uint32_t idx2 = 3 * nidx + 1;
+          const uint32_t idx3 = 3 * nidx + 2;
+
+          u[idx1] += omega * invD[idx1] * (b[idx1] - tmp[idx1]);
+          u[idx2] += omega * invD[idx2] * (b[idx2] - tmp[idx2]);
+          u[idx3] += omega * invD[idx3] * (b[idx3] - tmp[idx3]);
+        }
+  }
+}
+
+// jacobi smoothing/preconditioning
+// temperature: hot, called 2x(number of levels)x(number of cg iterations) ~
+// [20-1000] times every design iteration. Note that most compute time is spent
+// in child function.
+void smoothDampedJacobiSubspaceMatrix_halo(
+    const struct gridContext * gc, const struct CSRMatrix M, const int l,
+    const uint_fast32_t nswp, const CTYPE omega, const MTYPE *invD, CTYPE *u,
+    const CTYPE *b, CTYPE *tmp) {
+
+    int ncell,wrapxc,wrapyc,wrapzc;
+    uint_fast32_t ndof;
+    computePadding(gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndof);
+
+    const int32_t nelxc = gc->nelx / ncell;
+    const int32_t nelyc = gc->nely / ncell;
+    const int32_t nelzc = gc->nelz / ncell;
+
+  // usually nswp is between 1 and 5
+  for (int s = 0; s < nswp; s++) {
+    applyStateOperatorSubspaceMatrix(*gc, l, M, u, tmp);
+
+// long for loop, as ndof is typically 300.000 or more, but also trivially
+// parallel.
+    #pragma omp parallel for collapse(3) schedule(static) default(none) \
+      firstprivate(nelxc,nelzc,nelyc,omega,wrapyc,wrapzc) shared(u,invD,b,tmp)    
+    for (int i = 1; i < nelxc + 2; i++)
+      for (int k = 1; k < nelzc + 2; k++)
+        for (int j = 1; j < nelyc + 2; j++) {
+          const int nidx = (i * wrapyc * wrapzc + wrapyc * k + j);
+
+          const uint32_t idx1 = 3 * nidx + 0;
+          const uint32_t idx2 = 3 * nidx + 1;
+          const uint32_t idx3 = 3 * nidx + 2;
+
+          u[idx1] += omega * invD[idx1] * (b[idx1] - tmp[idx1]);
+          u[idx2] += omega * invD[idx2] * (b[idx2] - tmp[idx2]);
+          u[idx3] += omega * invD[idx3] * (b[idx3] - tmp[idx3]);
+        }
+  }
+}
+
+// jacobi smoothing/preconditioning
+// temperature: hot, called 2x(number of levels)x(number of cg iterations) ~
+// [20-1000] times every design iteration. Note that most compute time is spent
+// in child function.
+
+void smoothDampedJacobi_halo(const struct gpuNode * gpu_node, const DTYPE *x,
+                             const uint_fast32_t nswp, const CTYPE omega,
+                             const MTYPE *invD, CTYPE *u, const CTYPE *b,
+                             CTYPE *tmp) {
+  const struct gridContext gc = *(gpu_node->gc);
+  // usually nswp is between 1 and 5
+  for (int s = 0; s < nswp; s++) {
+    applyStateOperator_stencil(gpu_node, x, u, tmp);
+
+  // it is not faster to make an even simpler kernel with four loops
+#pragma omp target teams distribute parallel for collapse(3) schedule(static) device(gpu_node->id)
+    for (int i = 1; i < gc.nelx + 2; i++)
+      for (int k = 1; k < gc.nelz + 2; k++)
+        for (int j = 1; j < gc.nely + 2; j++) {
+          const int nidx = i * gc.wrapy * gc.wrapz + gc.wrapy * k + j;
+
+          const uint32_t idx1 = 3 * nidx + 0;
+          const uint32_t idx2 = 3 * nidx + 1;
+          const uint32_t idx3 = 3 * nidx + 2;
+
+          u[idx1] += omega * invD[idx1] * (b[idx1] - tmp[idx1]);
+          u[idx2] += omega * invD[idx2] * (b[idx2] - tmp[idx2]);
+          u[idx3] += omega * invD[idx3] * (b[idx3] - tmp[idx3]);
+        }
+  }
+}
+
+
+// Vcycle preconditioner. recursive function.
+// temperature: medium, called (number of levels)x(number of cg iterations ~
+// 5 - 100) every design iteration. Much of the compute time is spent in
+// this function, although in children functions.
+void VcyclePreconditioner(const struct gpuGrid * gpu_gc, const DTYPE *x,
+                          const int nl, const int l, MTYPE **const invD,
+                          struct CoarseSolverData *bottomSolverData,
+                          const struct CSRMatrix *coarseMatrices, CTYPE omega,
+                          const int nswp, CTYPE **r, CTYPE **z, CTYPE **d) {
+  const struct gridContext * gc = gpu_gc->target[0].gc;
+
+  int ncell,wrapxc,wrapyc,wrapzc;
+  uint_fast32_t ndofc;
+  computePadding(gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+
+  int ncell_nl,wrapx_nl,wrapy_nl,wrapz_nl;
+  uint_fast32_t ndof_nl;
+  computePadding(gc,l+1,&ncell_nl,&wrapx_nl,&wrapy_nl,&wrapz_nl,&ndof_nl);
+
+  CTYPE *zptr = z[l];
+  CTYPE *dptr = d[l];
+  CTYPE *rptr = r[l];
+  CTYPE * next_rptr = r[l + 1];
+  CTYPE * next_zptr = z[l + 1];
+  MTYPE *invDptr = invD[l];
+
+    #pragma omp target teams distribute parallel for schedule(static)
+    for (int i = 0; i < ndofc; i++)
+        zptr[i] = 0.0;
+    // smooth
+    if (l == 0) {
+      //printf("Case 1 - enter\n");
+
+      smoothDampedJacobi_halo(&(gpu_gc->target[0]), x, nswp, omega, invDptr, zptr, rptr, dptr);
+
+      applyStateOperator_stencil(&(gpu_gc->target[0]), x, zptr, dptr);
+      //#pragma omp target update from(dptr[:ndofc])
+
+    } else if (l < number_of_matrix_free_levels) {
+      //printf("Case 2 -enter \n");
+      //#pragma omp target update to(invDptr[:ndofc],rptr[:ndofc],zptr[:ndofc])
+      smoothDampedJacobiSubspace_halo(&(gpu_gc->target[0]), x, l, nswp, omega, invDptr, zptr, rptr,
+                                      dptr);
+      
+      applyStateOperatorSubspace_halo(&(gpu_gc->target[0]), l, x, zptr, dptr);
+      
+      //#pragma omp target update from(zptr[:ndofc])
+    } else {
+      //printf("Case 3 -enter \n");
+      #pragma omp target update from(zptr[:ndofc],rptr[:ndofc],dptr[:ndofc])
+      smoothDampedJacobiSubspaceMatrix_halo(gc, coarseMatrices[l], l, nswp, omega,
+                                            invDptr, zptr, rptr, dptr); // Updates z[l] and d[l]
+      applyStateOperatorSubspaceMatrix(*gc, l, coarseMatrices[l], zptr, dptr); // Updates coarseMatrices[l] and d[l]
+      #pragma omp target update to(zptr[:ndofc],dptr[:ndofc])
+    }
+  
+                          
+  // long for loop, as ndof is typically 300.000 or more, but also trivially
+  // parallel
+    #pragma omp target teams distribute parallel for schedule(static)
+    for (int i = 0; i < ndofc; i++)
+      dptr[i] = rptr[i] - dptr[i];
+    
+    //#pragma omp target update from(dptr[:ndofc])
+    
+
+    // project residual down
+    projectToCoarserGrid_halo(&(gpu_gc->target[0]), l, dptr, next_rptr);
+    //printf("To coarser grid\n");
+
+    // smooth coarse
+    if (nl == l + 2) {
+      #pragma omp parallel for schedule(static) default(none) firstprivate(ndof_nl,l) shared(next_zptr)
+      for (int i = 0; i < ndof_nl; i++)
+        next_zptr[i] = 0.0;
+
+      // Must run on CPU due to Cholesky factorization
+      #pragma omp target update from(next_rptr[:ndof_nl])
+      solveSubspaceMatrix(*gc, l + 1, *bottomSolverData, next_rptr, next_zptr);
+      #pragma omp target update to(next_zptr[:ndof_nl])
+      //printf("Solved on CPU\n");
+
+    } else
+      VcyclePreconditioner(gpu_gc, x, nl, l + 1, invD, bottomSolverData,
+                          coarseMatrices, omega, nswp, r, z, d);
+
+    // project residual up
+    projectToFinerGrid_halo(&(gpu_gc->target[0]), l, next_zptr, dptr);
+    //printf("To finer grid\n");
+
+    // smooth
+    if (l == 0) {
+      //printf("Case 1 exit\n");
+      //#pragma omp target update to(dptr[:ndofc])
+
+      #pragma omp target teams distribute parallel for schedule(static)
+      for (int i = 0; i < ndofc; i++)
+        zptr[i] += dptr[i];
+
+      smoothDampedJacobi_halo(&(gpu_gc->target[0]), x, nswp, omega, invDptr, zptr, rptr, dptr);
+
+      //#pragma omp target update from(zptr[:ndofc])
+    }
+    else if (l < number_of_matrix_free_levels) {
+      //printf("Case 2 exit\n");
+
+      #pragma omp target teams distribute parallel for schedule(static)
+      for (int i = 0; i < ndofc; i++)
+        zptr[i] += dptr[i];
+      
+      //#pragma omp target update to(invDptr[:ndofc])
+      smoothDampedJacobiSubspace_halo(&(gpu_gc->target[0]), x, l, nswp, omega, invDptr, zptr, rptr,
+                                      dptr);
+      //#pragma omp target update from(zptr[:ndofc])
+    } else {
+      //printf("Case 3 - exit\n");
+
+      #pragma omp target teams distribute parallel for schedule(static)
+      for (int i = 0; i < ndofc; i++)
+        zptr[i] += dptr[i];
+
+      #pragma omp target update from(zptr[:ndofc],rptr[:ndofc],dptr[:ndofc])
+
+      smoothDampedJacobiSubspaceMatrix_halo(gc, coarseMatrices[l], l, nswp, omega,
+                                            invD[l], zptr, rptr, dptr);
+      #pragma omp target update to(dptr[:ndofc],zptr[:ndofc])
+    }
+}
+
+// solves the linear system of Ku = b.
+// temperature: medium, accounts for 95% or more of runtime, but this time is
+// spent in children functions. The iter loop of this funciton is a good
+// candidate for GPU parallel region scope, as it is only performed once every
+// design iteration (and thus only 100 times during a program)
+void solveStateMG_halo(const struct gpuGrid * gpu_gc, DTYPE *x, const int nswp,
+                       const int nl, const CTYPE tol,
+                       struct SolverDataBuffer *data, int *finalIter,
+                       float *finalRes, CTYPE *b, STYPE *u) {
+
+  const struct gridContext * gc = gpu_gc->target[0].gc;
+
+  const uint_fast32_t ndof = 3 * gc->wrapx * gc->wrapy * gc->wrapz;
+
+  CTYPE *r = data->r;
+  CTYPE *p = data->p;
+  CTYPE *q = data->q;
+  CTYPE *z = data->z;
+
+  MTYPE **invD = data->invD;
+  CTYPE **dmg = data->dmg;
+  CTYPE **rmg = data->rmg;
+  CTYPE **zmg = data->zmg;
+
+  // setup residual vector
+  #pragma omp target teams distribute parallel for schedule(static) nowait
+  for (uint_fast32_t i = 0; i < ndof; i++)
+    z[i] = (CTYPE)u[i];
+
+  for (int l = number_of_matrix_free_levels; l < nl; l++) {
+    // printf("assemble mat l:%i\n", l);
+    assembleSubspaceMatrix(*gc, l, x, data->coarseMatrices[l], invD[l]);
+    MTYPE *invDptr = invD[l];
+    int ncell,wrapxc,wrapyc,wrapzc;
+    uint_fast32_t ndofc;
+    computePadding(gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+    #pragma omp target update to(invDptr[:ndofc])
+  }
+
+  for (int l = 0; l < nl; l++) {
+    assembleInvertedMatrixDiagonalSubspace_halo(&(gpu_gc->target[0]), x, l, invD[l]);
+    int ncell,wrapxc,wrapyc,wrapzc;
+    uint_fast32_t ndofc;
+    computePadding(gc,l,&ncell,&wrapxc,&wrapyc,&wrapzc,&ndofc);
+    #pragma omp target update from(invD[l][:ndofc]) nowait
+  }
+
+  factorizeSubspaceMatrix(*gc, nl - 1, data->bottomSolver,
+                          data->coarseMatrices[nl - 1]);
+
+  //const uint32_t designSize = (gc.wrapx - 1) * (gc.wrapy - 1) * (gc.wrapz - 1);
+  CTYPE rhoold = 0.0;
+  CTYPE dpr;
+  CTYPE alpha;
+  CTYPE rho;
+  //CTYPE *dptr = dmg[0];
+  //MTYPE *invDptr = invD[0];
+
+//#pragma omp target data map(to:b                                \
+                            [:ndof])//, z                                         \
+                            [:ndof], r                                         \
+                            [:ndof], p                                         \
+                            [:ndof], q                                         \
+                            [:ndof], dptr                                      \
+                            [:ndof], invDptr                                   \
+                            [:ndof]) map(tofrom                                \
+                                         : u[:ndof],x[:designSize])
+  {
+
+    #pragma omp taskwait
+
+    applyStateOperator_stencil(&(gpu_gc->target[0]), x, z, r);
+
+    #pragma omp target teams distribute parallel for schedule(static)
+    for (uint_fast32_t i = 0; i < ndof; i++)
+      r[i] = b[i] - r[i];
+
+    // setup scalars
+    const MTYPE omega = 0.6;
+    const CTYPE bnorm = norm(b, ndof);
+    const int maxIter = 1000;
+
+    // begin cg loop - usually spans 5 - 300 iterations will be reduced to 5 -
+    // 20 iterations once direct solver is included for coarse subproblem.
+    for (int iter = 0; iter < maxIter; iter++) {
+
+      //#pragma omp target update from(r[:ndof])
+
+      // get preconditioned vector
+      VcyclePreconditioner(gpu_gc, x, nl, 0, invD, &data->bottomSolver,
+                           data->coarseMatrices, omega, nswp, rmg, zmg, dmg);
+      //printf("Exit VcyclePreconditioner\n");
+      //#pragma omp target update to(z[:ndof])
+
+      rho = innerProduct(r, z, ndof);
+
+      if (iter == 0) {
+
+        #pragma omp target teams distribute parallel for schedule(static)
+        for (uint_fast32_t i = 0; i < ndof; i++)
+          p[i] = z[i];
+
+      } else {
+
+        CTYPE beta = rho / rhoold;
+        #pragma omp target teams distribute parallel for firstprivate(beta) schedule(static)
+        for (uint_fast32_t i = 0; i < ndof; i++)
+          p[i] = beta * p[i] + z[i];
+      }
+
+      applyStateOperator_stencil(&(gpu_gc->target[0]), x, p, q);
+
+      dpr = innerProduct(p, q, ndof);
+
+      alpha = rho / dpr;
+      rhoold = rho;
+
+      #pragma omp target teams distribute parallel for firstprivate(alpha) schedule(static) nowait
+      for (uint_fast32_t i = 0; i < ndof; i++)
+        u[i] += (STYPE)(alpha * p[i]);
+
+      #pragma omp target teams distribute parallel for firstprivate(alpha) schedule(static)
+      for (uint_fast32_t i = 0; i < ndof; i++)
+        r[i] -= alpha * q[i];
+
+
+      CTYPE rnorm = 0.0;
+
+      rnorm = norm(r, ndof);
+
+      const CTYPE relres = rnorm / bnorm;
+
+      (*finalIter) = iter;
+      (*finalRes) = relres;
+
+      #pragma omp taskwait
+
+      //printf("it: %i, res=%e\n", iter, relres);
+
+      if (relres < tol)
+        break;
+    }
+  }
+}
+
+void allocateSolverData(const struct gridContext gc, const int nl,
+                        struct SolverDataBuffer *data) {
+
+  allocateZeroPaddedStateField(gc, 0, &(*data).r);
+  allocateZeroPaddedStateField(gc, 0, &(*data).p);
+  allocateZeroPaddedStateField(gc, 0, &(*data).q);
+  allocateZeroPaddedStateField(gc, 0, &(*data).z);
+
+  (*data).invD = malloc(sizeof(MTYPE *) * nl);
+  (*data).dmg = malloc(sizeof(CTYPE *) * nl);
+  (*data).rmg = malloc(sizeof(CTYPE *) * nl);
+  (*data).zmg = malloc(sizeof(CTYPE *) * nl);
+
+  allocateZeroPaddedStateField(gc, 0, &((*data).dmg[0]));
+  allocateZeroPaddedStateField_MTYPE(gc, 0, &((*data).invD[0]));
+  (*data).rmg[0] = (*data).r;
+  (*data).zmg[0] = (*data).z;
+
+  for (int l = 1; l < nl; l++) {
+    allocateZeroPaddedStateField(gc, l, &((*data).dmg[l]));
+    allocateZeroPaddedStateField(gc, l, &((*data).rmg[l]));
+    allocateZeroPaddedStateField(gc, l, &((*data).zmg[l]));
+    allocateZeroPaddedStateField_MTYPE(gc, l, &((*data).invD[l]));
+  }
+
+  // allocate for all levels for easy indces
+  (*data).coarseMatrices = malloc(sizeof(struct CSRMatrix) * nl);
+  for (int l = number_of_matrix_free_levels; l < nl; l++) {
+    allocateSubspaceMatrix(gc, l, &((*data).coarseMatrices[l]));
+  }
+}
+
+void freeSolverData(struct SolverDataBuffer *data, const int nl) {
+
+  free((*data).r);
+  free((*data).z);
+  free((*data).p);
+  free((*data).q);
+
+  free((*data).invD[0]);
+  free((*data).dmg[0]);
+
+  for (int l = 1; l < nl; l++) {
+    free((*data).invD[l]);
+    free((*data).dmg[l]);
+    free((*data).rmg[l]);
+    free((*data).zmg[l]);
+  }
+
+  free((*data).invD);
+  free((*data).dmg);
+  free((*data).zmg);
+  free((*data).rmg);
+
+  for (int l = number_of_matrix_free_levels; l < nl; l++) {
+    freeSubspaceMatrix(&((*data).coarseMatrices[l]));
+  }
+  free((*data).coarseMatrices);
+}
diff --git a/stencil_solvers.h b/stencil_solvers.h
new file mode 100644
index 0000000..eaf0e85
--- /dev/null
+++ b/stencil_solvers.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "definitions.h"
+#include "gpu_definitions.h"
+
+#include "stencil_assembly.h"
+
+void smoothDampedJacobi_halo(const struct gpuNode * gpu_node, const DTYPE *x,
+                             const uint_fast32_t nswp, const CTYPE omega,
+                             const MTYPE *invD, CTYPE *u, const CTYPE *b,
+                             CTYPE *tmp);
+
+void smoothDampedJacobiSubspaceMatrix_halo(
+    const struct gridContext * gc, const struct CSRMatrix M, const int l,
+    const uint_fast32_t nswp, const CTYPE omega, const MTYPE *invD, CTYPE *u,
+    const CTYPE *b, CTYPE *tmp);
+
+void smoothDampedJacobiSubspace_halo(const struct gpuNode * gpu_node,
+                                     const DTYPE *x, const int l,
+                                     const uint_fast32_t nswp,
+                                     const CTYPE omega, const MTYPE *invD,
+                                     CTYPE *u, const CTYPE *b, CTYPE *tmp);
+
+void solveStateMG_halo(const struct gpuGrid * gpu_gc, DTYPE *x, const int nswp,
+                       const int nl, const CTYPE tol,
+                       struct SolverDataBuffer *data, int *finalIter,
+                       float *finalRes, CTYPE *b, STYPE *u);
+
+void allocateSolverData(const struct gridContext gc, const int nl,
+                        struct SolverDataBuffer *data);
+
+void freeSolverData(struct SolverDataBuffer *data, const int nl);
+
+// compute the norm of two vectors
+// temperature: cold-medium, called 2 x number of cg iterations
+__force_inline inline CTYPE norm(CTYPE *v, uint_fast32_t size) {
+  CTYPE val = 0.0;
+  // long for loop, as ndof is typically 300.000 or more, but also trivially
+// parallel.
+#pragma omp target teams distribute parallel for reduction(+ : val) map(always,tofrom: val)
+  for (uint_fast32_t i = 0; i < size; i++)
+    val += v[i] * v[i];
+  val = sqrt(val);
+  return val;
+}
+
+// compute the inner product of two vectors
+// temperature: cold-medium, called 2 x number of cg iterations
+__force_inline inline CTYPE innerProduct(CTYPE *a, CTYPE *b,
+                                         uint_fast32_t size) {
+  CTYPE val = 0.0;
+  // long for loop, as ndof is typically 300.000 or more, but also trivially
+// parallel.
+#pragma omp target teams distribute parallel for reduction(+ : val) map(always,tofrom: val) //firstprivate(size)
+  for (uint_fast32_t i = 0; i < size; i++)
+    val += a[i] * b[i];
+  return val;
+}
diff --git a/stencil_utility.h b/stencil_utility.h
new file mode 100644
index 0000000..290dbbb
--- /dev/null
+++ b/stencil_utility.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include "definitions.h"
+
+// compute indices of displacement for a given element number
+// temperature: very very hot, called as part of the hot kernels in the
+// program, should be inlined always.
+__force_inline inline void getEdof_halo(uint_fast32_t edof[24], const int i,
+                                        const int j, const int k,
+                                        const int wrapy, const int wrapz) {
+
+  const int nx_1 = i;
+  const int nx_2 = i + 1;
+  const int nz_1 = k;
+  const int nz_2 = k + 1;
+  const int ny_1 = j;
+  const int ny_2 = j + 1;
+
+  const uint_fast32_t nIndex1 = nx_1 * wrapy * wrapz + nz_1 * wrapy + ny_2;
+  const uint_fast32_t nIndex2 = nx_2 * wrapy * wrapz + nz_1 * wrapy + ny_2;
+  const uint_fast32_t nIndex3 = nx_2 * wrapy * wrapz + nz_1 * wrapy + ny_1;
+  const uint_fast32_t nIndex4 = nx_1 * wrapy * wrapz + nz_1 * wrapy + ny_1;
+  const uint_fast32_t nIndex5 = nx_1 * wrapy * wrapz + nz_2 * wrapy + ny_2;
+  const uint_fast32_t nIndex6 = nx_2 * wrapy * wrapz + nz_2 * wrapy + ny_2;
+  const uint_fast32_t nIndex7 = nx_2 * wrapy * wrapz + nz_2 * wrapy + ny_1;
+  const uint_fast32_t nIndex8 = nx_1 * wrapy * wrapz + nz_2 * wrapy + ny_1;
+
+  edof[0] = 3 * nIndex1 + 0;
+  edof[1] = 3 * nIndex1 + 1;
+  edof[2] = 3 * nIndex1 + 2;
+  edof[3] = 3 * nIndex2 + 0;
+  edof[4] = 3 * nIndex2 + 1;
+  edof[5] = 3 * nIndex2 + 2;
+  edof[6] = 3 * nIndex3 + 0;
+  edof[7] = 3 * nIndex3 + 1;
+  edof[8] = 3 * nIndex3 + 2;
+  edof[9] = 3 * nIndex4 + 0;
+  edof[10] = 3 * nIndex4 + 1;
+  edof[11] = 3 * nIndex4 + 2;
+
+  edof[12] = 3 * nIndex5 + 0;
+  edof[13] = 3 * nIndex5 + 1;
+  edof[14] = 3 * nIndex5 + 2;
+  edof[15] = 3 * nIndex6 + 0;
+  edof[16] = 3 * nIndex6 + 1;
+  edof[17] = 3 * nIndex6 + 2;
+  edof[18] = 3 * nIndex7 + 0;
+  edof[19] = 3 * nIndex7 + 1;
+  edof[20] = 3 * nIndex7 + 2;
+  edof[21] = 3 * nIndex8 + 0;
+  edof[22] = 3 * nIndex8 + 1;
+  edof[23] = 3 * nIndex8 + 2;
+}
+
+// convert the node index from coordinates with halo padding to a grid without.
+// requires the wrapping parameters from the grid with halo and size of the grid
+// without
+__force_inline inline int haloToTrue(const int index, const int wrapy,
+                                     const int wrapz, const int ny,
+                                     const int nz) {
+
+  const int nodeNumber = index / 3;
+  const int dofOffset = index % 3;
+
+  const int i_halo = nodeNumber / (wrapy * wrapz);
+  const int j_halo = nodeNumber % wrapy;
+  const int k_halo = (nodeNumber % (wrapy * wrapz)) / wrapy;
+
+  const int i = i_halo - 1;
+  const int j = j_halo - 1;
+  const int k = k_halo - 1;
+
+  const int newNodeNumber = i * ny * nz + k * ny + j;
+
+  return 3 * newNodeNumber + dofOffset;
+}
diff --git a/top3d.c b/top3d.c
new file mode 100644
index 0000000..8492a0e
--- /dev/null
+++ b/top3d.c
@@ -0,0 +1,88 @@
+#include "getopt.h"
+
+#include "definitions.h"
+
+#include "local_matrix.h"
+#include "stencil_grid_utility.h"
+#include "stencil_methods.h"
+
+#include "stencil_assembly.h"
+#include "stencil_optimization.h"
+#include "stencil_solvers.h"
+
+// todo: accept parameters as command line arguments
+int main(int argc, char *argv[]) {
+
+  int nelx_coarse = 12;
+  int nely_coarse = 6;
+  int nelz_coarse = 6;
+  float volfrac = 0.2;
+  float rmin = 1.5;
+  int iters = 20;
+  int nl = 4;
+  int verbose = 0;
+  int write_result = 0;
+  int opt;
+
+  while ((opt = getopt(argc, argv, "x:y:z:r:f:i:l:v:w:")) != -1) {
+    switch (opt) {
+    case 'x':
+      nelx_coarse = atoi(optarg);
+      break;
+    case 'y':
+      nely_coarse = atoi(optarg);
+      break;
+    case 'z':
+      nelz_coarse = atoi(optarg);
+      break;
+    case 'r':
+      rmin = atof(optarg);
+      break;
+    case 'f':
+      volfrac = atof(optarg);
+      break;
+    case 'i':
+      iters = atoi(optarg);
+      break;
+    case 'l':
+      nl = atoi(optarg);
+      break;
+    case 'v':
+      verbose = atoi(optarg);
+      break;
+    case 'w':
+      write_result = atoi(optarg);
+      break;
+    default:
+      fprintf(stderr, "Usage: %s [-xyzrfilvw]\n", argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  int sizeIncr = 2;
+  for (int i = 2; i < nl; i++)
+    sizeIncr *= 2;
+
+  const int nelx = nelx_coarse * sizeIncr;
+  const int nely = nely_coarse * sizeIncr;
+  const int nelz = nelz_coarse * sizeIncr;
+
+  if (verbose) {
+    printf(
+        "Running topopt with:\n number of coarse els x (-x): %i (fine els = "
+        "%i)\n number of coarse els y (-y): %i (fine els = %i)\n number of "
+        "coarse els z (-z): %i (fine els = %i)\n total number of elements: "
+        "%i\n volume fraction (-f): %f\n filter radius in elements (-r): %f\n "
+        "number of design iterations (-i): %i\n verbose (-v): %i\n write file "
+        "(-w): %i\n\n",
+        nelx_coarse, nelx, nely_coarse, nely, nelz_coarse, nelz,
+        nelx * nely * nelz, volfrac, rmin, iters, verbose, write_result);
+  }
+
+  const float cgtol = 1e-5;
+  const int cgmax = 200;
+
+  top3dmgcg(nelx, nely, nelz, volfrac, rmin, nl, cgtol, cgmax, verbose,
+            write_result, iters);
+  return 0;
+}