diff --git a/.gitmodules b/.gitmodules
index 2fa156d1..5857c773 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,7 +2,3 @@
 	path = ext/hwmalloc
     url = https://github.com/ghex-org/hwmalloc.git
     branch = master
-[submodule "ext/googletest"]
-	path = ext/googletest
-	url = https://github.com/google/googletest.git
-    branch = main
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c60d1b1..e0ae8b3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 option(OOMPH_GIT_SUBMODULE "Check submodules during build" ON)
 option(OOMPH_USE_BUNDLED_LIBS "Use bundled 3rd party libraries" OFF)
 include(oomph_external_dependencies)
+include(ExternalProject)
 
 # ---------------------------------------------------------------------
 # Define main oomph library
@@ -94,6 +95,8 @@ add_subdirectory(bindings)
 # ---------------------------------------------------------------------
 set(OOMPH_WITH_TESTING OFF CACHE BOOL "True if tests shall be built")
 if (OOMPH_WITH_TESTING)
+    set(CTEST_TEST_TIMEOUT 30)
+    include(CTest)
     enable_testing()
     add_subdirectory(test)
 endif()
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 9652b48b..8e018cef 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -22,6 +22,7 @@ function(make_benchmark t_ lib)
     add_executable(${t} ${t_}_mt.cpp)
     oomph_target_compile_options(${t})
     target_link_libraries(${t} PRIVATE oomph_${lib})
+    target_compile_definitions(${t} PRIVATE OOMPH_BENCHMARKS_${lib})
 endfunction()
 
 function(make_benchmark_mt t_ lib)
@@ -30,6 +31,7 @@ function(make_benchmark_mt t_ lib)
     add_executable(${t} ${t_}_mt.cpp)
     oomph_target_compile_options(${t})
     target_compile_definitions(${t} PRIVATE OOMPH_BENCHMARKS_MT)
+    target_compile_definitions(${t} PRIVATE OOMPH_BENCHMARKS_${lib})
     target_link_libraries(${t} PRIVATE oomph_${lib})
     target_link_libraries(${t} PRIVATE OpenMP::OpenMP_CXX)
 endfunction()
@@ -104,6 +106,7 @@ if (OOMPH_WITH_LIBFABRIC)
             make_benchmark_mt(${t} libfabric)
         endif()
     endforeach()
+    add_subdirectory(scripts)
 endif()
 else()
     message("warning: benchmarks cannot be built unless barrier is enabled")
diff --git a/benchmarks/mpi_environment.hpp b/benchmarks/mpi_environment.hpp
index 7affabd3..50d2c2bb 100644
--- a/benchmarks/mpi_environment.hpp
+++ b/benchmarks/mpi_environment.hpp
@@ -12,6 +12,25 @@
 #include <mpi.h>
 #include <iostream>
 
+#ifdef OOMPH_BENCHMARKS_mpi
+#define TRANSPORT_STRING "mpi"
+#define PROGRESS_STRING  "unspecified"
+#define ENDPOINT_STRING  "unspecified"
+#endif
+
+#ifdef OOMPH_BENCHMARKS_ucx
+#define TRANSPORT_STRING "ucx"
+#define PROGRESS_STRING  "unspecified"
+#define ENDPOINT_STRING  "unspecified"
+#endif
+
+#ifdef OOMPH_BENCHMARKS_libfabric
+#include "../src/libfabric/controller.hpp"
+#define TRANSPORT_STRING "libfabric"
+#define PROGRESS_STRING  LIBFABRIC_PROGRESS_STRING
+#define ENDPOINT_STRING  LIBFABRIC_ENDPOINT_STRING
+#endif
+
 namespace oomph
 {
 struct mpi_environment
diff --git a/benchmarks/mpi_p2p_bi_avail_mt.cpp b/benchmarks/mpi_p2p_bi_avail_mt.cpp
index 4bc1dedd..6013eb2c 100644
--- a/benchmarks/mpi_p2p_bi_avail_mt.cpp
+++ b/benchmarks/mpi_p2p_bi_avail_mt.cpp
@@ -23,6 +23,13 @@
 #include <omp.h>
 #endif /* OOMPH_BENCHMARKS_MT */
 
+#ifdef USE_TESTANY
+const char* syncmode = "testany";
+#else
+const char* syncmode = "test";
+#endif
+const char* waitmode = "avail";
+
 int
 main(int argc, char* argv[])
 {
@@ -209,8 +216,26 @@ main(int argc, char* argv[])
     MPI_Barrier(MPI_COMM_WORLD);
     if (rank == 1)
     {
+        double elapsed = t1.toc();
         t1.vtoc();
         t1.vtoc("final ", (double)niter * size * buff_size);
+        double bw = ((double)(niter * size) * buff_size) / elapsed;
+        // clang-format off
+        std::cout << "time:       " << elapsed/1000000 << "s\n";
+        std::cout << "final MB/s: " << bw << "\n";
+        std::cout << "CSVData"
+                  << ", niter, " << niter
+                  << ", buff_size, " << buff_size
+                  << ", inflight, " << inflight
+                  << ", num_threads, " << cmd_args.num_threads
+                  << ", syncmode, " << syncmode
+                  << ", waitmode, " << waitmode
+                  << ", transport, " << "Native-MPI"
+                  << ", BW MB/s, " << bw
+                  << ", progress, " << "unspecified"
+                  << ", endpoint, " << "unspecified"
+                  << "\n";
+        // clang-format on
     }
 
     return 0;
diff --git a/benchmarks/mpi_p2p_bi_wait_mt.cpp b/benchmarks/mpi_p2p_bi_wait_mt.cpp
index c19d69e6..617862de 100644
--- a/benchmarks/mpi_p2p_bi_wait_mt.cpp
+++ b/benchmarks/mpi_p2p_bi_wait_mt.cpp
@@ -23,6 +23,13 @@
 #include <omp.h>
 #endif /* OOMPH_BENCHMARKS_MT */
 
+#ifdef USE_WAITALL
+const char* syncmode = "waitall";
+#else
+const char* syncmode = "wait";
+#endif
+const char* waitmode = "wait";
+
 int
 main(int argc, char* argv[])
 {
@@ -155,8 +162,26 @@ main(int argc, char* argv[])
     MPI_Barrier(MPI_COMM_WORLD);
     if (rank == 1)
     {
+        double elapsed = t1.toc();
         t1.vtoc();
         t1.vtoc("final ", (double)niter * size * buff_size);
+        double bw = ((double)(niter * size) * buff_size) / elapsed;
+        // clang-format off
+        std::cout << "time:       " << elapsed/1000000 << "s\n";
+        std::cout << "final MB/s: " << bw << "\n";
+        std::cout << "CSVData"
+                  << ", niter, " << niter
+                  << ", buff_size, " << buff_size
+                  << ", inflight, " << inflight
+                  << ", num_threads, " << cmd_args.num_threads
+                  << ", syncmode, " << syncmode
+                  << ", waitmode, " << waitmode
+                  << ", transport, " << "Native-MPI"
+                  << ", BW MB/s, " << bw
+                  << ", progress, " << "unspecified"
+                  << ", endpoint, " << "unspecified"
+                  << "\n";
+        // clang-format on
     }
 
     return 0;
diff --git a/benchmarks/scripts/CMakeLists.txt b/benchmarks/scripts/CMakeLists.txt
new file mode 100644
index 00000000..e74f3c70
--- /dev/null
+++ b/benchmarks/scripts/CMakeLists.txt
@@ -0,0 +1,68 @@
+#-------------------------------------------------------
+# Slurm job launching script generator
+#
+# We would like to generate a script which can be used to generate batch jobs
+# to submit benchmarks to slurm using many combinations of settings.
+#
+# We add a custom command which takes our template script and
+# expands out all the variables we need to pass into it.
+# Unfortunately, due to the way cmake works, some variables are only known
+# at build time, and not at cmake configure time. Using a custom command which
+# calls cmake to run our script at build time, allows us to pass variables
+# into the final script which is placed in our build dir.
+#
+# Note that we generate these scripts in the build dir instead
+# of the install dir as they are intended for development testing.
+# A version could be supported for installation later.
+#-------------------------------------------------------
+
+set(OOMPH_BENCHMARK_SCRIPTS_PATH "${PROJECT_BINARY_DIR}/scripts" CACHE PATH "Directory to place batch scripts in")
+mark_as_advanced(OOMPH_BENCHMARK_SCRIPTS_PATH)
+
+# Make sure scripts dir exists
+execute_process(COMMAND "${CMAKE_COMMAND}" -E make_directory "${OOMPH_BENCHMARK_SCRIPTS_PATH}")
+
+# The MPI backend is always enabled and uses no extension on the benchmarks
+# it will be added automatically by the script runner
+set(BENCHMARK_SUFFIXES)
+
+if (OOMPH_WITH_LIBFABRIC)
+    list(APPEND BENCHMARK_SUFFIXES _libfabric)
+endif()
+if (OOMPH_WITH_UCX)
+    list(APPEND BENCHMARK_SUFFIXES _ucx)
+endif()
+
+#--------------------------------------------------
+# Slurm script generator for benchmarks
+#--------------------------------------------------
+set(SCRIPTS "generate-oomph.py")
+
+foreach(script ${SCRIPTS})
+  ADD_CUSTOM_COMMAND(
+    DEPENDS   "${CMAKE_CURRENT_SOURCE_DIR}/${script}"
+    COMMAND   "${CMAKE_COMMAND}"
+    ARGS
+              # needed by the benchmark script
+              -DBIN_DIR=${PROJECT_BINARY_DIR}/benchmarks
+              -DRUN_DIR=${OOMPH_BENCHMARK_SCRIPTS_PATH}
+              -DMPIEXEC="${MPIEXEC}"
+
+              # needed by the copy script
+              -DSCRIPT_SOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}"
+              -DSCRIPT_NAME=${script}
+              -DSCRIPT_DEST_DIR="${OOMPH_BENCHMARK_SCRIPTS_PATH}"
+
+              # quoted as it might be a list
+              "-DBENCHMARK_SUFFIXES=${BENCHMARK_SUFFIXES}"
+              -DJOB_OPTIONS1="${SLURM_JOB_OPTIONS1}"
+              -P "${CMAKE_CURRENT_SOURCE_DIR}/copy_script.cmake"
+
+    OUTPUT    "${OOMPH_BENCHMARK_SCRIPTS_PATH}/${script}"
+    VERBATIM
+  )
+
+  add_custom_target(script-${script}
+    DEPENDS "${OOMPH_BENCHMARK_SCRIPTS_PATH}/${script}"
+  )
+endforeach(script)
diff --git a/benchmarks/scripts/copy_script.cmake b/benchmarks/scripts/copy_script.cmake
new file mode 100644
index 00000000..837676de
--- /dev/null
+++ b/benchmarks/scripts/copy_script.cmake
@@ -0,0 +1,22 @@
+FILE(TO_NATIVE_PATH "${SCRIPT_SOURCE_DIR}/${SCRIPT_NAME}" INFILE)
+FILE(TO_NATIVE_PATH "${SCRIPT_DEST_DIR}/${SCRIPT_NAME}" OUTFILE)
+
+STRING(REPLACE "\"" "" FILE1 "${INFILE}")
+STRING(REPLACE "\"" "" FILE2 "${OUTFILE}")
+
+#
+# when debugging, this dumps all vars out
+#
+if (DEBUG_THIS_SCRIPT)
+    get_cmake_property(_variableNames VARIABLES)
+    list (SORT _variableNames)
+    foreach (_variableName ${_variableNames})
+        message(STATUS "${_variableName}=${${_variableName}}")
+    endforeach()
+endif()
+
+configure_file(
+  "${FILE1}"
+  "${FILE2}"
+  @ONLY
+)
diff --git a/benchmarks/scripts/generate-oomph.ipynb b/benchmarks/scripts/generate-oomph.ipynb
new file mode 100644
index 00000000..eb077b36
--- /dev/null
+++ b/benchmarks/scripts/generate-oomph.ipynb
@@ -0,0 +1,557 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import product\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import inspect\n",
+    "import os\n",
+    "import time\n",
+    "from IPython.display import Image, display, HTML\n",
+    "import importlib\n",
+    "import socket\n",
+    "import argparse\n",
+    "\n",
+    "# working dir\n",
+    "cwd = os.getcwd()\n",
+    "\n",
+    "# name of this script\n",
+    "scriptname = inspect.getframeinfo(inspect.currentframe()).filename\n",
+    "scriptpath = os.path.dirname(os.path.abspath(scriptname))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>.container { width:100% !important; }</style>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NbConvertApp] Converting notebook generate-oomph.ipynb to script\n",
+      "[NbConvertApp] Writing 13176 bytes to generate-oomph.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "def is_notebook():\n",
+    "    try:\n",
+    "        shell = get_ipython().__class__.__name__\n",
+    "        if shell == 'ZMQInteractiveShell':\n",
+    "            return True   # Jupyter notebook or qtconsole\n",
+    "        elif shell == 'TerminalInteractiveShell':\n",
+    "            return False  # Terminal running IPython\n",
+    "        else:\n",
+    "            return False  # Other type (?)\n",
+    "    except NameError:\n",
+    "        return False      # Probably standard Python interpreter\n",
+    "\n",
+    "if is_notebook():\n",
+    "    # this makes the notebook wider on a larger screen using %x of the display\n",
+    "    display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n",
+    "    # save this notebook as a raw python file as well please\n",
+    "    get_ipython().system('jupyter nbconvert --to script generate-oomph.ipynb')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ------------------------------------------------------------------\n",
+    "# Command line params\n",
+    "# ------------------------------------------------------------------\n",
+    "def get_command_line_args(notebook_args=None):\n",
+    "    parser = argparse.ArgumentParser(description='Generator for oomph benchmarks')\n",
+    "    parser.add_argument('-d', '--dir', default=cwd, action='store', help='base directory to generate job scripts in')\n",
+    "    parser.add_argument('-t', '--type', default='normal', action='store', help='normal, timed or native for different test types')\n",
+    "    parser.add_argument('-T', '--timeout', default=120, action='store', help='executable timeout period')\n",
+    "    parser.add_argument('-m', '--machine', default='', action='store', help='select machine batch job config/preamble')\n",
+    "    if is_notebook():\n",
+    "        parser.add_argument('-f', help='seems to be defaulted by jupyter')\n",
+    "        return parser.parse_args(notebook_args)\n",
+    "    return parser.parse_args()\n",
+    "\n",
+    "notebook_args = '--type=native --dir /home/biddisco/benchmarking-results/test'.split()\n",
+    "if is_notebook():\n",
+    "    args = get_command_line_args(notebook_args)\n",
+    "else:\n",
+    "    args = get_command_line_args()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CWD        : /home/biddisco/src/ghex/extern/oomph/benchmarks/scripts \n",
+      "Scriptpath : /tmp/ipykernel_91426 \n",
+      "Hostname   : oryx2\n"
+     ]
+    }
+   ],
+   "source": [
+    "# hostname + cleanup login node 'daint101' etc\n",
+    "if args.machine != '':\n",
+    "    hostname = args.machine\n",
+    "elif os.environ.get('LUMI_STACK_NAME', 'oryx2') == 'LUMI':\n",
+    "    hostname = 'lumi'\n",
+    "elif socket.gethostname().startswith('daint'):\n",
+    "    hostname = 'daint'\n",
+    "else :\n",
+    "    hostname = 'oryx2'\n",
+    "\n",
+    "# summary\n",
+    "print(f'CWD        : {cwd} \\nScriptpath : {scriptpath} \\nHostname   : {hostname}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_executable(path):\n",
+    "    mode = os.stat(path).st_mode\n",
+    "    mode |= (mode & 0o444) >> 2    # copy R bits to X\n",
+    "    os.chmod(path, mode)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generating scripts in /home/biddisco/benchmarking-results/test\n"
+     ]
+    }
+   ],
+   "source": [
+    "# strings with @xxx@ will be substituted by cmake\n",
+    "binary_dir = \"@BIN_DIR@\"\n",
+    "\n",
+    "if args.dir:\n",
+    "    run_dir = args.dir\n",
+    "else:\n",
+    "    run_dir = \"@RUN_DIR@\"\n",
+    "\n",
+    "print(f'Generating scripts in {run_dir}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cscs = {}\n",
+    "\n",
+    "# jb laptop\n",
+    "cscs[\"oryx2\"] = {\n",
+    "  \"Machine\":\"system76\",\n",
+    "  \"Cores\": 8,\n",
+    "  \"Threads per core\": 2,\n",
+    "  \"Allowed rpns\": [1, 2],\n",
+    "  \"Thread_array\": [1,2,4],\n",
+    "  \"Sleeptime\":0,\n",
+    "  \"Launch\": \"pushd {job_path} && source {job_file} && popd\",\n",
+    "  \"Run command\": \"mpiexec -n {total_ranks} --oversubscribe timeout {timeout} \",\n",
+    "  \"Batch preamble\": \"\"\"\n",
+    "#!/bin/bash -l\n",
+    "\n",
+    "# Env\n",
+    "#export OMP_NUM_THREADS={threads}\n",
+    "#export GOMP_CPU_AFFINITY=0-{threadsm1}\n",
+    "\n",
+    "# Commands\n",
+    "\"\"\"\n",
+    "}\n",
+    "\n",
+    "# daint mc nodes config\n",
+    "cscs[\"daint\"] = {\n",
+    "  \"Machine\":\"daint\",\n",
+    "  \"Cores\": 128,\n",
+    "  \"Threads per core\": 2,\n",
+    "  \"Allowed rpns\": [1],\n",
+    "  \"Thread_array\": [1,2,4,8,16],\n",
+    "  \"Sleeptime\":0.25,\n",
+    "  \"Launch\": \"sbatch --chdir={job_path} {job_file}\",\n",
+    "  \"Run command\": \"srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} \",\n",
+    "  \"Batch preamble\": \"\"\"\n",
+    "#!/bin/bash -l\n",
+    "#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size}\n",
+    "#SBATCH --time={time_min}\n",
+    "#SBATCH --nodes={nodes}\n",
+    "#SBATCH --partition=normal\n",
+    "#SBATCH --account=csstaff\n",
+    "#SBATCH --constraint=mc\n",
+    "#SBATCH --output=output.txt\n",
+    "#SBATCH --error=error.txt\n",
+    "\n",
+    "module swap craype/2.7.10 craype/2.7.15\n",
+    "\n",
+    "# alternatives : srun --cpu-bind v,mask_cpu:0xffff\n",
+    "# export GOMP_CPU_AFFINITY=0-{threadsm1}\n",
+    "\n",
+    "# Old Env vars that might be useful\n",
+    "# export MPICH_MAX_THREAD_SAFETY=multiple\n",
+    "# export OMP_NUM_THREADS={threads}\n",
+    "# export MKL_NUM_THREADS={threads}\n",
+    "# export MPICH_GNI_NDREG_ENTRIES=1024\n",
+    "\n",
+    "# Debug\n",
+    "module list &> modules.txt\n",
+    "printenv > env.txt\n",
+    "\n",
+    "# Commands\n",
+    "\"\"\"\n",
+    "}\n",
+    "\n",
+    "# daint mc nodes config\n",
+    "cscs[\"lumi\"] = {\n",
+    "  \"Machine\":\"lumi\",\n",
+    "  \"Cores\": 16,\n",
+    "  \"Threads per core\": 2,\n",
+    "  \"Allowed rpns\": [1],\n",
+    "  \"Thread_array\": [1,2,4,8,16],\n",
+    "  \"Sleeptime\":0.25,\n",
+    "  \"Launch\": \"sbatch --chdir={job_path} {job_file}\",\n",
+    "  \"Run command\": \"srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} \",\n",
+    "  \"Batch preamble\": \"\"\"\n",
+    "#!/bin/bash -l\n",
+    "#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size}\n",
+    "#SBATCH --time={time_min}\n",
+    "#SBATCH --nodes={nodes}\n",
+    "#SBATCH --partition=standard\n",
+    "#SBATCH --account=project_465000105\n",
+    "#SBATCH --output=output.txt\n",
+    "#SBATCH --error=error.txt\n",
+    "\n",
+    "module load LUMI/22.06\n",
+    "module load cpeGNU\n",
+    "module load buildtools\n",
+    "module load Boost\n",
+    "\n",
+    "# alternatives : srun --cpu-bind v,mask_cpu:0xffff\n",
+    "# export GOMP_CPU_AFFINITY=0-{threadsm1}\n",
+    "\n",
+    "export MPICH_MAX_THREAD_SAFETY=multiple\n",
+    "# export OMP_NUM_THREADS={threads}\n",
+    "# export MKL_NUM_THREADS={threads}\n",
+    "# export MPICH_GNI_NDREG_ENTRIES=1024\n",
+    "\n",
+    "# Debug\n",
+    "module list &> modules.txt\n",
+    "printenv > env.txt\n",
+    "\n",
+    "# Commands\n",
+    "\"\"\"\n",
+    "}\n",
+    "\n",
+    "\n",
+    "cscs['eiger'] = cscs['daint']\n",
+    "cscs['eiger']['Machine'] = 'eiger'\n",
+    "cscs['eiger']['Cores'] = 64\n",
+    "cscs['eiger']['Thread_array'] = [1,2,4,8,16]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#\n",
+    "# Generate Job script preamble\n",
+    "#\n",
+    "def init_job_text(system, run_name, time_min, transport, nodes, threads, inflight, size):\n",
+    "    return system[\"Batch preamble\"].format(run_name=run_name,\n",
+    "                                           time_min=time_min,\n",
+    "                                           transport=transport,\n",
+    "                                           nodes=nodes,\n",
+    "                                           threads=threads,\n",
+    "                                           threadsm1=(threads-1),\n",
+    "                                           inflight=inflight,\n",
+    "                                           size=size).strip()\n",
+    "#\n",
+    "# create a directory name from params\n",
+    "#\n",
+    "def make_job_directory(fdir,name, transport, nodes, threads, inflight, size):\n",
+    "    return f'{fdir}/{name}_{transport}_{nodes}_{threads}_{inflight}_{size}'\n",
+    "\n",
+    "#\n",
+    "# create the launch command-line\n",
+    "#\n",
+    "def run_command(system, total_ranks, cpus_per_rank, timeout):\n",
+    "    return system[\"Run command\"].format(total_ranks=total_ranks, cpus_per_rank=cpus_per_rank, threads_per_rank=cpus_per_rank, timeout=timeout)\n",
+    "\n",
+    "#\n",
+    "# create dir + write final script for sbatch/shell or other job launcher\n",
+    "#\n",
+    "def write_job_file(system, launch_file, job_dir, job_text, suffix=''):\n",
+    "    job_path = os.path.expanduser(job_dir)\n",
+    "    os.makedirs(job_path, exist_ok=True)\n",
+    "    job_file = f\"{job_path}/job_{suffix}.sh\"\n",
+    "    print(f\"Generating : {job_path} : {job_file}\")\n",
+    "\n",
+    "    with open(job_file, \"w\") as f:\n",
+    "        f.write(job_text)\n",
+    "        make_executable(job_file)\n",
+    "\n",
+    "    launchstring  = system[\"Launch\"].format(job_path=job_path,job_file=job_file) + '\\n'\n",
+    "    launchstring += 'sleep ' + str(system['Sleeptime']) + '\\n'\n",
+    "    launch_file.write(launchstring)\n",
+    "\n",
+    "#\n",
+    "# generate a string that decorates and launches a single instance of the test\n",
+    "#\n",
+    "def execution_string(env, launch_cmd, prog_cmd, output_redirect):\n",
+    "    full_command = f\"{env} {launch_cmd} {prog_cmd}\".strip()\n",
+    "    command_prologue  = f'printf \"\\\\n'\n",
+    "    command_prologue += f'# ----- Executing \\\\n'\n",
+    "    command_prologue += f'{full_command}    \\\\n'\n",
+    "    command_prologue += f'# --------------- \\\\n\" >> {output_redirect}'\n",
+    "    command_epilogue  = f'printf \"\\\\n'\n",
+    "    command_epilogue += f'# ----- Finished  \\\\n\\\\n\" >> {output_redirect}'\n",
+    "    return '\\n' + command_prologue + '\\n' + full_command + ' >> ' + output_redirect + '\\n' + command_epilogue + '\\n'\n",
+    "\n",
+    "#\n",
+    "# generate application specific commmands/flags/options that go into the job script\n",
+    "#\n",
+    "def oomph_original(system, bin_dir, timeout, transport, progs, nodes, threads, msg, size, inflight, env):\n",
+    "    total_ranks = 2\n",
+    "    whole_cmd = ''\n",
+    "    suffix = ''\n",
+    "\n",
+    "    # transport layers use '_libfabric', '_ucx', '_mpi', etc\n",
+    "    if args.type!='native':\n",
+    "        suffix = f'_{transport}'\n",
+    "\n",
+    "    # timed version uses seconds instead of messages/iterations\n",
+    "    if args.type=='timed':\n",
+    "        msg = 30\n",
+    "\n",
+    "    # always remember to add a space to the end of each env var for concatenation of many of them\n",
+    "    if threads==1:\n",
+    "        env +=  'MPICH_MAX_THREAD_SAFETY=single '\n",
+    "    else:\n",
+    "        env +=  'MPICH_MAX_THREAD_SAFETY=multiple '\n",
+    "        env += f'OMP_NUM_THREADS={threads} '\n",
+    "        env += f'GOMP_CPU_AFFINITY=0-{threads} '\n",
+    "\n",
+    "    for prog in progs:\n",
+    "        if threads>1:\n",
+    "            if args.type=='normal' or args.type=='timed':\n",
+    "                prog = prog + '_mt'\n",
+    "\n",
+    "        if transport=='native' and threads==1:\n",
+    "            prog = prog.replace('_mt_','_')\n",
+    "\n",
+    "        # generate the name of the output file we redirect output to\n",
+    "        outfile = f'{prog}_N{nodes}_T{threads}_I{msg}_S{size}_F{inflight}.out'\n",
+    "\n",
+    "        # generate the program commmand with all command line params needed by program\n",
+    "        prog_cmd = f\"{bin_dir}/{prog}{suffix} {msg} {size} {inflight}\"\n",
+    "\n",
+    "        # get the system launch command (mpiexec, srun, etc) with options/params\n",
+    "        launch_cmd = run_command(system, total_ranks, threads, timeout)\n",
+    "\n",
+    "        if transport=='libfabric':\n",
+    "            env2 = env + 'LIBFABRIC_POLL_SIZE=32 '\n",
+    "            #for ep in ['single', 'multiple', 'scalableTx', 'threadlocal']:\n",
+    "            for ep in ['threadlocal']:\n",
+    "                whole_cmd += execution_string(env2 + f\"LIBFABRIC_ENDPOINT_TYPE={ep} \", launch_cmd, prog_cmd, outfile)\n",
+    "            if False: # add option to enable this?\n",
+    "                whole_cmd += execution_string(env2 + f\"LIBFABRIC_ENDPOINT_TYPE={ep} \" + f\"LIBFABRIC_AUTO_PROGRESS=1 \", launch_cmd, prog_cmd, outfile)\n",
+    "        else:\n",
+    "            whole_cmd += execution_string(env, launch_cmd, prog_cmd, outfile)\n",
+    "\n",
+    "    return whole_cmd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "system = cscs[hostname]\n",
+    "#\n",
+    "job_name       = 'oomph'\n",
+    "timeout        = args.timeout\n",
+    "time_min       = 2000*60 # total time estimate\n",
+    "timestr        = time.strftime('%H:%M:%S', time.gmtime(time_min))\n",
+    "ranks_per_node = 1\n",
+    "nodes_arr = [2]\n",
+    "thrd_arr  = system['Thread_array']\n",
+    "size_arr  = [1,10,100,1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000]\n",
+    "nmsg_lut  = {1:500000,\n",
+    "             10:500000,\n",
+    "             100:500000,\n",
+    "             1000:500000,\n",
+    "             2000:500000,\n",
+    "             5000:250000,\n",
+    "             10000:250000,\n",
+    "             20000:250000,\n",
+    "             50000:250000,\n",
+    "             100000:250000,\n",
+    "             200000:250000,\n",
+    "             500000:100000,\n",
+    "             1000000:50000,\n",
+    "             2000000:25000}\n",
+    "\n",
+    "flight_arr = [1,10,50,100]\n",
+    "\n",
+    "if args.type=='normal':\n",
+    "    trans_arr = ['libfabric', 'mpi']\n",
+    "    prog_arr  = [\n",
+    "        #\"bench_p2p_bi_cb_avail\",\n",
+    "        #\"bench_p2p_bi_cb_wait\",\n",
+    "        \"bench_p2p_bi_ft_avail\",\n",
+    "        #\"bench_p2p_bi_ft_wait\"\n",
+    "    ]\n",
+    "\n",
+    "if args.type=='timed':\n",
+    "    trans_arr = ['libfabric', 'mpi']\n",
+    "    prog_arr  = ['bench_p2p_pp_ft_avail']\n",
+    "\n",
+    "if args.type=='native':\n",
+    "    trans_arr = ['native']\n",
+    "    prog_arr  = [\n",
+    "        #\"mpi_p2p_bi_avail_mt_test\", \"mpi_p2p_bi_avail_mt_testany\",\n",
+    "        #\"mpi_p2p_bi_wait_mt_wait\",\n",
+    "        \"mpi_p2p_bi_wait_mt_waitall\"\n",
+    "    ]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Uncommment the following line to perform the job creation\n",
+      "Generating : /home/biddisco/benchmarking-results/test/oomph_all_2_1_1_1 : /home/biddisco/benchmarking-results/test/oomph_all_2_1_1_1/job_.sh\n",
+      "Combinations 168 est-time 1344 minutes\n"
+     ]
+    }
+   ],
+   "source": [
+    "combos = 0\n",
+    "\n",
+    "if run_dir.startswith('@'):\n",
+    "    print(f'Skipping creation of job launch file for {run_dir}')\n",
+    "else:\n",
+    "    job_launch = f\"{run_dir}/launch.sh\"\n",
+    "    job_launch_file = open(job_launch, \"w\")\n",
+    "    #\n",
+    "    job_launch_file.write(\"#!/bin/bash -l\\n\")\n",
+    "\n",
+    "# create the output directory for each job\n",
+    "job_dir = make_job_directory(run_dir, 'oomph', \"all\", 2, 1, 1, 1)\n",
+    "\n",
+    "# first part of boiler plate job script\n",
+    "job_text = init_job_text(system, job_name, timestr, \"all\", 2, 16, 1, 1)\n",
+    "\n",
+    "# generate all combinations in one monster loop\n",
+    "for nodes, transport, threads, size, inflight in product(nodes_arr, trans_arr, thrd_arr, size_arr, flight_arr):\n",
+    "\n",
+    "    env = \"\"\n",
+    "    msg = nmsg_lut[size]\n",
+    "\n",
+    "    # create the output directory for each job\n",
+    "    #job_dir = make_job_directory(run_dir, 'oomph', transport, nodes, threads, inflight, size)\n",
+    "\n",
+    "    # first part of boiler plate job script\n",
+    "    #job_text = init_job_text(system, job_name, timestr, transport, nodes, threads, inflight, size)\n",
+    "\n",
+    "    env = 'MPICH_GNI_NDREG_ENTRIES=1024 '\n",
+    "\n",
+    "    # application specific part of job script\n",
+    "    job_text += oomph_original(\n",
+    "        system,\n",
+    "        binary_dir,\n",
+    "        timeout,\n",
+    "        transport,\n",
+    "        prog_arr,\n",
+    "        nodes,\n",
+    "        threads,\n",
+    "        msg,\n",
+    "        size,\n",
+    "        inflight,\n",
+    "        env\n",
+    "    )\n",
+    "    # debugging\n",
+    "    # print(job_dir, '\\n', job_text, '\\n\\n\\n\\n')\n",
+    "\n",
+    "    combos += 1\n",
+    "\n",
+    "    if combos==1:\n",
+    "        print('Uncommment the following line to perform the job creation')\n",
+    "\n",
+    "write_job_file(system, job_launch_file, job_dir, job_text)\n",
+    "\n",
+    "make_executable(job_launch)\n",
+    "print('Combinations', combos, 'est-time', combos*4*2,'minutes')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/benchmarks/scripts/generate-oomph.py b/benchmarks/scripts/generate-oomph.py
new file mode 100644
index 00000000..1c43715d
--- /dev/null
+++ b/benchmarks/scripts/generate-oomph.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+from itertools import product
+import math
+import numpy as np
+import inspect
+import os
+import time
+from IPython.display import Image, display, HTML
+import importlib
+import socket
+import argparse
+
+# working dir
+cwd = os.getcwd()
+
+# name of this script
+scriptname = inspect.getframeinfo(inspect.currentframe()).filename
+scriptpath = os.path.dirname(os.path.abspath(scriptname))
+
+
+# In[2]:
+
+
+def is_notebook():
+    try:
+        shell = get_ipython().__class__.__name__
+        if shell == 'ZMQInteractiveShell':
+            return True   # Jupyter notebook or qtconsole
+        elif shell == 'TerminalInteractiveShell':
+            return False  # Terminal running IPython
+        else:
+            return False  # Other type (?)
+    except NameError:
+        return False      # Probably standard Python interpreter
+
+if is_notebook():
+    # this makes the notebook wider on a larger screen using %x of the display
+    display(HTML("<style>.container { width:100% !important; }</style>"))
+    # save this notebook as a raw python file as well please
+    get_ipython().system('jupyter nbconvert --to script generate-oomph.ipynb')
+
+
+# In[3]:
+
+
+# ------------------------------------------------------------------
+# Command line params
+# ------------------------------------------------------------------
+def get_command_line_args(notebook_args=None):
+    parser = argparse.ArgumentParser(description='Generator for oomph benchmarks')
+    parser.add_argument('-d', '--dir', default=cwd, action='store', help='base directory to generate job scripts in')
+    parser.add_argument('-t', '--type', default='normal', action='store', help='normal, timed or native for different test types')
+    parser.add_argument('-T', '--timeout', default=120, action='store', help='executable timeout period')
+    parser.add_argument('-m', '--machine', default='', action='store', help='select machine batch job config/preamble')
+    if is_notebook():
+        parser.add_argument('-f', help='seems to be defaulted by jupyter')
+        return parser.parse_args(notebook_args)
+    return parser.parse_args()
+
+notebook_args = '--type=native --dir /home/biddisco/benchmarking-results/test'.split()
+if is_notebook():
+    args = get_command_line_args(notebook_args)
+else:
+    args = get_command_line_args()
+
+
+# In[4]:
+
+
+# hostname + cleanup login node 'daint101' etc
+if args.machine != '':
+    hostname = args.machine
+elif os.environ.get('LUMI_STACK_NAME', 'oryx2') == 'LUMI':
+    hostname = 'lumi'
+elif socket.gethostname().startswith('daint'):
+    hostname = 'daint'
+else :
+    hostname = 'oryx2'
+
+# summary
+print(f'CWD        : {cwd} \nScriptpath : {scriptpath} \nHostname   : {hostname}')
+
+
+# In[5]:
+
+
+def make_executable(path):
+    mode = os.stat(path).st_mode
+    mode |= (mode & 0o444) >> 2    # copy R bits to X
+    os.chmod(path, mode)
+
+
+# In[6]:
+
+
+# strings with @xxx@ will be substituted by cmake
+binary_dir = "@BIN_DIR@"
+
+if args.dir:
+    run_dir = args.dir
+else:
+    run_dir = "@RUN_DIR@"
+
+print(f'Generating scripts in {run_dir}')
+
+
+# In[7]:
+
+
+cscs = {}
+
+# jb laptop
+cscs["oryx2"] = {
+  "Machine":"system76",
+  "Cores": 8,
+  "Threads per core": 2,
+  "Allowed rpns": [1, 2],
+  "Thread_array": [1,2,4],
+  "Sleeptime":0,
+  "Launch": "pushd {job_path} && source {job_file} && popd",
+  "Run command": "mpiexec -n {total_ranks} --oversubscribe timeout {timeout} ",
+  "Batch preamble": """
+#!/bin/bash -l
+
+# Env
+#export OMP_NUM_THREADS={threads}
+#export GOMP_CPU_AFFINITY=0-{threadsm1}
+
+# Commands
+"""
+}
+
+# daint mc nodes config
+cscs["daint"] = {
+  "Machine":"daint",
+  "Cores": 128,
+  "Threads per core": 2,
+  "Allowed rpns": [1],
+  "Thread_array": [1,2,4,8,16],
+  "Sleeptime":0.25,
+  "Launch": "sbatch --chdir={job_path} {job_file}",
+  "Run command": "srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} ",
+  "Batch preamble": """
+#!/bin/bash -l
+#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size}
+#SBATCH --time={time_min}
+#SBATCH --nodes={nodes}
+#SBATCH --partition=normal
+#SBATCH --account=csstaff
+#SBATCH --constraint=mc
+#SBATCH --output=output.txt
+#SBATCH --error=error.txt
+
+module swap craype/2.7.10 craype/2.7.15
+
+# alternatives : srun --cpu-bind v,mask_cpu:0xffff
+# export GOMP_CPU_AFFINITY=0-{threadsm1}
+
+# Old Env vars that might be useful
+# export MPICH_MAX_THREAD_SAFETY=multiple
+# export OMP_NUM_THREADS={threads}
+# export MKL_NUM_THREADS={threads}
+# export MPICH_GNI_NDREG_ENTRIES=1024
+
+# Debug
+module list &> modules.txt
+printenv > env.txt
+
+# Commands
+"""
+}
+
+# daint mc nodes config
+cscs["lumi"] = {
+  "Machine":"lumi",
+  "Cores": 16,
+  "Threads per core": 2,
+  "Allowed rpns": [1],
+  "Thread_array": [1,2,4,8,16],
+  "Sleeptime":0.25,
+  "Launch": "sbatch --chdir={job_path} {job_file}",
+  "Run command": "srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} ",
+  "Batch preamble": """
+#!/bin/bash -l
+#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size}
+#SBATCH --time={time_min}
+#SBATCH --nodes={nodes}
+#SBATCH --partition=standard
+#SBATCH --account=project_465000105
+#SBATCH --output=output.txt
+#SBATCH --error=error.txt
+
+module load LUMI/22.06
+module load cpeGNU
+module load buildtools
+module load Boost
+
+# alternatives : srun --cpu-bind v,mask_cpu:0xffff
+# export GOMP_CPU_AFFINITY=0-{threadsm1}
+
+export MPICH_MAX_THREAD_SAFETY=multiple
+# export OMP_NUM_THREADS={threads}
+# export MKL_NUM_THREADS={threads}
+# export MPICH_GNI_NDREG_ENTRIES=1024
+
+# Debug
+module list &> modules.txt
+printenv > env.txt
+
+# Commands
+"""
+}
+
+
+cscs['eiger'] = cscs['daint']
+cscs['eiger']['Machine'] = 'eiger'
+cscs['eiger']['Cores'] = 64
+cscs['eiger']['Thread_array'] = [1,2,4,8,16]
+
+
+# In[8]:
+
+
+#
+# Generate Job script preamble
+#
+def init_job_text(system, run_name, time_min, transport, nodes, threads, inflight, size):
+    return system["Batch preamble"].format(run_name=run_name,
+                                           time_min=time_min,
+                                           transport=transport,
+                                           nodes=nodes,
+                                           threads=threads,
+                                           threadsm1=(threads-1),
+                                           inflight=inflight,
+                                           size=size).strip()
+#
+# create a directory name from params
+#
+def make_job_directory(fdir,name, transport, nodes, threads, inflight, size):
+    return f'{fdir}/{name}_{transport}_{nodes}_{threads}_{inflight}_{size}'
+
+#
+# create the launch command-line
+#
+def run_command(system, total_ranks, cpus_per_rank, timeout):
+    return system["Run command"].format(total_ranks=total_ranks, cpus_per_rank=cpus_per_rank, threads_per_rank=cpus_per_rank, timeout=timeout)
+
+#
+# create dir + write final script for sbatch/shell or other job launcher
+#
+def write_job_file(system, launch_file, job_dir, job_text, suffix=''):
+    job_path = os.path.expanduser(job_dir)
+    os.makedirs(job_path, exist_ok=True)
+    job_file = f"{job_path}/job_{suffix}.sh"
+    print(f"Generating : {job_path} : {job_file}")
+
+    with open(job_file, "w") as f:
+        f.write(job_text)
+        make_executable(job_file)
+
+    launchstring  = system["Launch"].format(job_path=job_path,job_file=job_file) + '\n'
+    launchstring += 'sleep ' + str(system['Sleeptime']) + '\n'
+    launch_file.write(launchstring)
+
+#
+# generate a string that decorates and launches a single instance of the test
+#
+def execution_string(env, launch_cmd, prog_cmd, output_redirect):
+    full_command = f"{env} {launch_cmd} {prog_cmd}".strip()
+    command_prologue  = f'printf "\\n'
+    command_prologue += f'# ----- Executing \\n'
+    command_prologue += f'{full_command}    \\n'
+    command_prologue += f'# --------------- \\n" >> {output_redirect}'
+    command_epilogue  = f'printf "\\n'
+    command_epilogue += f'# ----- Finished  \\n\\n" >> {output_redirect}'
+    return '\n' + command_prologue + '\n' + full_command + ' >> ' + output_redirect + '\n' + command_epilogue + '\n'
+
+#
+# generate application specific commmands/flags/options that go into the job script
+#
+def oomph_original(system, bin_dir, timeout, transport, progs, nodes, threads, msg, size, inflight, env):
+    total_ranks = 2
+    whole_cmd = ''
+    suffix = ''
+
+    # transport layers use '_libfabric', '_ucx', '_mpi', etc
+    if args.type!='native':
+        suffix = f'_{transport}'
+
+    # timed version uses seconds instead of messages/iterations
+    if args.type=='timed':
+        msg = 30
+
+    # always remember to add a space to the end of each env var for concatenation of many of them
+    if threads==1:
+        env +=  'MPICH_MAX_THREAD_SAFETY=single '
+    else:
+        env +=  'MPICH_MAX_THREAD_SAFETY=multiple '
+        env += f'OMP_NUM_THREADS={threads} '
+        env += f'GOMP_CPU_AFFINITY=0-{threads} '
+
+    for prog in progs:
+        if threads>1:
+            if args.type=='normal' or args.type=='timed':
+                prog = prog + '_mt'
+
+        if transport=='native' and threads==1:
+            prog = prog.replace('_mt_','_')
+
+        # generate the name of the output file we redirect output to
+        outfile = f'{prog}_N{nodes}_T{threads}_I{msg}_S{size}_F{inflight}.out'
+
+        # generate the program commmand with all command line params needed by program
+        prog_cmd = f"{bin_dir}/{prog}{suffix} {msg} {size} {inflight}"
+
+        # get the system launch command (mpiexec, srun, etc) with options/params
+        launch_cmd = run_command(system, total_ranks, threads, timeout)
+
+        if transport=='libfabric':
+            env2 = env + 'LIBFABRIC_POLL_SIZE=32 '
+            #for ep in ['single', 'multiple', 'scalableTx', 'threadlocal']:
+            for ep in ['threadlocal']:
+                whole_cmd += execution_string(env2 + f"LIBFABRIC_ENDPOINT_TYPE={ep} ", launch_cmd, prog_cmd, outfile)
+            if False: # add option to enable this?
+                whole_cmd += execution_string(env2 + f"LIBFABRIC_ENDPOINT_TYPE={ep} " + f"LIBFABRIC_AUTO_PROGRESS=1 ", launch_cmd, prog_cmd, outfile)
+        else:
+            whole_cmd += execution_string(env, launch_cmd, prog_cmd, outfile)
+
+    return whole_cmd
+
+
+# In[9]:
+
+
+system = cscs[hostname]
+#
+job_name       = 'oomph'
+timeout        = args.timeout
+time_min       = 2000*60 # total time estimate
+timestr        = time.strftime('%H:%M:%S', time.gmtime(time_min))
+ranks_per_node = 1
+nodes_arr = [2]
+thrd_arr  = system['Thread_array']
+size_arr  = [1,10,100,1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000]
+nmsg_lut  = {1:500000,
+             10:500000,
+             100:500000,
+             1000:500000,
+             2000:500000,
+             5000:250000,
+             10000:250000,
+             20000:250000,
+             50000:250000,
+             100000:250000,
+             200000:250000,
+             500000:100000,
+             1000000:50000,
+             2000000:25000}
+
+flight_arr = [1,10,50,100]
+
+if args.type=='normal':
+    trans_arr = ['libfabric', 'mpi']
+    prog_arr  = [
+        #"bench_p2p_bi_cb_avail",
+        #"bench_p2p_bi_cb_wait",
+        "bench_p2p_bi_ft_avail",
+        #"bench_p2p_bi_ft_wait"
+    ]
+
+if args.type=='timed':
+    trans_arr = ['libfabric', 'mpi']
+    prog_arr  = ['bench_p2p_pp_ft_avail']
+
+if args.type=='native':
+    trans_arr = ['native']
+    prog_arr  = [
+        #"mpi_p2p_bi_avail_mt_test", "mpi_p2p_bi_avail_mt_testany",
+        #"mpi_p2p_bi_wait_mt_wait",
+        "mpi_p2p_bi_wait_mt_waitall"
+    ]
+
+
+# In[10]:
+
+
+combos = 0
+
+if run_dir.startswith('@'):
+    print(f'Skipping creation of job launch file for {run_dir}')
+else:
+    job_launch = f"{run_dir}/launch.sh"
+    job_launch_file = open(job_launch, "w")
+    #
+    job_launch_file.write("#!/bin/bash -l\n")
+
+# create the output directory for each job
+job_dir = make_job_directory(run_dir, 'oomph', "all", 2, 1, 1, 1)
+
+# first part of boiler plate job script
+job_text = init_job_text(system, job_name, timestr, "all", 2, 16, 1, 1)
+
+# generate all combinations in one monster loop
+for nodes, transport, threads, size, inflight in product(nodes_arr, trans_arr, thrd_arr, size_arr, flight_arr):
+
+    env = ""
+    msg = nmsg_lut[size]
+
+    # create the output directory for each job
+    #job_dir = make_job_directory(run_dir, 'oomph', transport, nodes, threads, inflight, size)
+
+    # first part of boiler plate job script
+    #job_text = init_job_text(system, job_name, timestr, transport, nodes, threads, inflight, size)
+
+    env = 'MPICH_GNI_NDREG_ENTRIES=1024 '
+
+    # application specific part of job script
+    job_text += oomph_original(
+        system,
+        binary_dir,
+        timeout,
+        transport,
+        prog_arr,
+        nodes,
+        threads,
+        msg,
+        size,
+        inflight,
+        env
+    )
+    # debugging
+    # print(job_dir, '\n', job_text, '\n\n\n\n')
+
+    combos += 1
+
+    if combos==1:
+        print('Uncommment the following line to perform the job creation')
+
+write_job_file(system, job_launch_file, job_dir, job_text)
+
+make_executable(job_launch)
+print('Combinations', combos, 'est-time', combos*4*2,'minutes')
+
+
+# In[ ]:
+
+
+
+
diff --git a/cmake/oomph_external_dependencies.cmake b/cmake/oomph_external_dependencies.cmake
index 92de39bd..e0eb1a65 100644
--- a/cmake/oomph_external_dependencies.cmake
+++ b/cmake/oomph_external_dependencies.cmake
@@ -1,8 +1,9 @@
 include(oomph_git_submodule)
 include(oomph_external_project)
+include(ExternalProject)
 
 if(OOMPH_GIT_SUBMODULE)
-    update_git_submodules()
+#    update_git_submodules()
 endif()
 
 # ---------------------------------------------------------------------
@@ -15,48 +16,40 @@ find_package(MPI REQUIRED COMPONENTS CXX)
 # ---------------------------------------------------------------------
 find_package(Boost REQUIRED)
 
-# ---------------------------------------------------------------------
-# hwmalloc setup
-# ---------------------------------------------------------------------
-cmake_dependent_option(OOMPH_USE_BUNDLED_HWMALLOC "Use bundled hwmalloc lib." ON
-    "OOMPH_USE_BUNDLED_LIBS" OFF)
-if(OOMPH_USE_BUNDLED_HWMALLOC)
-    check_git_submodule(hwmalloc ext/hwmalloc)
-    add_subdirectory(ext/hwmalloc)
-    add_library(HWMALLOC::hwmalloc ALIAS hwmalloc)
-else()
-    find_package(HWMALLOC REQUIRED)
-endif()
+#------------------------------------------------------------------------------
+# Find Threads
+#------------------------------------------------------------------------------
+find_package(Threads REQUIRED)
+
+# ------------------------------------------------------------------------------
+# Build/Download HWMalloc
+# ------------------------------------------------------------------------------
+get_external_project(
+  PROJECT_NAME
+  "hwmalloc"
+  FOLDER_NAME
+  "hwmalloc"
+  GIT_REPO
+  "https://github.com/ghex-org/hwmalloc.git"
+  GIT_TAG
+  "master"
+)
+
+add_library(HWMALLOC::hwmalloc ALIAS hwmalloc)
 
 # ---------------------------------------------------------------------
 # google test setup
 # ---------------------------------------------------------------------
-cmake_dependent_option(OOMPH_USE_BUNDLED_GTEST "Use bundled googletest lib." ON
-    "OOMPH_USE_BUNDLED_LIBS" OFF)
-if (OOMPH_WITH_TESTING)
-    if(OOMPH_USE_BUNDLED_GTEST)
-        add_external_cmake_project(
-            NAME googletest
-            PATH ext/googletest
-            INTERFACE_NAME ext-gtest
-            LIBS libgtest.a libgtest_main.a
-            CMAKE_ARGS
-                "-DCMAKE_BUILD_TYPE=release"
-                "-DBUILD_SHARED_LIBS=OFF"
-                "-DBUILD_GMOCK=OFF")
-        # on some systems we need link explicitly against threads
-        if (TARGET ext-gtest)
-            find_package (Threads)
-            target_link_libraries(ext-gtest INTERFACE Threads::Threads)
-        endif()
-    else()
-        # Use system provided google test
-        find_package(GTest REQUIRED)
-        add_library(ext-gtest INTERFACE)
-        if (${CMAKE_VERSION} VERSION_LESS "3.20.0")
-            target_link_libraries(ext-gtest INTERFACE GTest::GTest GTest::Main)
-        else()
-            target_link_libraries(ext-gtest INTERFACE GTest::gtest GTest::gtest_main)
-        endif()
-    endif()
+find_package(GTest QUIET)
+message("GTest FOUND ${GTest_FOUND}")
+if (NOT GTest_FOUND)
+    include(FetchContent)
+    FetchContent_Declare(
+      googletest
+      GIT_REPOSITORY https://github.com/google/googletest.git
+      GIT_TAG main
+      GIT_SHALLOW TRUE)
+    # For Windows: Prevent overriding the parent project's compiler/linker settings
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+    FetchContent_MakeAvailable(googletest)
 endif()
diff --git a/cmake/oomph_libfabric.cmake b/cmake/oomph_libfabric.cmake
index 5a981bc1..c5209e99 100644
--- a/cmake/oomph_libfabric.cmake
+++ b/cmake/oomph_libfabric.cmake
@@ -76,7 +76,7 @@ if (OOMPH_WITH_LIBFABRIC)
         file(WRITE ${TEMP_FILENAME}
             ${PREAMBLE}
             ${oomph_config_defines}
-            "\n#endif\n"
+            "#endif\n"
         )
         configure_file("${TEMP_FILENAME}" "${OPTION_FILENAME}" COPYONLY)
         file(REMOVE "${TEMP_FILENAME}")
@@ -93,8 +93,9 @@ if (OOMPH_WITH_LIBFABRIC)
     # Hardware device selection
     #------------------------------------------------------------------------------
     set(OOMPH_LIBFABRIC_PROVIDER "tcp" CACHE
-        STRING "The provider (cxi/gni/psm2/sockets/tcp/verbs)")
-    set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS "cxi" "gni" "psm2" "sockets" "tcp" "verbs")
+        STRING "The provider (cxi(Cray Slingshot)/efa(Amazon Elastic)/gni(Cray Gemini)/psm2(Intel Omni-Path)/tcp/verbs(Infiniband))")
+    set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS
+        "cxi" "efa" "gni" "psm2" "tcp" "verbs")
 
     oomph_libfabric_add_config_define_namespace(
         DEFINE HAVE_LIBFABRIC_PROVIDER
@@ -115,11 +116,17 @@ if (OOMPH_WITH_LIBFABRIC)
         oomph_libfabric_add_config_define_namespace(
             DEFINE HAVE_LIBFABRIC_CXI
             NAMESPACE libfabric)
+    elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "efa")
+        oomph_libfabric_add_config_define_namespace(
+            DEFINE HAVE_LIBFABRIC_EFA
+            NAMESPACE libfabric)
     elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "tcp")
         oomph_libfabric_add_config_define_namespace(
             DEFINE HAVE_LIBFABRIC_TCP
             NAMESPACE libfabric)
     elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "sockets")
+        message(WARNING "The Sockets provider is deprecated in favor of the tcp, udp, "
+            "and utility providers")
         oomph_libfabric_add_config_define_namespace(
             DEFINE HAVE_LIBFABRIC_SOCKETS
             NAMESPACE libfabric)
@@ -134,7 +141,6 @@ if (OOMPH_WITH_LIBFABRIC)
     #------------------------------------------------------------------------------
     set(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS OFF BOOL
         STRING "Enable libfabric parcelport performance counters (default: OFF)")
-    set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS "tcp" "sockets" "psm2" "verbs" "gni")
     mark_as_advanced(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS)
 
     if (OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS)
@@ -143,14 +149,20 @@ if (OOMPH_WITH_LIBFABRIC)
           NAMESPACE libfabric)
     endif()
 
+    #------------------------------------------------------------------------------
+    # used by template expansion for location of print.hpp
+    #------------------------------------------------------------------------------
+    set(OOMPH_SRC_LIBFABRIC_DIR "${PROJECT_SOURCE_DIR}/src/libfabric")
+
     #------------------------------------------------------------------------------
     # Write options to file in build dir
     #------------------------------------------------------------------------------
     oomph_libfabric_write_config_defines_file(
         NAMESPACE libfabric
-        FILENAME  "${PROJECT_BINARY_DIR}/oomph_libfabric_defines.hpp"
+        FILENAME  "${PROJECT_BINARY_DIR}/src/libfabric/oomph_libfabric_defines.hpp"
+        TEMPLATE  "${OOMPH_SRC_LIBFABRIC_DIR}/libfabric_defines_template.hpp"
     )
-    target_include_directories(oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}")
+    target_include_directories(oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}/src/libfabric")
 endif()
 
 
diff --git a/ext/googletest b/ext/googletest
deleted file mode 160000
index b10fad38..00000000
--- a/ext/googletest
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b10fad38c4026a29ea6561ab15fc4818170d1c10
diff --git a/include/oomph/context.hpp b/include/oomph/context.hpp
index f1519ef6..bd12588b 100644
--- a/include/oomph/context.hpp
+++ b/include/oomph/context.hpp
@@ -120,7 +120,7 @@ template<typename Context>
 typename Context::region_type register_memory(Context&, void*, std::size_t);
 #if OOMPH_ENABLE_DEVICE
 template<typename Context>
-typename Context::device_region_type register_device_memory(Context&, void*, std::size_t);
+typename Context::device_region_type register_device_memory(Context&, int, void*, std::size_t);
 #endif
 
 } // namespace oomph
diff --git a/spack/packages/oomph/package.py b/spack/packages/oomph/package.py
new file mode 100644
index 00000000..a0281b7a
--- /dev/null
+++ b/spack/packages/oomph/package.py
@@ -0,0 +1,96 @@
+from spack import *
+import os
+
+class Oomph(CMakePackage, CudaPackage, ROCmPackage):
+    """dummy placeholder for oomph dependencies"""
+    homepage    = "https://127.0.0.1/readme.html"
+    generator   = "Ninja"
+    maintainers = ["biddisco"]
+    version("develop")
+
+    # we have only tested/supported a subset of potential libfabrics providers
+    fabrics = (
+        "cxi", "efa", "gni", "psm2", "tcp", "verbs",
+    )
+
+    variant(
+        "ofi",
+        default="tcp",
+        description="A list of enabled OFI fabrics",
+        values=fabrics,
+        multi=False,
+    )
+
+    # ------------------------------------------------------------------------
+    # Exactly one of +cuda and +rocm need to be set
+    # ------------------------------------------------------------------------
+    conflicts("+cuda +rocm")
+
+    # ------------------------------------------------------------------------
+    # variants
+    # ------------------------------------------------------------------------
+    variant("ucx", default=False, description="Enable ucx support")
+    variant("ofi", default=False, description="Enable ofi libfabric support")
+    variant("testing", default=False, description="Enable testing")
+
+    # ------------------------------------------------------------------------
+    # build time dependencies
+    # ------------------------------------------------------------------------
+    depends_on("ninja", type="build")
+    depends_on("cmake@3.22:", type="build")
+
+    # ------------------------------------------------------------------------
+    # generic c++ libs needed by several projects
+    # ------------------------------------------------------------------------
+    depends_on("boost +atomic+chrono+container+context+coroutine+date_time+filesystem+program_options+regex+serialization+system+test+thread+mpi+graph+json cxxstd=17")
+    #depends_on("fmt")
+
+    # ------------------------------------------------------------------------
+    # GPU/cuda/rocm
+    # ------------------------------------------------------------------------
+    depends_on("cuda",       when="+cuda")
+    depends_on("rocm-core",  when="+rocm")
+
+    # ------------------------------------------------------------------------
+    # allocators/memory
+    # ------------------------------------------------------------------------
+    depends_on("hwloc +cuda", when="+cuda ^gcc@11.4:")
+    depends_on("hwloc ~cuda", when="gcc@:11.4")
+    depends_on("numactl")
+
+
+    # ------------------------------------------------------------------------
+    # mpi and parallel io
+    # ------------------------------------------------------------------------
+    depends_on("mpi")
+    depends_on("ucx +thread_multiple", when="+ucx")
+    depends_on("libfabric@1.17:", when="+ofi")
+
+    # ------------------------------------------------------------------------
+    # testing
+    # ------------------------------------------------------------------------
+    depends_on("googletest")
+
+    # ------------------------------------------------------------------------
+    def cmake_args(self):
+        """Populate cmake arguments for Mercury."""
+        spec = self.spec
+        define = self.define
+        define_from_variant = self.define_from_variant
+        parallel_tests = "+mpi" in spec and self.run_tests
+
+        cmake_args = [
+            define_from_variant("OOMPH_WITH_LIBFABRIC", "ofi"),
+            define_from_variant("OOMPH_WITH_UCX", "ucx"),
+        ]
+
+        if "+ofi" in spec:
+            ofi_fabrics = spec["libfabric"].variants["fabrics"].value
+            ofi_fabrics = next((fabric for fabric in ofi_fabrics if fabric in self.fabrics), None)
+            if ofi_fabrics is None:
+                raise ValueError("No matching fabric found in ofi_fabrics and fabrics")
+            cmake_args.append(f"OOMPH_LIBFABRIC_PROVIDER={ofi_fabrics}")
+        print (cmake_args)
+        return cmake_args
+
+    # ------------------------------------------------------------------------
diff --git a/src/libfabric/CMakeLists.txt b/src/libfabric/CMakeLists.txt
index 6de175ad..81d7f907 100644
--- a/src/libfabric/CMakeLists.txt
+++ b/src/libfabric/CMakeLists.txt
@@ -1,10 +1,19 @@
 find_package(Boost REQUIRED COMPONENTS thread)
+
+# dummy library of our private headers
 add_library(oomph_private_libfabric_headers INTERFACE)
 target_include_directories(oomph_private_libfabric_headers INTERFACE
     "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>")
+
+# actual library (created in oomph_libfabric.cmake) source files,
+# depends on dummy library
 target_link_libraries(oomph_libfabric PRIVATE oomph_private_libfabric_headers)
 target_link_libraries(oomph_libfabric PRIVATE Boost::thread)
 
+# we need to include a binary dir for the oomph_config_defines.hpp file
+target_include_directories(oomph_libfabric INTERFACE
+    "$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/src/libfabric>")
+
 list(TRANSFORM oomph_sources PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/../
     OUTPUT_VARIABLE oomph_sources_libfabric)
 target_sources(oomph_libfabric PRIVATE ${oomph_sources_libfabric})
diff --git a/src/libfabric/README.txt b/src/libfabric/README.txt
index 007e014d..70bc8151 100644
--- a/src/libfabric/README.txt
+++ b/src/libfabric/README.txt
@@ -42,3 +42,14 @@ clang-format -i ./include/hpx/parcelport_libfabric/controller_base.hpp
 meld ./include/hpx/parcelport_libfabric/memory_region.hpp ~/src/ghex/extern/oomph/src/libfabric/memory_region.hpp
 meld ./include/hpx/parcelport_libfabric/operation_context_base.hpp ~/src/ghex/extern/oomph/src/libfabric/operation_context_base.hpp
 meld ./include/hpx/parcelport_libfabric/controller_base.hpp ~/src/ghex/extern/oomph/src/libfabric/controller_base.hpp
+
+cp ~/src/ghex/extern/oomph/src/libfabric/memory_region.hpp          ./include/hpx/parcelport_libfabric/memory_region.hpp
+cp ~/src/ghex/extern/oomph/src/libfabric/operation_context_base.hpp ./include/hpx/parcelport_libfabric/operation_context_base.hpp
+cp ~/src/ghex/extern/oomph/src/libfabric/controller_base.hpp        ./include/hpx/parcelport_libfabric/controller_base.hpp
+cp ~/src/ghex/extern/oomph/src/libfabric/fabric_error.hpp           ./include/hpx/parcelport_libfabric/fabric_error.hpp
+
+clang-format -i include/hpx/parcelport_libfabric/memory_region.hpp
+clang-format -i include/hpx/parcelport_libfabric/operation_context_base.hpp
+clang-format -i include/hpx/parcelport_libfabric/controller_base.hpp
+clang-format -i include/hpx/parcelport_libfabric/fabric_error.hpp
+
diff --git a/src/libfabric/communicator.hpp b/src/libfabric/communicator.hpp
index 76509a2a..0b5397ba 100644
--- a/src/libfabric/communicator.hpp
+++ b/src/libfabric/communicator.hpp
@@ -20,7 +20,7 @@
 // paths relative to backend
 #include <../communicator_base.hpp>
 #include <../device_guard.hpp>
-#include <./operation_context.hpp>
+#include <operation_context.hpp>
 #include <request_state.hpp>
 #include <controller.hpp>
 #include <context.hpp>
@@ -28,12 +28,12 @@
 namespace oomph
 {
 
-using operation_context = oomph::libfabric::operation_context;
+using operation_context = libfabric::operation_context;
 
 using tag_disp = NS_DEBUG::detail::hex<12, uintptr_t>;
 
 // cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> com_deb("COMMUNI");
+static NS_DEBUG::enable_print<true>  com_deb("COMMUNI");
 static NS_DEBUG::enable_print<true>  com_err("COMMUNI");
 
 class communicator_impl : public communicator_base<communicator_impl>
@@ -63,7 +63,7 @@ class communicator_impl : public communicator_base<communicator_impl>
     , m_recv_cb_queue(128)
     , m_recv_cb_cancel(8)
     {
-        OOMPH_DP_ONLY(com_deb, debug(NS_DEBUG::str<>("MPI_comm"), NS_DEBUG::ptr(mpi_comm())));
+        LF_DEB(com_deb, debug(NS_DEBUG::str<>("MPI_comm"), NS_DEBUG::ptr(mpi_comm())));
         m_tx_endpoint = m_context->get_controller()->get_tx_endpoint();
         m_rx_endpoint = m_context->get_controller()->get_rx_endpoint();
     }
@@ -77,10 +77,10 @@ class communicator_impl : public communicator_base<communicator_impl>
     // --------------------------------------------------------------------
     /// generate a tag with 0xRRRRRRRRtttttttt rank, tag.
     /// original tag can be 32bits, then we add 32bits of rank info.
-    inline std::uint64_t make_tag64(std::uint32_t tag, std::uint32_t rank)
+    inline std::uint64_t make_tag64(std::uint32_t tag, /*std::uint32_t rank, */std::uintptr_t ctxt)
     {
-        return (((std::uint64_t(rank) & 0x00000000FFFFFFFF) << 32) |
-                ((std::uint64_t(tag) & 0x00000000FFFFFFFF)));
+        return (((ctxt & 0x0000000000FFFFFF) << 24) |
+                ((std::uint64_t(tag) & 0x0000000000FFFFFF)));
     }
 
     // --------------------------------------------------------------------
@@ -94,7 +94,7 @@ class communicator_impl : public communicator_base<communicator_impl>
             if (ret == 0) { return; }
             else if (ret == -FI_EAGAIN)
             {
-                com_deb.error("Reposting", msg);
+                // com_deb.error("Reposting", msg);
                 // no point stressing the system
                 m_context->get_controller()->poll_for_work_completions(this);
             }
@@ -105,7 +105,7 @@ class communicator_impl : public communicator_base<communicator_impl>
                 com_err.error("No destination endpoint, terminating.");
                 std::terminate();
             }
-            else if (ret) { throw libfabric::fabric_error(int(ret), msg); }
+            else if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), msg); }
         }
     }
 
@@ -116,8 +116,8 @@ class communicator_impl : public communicator_base<communicator_impl>
     {
         [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__);
         // clang-format off
-        OOMPH_DP_ONLY(com_deb,
-            debug(NS_DEBUG::str<>("send message buffer"),
+        LF_DEB(com_deb,
+            debug(NS_DEBUG::str<>("send_tagged_region"),
                   "->", NS_DEBUG::dec<2>(dst_addr_),
                   send_region,
                   "tag", tag_disp(tag_),
@@ -135,12 +135,9 @@ class communicator_impl : public communicator_base<communicator_impl>
     {
         [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__);
         // clang-format on
-        OOMPH_DP_ONLY(com_deb,
-            debug(NS_DEBUG::str<>("inject tagged"),
-                  "->", NS_DEBUG::dec<2>(dst_addr_),
-                  send_region,
-                  "tag", tag_disp(tag_),
-                  "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep())));
+        LF_DEB(com_deb,
+            debug(NS_DEBUG::str<>("inject tagged"), "->", NS_DEBUG::dec<2>(dst_addr_), send_region,
+                "tag", tag_disp(tag_), "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep())));
         // clang-format off
         execute_fi_function(fi_tinject, "fi_tinject", m_tx_endpoint.get_ep(),
             send_region.get_address(), size, dst_addr_, tag_);
@@ -155,8 +152,8 @@ class communicator_impl : public communicator_base<communicator_impl>
     {
         [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__);
         // clang-format off
-        OOMPH_DP_ONLY(com_deb,
-            debug(NS_DEBUG::str<>("recv message buffer"),
+        LF_DEB(com_deb,
+            debug(NS_DEBUG::str<>("recv_tagged_region"),
                   "<-", NS_DEBUG::dec<2>(src_addr_),
                   recv_region,
                   "tag", tag_disp(tag_),
@@ -175,15 +172,14 @@ class communicator_impl : public communicator_base<communicator_impl>
         std::size_t* scheduled)
     {
         [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__);
-        std::uint64_t         stag = make_tag64(tag, this->rank());
+        std::uint64_t         stag = make_tag64(tag, /*this->rank(), */this->m_context->get_context_tag());
 
         auto& reg = ptr.handle_ref();
 #ifdef EXTRA_SIZE_CHECKS
         if (size != reg.get_size())
         {
-            OOMPH_DP_ONLY(com_err,
-                error(NS_DEBUG::str<>("send mismatch"), "size", NS_DEBUG::hex<6>(size), "reg size",
-                    NS_DEBUG::hex<6>(reg.get_size())));
+            LF_DEB(com_err, error(NS_DEBUG::str<>("send mismatch"), "size", NS_DEBUG::hex<6>(size),
+                                "reg size", NS_DEBUG::hex<6>(reg.get_size())));
         }
 #endif
         m_context->get_controller()->sends_posted_++;
@@ -215,7 +211,7 @@ class communicator_impl : public communicator_base<communicator_impl>
         s->create_self_ref();
 
         // clang-format off
-        OOMPH_DP_ONLY(com_deb,
+        LF_DEB(com_deb,
             debug(NS_DEBUG::str<>("Send"),
                   "thisrank", NS_DEBUG::dec<>(rank()),
                   "rank", NS_DEBUG::dec<>(dst),
@@ -238,15 +234,14 @@ class communicator_impl : public communicator_base<communicator_impl>
         std::size_t* scheduled)
     {
         [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__);
-        std::uint64_t         stag = make_tag64(tag, src);
+        std::uint64_t         stag = make_tag64(tag, /*src, */this->m_context->get_context_tag());
 
         auto& reg = ptr.handle_ref();
 #ifdef EXTRA_SIZE_CHECKS
         if (size != reg.get_size())
         {
-            OOMPH_DP_ONLY(com_err,
-                error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), "reg size",
-                    NS_DEBUG::hex<6>(reg.get_size())));
+            LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size),
+                                "reg size", NS_DEBUG::hex<6>(reg.get_size())));
         }
 #endif
         m_context->get_controller()->recvs_posted_++;
@@ -257,8 +252,8 @@ class communicator_impl : public communicator_base<communicator_impl>
         s->create_self_ref();
 
         // clang-format off
-        OOMPH_DP_ONLY(com_deb,
-            debug(NS_DEBUG::str<>("Recv"),
+        LF_DEB(com_deb,
+            debug(NS_DEBUG::str<>("recv"),
                   "thisrank", NS_DEBUG::dec<>(rank()),
                   "rank", NS_DEBUG::dec<>(src),
                   "tag", tag_disp(std::uint64_t(tag)),
@@ -281,15 +276,14 @@ class communicator_impl : public communicator_base<communicator_impl>
         std::atomic<std::size_t>*                                 scheduled)
     {
         [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__);
-        std::uint64_t         stag = make_tag64(tag, src);
+        std::uint64_t         stag = make_tag64(tag, /*src, */this->m_context->get_context_tag());
 
         auto& reg = ptr.handle_ref();
 #ifdef EXTRA_SIZE_CHECKS
         if (size != reg.get_size())
         {
-            OOMPH_DP_ONLY(com_err,
-                error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), "reg size",
-                    NS_DEBUG::hex<6>(reg.get_size())));
+            LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size),
+                                "reg size", NS_DEBUG::hex<6>(reg.get_size())));
         }
 #endif
         m_context->get_controller()->recvs_posted_++;
@@ -300,8 +294,8 @@ class communicator_impl : public communicator_base<communicator_impl>
         s->create_self_ref();
 
         // clang-format off
-        OOMPH_DP_ONLY(com_deb,
-            debug(NS_DEBUG::str<>("Recv"),
+        LF_DEB(com_deb,
+            debug(NS_DEBUG::str<>("shared_recv"),
                   "thisrank", NS_DEBUG::dec<>(rank()),
                   "rank", NS_DEBUG::dec<>(src),
                   "tag", tag_disp(std::uint64_t(tag)),
@@ -368,7 +362,7 @@ class communicator_impl : public communicator_base<communicator_impl>
 
         // submit the cancellation request
         bool ok = (fi_cancel(&m_rx_endpoint.get_ep()->fid, op_ctx) == 0);
-        OOMPH_DP_ONLY(com_deb,
+        LF_DEB(com_deb,
             debug(NS_DEBUG::str<>("Cancel"), "ok", ok, "op_ctx", NS_DEBUG::ptr(op_ctx)));
 
         // if the cancel operation failed completely, return
@@ -387,8 +381,8 @@ class communicator_impl : public communicator_base<communicator_impl>
                 {
                     // our recv was cancelled correctly
                     found = true;
-                    OOMPH_DP_ONLY(com_deb, debug(NS_DEBUG::str<>("Cancel"), "succeeded", "op_ctx",
-                                               NS_DEBUG::ptr(op_ctx)));
+                    LF_DEB(com_deb, debug(NS_DEBUG::str<>("Cancel"), "succeeded", "op_ctx",
+                                        NS_DEBUG::ptr(op_ctx)));
                     auto ptr = s->release_self_ref();
                     s->set_canceled();
                 }
diff --git a/src/libfabric/context.cpp b/src/libfabric/context.cpp
index 1baca07b..cbbc4b26 100644
--- a/src/libfabric/context.cpp
+++ b/src/libfabric/context.cpp
@@ -7,6 +7,8 @@
  * Please, refer to the LICENSE file in the root directory.
  * SPDX-License-Identifier: BSD-3-Clause
  */
+#include <cstdint>
+//
 #include <boost/thread.hpp>
 // paths relative to backend
 #include <oomph_libfabric_defines.hpp>
@@ -17,9 +19,9 @@
 namespace oomph
 {
 // cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> src_deb("__SRC__");
+static NS_DEBUG::enable_print<true> src_deb("__SRC__");
 
-using controller_type = oomph::libfabric::controller;
+using controller_type = libfabric::controller;
 
 context_impl::context_impl(MPI_Comm comm, bool thread_safe, bool message_pool_never_free,
     std::size_t message_pool_reserve)
@@ -31,6 +33,12 @@ context_impl::context_impl(MPI_Comm comm, bool thread_safe, bool message_pool_ne
     int rank, size;
     OOMPH_CHECK_MPI_RESULT(MPI_Comm_rank(comm, &rank));
     OOMPH_CHECK_MPI_RESULT(MPI_Comm_size(comm, &size));
+
+    m_ctxt_tag = reinterpret_cast<std::uintptr_t>(this);
+    OOMPH_CHECK_MPI_RESULT(MPI_Bcast(&m_ctxt_tag, 1, MPI_UINT64_T, 0, comm));
+    LF_DEB(src_deb, debug(NS_DEBUG::str<>("Broadcast"), "rank", debug::dec<3>(rank), "context",
+                        debug::ptr(m_ctxt_tag)));
+
     // TODO fix the thread safety
     // problem: controller is a singleton and has problems when 2 contexts are created in the
     // following order: single threaded first, then multi-threaded after
@@ -75,8 +83,8 @@ context_impl::init_libfabric_controller(oomph::context_impl* /*ctx*/, MPI_Comm c
     static std::shared_ptr<controller_type> instance(nullptr);
     if (!instance.get())
     {
-        OOMPH_DP_ONLY(src_deb, debug(NS_DEBUG::str<>("New Controller"), "rank", debug::dec<3>(rank),
-                                   "size", debug::dec<3>(size), "threads", debug::dec<3>(threads)));
+        LF_DEB(src_deb, debug(NS_DEBUG::str<>("New Controller"), "rank", debug::dec<3>(rank),
+                            "size", debug::dec<3>(size), "threads", debug::dec<3>(threads)));
         instance.reset(new controller_type());
         instance->initialize(HAVE_LIBFABRIC_PROVIDER, rank == 0, size, threads, comm);
     }
diff --git a/src/libfabric/context.hpp b/src/libfabric/context.hpp
index f3b34259..d17d8a05 100644
--- a/src/libfabric/context.hpp
+++ b/src/libfabric/context.hpp
@@ -26,14 +26,16 @@
 namespace oomph
 {
 
-using controller_type = oomph::libfabric::controller;
+static NS_DEBUG::enable_print<true> ctx_deb("CONTEXT");
+
+using controller_type = libfabric::controller;
 
 class context_impl : public context_base
 {
   public:
-    using region_type = oomph::libfabric::memory_segment;
+    using region_type = libfabric::memory_segment;
     using domain_type = region_type::provider_domain;
-    using device_region_type = oomph::libfabric::memory_segment;
+    using device_region_type = libfabric::memory_segment;
     using heap_type = hwmalloc::heap<context_impl>;
     using callback_queue = boost::lockfree::queue<detail::shared_request_state*,
         boost::lockfree::fixed_sized<false>, boost::lockfree::allocator<std::allocator<void>>>;
@@ -42,6 +44,7 @@ class context_impl : public context_base
     heap_type                        m_heap;
     domain_type*                     m_domain;
     std::shared_ptr<controller_type> m_controller;
+    std::uintptr_t                   m_ctxt_tag;
 
   public:
     // --------------------------------------------------
@@ -61,22 +64,23 @@ class context_impl : public context_base
     context_impl(context_impl const&) = delete;
     context_impl(context_impl&&) = delete;
 
-    region_type make_region(void* const ptr, std::size_t size, bool /*device*/)
+    region_type make_region(void* const ptr, std::size_t size, int device_id)
     {
-        bool bind_mr = ((m_controller->memory_registration_mode_flags() & FI_MR_ENDPOINT) != 0);
-        if (bind_mr) {
-            void *endpoint = m_controller->get_rx_endpoint().get_ep();
-            return oomph::libfabric::memory_segment(m_domain, ptr, size, bind_mr, endpoint);
-        }
-        else {
-            return oomph::libfabric::memory_segment(m_domain, ptr, size, false, nullptr);
+        if (m_controller->get_mrbind())
+        {
+            void* endpoint = m_controller->get_rx_endpoint().get_ep();
+            return libfabric::memory_segment(m_domain, ptr, size, true, endpoint, device_id);
         }
+        else { return libfabric::memory_segment(m_domain, ptr, size, false, nullptr, device_id); }
     }
 
     auto& get_heap() noexcept { return m_heap; }
 
     communicator_impl* get_communicator();
 
+    // we must modify all tags to use 32bits of context ptr for uniqueness
+    inline std::uintptr_t get_context_tag() { return m_ctxt_tag; }
+
     inline controller_type* get_controller() /*const */ { return m_controller.get(); }
     const char*             get_transport_option(const std::string& opt);
 
@@ -107,9 +111,8 @@ class context_impl : public context_base
                 {
                     // our recv was cancelled correctly
                     found = true;
-                    OOMPH_DP_ONLY(libfabric::ctx_deb,
-                        debug(NS_DEBUG::str<>("Cancel shared"), "succeeded", "op_ctx",
-                            NS_DEBUG::ptr(op_ctx)));
+                    LF_DEB(oomph::ctx_deb, debug(NS_DEBUG::str<>("Cancel shared"), "succeeded",
+                                               "op_ctx", NS_DEBUG::ptr(op_ctx)));
                     auto ptr = s->release_self_ref();
                     s->set_canceled();
                 }
@@ -138,15 +141,15 @@ template<>
 inline oomph::libfabric::memory_segment
 register_memory<oomph::context_impl>(oomph::context_impl& c, void* const ptr, std::size_t size)
 {
-    return c.make_region(ptr, size, false);
+    return c.make_region(ptr, size, -2);
 }
 
 #if OOMPH_ENABLE_DEVICE
 template<>
 inline oomph::libfabric::memory_segment
-register_device_memory<context_impl>(context_impl& c, void* ptr, std::size_t size)
+register_device_memory<context_impl>(context_impl& c, int device_id, void* ptr, std::size_t size)
 {
-    return c.make_region(ptr, size, true);
+    return c.make_region(ptr, size, device_id);
 }
 #endif
 
diff --git a/src/libfabric/controller.hpp b/src/libfabric/controller.hpp
index 301c0eb5..04443a36 100644
--- a/src/libfabric/controller.hpp
+++ b/src/libfabric/controller.hpp
@@ -35,24 +35,21 @@
 #include <rdma/fi_rma.h>
 #include <rdma/fi_tagged.h>
 //
+#include "oomph_libfabric_defines.hpp"
 #include "fabric_error.hpp"
 #include "locality.hpp"
 #include "memory_region.hpp"
 #include "operation_context.hpp"
-#include "simple_counter.hpp"
-#include "print.hpp"
 #include "controller_base.hpp"
 //
 #include <oomph/util/unique_function.hpp>
 //
-#include "libfabric_defines.hpp"
-//
 #include <mpi.h>
 
 namespace NS_DEBUG
 {
 // cppcheck-suppress ConfigurationNotChecked
-static debug::enable_print<false> cnt_deb("CONTROL");
+static debug::enable_print<true>  cnt_deb("CONTROL");
 static debug::enable_print<true>  cnt_err("CONTROL");
 } // namespace NS_DEBUG
 
@@ -79,7 +76,7 @@ class controller : public controller_base<controller>
     // --------------------------------------------------------------------
     constexpr fi_threading threadlevel_flags()
     {
-#if defined(HAVE_LIBFABRIC_GNI) || defined(HAVE_LIBFABRIC_CXI)
+#if defined(HAVE_LIBFABRIC_GNI) /*|| defined(HAVE_LIBFABRIC_CXI)*/
         return FI_THREAD_ENDPOINT;
 #else
         return FI_THREAD_SAFE;
@@ -87,7 +84,15 @@ class controller : public controller_base<controller>
     }
 
     // --------------------------------------------------------------------
-    constexpr uint64_t caps_flags() { return FI_MSG | FI_TAGGED; }
+    constexpr uint64_t caps_flags()
+    {
+#if OOMPH_ENABLE_DEVICE
+        std::int64_t hmem_flags = FI_HMEM;
+#else
+        std::int64_t hmem_flags = 0;
+#endif
+        return hmem_flags | FI_MSG | FI_TAGGED | FI_RMA | FI_READ | FI_WRITE | FI_RECV  | FI_SEND | FI_TRANSMIT | FI_REMOTE_READ | FI_REMOTE_WRITE ;
+    }
 
     // --------------------------------------------------------------------
     // we do not need to perform any special actions on init (to contact root node)
@@ -102,14 +107,14 @@ class controller : public controller_base<controller>
         //
         if (rank > 0)
         {
-            DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("sending here"), iplocality(here_), "size",
-                                         locality_defs::array_size));
+            LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("sending here"), iplocality(here_), "size",
+                                          locality_defs::array_size));
             /*int err = */ MPI_Send(here_.fabric_data(), locality_defs::array_size, MPI_CHAR,
                 0, // dst rank
                 0, // tag
                 comm);
 
-            DEBUG(NS_DEBUG::cnt_deb,
+            LF_DEB(NS_DEBUG::cnt_deb,
                 debug(debug::str<>("receiving all"), "size", locality_defs::array_size));
 
             MPI_Status status;
@@ -117,28 +122,29 @@ class controller : public controller_base<controller>
                 0, // src rank
                 0, // tag
                 comm, &status);
-            DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("received addresses")));
+            LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("received addresses")));
         }
         else
         {
-            DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("receiving addresses")));
+            LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("receiving addresses")));
             memcpy(&localities[0], here_.fabric_data(), locality_defs::array_size);
             for (int i = 1; i < size; ++i)
             {
-                DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("receiving address"), debug::dec<>(i)));
+                LF_DEB(NS_DEBUG::cnt_deb,
+                    debug(debug::str<>("receiving address"), debug::dec<>(i)));
                 MPI_Status status;
                 /*int err = */ MPI_Recv(&localities[i * locality_defs::array_size],
                     size * locality_defs::array_size, MPI_CHAR,
                     i, // src rank
                     0, // tag
                     comm, &status);
-                DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("received address"), debug::dec<>(i)));
+                LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("received address"), debug::dec<>(i)));
             }
 
-            DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("sending all")));
+            LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("sending all")));
             for (int i = 1; i < size; ++i)
             {
-                DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("sending to"), debug::dec<>(i)));
+                LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("sending to"), debug::dec<>(i)));
                 /*int err = */ MPI_Send(&localities[0], size * locality_defs::array_size, MPI_CHAR,
                     i, // dst rank
                     0, // tag
@@ -147,7 +153,7 @@ class controller : public controller_base<controller>
         }
 
         // all ranks should now have a full localities vector
-        DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("populating vector")));
+        LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("populating vector")));
         for (int i = 0; i < size; ++i)
         {
             locality temp;
@@ -168,11 +174,11 @@ class controller : public controller_base<controller>
         MPI_Comm_rank(mpi_comm, &rank);
         MPI_Comm_size(mpi_comm, &size);
 
-        DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("initialize_localities"), size, "localities"));
+        LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("initialize_localities"), size, "localities"));
 
         MPI_exchange_localities(av, mpi_comm, rank, size);
         debug_print_av_vector(size);
-        DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("Done localities")));
+        LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("Done localities")));
     }
 
     // --------------------------------------------------------------------
@@ -182,8 +188,7 @@ class controller : public controller_base<controller>
         return true;
 #elif defined(HAVE_LIBFABRIC_CXI)
         // @todo : cxi provider is not yet thread safe using scalable endpoints
-        return (threadlevel_flags() == FI_THREAD_SAFE ||
-                endpoint_type_ == endpoint_type::threadlocalTx);
+        return false;
 #else
         return (threadlevel_flags() == FI_THREAD_SAFE ||
                 endpoint_type_ == endpoint_type::threadlocalTx);
@@ -240,7 +245,8 @@ class controller : public controller_base<controller>
         send_poll_stamp = now;
 #endif
         int             ret;
-        fi_cq_msg_entry entry[256]; // max_completions_per_poll_ must be <= this
+        fi_cq_msg_entry entry[max_completions_array_limit_];
+        assert(max_completions_per_poll_ <= max_completions_array_limit_);
         {
             auto lock = try_tx_lock();
 
@@ -249,7 +255,7 @@ class controller : public controller_base<controller>
             if (!bypass_tx_lock() && !lock.owns_lock()) { return -1; }
 
             static auto polling = NS_DEBUG::cnt_deb.make_timer(1, debug::str<>("poll send queue"));
-            DEBUG(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(send_cq)));
+            LF_DEB(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(send_cq)));
 
             // poll for completions
             {
@@ -293,14 +299,14 @@ class controller : public controller_base<controller>
             for (int i = 0; i < ret; ++i)
             {
                 ++sends_complete;
-                DEBUG(NS_DEBUG::cnt_deb,
+                LF_DEB(NS_DEBUG::cnt_deb,
                     debug(debug::str<>("Completion"), i, debug::dec<2>(i), "txcq flags",
                         fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(",
                         debug::dec<>(entry[i].flags), ")", "context",
                         NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len)));
                 if ((entry[i].flags & (FI_TAGGED | FI_SEND | FI_MSG)) != 0)
                 {
-                    DEBUG(NS_DEBUG::cnt_deb,
+                    LF_DEB(NS_DEBUG::cnt_deb,
                         debug(debug::str<>("Completion"), "txcq tagged send completion",
                             NS_DEBUG::ptr(entry[i].op_context)));
 
@@ -336,8 +342,8 @@ class controller : public controller_base<controller>
         recv_poll_stamp = now;
 #endif
         int             ret;
-        fi_cq_msg_entry entry[256]; // max_completions_per_poll_ must be <= this
-
+        fi_cq_msg_entry entry[max_completions_array_limit_];
+        assert(max_completions_per_poll_ <= max_completions_array_limit_);
         {
             auto lock = get_rx_lock();
 
@@ -346,7 +352,7 @@ class controller : public controller_base<controller>
             if (!bypass_rx_lock() && !lock.owns_lock()) { return -1; }
 
             static auto polling = NS_DEBUG::cnt_deb.make_timer(1, debug::str<>("poll recv queue"));
-            DEBUG(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(rx_cq)));
+            LF_DEB(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(rx_cq)));
 
             // poll for completions
             {
@@ -362,7 +368,7 @@ class controller : public controller_base<controller>
                 // from the manpage 'man 3 fi_cq_readerr'
                 if (e.err == FI_ECANCELED)
                 {
-                    DEBUG(NS_DEBUG::cnt_deb,
+                    LF_DEB(NS_DEBUG::cnt_deb,
                         debug(debug::str<>("rxcq Cancelled"), "flags", debug::hex<6>(e.flags),
                             "len", debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context)));
                     // the request was cancelled, we can simply exit
@@ -373,9 +379,9 @@ class controller : public controller_base<controller>
                 }
                 else if (e.err != FI_SUCCESS)
                 {
-                    NS_DEBUG::cnt_err.error("rxcq Error ??? ", "err", debug::dec<>(-e.err), "flags",
+                    NS_DEBUG::cnt_err.error(debug::str<>("poll_recv_queue"), "error code", debug::dec<>(-e.err), "flags",
                         debug::hex<6>(e.flags), "len", debug::hex<6>(e.len), "context",
-                        NS_DEBUG::ptr(e.op_context), "error",
+                        NS_DEBUG::ptr(e.op_context), "error msg",
                         fi_cq_strerror(rx_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len));
                 }
                 operation_context* handler = reinterpret_cast<operation_context*>(e.op_context);
@@ -392,14 +398,14 @@ class controller : public controller_base<controller>
             for (int i = 0; i < ret; ++i)
             {
                 ++recvs_complete;
-                DEBUG(NS_DEBUG::cnt_deb,
+                LF_DEB(NS_DEBUG::cnt_deb,
                     debug(debug::str<>("Completion"), i, "rxcq flags",
                         fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(",
                         debug::dec<>(entry[i].flags), ")", "context",
                         NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len)));
                 if ((entry[i].flags & (FI_TAGGED | FI_RECV)) != 0)
                 {
-                    DEBUG(NS_DEBUG::cnt_deb,
+                    LF_DEB(NS_DEBUG::cnt_deb,
                         debug(debug::str<>("Completion"), "rxcq tagged recv completion",
                             NS_DEBUG::ptr(entry[i].op_context)));
 
@@ -430,14 +436,16 @@ class controller : public controller_base<controller>
         (void)info; // unused variable warning
         (void)tx;   // unused variable warning
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo")));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo")));
         struct fi_info* hints = fi_dupinfo(info);
         if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo");
         // clear any Rx address data that might be set
-        free(hints->src_addr);
+        // free(hints->src_addr);
+        // hints->src_addr = nullptr;
+        // hints->src_addrlen = 0;
         free(hints->dest_addr);
-        hints->src_addr = nullptr;
         hints->dest_addr = nullptr;
+        hints->dest_addrlen = 0;
         return hints;
     }
 };
diff --git a/src/libfabric/controller_base.hpp b/src/libfabric/controller_base.hpp
index 1a4b768d..41191a57 100644
--- a/src/libfabric/controller_base.hpp
+++ b/src/libfabric/controller_base.hpp
@@ -38,33 +38,15 @@
 #include <rdma/fi_rma.h>
 #include <rdma/fi_tagged.h>
 //
-#include "libfabric_defines.hpp"
+#include "oomph_libfabric_defines.hpp"
 //
 #include "fabric_error.hpp"
 #include "locality.hpp"
 #include "memory_region.hpp"
 #include "operation_context_base.hpp"
-#include "simple_counter.hpp"
-
-// ------------------------------------------------------------------
-// This section exists to make interoperabily/sharing of code
-// between OOMPH/GHEX and HPX easier
-#if __has_include("print.hpp")
-#include "print.hpp"
-#define NS_LIBFABRIC oomph::libfabric
-#define DEBUG        OOMPH_DP_ONLY
-#elif __has_include(<hpx/debugging/print.hpp>)
-#include <hpx/debugging/print.hpp>
-#define NS_LIBFABRIC hpx::parcelset::policies::libfabric
-using namespace NS_LIBFABRIC;
-#endif
-
-#if __has_include("simple_counter.hpp")
-#include "simple_counter.hpp"
-#endif
 
 //#define DISABLE_FI_INJECT
-// #define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50
+//#define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50
 
 // ------------------------------------------------------------------
 
@@ -150,7 +132,8 @@ static int
 libfabric_completions_per_poll()
 {
     auto env_str = std::getenv("LIBFABRIC_POLL_SIZE");
-    if (env_str != nullptr) {
+    if (env_str != nullptr)
+    {
         try
         {
             return std::atoi(env_str);
@@ -169,7 +152,8 @@ static int
 libfabric_rendezvous_threshold(int def_val)
 {
     auto env_str = std::getenv("LIBFABRIC_RENDEZVOUS_THRESHOLD");
-    if (env_str != nullptr) {
+    if (env_str != nullptr)
+    {
         try
         {
             char* end;
@@ -191,12 +175,12 @@ libfabric_rendezvous_threshold(int def_val)
 #define OOMPH_GNI_REG "internal"
 //#define OOMPH_GNI_REG "udreg"
 
-std::vector<std::pair<int, std::string>> gni_strs = {
+static std::vector<std::pair<int, std::string>> gni_strs = {
     {GNI_MR_CACHE, "GNI_MR_CACHE"},
 };
 
 // clang-format off
-std::vector<std::pair<int, std::string>> gni_ints = {
+static std::vector<std::pair<int, std::string>> gni_ints = {
     {GNI_MR_CACHE_LAZY_DEREG, "GNI_MR_CACHE_LAZY_DEREG"},
     {GNI_MR_HARD_REG_LIMIT, "GNI_MR_HARD_REG_LIMIT"},
     {GNI_MR_SOFT_REG_LIMIT, "GNI_MR_SOFT_REG_LIMIT"},
@@ -225,13 +209,18 @@ std::vector<std::pair<int, std::string>> gni_ints = {
 // clang-format on
 #endif
 
-#define LIBFABRIC_FI_VERSION_MAJOR 1
-#define LIBFABRIC_FI_VERSION_MINOR 11
+#if defined(HAVE_LIBFABRIC_CXI)
+# define LIBFABRIC_FI_VERSION_MAJOR 1
+# define LIBFABRIC_FI_VERSION_MINOR 15
+#else
+# define LIBFABRIC_FI_VERSION_MAJOR 1
+# define LIBFABRIC_FI_VERSION_MINOR 15
+#endif
 
 namespace NS_DEBUG
 {
 // cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> cnb_deb("CONBASE");
+static NS_DEBUG::enable_print<true> cnb_deb("CONBASE");
 static NS_DEBUG::enable_print<true> cnb_err("CONBASE");
 } // namespace NS_DEBUG
 
@@ -255,7 +244,6 @@ struct progress_status
 
 namespace NS_LIBFABRIC
 {
-
 /// A wrapper around fi_close that reports any error
 /// Because we use so many handles, we must be careful to
 /// delete them all before closing resources that use them
@@ -263,7 +251,7 @@ template<typename Handle>
 void
 fidclose(Handle fid, const char* msg)
 {
-    DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("closing"), msg));
+    LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("closing"), msg));
     int ret = fi_close(fid);
     if (ret == -FI_EBUSY) { throw NS_LIBFABRIC::fabric_error(ret, "fi_close EBUSY"); }
     else if (ret == FI_SUCCESS) { return; }
@@ -356,7 +344,7 @@ struct stack_endpoint
     ~stack_endpoint()
     {
         if (!pool_) return;
-        DEBUG(NS_DEBUG::cnb_deb,
+        LF_DEB(NS_DEBUG::cnb_deb,
             trace(debug::str<>("Scalable Ep"), "used push", "ep", NS_DEBUG::ptr(get_ep()), "tx cq",
                 NS_DEBUG::ptr(get_tx_cq()), "rx cq", NS_DEBUG::ptr(get_rx_cq())));
         pool_->push(endpoint_);
@@ -422,10 +410,20 @@ class controller_base
 
     uint32_t max_completions_per_poll_;
     uint32_t msg_rendezvous_threshold_;
+    inline static constexpr uint32_t max_completions_array_limit_ = 256;
 
     static inline thread_local std::chrono::steady_clock::time_point send_poll_stamp;
     static inline thread_local std::chrono::steady_clock::time_point recv_poll_stamp;
 
+    // set if FI_MR_LOCAL is required (local access requires binding)
+    bool mrlocal = false;
+    // set if FI_MR_ENDPOINT is required (per endpoint memory binding)
+    bool mrbind = false;
+    // set if FI_MR_HRMEM provider requires heterogeneous memory registration
+    bool mrhmem = false;
+  public:
+    bool get_mrbind() { return mrbind;}
+
   public:
     NS_LIBFABRIC::simple_counter<int, false> sends_posted_;
     NS_LIBFABRIC::simple_counter<int, false> recvs_posted_;
@@ -436,7 +434,7 @@ class controller_base
 
     void finvoke(const char* msg, const char* err, int ret)
     {
-        DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>(msg)));
+        LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>(msg)));
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, err);
     }
 
@@ -474,7 +472,7 @@ class controller_base
         unsigned int          rma_reads_ = 0;
         unsigned int          recv_deletes_ = 0;
 
-        DEBUG(NS_DEBUG::cnb_deb,
+        LF_DEB(NS_DEBUG::cnb_deb,
             debug(debug::str<>("counters"), "Received messages", debug::dec<>(messages_handled_),
                 "Total reads", debug::dec<>(rma_reads_), "Total deletes",
                 debug::dec<>(recv_deletes_), "deletes error",
@@ -511,7 +509,7 @@ class controller_base
         fidclose(&fabric_->fid, "Fabric");
 
         // clean up
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("freeing fabric_info")));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("freeing fabric_info")));
 
         fi_freeinfo(fabric_info_);
     }
@@ -519,7 +517,8 @@ class controller_base
     // --------------------------------------------------------------------
     // setup an endpoint for receiving messages,
     // usually an rx endpoint is shared by all threads
-    endpoint_wrapper create_rx_endpoint(struct fid_domain* domain, struct fi_info* info, struct fid_av* av)
+    endpoint_wrapper create_rx_endpoint(struct fid_domain* domain, struct fi_info* info,
+        struct fid_av* av)
     {
         auto ep_rx = new_endpoint_active(domain, info, false);
 
@@ -541,24 +540,24 @@ class controller_base
     void initialize(std::string const& provider, bool rootnode, int size, size_t threads,
         Args&&... args)
     {
-        DEBUG(NS_DEBUG::cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); }));
+        LF_DEB(NS_DEBUG::cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); }));
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
 
         max_completions_per_poll_ = libfabric_completions_per_poll();
-        DEBUG(NS_DEBUG::cnb_err,
+        LF_DEB(NS_DEBUG::cnb_err,
             debug(debug::str<>("Poll completions"), debug::dec<3>(max_completions_per_poll_)));
 
         uint32_t default_val = (threads == 1) ? 0x400 : 0x4000;
         msg_rendezvous_threshold_ = libfabric_rendezvous_threshold(default_val);
-        DEBUG(NS_DEBUG::cnb_err,
+        LF_DEB(NS_DEBUG::cnb_err,
             debug(debug::str<>("Rendezvous threshold"), debug::hex<4>(msg_rendezvous_threshold_)));
 
         endpoint_type_ = static_cast<endpoint_type>(libfabric_endpoint_type());
-        DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("Endpoints"), libfabric_endpoint_string()));
+        LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("Endpoints"), libfabric_endpoint_string()));
 
         eps_ = std::make_unique<endpoints_lifetime_manager>();
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Threads"), debug::dec<3>(threads)));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Threads"), debug::dec<3>(threads)));
 
         open_fabric(provider, threads, rootnode);
 
@@ -566,7 +565,8 @@ class controller_base
         av_ = create_address_vector(fabric_info_, size, threads);
 
         // we need an rx endpoint in all cases except scalable rx
-        if (endpoint_type_ != endpoint_type::scalableTxRx) {
+        if (endpoint_type_ != endpoint_type::scalableTxRx)
+        {
             // setup an endpoint for receiving messages
             // rx endpoint is typically shared by all threads
             eps_->ep_rx_ = create_rx_endpoint(fabric_domain_, fabric_info_, av_);
@@ -578,8 +578,10 @@ class controller_base
             auto tx_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep());
             eps_->ep_rx_.set_tx_cq(tx_cq);
         }
-        else if (endpoint_type_ != endpoint_type::scalableTxRx) {
-#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI)
+        else if (endpoint_type_ != endpoint_type::scalableTxRx)
+        {
+#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) ||                              \
+    defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI) || defined(HAVE_LIBFABRIC_EFA)
             // it appears that the rx endpoint cannot be enabled if it does not
             // have a Tx CQ (at least when using sockets), so we create a dummy
             // Tx CQ and bind it just to stop libfabric from triggering an error.
@@ -599,11 +601,11 @@ class controller_base
             auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true);
 
             // create a completion queue for tx endpoint
-            fabric_info_->tx_attr->op_flags |= FI_INJECT_COMPLETE | FI_COMPLETION;
-            auto tx_cq = create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size,
-                "tx multiple");
+            fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
+            auto tx_cq =
+                create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size, "tx multiple");
 
-            bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "rx multiple");
+            bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx multiple");
             bind_address_vector_to_endpoint(ep_tx, av_);
             enable_endpoint(ep_tx, "tx multiple");
 
@@ -619,10 +621,11 @@ class controller_base
         {
             // setup tx contexts for each possible thread
             size_t threads_allocated = 0;
-            auto   ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads, threads_allocated);
+            auto   ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads,
+                  threads_allocated);
 
-            DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"),
-                                         "Contexts allocated", debug::dec<4>(threads_allocated)));
+            LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"),
+                                          "Contexts allocated", debug::dec<4>(threads_allocated)));
 
             finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind",
                 fi_scalable_ep_bind(ep_sx, &av_->fid, 0));
@@ -648,7 +651,7 @@ class controller_base
                 enable_endpoint(scalable_ep_tx, "tx scalable");
 
                 endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable");
-                DEBUG(NS_DEBUG::cnb_deb,
+                LF_DEB(NS_DEBUG::cnb_deb,
                     trace(debug::str<>("Scalable Ep"), "initial tx push", "ep",
                         NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq",
                         NS_DEBUG::ptr(tx.get_rx_cq())));
@@ -661,7 +664,7 @@ class controller_base
         // once enabled we can get the address
         enable_endpoint(eps_->ep_rx_.get_ep(), "rx here");
         here_ = get_endpoint_address(&eps_->ep_rx_.get_ep()->fid);
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("setting 'here'"), iplocality(here_)));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting 'here'"), iplocality(here_)));
 
         //        // if we are using scalable endpoints, then setup tx/rx contexts
         //        // we will us a single endpoint for all Tx/Rx contexts
@@ -676,7 +679,7 @@ class controller_base
         //            if (!ep_sx)
         //                throw NS_LIBFABRIC::fabric_error(FI_EOTHER, "fi_scalable endpoint creation failed");
 
-        //            DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"),
+        //            LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"),
         //                                         "Contexts allocated", debug::dec<4>(threads_allocated)));
 
         //            // prepare the stack for insertions
@@ -706,7 +709,7 @@ class controller_base
         //                enable_endpoint(scalable_ep_tx, "tx scalable");
 
         //                endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable");
-        //                DEBUG(NS_DEBUG::cnb_deb,
+        //                LF_DEB(NS_DEBUG::cnb_deb,
         //                    trace(debug::str<>("Scalable Ep"), "initial tx push", "ep",
         //                        NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq",
         //                        NS_DEBUG::ptr(tx.get_rx_cq())));
@@ -724,7 +727,7 @@ class controller_base
         ////                enable_endpoint(scalable_ep_rx, "rx scalable");
 
         ////                endpoint_wrapper rx(scalable_ep_rx, scalable_cq_rx, nullptr, "rx scalable");
-        ////                DEBUG(NS_DEBUG::cnb_deb,
+        ////                LF_DEB(NS_DEBUG::cnb_deb,
         ////                    trace(debug::str<>("Scalable Ep"), "initial rx push", "ep",
         ////                        NS_DEBUG::ptr(rx.get_ep()), "tx cq", NS_DEBUG::ptr(rx.get_tx_cq()), "rx cq",
         ////                        NS_DEBUG::ptr(rx.get_rx_cq())));
@@ -750,24 +753,26 @@ class controller_base
     }
 
     // --------------------------------------------------------------------
-    constexpr int memory_registration_mode_flags()
+    constexpr std::int64_t memory_registration_mode_flags()
     {
-        // use basic registration for providers except CXI
+        std::int64_t base_flags = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY;
+#if OOMPH_ENABLE_DEVICE
+        base_flags = base_flags | FI_MR_HMEM;
+#endif
+        base_flags = base_flags | FI_MR_LOCAL;
+
 #if defined(HAVE_LIBFABRIC_CXI)
-        int base_flags =
-            FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL | FI_MR_MMU_NOTIFY;
-        return base_flags | FI_MR_ENDPOINT | FI_MR_HMEM;
-#elif defined(HAVE_LIBFABRIC_GNI)
-        return FI_MR_BASIC; // FI_MR_SCALABLE one day?;
+        return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT;
+
+#elif defined(HAVE_LIBFABRIC_EFA)
+        return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT;
 #else
-        return FI_MR_BASIC;
+        return base_flags;
 #endif
     }
 
     // --------------------------------------------------------------------
-    uint32_t rendezvous_threshold() {
-        return msg_rendezvous_threshold_;
-    }
+    uint32_t rendezvous_threshold() { return msg_rendezvous_threshold_; }
     // --------------------------------------------------------------------
     // initialize the basic fabric/domain/name
     void open_fabric(std::string const& provider, int threads, bool rootnode)
@@ -780,10 +785,12 @@ class controller_base
             throw NS_LIBFABRIC::fabric_error(-1, "Failed to allocate fabric hints");
         }
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Here locality"), iplocality(here_)));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Here locality"), iplocality(here_)));
 
 #if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || defined(HAVE_LIBFABRIC_VERBS)
         fabric_hints_->addr_format = FI_SOCKADDR_IN;
+#elif defined(HAVE_LIBFABRIC_EFA)
+        fabric_hints_->addr_format = FI_ADDR_EFA;
 #endif
 
         fabric_hints_->caps = caps_flags();
@@ -800,7 +807,7 @@ class controller_base
                 strdup(std::string(provider + ";ofi_rxm").c_str());
         }
         else { fabric_hints_->fabric_attr->prov_name = strdup(provider.c_str()); }
-        DEBUG(NS_DEBUG::cnb_deb,
+        LF_DEB(NS_DEBUG::cnb_deb,
             debug(debug::str<>("fabric provider"), fabric_hints_->fabric_attr->prov_name));
 
         fabric_hints_->domain_attr->mr_mode = memory_registration_mode_flags();
@@ -809,11 +816,11 @@ class controller_base
         auto progress = libfabric_progress_type();
         fabric_hints_->domain_attr->control_progress = progress;
         fabric_hints_->domain_attr->data_progress = progress;
-        DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("progress"), libfabric_progress_string()));
+        LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("progress"), libfabric_progress_string()));
 
         if (threads > 1)
         {
-            DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_FID")));
+            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_FID")));
             // Enable thread safe mode (Does not work with psm2 provider)
             // fabric_hints_->domain_attr->threading = FI_THREAD_SAFE;
             //fabric_hints_->domain_attr->threading = FI_THREAD_FID;
@@ -821,7 +828,7 @@ class controller_base
         }
         else
         {
-            DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_DOMAIN")));
+            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_DOMAIN")));
             // we serialize everything
             fabric_hints_->domain_attr->threading = FI_THREAD_DOMAIN;
         }
@@ -829,11 +836,11 @@ class controller_base
         // Enable resource management
         fabric_hints_->domain_attr->resource_mgmt = FI_RM_ENABLED;
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("fabric endpoint"), "RDM"));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fabric endpoint"), "RDM"));
         fabric_hints_->ep_attr->type = FI_EP_RDM;
 
         uint64_t flags = 0;
-        DEBUG(NS_DEBUG::cnb_deb,
+        LF_DEB(NS_DEBUG::cnb_deb,
             debug(debug::str<>("get fabric info"), "FI_VERSION",
                 debug::dec(LIBFABRIC_FI_VERSION_MAJOR), debug::dec(LIBFABRIC_FI_VERSION_MINOR)));
 
@@ -843,29 +850,32 @@ class controller_base
 
         if (rootnode)
         {
-            DEBUG(NS_DEBUG::cnb_err,
+            LF_DEB(NS_DEBUG::cnb_err,
                 trace(debug::str<>("Fabric info"), "\n", fi_tostr(fabric_info_, FI_TYPE_INFO)));
         }
 
         bool context = (fabric_hints_->mode & FI_CONTEXT) != 0;
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_CONTEXT"), context));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_CONTEXT"), context));
 
-        bool mrlocal = (fabric_hints_->domain_attr->mr_mode & FI_MR_LOCAL) != 0;
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_LOCAL"), mrlocal));
+        mrlocal = (fabric_hints_->domain_attr->mr_mode & FI_MR_LOCAL) != 0;
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_LOCAL"), mrlocal));
 
-        bool mrbind = (fabric_hints_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0;
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ENDPOINT"), mrbind));
+        mrbind = (fabric_hints_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0;
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ENDPOINT"), mrbind));
 
         /* Check if provider requires heterogeneous memory registration */
-        bool mrhmem = (fabric_hints_->domain_attr->mr_mode & FI_MR_HMEM) != 0;
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_HMEM"), mrhmem));
+        mrhmem = (fabric_hints_->domain_attr->mr_mode & FI_MR_HMEM) != 0;
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_HMEM"), mrhmem));
+
+        bool mrhalloc = (fabric_hints_->domain_attr->mr_mode & FI_MR_ALLOCATED) != 0;
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ALLOCATED"), mrhalloc));
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating fi_fabric")));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating fi_fabric")));
         ret = fi_fabric(fabric_info_->fabric_attr, &fabric_, nullptr);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fi_fabric");
 
         // Allocate a domain.
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Allocating domain")));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Allocating domain")));
         ret = fi_domain(fabric_, fabric_info_, &fabric_domain_, nullptr);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_domain");
 
@@ -874,18 +884,20 @@ class controller_base
             [[maybe_unused]] auto scp =
                 NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "GNI memory registration block");
 
-            DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI String values"));
+            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI String values"));
             // Dump out all vars for debug purposes
-            for (auto &gni_data : gni_strs) {
-                _set_check_domain_op_value<const char*>(gni_data.first, 0,
-                    gni_data.second.c_str(), false);
+            for (auto& gni_data : gni_strs)
+            {
+                _set_check_domain_op_value<const char*>(gni_data.first, 0, gni_data.second.c_str(),
+                    false);
             }
-            DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI Int values"));
-            for (auto &gni_data : gni_ints) {
-                _set_check_domain_op_value<uint32_t>(gni_data.first, 0,
-                    gni_data.second.c_str(), false);
+            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI Int values"));
+            for (auto& gni_data : gni_ints)
+            {
+                _set_check_domain_op_value<uint32_t>(gni_data.first, 0, gni_data.second.c_str(),
+                    false);
             }
-            DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("-------")));
+            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------")));
 
             // --------------------------
             // GNI_MR_CACHE
@@ -908,7 +920,7 @@ class controller_base
             // Enable lazy deregistration in MR cache
             //
             int32_t enable = 1;
-            DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("setting GNI_MR_CACHE_LAZY_DEREG")));
+            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting GNI_MR_CACHE_LAZY_DEREG")));
             _set_check_domain_op_value<int32_t>(GNI_MR_CACHE_LAZY_DEREG, enable,
                 "GNI_MR_CACHE_LAZY_DEREG");
 
@@ -947,11 +959,13 @@ class controller_base
         static struct fi_gni_ops_domain* gni_domain_ops = nullptr;
         int                              ret = 0;
 
-        if (gni_domain_ops == nullptr) {
+        if (gni_domain_ops == nullptr)
+        {
             ret = fi_open_ops(&fabric_domain_->fid, FI_GNI_DOMAIN_OPS_1, 0, (void**)&gni_domain_ops,
                 nullptr);
-            DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"),
-                                         NS_DEBUG::ptr(gni_domain_ops)));
+            LF_DEB(NS_DEBUG::cnb_deb,
+                debug(debug::str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"),
+                    NS_DEBUG::ptr(gni_domain_ops)));
         }
 
         // if open was ok and set flag is present, then set value
@@ -960,24 +974,25 @@ class controller_base
             ret = gni_domain_ops->set_val(&fabric_domain_->fid, (dom_ops_val_t)(op),
                 reinterpret_cast<void*>(&value));
 
-            DEBUG(NS_DEBUG::cnb_deb,
+            LF_DEB(NS_DEBUG::cnb_deb,
                 debug(debug::str<>("gni set ops val"), value, (ret == 0 ? "OK" : "FAIL")));
         }
 
         // Get the value (so we can check that the value we set is now returned)
         T new_value;
         ret = gni_domain_ops->get_val(&fabric_domain_->fid, (dom_ops_val_t)(op), &new_value);
-        if constexpr (std::is_integral<T>::value) {
-            DEBUG(NS_DEBUG::cnb_err,
-                debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, debug::hex<8>(new_value)));
+        if constexpr (std::is_integral<T>::value)
+        {
+            LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"),
+                                          info, debug::hex<8>(new_value)));
         }
-        else {
-            DEBUG(NS_DEBUG::cnb_err,
+        else
+        {
+            LF_DEB(NS_DEBUG::cnb_err,
                 debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, new_value));
         }
         //
-        if (ret)
-            throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info);
+        if (ret) throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info);
 
         return ret;
     }
@@ -994,7 +1009,7 @@ class controller_base
         struct fi_info* hints = set_src_dst_addresses(info, tx);
 
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-        DEBUG(NS_DEBUG::cnb_deb,
+        LF_DEB(NS_DEBUG::cnb_deb,
             debug(debug::str<>("Got info mode"), (info->mode & FI_NOTIFY_FLAGS_ONLY)));
 
         struct fid_ep* ep;
@@ -1005,8 +1020,7 @@ class controller_base
                                                   "endpoints?)");
         }
         fi_freeinfo(hints);
-        DEBUG(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("new_endpoint_active"), NS_DEBUG::ptr(ep)));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_active"), NS_DEBUG::ptr(ep)));
         return ep;
     }
 
@@ -1019,7 +1033,7 @@ class controller_base
 
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo")));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo")));
         struct fi_info* hints = fi_dupinfo(info);
         if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo");
 
@@ -1035,7 +1049,7 @@ class controller_base
         else { context_count = std::min(new_hints->domain_attr->rx_ctx_cnt, threads); }
 
         // clang-format off
-        DEBUG(NS_DEBUG::cnb_deb,
+        LF_DEB(NS_DEBUG::cnb_deb,
             trace(debug::str<>("scalable endpoint"),
                   "Tx", tx,
                   "Threads", debug::dec<3>(threads),
@@ -1051,8 +1065,7 @@ class controller_base
         struct fid_ep* ep;
         ret = fi_scalable_ep(domain, new_hints, &ep, nullptr);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_scalable_ep");
-        DEBUG(NS_DEBUG::cnb_deb,
-            debug(debug::str<>("new_endpoint_scalable"), NS_DEBUG::ptr(ep)));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_scalable"), NS_DEBUG::ptr(ep)));
         fi_freeinfo(hints);
         return ep;
     }
@@ -1061,7 +1074,7 @@ class controller_base
     endpoint_wrapper& get_rx_endpoint()
     {
         static auto rx = NS_DEBUG::cnb_deb.make_timer(1, debug::str<>("get_rx_endpoint"));
-        DEBUG(NS_DEBUG::cnb_deb, timed(rx));
+        LF_DEB(NS_DEBUG::cnb_deb, timed(rx));
 
         if (endpoint_type_ == endpoint_type::scalableTxRx)
         {
@@ -1072,7 +1085,7 @@ class controller_base
                 if (!ok)
                 {
                     // clang-format off
-                    DEBUG(NS_DEBUG::cnb_deb, error(debug::str<>("Scalable Ep"), "pop rx",
+                    LF_DEB(NS_DEBUG::cnb_deb, error(debug::str<>("Scalable Ep"), "pop rx",
                         "ep", NS_DEBUG::ptr(ep.get_ep()),
                         "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()),
                         "rx cq", NS_DEBUG::ptr(ep.get_rx_cq())));
@@ -1081,10 +1094,10 @@ class controller_base
                 }
                 eps_->tl_srx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(),
                     ep.get_name(), &rx_endpoints_);
-                DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop rx", "ep",
-                                             NS_DEBUG::ptr(eps_->tl_srx_.get_ep()), "tx cq",
-                                             NS_DEBUG::ptr(eps_->tl_srx_.get_tx_cq()), "rx cq",
-                                             NS_DEBUG::ptr(eps_->tl_srx_.get_rx_cq())));
+                LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop rx", "ep",
+                                              NS_DEBUG::ptr(eps_->tl_srx_.get_ep()), "tx cq",
+                                              NS_DEBUG::ptr(eps_->tl_srx_.get_tx_cq()), "rx cq",
+                                              NS_DEBUG::ptr(eps_->tl_srx_.get_rx_cq())));
             }
             return eps_->tl_srx_.endpoint_;
         }
@@ -1103,7 +1116,7 @@ class controller_base
                     NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, "threadlocal");
 
                 // create a completion queue for tx endpoint
-                fabric_info_->tx_attr->op_flags |= FI_INJECT_COMPLETE | FI_COMPLETION;
+                fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
                 auto tx_cq = create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size,
                     "tx threadlocal");
 
@@ -1117,7 +1130,7 @@ class controller_base
                 enable_endpoint(ep_tx, "tx threadlocal");
 
                 // set threadlocal endpoint wrapper
-                DEBUG(NS_DEBUG::cnb_deb,
+                LF_DEB(NS_DEBUG::cnb_deb,
                     trace(debug::str<>("Threadlocal Ep"), "create Tx", "ep", NS_DEBUG::ptr(ep_tx),
                         "tx cq", NS_DEBUG::ptr(tx_cq), "rx cq", NS_DEBUG::ptr(nullptr)));
                 // for cleaning up at termination
@@ -1136,7 +1149,7 @@ class controller_base
                 bool             ok = tx_endpoints_.pop(ep);
                 if (!ok)
                 {
-                    DEBUG(NS_DEBUG::cnb_deb,
+                    LF_DEB(NS_DEBUG::cnb_deb,
                         error(debug::str<>("Scalable Ep"), "pop tx", "ep",
                             NS_DEBUG::ptr(ep.get_ep()), "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()),
                             "rx cq", NS_DEBUG::ptr(ep.get_rx_cq())));
@@ -1144,20 +1157,15 @@ class controller_base
                 }
                 eps_->tl_stx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(),
                     ep.get_name(), &tx_endpoints_);
-                DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop tx", "ep",
-                                             NS_DEBUG::ptr(eps_->tl_stx_.get_ep()), "tx cq",
-                                             NS_DEBUG::ptr(eps_->tl_stx_.get_tx_cq()), "rx cq",
-                                             NS_DEBUG::ptr(eps_->tl_stx_.get_rx_cq())));
+                LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop tx", "ep",
+                                              NS_DEBUG::ptr(eps_->tl_stx_.get_ep()), "tx cq",
+                                              NS_DEBUG::ptr(eps_->tl_stx_.get_tx_cq()), "rx cq",
+                                              NS_DEBUG::ptr(eps_->tl_stx_.get_rx_cq())));
             }
             return eps_->tl_stx_.endpoint_;
         }
         else if (endpoint_type_ == endpoint_type::multiple) { return eps_->ep_tx_; }
-        else if (endpoint_type_ == endpoint_type::single)
-        {
-            // shared tx/rx endpoint
-            return eps_->ep_rx_;
-        }
-        // shared tx/rx endpoint
+        // single : shared tx/rx endpoint
         return eps_->ep_rx_;
     }
 
@@ -1166,7 +1174,7 @@ class controller_base
     {
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding AV"), "to", NS_DEBUG::ptr(endpoint)));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding AV"), "to", NS_DEBUG::ptr(endpoint)));
         int ret = fi_ep_bind(endpoint, &av->fid, 0);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind address_vector");
     }
@@ -1177,7 +1185,8 @@ class controller_base
     {
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type);
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding CQ"), "to", NS_DEBUG::ptr(endpoint), type));
+        LF_DEB(NS_DEBUG::cnb_deb,
+            debug(debug::str<>("Binding CQ"), "to", NS_DEBUG::ptr(endpoint), type));
         int ret = fi_ep_bind(endpoint, &cq->fid, cqtype);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind cq");
     }
@@ -1186,7 +1195,7 @@ class controller_base
     fid_cq* bind_tx_queue_to_rx_endpoint(struct fi_info* info, struct fid_ep* ep)
     {
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
-        info->tx_attr->op_flags |= FI_INJECT_COMPLETE | FI_COMPLETION;
+        info->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION);
         fid_cq* tx_cq = create_completion_queue(fabric_domain_, info->tx_attr->size, "tx->rx");
         // shared send/recv endpoint - bind send cq to the recv endpoint
         bind_queue_to_endpoint(ep, tx_cq, FI_TRANSMIT, "tx->rx bug fix");
@@ -1198,7 +1207,8 @@ class controller_base
     {
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type);
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Enabling endpoint"), NS_DEBUG::ptr(endpoint)));
+        LF_DEB(NS_DEBUG::cnb_deb,
+            debug(debug::str<>("Enabling endpoint"), NS_DEBUG::ptr(endpoint)));
         int ret = fi_enable(endpoint);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_enable");
     }
@@ -1227,14 +1237,14 @@ class controller_base
                 temp1 << debug::ipaddr(&local_addr[i]) << " - ";
             }
 
-            DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), "size",
-                                         debug::dec<>(addrlen), " : ", temp1.str().c_str()));
+            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), "size",
+                                          debug::dec<>(addrlen), " : ", temp1.str().c_str()));
             std::stringstream temp2;
             for (std::size_t i = 0; i < locality_defs::array_length; ++i)
             {
                 temp2 << debug::hex<8>(local_addr[i]) << " - ";
             }
-            DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), temp2.str().c_str()));
+            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), temp2.str().c_str()));
         }
         return locality(local_addr);
     }
@@ -1253,6 +1263,9 @@ class controller_base
     // --------------------------------------------------------------------
     inline const locality& here() const { return here_; }
 
+    // --------------------------------------------------------------------
+    inline const fi_addr_t& fi_address() const { return here_.fi_address(); }
+
     // --------------------------------------------------------------------
     inline void setHere(const locality& val) { here_ = val; }
 
@@ -1297,19 +1310,73 @@ class controller_base
             addr.set_fi_address(fi_addr_t(i));
             if ((ret == 0) && (addrlen == locality_defs::array_size))
             {
-                DEBUG(NS_DEBUG::cnb_deb,
+                LF_DEB(NS_DEBUG::cnb_deb,
                     debug(debug::str<>("address vector"), debug::dec<3>(i), iplocality(addr)));
             }
             else
             {
-                DEBUG(NS_DEBUG::cnb_err,
-                    error(debug::str<>("address length"), debug::dec<3>(addrlen), debug::dec<3>(locality_defs::array_size)));
+                LF_DEB(NS_DEBUG::cnb_err,
+                    error(debug::str<>("address length"), debug::dec<3>(addrlen),
+                        debug::dec<3>(locality_defs::array_size)));
                 throw std::runtime_error("debug_print_av_vector : address vector "
                                          "traversal failure");
             }
         }
     }
 
+    // --------------------------------------------------------------------
+    inline constexpr bool bypass_tx_lock()
+    {
+#if defined(HAVE_LIBFABRIC_GNI)
+        return true;
+#elif defined(HAVE_LIBFABRIC_CXI)
+        // @todo : cxi provider is not yet thread safe using scalable endpoints
+        return false;
+#else
+        return (threadlevel_flags() == FI_THREAD_SAFE ||
+                endpoint_type_ == endpoint_type::threadlocalTx);
+#endif
+    }
+
+    // --------------------------------------------------------------------
+    inline controller_base::unique_lock get_tx_lock()
+    {
+        if (bypass_tx_lock()) return unique_lock();
+        return unique_lock(send_mutex_);
+    }
+
+    // --------------------------------------------------------------------
+    inline controller_base::unique_lock try_tx_lock()
+    {
+        if (bypass_tx_lock()) return unique_lock();
+        return unique_lock(send_mutex_, std::try_to_lock_t{});
+    }
+
+    // --------------------------------------------------------------------
+    inline constexpr bool bypass_rx_lock()
+    {
+#ifdef HAVE_LIBFABRIC_GNI
+        return true;
+#else
+        return (
+            threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::scalableTxRx);
+#endif
+    }
+
+    // --------------------------------------------------------------------
+    inline controller_base::unique_lock get_rx_lock()
+    {
+        if (bypass_rx_lock()) return unique_lock();
+        return unique_lock(recv_mutex_);
+    }
+
+    // --------------------------------------------------------------------
+    inline controller_base::unique_lock try_rx_lock()
+    {
+        if (bypass_rx_lock()) return unique_lock();
+        return unique_lock(recv_mutex_, std::try_to_lock_t{});
+    }
+
     // --------------------------------------------------------------------
     progress_status poll_for_work_completions(void* user_data)
     {
@@ -1317,11 +1384,13 @@ class controller_base
         bool            retry = false;
         do {
             // sends
-            uint32_t nsend = static_cast<Derived*>(this)->poll_send_queue(get_tx_endpoint().get_tx_cq(), user_data);
+            uint32_t nsend = static_cast<Derived*>(this)->poll_send_queue(
+                get_tx_endpoint().get_tx_cq(), user_data);
             p.m_num_sends += nsend;
             retry = (nsend == max_completions_per_poll_);
             // recvs
-            uint32_t nrecv = static_cast<Derived*>(this)->poll_recv_queue(get_rx_endpoint().get_rx_cq(), user_data);
+            uint32_t nrecv = static_cast<Derived*>(this)->poll_recv_queue(
+                get_rx_endpoint().get_rx_cq(), user_data);
             p.m_num_recvs += nrecv;
             retry |= (nrecv == max_completions_per_poll_);
         } while (retry);
@@ -1352,7 +1421,7 @@ class controller_base
         cq_attr.wait_cond = FI_CQ_COND_NONE;
         cq_attr.size = size;
         cq_attr.flags = 0 /*FI_COMPLETION*/;
-        DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("CQ size"), debug::dec<4>(size)));
+        LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("CQ size"), debug::dec<4>(size)));
         // open completion queue on fabric domain and set context to null
         int ret = fi_cq_open(domain, &cq_attr, &cq, nullptr);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_cq_open");
@@ -1375,7 +1444,7 @@ class controller_base
 #ifdef RX_CONTEXTS_SUPPORT
         while (num_rx_contexts >> ++rx_ctx_bits)
             ;
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("rx_ctx_bits"), rx_ctx_bits));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("rx_ctx_bits"), rx_ctx_bits));
 #endif
         av_attr.rx_ctx_bits = rx_ctx_bits;
         // if contexts is nonzero, then we are using a single scalable endpoint
@@ -1387,11 +1456,11 @@ class controller_base
         }
         else
         {
-            DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("map FI_AV_TABLE")));
+            LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("map FI_AV_TABLE")));
             av_attr.type = FI_AV_TABLE;
         }
 
-        DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating AV")));
+        LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating AV")));
         int ret = fi_av_open(fabric_domain_, &av_attr, &av, nullptr);
         if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_av_open");
         return av;
@@ -1405,7 +1474,7 @@ class controller_base
     {
         [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__);
 
-        DEBUG(NS_DEBUG::cnb_deb,
+        LF_DEB(NS_DEBUG::cnb_deb,
             trace(debug::str<>("inserting AV"), iplocality(address), NS_DEBUG::ptr(av)));
         fi_addr_t fi_addr = 0xffffffff;
         int       ret = fi_av_insert(av, address.fabric_data(), 1, &fi_addr, 0, nullptr);
@@ -1417,8 +1486,8 @@ class controller_base
         }
         // address was generated correctly, now update the locality with the fi_addr
         locality new_locality(address, fi_addr);
-        DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("AV add"), "rank", debug::dec<>(fi_addr),
-                                     iplocality(new_locality), "fi_addr", debug::hex<4>(fi_addr)));
+        LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("AV add"), "rank", debug::dec<>(fi_addr),
+                                      iplocality(new_locality), "fi_addr", debug::hex<4>(fi_addr)));
         return new_locality;
     }
 };
diff --git a/src/libfabric/fabric_error.hpp b/src/libfabric/fabric_error.hpp
index bc7183a4..89422e19 100644
--- a/src/libfabric/fabric_error.hpp
+++ b/src/libfabric/fabric_error.hpp
@@ -15,17 +15,15 @@
 //
 #include <rdma/fi_errno.h>
 //
-#include "./print.hpp"
+#include "oomph_libfabric_defines.hpp"
 
-namespace oomph
+namespace NS_DEBUG
 {
 // cppcheck-suppress ConfigurationNotChecked
 static NS_DEBUG::enable_print<true> err_deb("ERROR__");
-} // namespace oomph
+} // namespace NS_DEBUG
 
-namespace oomph
-{
-namespace libfabric
+namespace NS_LIBFABRIC
 {
 
 class fabric_error : public std::runtime_error
@@ -36,7 +34,7 @@ class fabric_error : public std::runtime_error
     : std::runtime_error(std::string(fi_strerror(-err)) + msg)
     , error_(err)
     {
-        err_deb.error(msg, ":", fi_strerror(-err));
+        NS_DEBUG::err_deb.error(msg, ":", fi_strerror(-err));
         std::terminate();
     }
 
@@ -44,12 +42,11 @@ class fabric_error : public std::runtime_error
     : std::runtime_error(fi_strerror(-err))
     , error_(-err)
     {
-        err_deb.error(what());
+        NS_DEBUG::err_deb.error(what());
         std::terminate();
     }
 
     int error_;
 };
 
-} // namespace libfabric
-} // namespace oomph
+} // namespace NS_LIBFABRIC
diff --git a/src/libfabric/gni-debug.txt b/src/libfabric/gni-debug.txt
index 53eb3018..7f61601c 100644
--- a/src/libfabric/gni-debug.txt
+++ b/src/libfabric/gni-debug.txt
@@ -14,3 +14,5 @@ mpiexec -n 2 --oversubscribe konsole -e gdb -ex run --args /home/biddisco/build/
 # Build on LUMI
 # -----------------------------------------------
 cmake -DCMAKE_BUILD_TYPE=Release -DOOMPH_WITH_LIBFABRIC=ON -DOOMPH_LIBFABRIC_PROVIDER=cxi -DLIBFABRIC_ROOT=/opt/cray/libfabric/1.15.0.0/ -DOOMPH_WITH_BENCHMARKS=ON -DOOMPH_BENCHMARKS_MT=ON -DOOMPH_WITH_TESTING=ON /users/jobiddis/src/ghex/extern/oomph/
+
+MPICH_GNI_NDREG_ENTRIES=1024 MPICH_MAX_THREAD_SAFETY=single LIBFABRIC_POLL_SIZE=32 LIBFABRIC_ENDPOINT_TYPE=scalableTx  srun --cpu-bind=cores --unbuffered --ntasks 2 --cpus-per-task 1 timeout 120  /scratch/snx3000/biddisco/build/oomph/benchmarks/bench_p2p_bi_ft_avail_libfabric 50000 1000000 10 > dump
diff --git a/src/libfabric/libfabric_defines.hpp b/src/libfabric/libfabric_defines.hpp
deleted file mode 100644
index 03328979..00000000
--- a/src/libfabric/libfabric_defines.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * ghex-org
- *
- * Copyright (c) 2014-2023, ETH Zurich
- * All rights reserved.
- *
- * Please, refer to the LICENSE file in the root directory.
- * SPDX-License-Identifier: BSD-3-Clause
- */
-#pragma once
-
-// ------------------------------------------------------------------
-// This section exists to make interoperabily/sharing of code
-// between OOMPH/GHEX and HPX easier
-#if __has_include(<hpx/config/parcelport_defines.hpp>)
-#include <hpx/config/parcelport_defines.hpp>
-#elif __has_include("oomph_libfabric_defines.hpp")
-#include "oomph_libfabric_defines.hpp"
-#endif
diff --git a/src/libfabric/libfabric_defines_template.hpp b/src/libfabric/libfabric_defines_template.hpp
new file mode 100644
index 00000000..64c04944
--- /dev/null
+++ b/src/libfabric/libfabric_defines_template.hpp
@@ -0,0 +1,39 @@
+#ifndef OOMPH_LIBFABRIC_CONFIG_LIBFABRIC_HPP
+#define OOMPH_LIBFABRIC_CONFIG_LIBFABRIC_HPP
+
+// definitions that cmake generates from user options
+// clang-format off
+@oomph_config_defines@
+// clang-format on
+
+// ------------------------------------------------------------------
+// This section exists to make interoperabily/sharing of code
+// between OOMPH/GHEX and HPX easier - there are some files that do
+// the majority of libfabric initialization/setup and polling that
+// are basically the same in many apps, these files can be reused provided
+// some namespaces for the lib and for debugging are setup correctly
+
+#define NS_LIBFABRIC oomph::libfabric
+#define NS_MEMORY    oomph::libfabric
+#define NS_DEBUG     oomph::debug
+
+#ifndef LF_DEB
+#define LF_DEB(printer, Expr)                                                                      \
+    if constexpr (printer.is_enabled()) { printer.Expr; };
+#endif
+
+#define LFSOURCE_DIR "@OOMPH_SRC_LIBFABRIC_DIR@"
+#define LFPRINT_HPP  "@OOMPH_SRC_LIBFABRIC_DIR@/print.hpp"
+#define LFCOUNT_HPP  "@OOMPH_SRC_LIBFABRIC_DIR@/simple_counter.hpp"
+
+// oomph has a debug print helper file in the main source tree
+#if __has_include(LFPRINT_HPP)
+#include LFPRINT_HPP
+#define has_debug 1
+#endif
+
+#if __has_include(LFCOUNT_HPP)
+#include LFCOUNT_HPP
+#endif
+
+#endif
diff --git a/src/libfabric/locality.hpp b/src/libfabric/locality.hpp
index 2b65d61b..0bbbb16f 100644
--- a/src/libfabric/locality.hpp
+++ b/src/libfabric/locality.hpp
@@ -9,17 +9,17 @@
  */
 #pragma once
 
-#include <utility>
-#include <cstring>
-#include <cstdint>
 #include <array>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <utility>
 //
 #include <rdma/fabric.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 //
-#include "libfabric_defines.hpp"
-#include "print.hpp"
+#include "oomph_libfabric_defines.hpp"
 
 // Different providers use different address formats that we must accommodate
 // in our locality object.
@@ -31,6 +31,10 @@
 #define HAVE_LIBFABRIC_LOCALITY_SIZE 4
 #endif
 
+#ifdef HAVE_LIBFABRIC_EFA
+#define HAVE_LIBFABRIC_LOCALITY_SIZE 32
+#endif
+
 #if defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_TCP) ||                                \
     defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_PSM2)
 #define HAVE_LIBFABRIC_LOCALITY_SIZE 16
@@ -40,7 +44,7 @@
 namespace oomph
 {
 // cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> loc_deb("LOCALIT");
+static NS_DEBUG::enable_print<true> loc_deb("LOCALIT");
 } // namespace oomph
 
 namespace oomph
@@ -86,41 +90,41 @@ struct locality
     {
         std::memcpy(&data_[0], &in_data[0], locality_defs::array_size);
         fi_address_ = 0;
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("expl constructing"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("expl constructing"), iplocality((*this))));
     }
 
     locality()
     {
         std::memset(&data_[0], 0x00, locality_defs::array_size);
         fi_address_ = 0;
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("default construct"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("default construct"), iplocality((*this))));
     }
 
     locality(const locality& other)
     : data_(other.data_)
     , fi_address_(other.fi_address_)
     {
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("copy construct"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy construct"), iplocality((*this))));
     }
 
     locality(const locality& other, fi_addr_t addr)
     : data_(other.data_)
     , fi_address_(addr)
     {
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), iplocality((*this))));
     }
 
     locality(locality&& other)
     : data_(std::move(other.data_))
     , fi_address_(other.fi_address_)
     {
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("move construct"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("move construct"), iplocality((*this))));
     }
 
     // provided to support sockets mode bootstrap
     explicit locality(const std::string& address, const std::string& portnum)
     {
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), address, ":", portnum));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), address, ":", portnum));
         //
         struct sockaddr_in socket_data;
         memset(&socket_data, 0, sizeof(socket_data));
@@ -130,19 +134,19 @@ struct locality
         //
         std::memcpy(&data_[0], &socket_data, locality_defs::array_size);
         fi_address_ = 0;
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("string constructing"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("string constructing"), iplocality((*this))));
     }
 
     // some condition marking this locality as valid
     explicit inline operator bool() const
     {
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("bool operator"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("bool operator"), iplocality((*this))));
         return (ip_address() != 0);
     }
 
     inline bool valid() const
     {
-        OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("valid operator"), iplocality((*this))));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("valid operator"), iplocality((*this))));
         return (ip_address() != 0);
     }
 
@@ -150,21 +154,21 @@ struct locality
     {
         data_ = other.data_;
         fi_address_ = other.fi_address_;
-        OOMPH_DP_ONLY(loc_deb,
+        LF_DEB(loc_deb,
             trace(NS_DEBUG::str<>("copy operator"), iplocality(*this), iplocality(other)));
         return *this;
     }
 
     bool operator==(const locality& other)
     {
-        OOMPH_DP_ONLY(loc_deb,
+        LF_DEB(loc_deb,
             trace(NS_DEBUG::str<>("equality operator"), iplocality(*this), iplocality(other)));
         return std::memcmp(&data_, &other.data_, locality_defs::array_size) == 0;
     }
 
     bool less_than(const locality& other)
     {
-        OOMPH_DP_ONLY(loc_deb,
+        LF_DEB(loc_deb,
             trace(NS_DEBUG::str<>("less operator"), iplocality(*this), iplocality(other)));
         if (ip_address() < other.ip_address()) return true;
         if (ip_address() == other.ip_address()) return port() < other.port();
@@ -179,6 +183,8 @@ struct locality
         return data_[0];
 #elif defined(HAVE_LIBFABRIC_CXI)
         return data_[0];
+#elif defined(HAVE_LIBFABRIC_EFA)
+        return data_[0];
 #else
         throw fabric_error(0, "unsupported fabric provider, please fix ASAP");
 #endif
@@ -192,6 +198,8 @@ struct locality
         return data[0];
 #elif defined(HAVE_LIBFABRIC_CXI)
         return data[0];
+#elif defined(HAVE_LIBFABRIC_EFA)
+        return data[0];
 #else
         throw fabric_error(0, "unsupported fabric provider, please fix ASAP");
 #endif
@@ -215,7 +223,7 @@ struct locality
   private:
     friend bool operator==(locality const& lhs, locality const& rhs)
     {
-        OOMPH_DP_ONLY(loc_deb,
+        LF_DEB(loc_deb,
             trace(NS_DEBUG::str<>("equality friend"), iplocality(lhs), iplocality(rhs)));
         return ((lhs.data_ == rhs.data_) && (lhs.fi_address_ == rhs.fi_address_));
     }
@@ -226,8 +234,7 @@ struct locality
         const uint32_t&  a2 = rhs.ip_address();
         const fi_addr_t& f1 = lhs.fi_address();
         const fi_addr_t& f2 = rhs.fi_address();
-        OOMPH_DP_ONLY(loc_deb,
-            trace(NS_DEBUG::str<>("less friend"), iplocality(lhs), iplocality(rhs)));
+        LF_DEB(loc_deb, trace(NS_DEBUG::str<>("less friend"), iplocality(lhs), iplocality(rhs)));
         return (a1 < a2) || (a1 == a2 && f1 < f2);
     }
 
diff --git a/src/libfabric/memory_region.hpp b/src/libfabric/memory_region.hpp
index bb67a4ed..97cdfaa8 100644
--- a/src/libfabric/memory_region.hpp
+++ b/src/libfabric/memory_region.hpp
@@ -18,31 +18,40 @@
 #include <memory>
 #include <utility>
 
+#include "oomph_libfabric_defines.hpp"
 #include "fabric_error.hpp"
 
-// ------------------------------------------------------------------
-// This section exists to make interoperabily/sharing of code
-// between OOMPH/GHEX and HPX easier
-#if __has_include("./print.hpp")
-#include "print.hpp"
-#define DEBUG     OOMPH_DP_ONLY
-#define has_debug 1
-#elif __has_include(<hpx/debugging/print.hpp>)
-#include <hpx/debugging/print.hpp>
-#include <hpx/rma/rma_fwd.hpp>
-#define DEBUG(printer, Expr) HPX_DP_ONLY(printer, Expr)
-#define has_debug 1
-#else
-#define DEBUG(printer, Expr)
+#ifdef OOMPH_ENABLE_DEVICE
+#include <hwmalloc/device.hpp>
 #endif
 // ------------------------------------------------------------------
 
-#define NS_MEMORY oomph::libfabric
-
 namespace NS_MEMORY
 {
 
-static NS_DEBUG::enable_print<false> mrn_deb("REGION_");
+static NS_DEBUG::enable_print<true> mrn_deb("REGION_");
+
+/*
+struct fi_mr_attr {
+    const struct iovec *mr_iov;
+    size_t              iov_count;
+    uint64_t            access;
+    uint64_t            offset;
+    uint64_t            requested_key;
+    void               *context;
+    size_t              auth_key_size;
+    uint8_t            *auth_key;
+    enum fi_hmem_iface  iface;
+    union {
+        uint64_t        reserved;
+        int             cuda;
+        int             ze;
+        int             neuron;
+        int             synapseai;
+    } device;
+    void               *hmem_data;
+};
+*/
 
 // This is the only part of the code that actually
 // calls libfabric functions
@@ -53,30 +62,56 @@ struct region_provider
     using provider_domain = struct fid_domain;
 
     // register region
-    template<typename... Args>
-    static inline int register_memory(Args&&... args)
+    static inline int fi_register_memory(provider_domain* pd, int device_id, const void* buf,
+        size_t len, uint64_t access_flags, uint64_t offset, uint64_t request_key, struct fid_mr** mr)
     {
-        return fi_mr_reg(std::forward<Args>(args)...);
-    }
-
-    // register region
-    template<typename... Args>
-    static inline int register_memory_attr(Args&&... args)
-    {
-        [[maybe_unused]] auto scp = NS_MEMORY::mrn_deb.scope(__func__, std::forward<Args>(args)...);
-        //        int x = FI_HMEM_ROCR;
-        //        fi_mr_regattr(struct fid_domain *domain, const struct fi_mr_attr *attr,
-        //                    uint64_t flags, struct fid_mr **mr)
-        return fi_mr_regattr(std::forward<Args>(args)...);
+        [[maybe_unused]] auto scp = NS_MEMORY::mrn_deb.scope(__func__, NS_DEBUG::ptr(buf), NS_DEBUG::dec<>(len), device_id);
+        //
+        struct iovec addresses = {/*.iov_base = */const_cast<void*>(buf), /*.iov_len = */len};
+        fi_mr_attr attr = {
+            /*.mr_iov         = */&addresses,
+            /*.iov_count      = */1,
+            /*.access         = */access_flags,
+            /*.offset         = */offset,
+            /*.requested_key  = */request_key,
+            /*.context        = */nullptr,
+            /*.auth_key_size  = */0,
+            /*.auth_key       = */nullptr,
+            /*.iface          = */FI_HMEM_SYSTEM,
+            /*.device         = */{0},
+#if (FI_MAJOR_VERSION == 1) && (FI_MINOR_VERSION < 17)
+        };
+#else
+            /*.hmem_data      = */nullptr};
+#endif
+        if (device_id >= 0)
+        {
+#ifdef OOMPH_ENABLE_DEVICE
+            attr.device.cuda = device_id;
+            int handle = hwmalloc::get_device_id();
+            attr.device.cuda = handle;
+#if defined(OOMPH_DEVICE_CUDA)
+            attr.iface = FI_HMEM_CUDA;
+            LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("CUDA"), "set device id", device_id, handle));
+#elif defined(OOMPH_DEVICE_HIP)
+            attr.iface = FI_HMEM_ROCR;
+            LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("HIP"), "set device id", device_id, handle));
+#endif
+#endif
+        }
+        uint64_t flags = 0;
+        int ret = fi_mr_regattr(pd, &attr, flags, mr);
+        if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "register_memory"); }
+        return ret;
     }
 
     // unregister region
     static inline int unregister_memory(provider_region* region) { return fi_close(&region->fid); }
 
     // Default registration flags for this provider
-    static inline constexpr int flags()
+    static inline constexpr int access_flags()
     {
-        return FI_READ | FI_WRITE | FI_RECV | FI_SEND | FI_REMOTE_READ | FI_REMOTE_WRITE;
+        return FI_READ | FI_WRITE | FI_RECV | FI_SEND /*| FI_REMOTE_READ | FI_REMOTE_WRITE*/;
     }
 
     // Get the local descriptor of the memory region.
@@ -116,7 +151,7 @@ struct memory_handle
     , size_{uint32_t(size)}
     , used_space_{0}
     {
-        //            DEBUG(NS_MEMORY::mrn_deb,
+        //            LF_DEB(NS_MEMORY::mrn_deb,
         //                trace(NS_DEBUG::str<>("memory_handle"), *this));
     }
 
@@ -181,16 +216,16 @@ struct memory_handle
     {
         if (region_ /*&& !get_user_region()*/)
         {
-            DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_));
+            LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_));
             //
             if (region_provider::unregister_memory(region_))
             {
-                DEBUG(NS_MEMORY::mrn_deb, error("fi_close mr failed"));
+                LF_DEB(NS_MEMORY::mrn_deb, error("fi_close mr failed"));
                 return -1;
             }
             else
             {
-                DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this));
+                LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this));
             }
             region_ = nullptr;
         }
@@ -201,16 +236,16 @@ struct memory_handle
     friend std::ostream& operator<<(std::ostream& os, memory_handle const& region)
     {
         (void)region;
-#if has_debug
-        // clang-format off
-            os /*<< "region "*/      << NS_DEBUG::ptr(&region)
-               //<< " fi_region "  << NS_DEBUG::ptr(region.region_)
-           << " address "    << NS_DEBUG::ptr(region.address_)
-           << " size "       << NS_DEBUG::hex<6>(region.size_)
-               //<< " used_space " << NS_DEBUG::hex<6>(region.used_space_/*size_*/)
-               << " loc key "  << NS_DEBUG::ptr(region.region_ ? region_provider::get_local_key(region.region_) : nullptr)
-               << " rem key " << NS_DEBUG::ptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0);
-        // clang-format on
+#if 1 || has_debug
+            os << "region "      << NS_DEBUG::ptr(&region)
+            //<< " fi_region "  << NS_DEBUG::ptr(region.region_)
+            << " address "    << NS_DEBUG::ptr(region.address_)
+            << " size "       << NS_DEBUG::hex<6>(region.size_)
+            //<< " used_space " << NS_DEBUG::hex<6>(region.used_space_/*size_*/)
+            << " loc key "  << NS_DEBUG::ptr(region.region_ ? region_provider::get_local_key(region.region_) : nullptr)
+            << " rem key " << NS_DEBUG::ptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0);
+        ///// clang-format off
+        ///// clang-format on
 #endif
         return os;
     }
@@ -274,7 +309,8 @@ struct memory_segment : public memory_handle
     // we do not cache local/remote keys here because memory segments are only
     // used by the heap to store chunks and the user will always receive
     // a memory_handle - which does have keys cached
-    memory_segment(provider_domain* pd, const void* buffer, const uint64_t length, bool bind_mr, void *ep)
+    memory_segment(provider_domain* pd, const void* buffer, const uint64_t length, bool bind_mr,
+        void* ep, int device_id)
     {
         // an rma key counter to keep some providers (CXI) happy
         static std::atomic<std::uint64_t> key = 0;
@@ -285,22 +321,25 @@ struct memory_segment : public memory_handle
         region_ = nullptr;
         //
         base_addr_ = memory_handle::address_;
-        DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("memory_segment"), *this));
+        LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("memory_segment"), *this, device_id));
 
-        int ret = region_provider::register_memory(pd, const_cast<void*>(buffer), length,
-            region_provider::flags(), 0, key++, 0, &(region_), nullptr);
-        if (ret) { throw libfabric::fabric_error(int(ret), "register_memory"); }
-        else { DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Registered region"), *this)); }
+        int ret = region_provider::fi_register_memory(pd, device_id, buffer, length,
+            region_provider::access_flags(), 0, key++, &(region_));
+        if (!ret)
+        {
+            LF_DEB(NS_MEMORY::mrn_deb,
+                trace(NS_DEBUG::str<>("Registered region"), "device", device_id, *this));
+        }
 
         if (bind_mr)
         {
             ret = fi_mr_bind(region_, (struct fid*)ep, 0);
-            if (ret) { throw libfabric::fabric_error(int(ret), "fi_mr_bind"); }
-            else { DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Bound region"), *this)); }
+            if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_bind"); }
+            else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Bound region"), *this)); }
 
             ret = fi_mr_enable(region_);
-            if (ret) { throw libfabric::fabric_error(int(ret), "fi_mr_enable"); }
-            else { DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Enabled region"), *this)); }
+            if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_enable"); }
+            else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Enabled region"), *this)); }
         }
     }
 
diff --git a/src/libfabric/operation_context.cpp b/src/libfabric/operation_context.cpp
index 11bc85dc..92ffbf78 100644
--- a/src/libfabric/operation_context.cpp
+++ b/src/libfabric/operation_context.cpp
@@ -13,38 +13,36 @@
 #include <communicator.hpp>
 #include <context.hpp>
 
-namespace oomph
-{
-namespace libfabric
+namespace oomph::libfabric
 {
 void
 operation_context::handle_cancelled()
 {
-    [[maybe_unused]] auto scp = ctx_deb.scope(NS_DEBUG::ptr(this), __func__);
+    [[maybe_unused]] auto scp = opctx_deb.scope(NS_DEBUG::ptr(this), __func__);
     // enqueue the cancelled/callback
-    if (m_req.index() == 0)
+    if (std::holds_alternative<detail::request_state*>(m_req))
     {
         // regular (non-shared) recv
-        auto s = std::get<0>(m_req);
+        auto s = std::get<detail::request_state*>(m_req);
         while (!(s->m_comm->m_recv_cb_cancel.push(s))) {}
     }
-    else
+    else if (std::holds_alternative<detail::shared_request_state*>(m_req))
     {
         // shared recv
-        auto s = std::get<1>(m_req);
-
+        auto s = std::get<detail::shared_request_state*>(m_req);
         while (!(s->m_ctxt->m_recv_cb_cancel.push(s))) {}
     }
+    else { throw std::runtime_error("Request state invalid in handle_cancelled"); }
 }
 
 int
 operation_context::handle_tagged_recv_completion_impl(void* user_data)
 {
-    [[maybe_unused]] auto scp = ctx_deb.scope(NS_DEBUG::ptr(this), __func__);
-    if (m_req.index() == 0)
+    [[maybe_unused]] auto scp = opctx_deb.scope(NS_DEBUG::ptr(this), __func__);
+    if (std::holds_alternative<detail::request_state*>(m_req))
     {
         // regular (non-shared) recv
-        auto s = std::get<0>(m_req);
+        auto s = std::get<detail::request_state*>(m_req);
         //if (std::this_thread::get_id() == thread_id_)
         if (reinterpret_cast<oomph::communicator_impl*>(user_data) == s->m_comm)
         {
@@ -66,10 +64,10 @@ operation_context::handle_tagged_recv_completion_impl(void* user_data)
             while (!(s->m_comm->m_recv_cb_queue.push(s))) {}
         }
     }
-    else
+    else if (std::holds_alternative<detail::shared_request_state*>(m_req))
     {
         // shared recv
-        auto s = std::get<1>(m_req);
+        auto s = std::get<detail::shared_request_state*>(m_req);
         if (!s->m_comm->m_context->has_reached_recursion_depth())
         {
             auto inc = s->m_comm->m_context->recursion();
@@ -82,20 +80,36 @@ operation_context::handle_tagged_recv_completion_impl(void* user_data)
             while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {}
         }
     }
+    else
+    {
+        detail::request_state** req = reinterpret_cast<detail::request_state**>(&m_req);
+        LF_DEB(NS_MEMORY::opctx_deb,
+            error(NS_DEBUG::str<>("invalid request_state"), this, "request", NS_DEBUG::ptr(req)));
+        throw std::runtime_error("Request state invalid in handle_tagged_recv");
+    }
     return 1;
 }
 
 int
 operation_context::handle_tagged_send_completion_impl(void* user_data)
 {
-    auto s = std::get<0>(m_req);
-    if (reinterpret_cast<oomph::communicator_impl*>(user_data) == s->m_comm)
+    if (std::holds_alternative<detail::request_state*>(m_req))
     {
-        if (!s->m_comm->has_reached_recursion_depth())
+        // regular (non-shared) recv
+        auto s = std::get<detail::request_state*>(m_req);
+        if (reinterpret_cast<oomph::communicator_impl*>(user_data) == s->m_comm)
         {
-            auto inc = s->m_comm->recursion();
-            auto ptr = s->release_self_ref();
-            s->invoke_cb();
+            if (!s->m_comm->has_reached_recursion_depth())
+            {
+                auto inc = s->m_comm->recursion();
+                auto ptr = s->release_self_ref();
+                s->invoke_cb();
+            }
+            else
+            {
+                // enqueue the callback
+                while (!(s->m_comm->m_send_cb_queue.push(s))) {}
+            }
         }
         else
         {
@@ -103,12 +117,23 @@ operation_context::handle_tagged_send_completion_impl(void* user_data)
             while (!(s->m_comm->m_send_cb_queue.push(s))) {}
         }
     }
-    else
+    else if (std::holds_alternative<detail::shared_request_state*>(m_req))
     {
-        // enqueue the callback
-        while (!(s->m_comm->m_send_cb_queue.push(s))) {}
+        // shared recv
+        auto s = std::get<detail::shared_request_state*>(m_req);
+        if (!s->m_comm->m_context->has_reached_recursion_depth())
+        {
+            auto inc = s->m_comm->m_context->recursion();
+            auto ptr = s->release_self_ref();
+            s->invoke_cb();
+        }
+        else
+        {
+            // enqueue the callback
+            while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {}
+        }
     }
+    else { throw std::runtime_error("Request state invalid in handle_tagged_send"); }
     return 1;
 }
-} // namespace libfabric
-} // namespace oomph
+} // namespace oomph::libfabric
diff --git a/src/libfabric/operation_context.hpp b/src/libfabric/operation_context.hpp
index 4c0114a4..c99533db 100644
--- a/src/libfabric/operation_context.hpp
+++ b/src/libfabric/operation_context.hpp
@@ -13,16 +13,13 @@
 //
 #include <oomph/request.hpp>
 //
-#include <print.hpp>
-#include <operation_context_base.hpp>
+#include "operation_context_base.hpp"
 //
-namespace oomph
-{
-namespace libfabric
+namespace oomph::libfabric
 {
 
 // cppcheck-suppress ConfigurationNotChecked
-static NS_DEBUG::enable_print<false> ctx_deb("CONTEXT");
+static NS_DEBUG::enable_print<true> opctx_deb("OP__CXT");
 
 // This struct holds the ready state of a future
 // we must also store the context used in libfabric, in case
@@ -37,7 +34,7 @@ struct operation_context : public operation_context_base<operation_context>
     , m_req{req}
     {
         [[maybe_unused]] auto scp =
-            ctx_deb.scope(NS_DEBUG::ptr(this), __func__, NS_DEBUG::ptr(req));
+            opctx_deb.scope(NS_DEBUG::ptr(this), __func__, "request", req);
     }
 
     // --------------------------------------------------------------------
@@ -53,5 +50,4 @@ struct operation_context : public operation_context_base<operation_context>
     int handle_tagged_send_completion_impl(void* user_data);
 };
 
-} // namespace libfabric
-} // namespace oomph
+} // namespace oomph::libfabric
diff --git a/src/libfabric/operation_context_base.hpp b/src/libfabric/operation_context_base.hpp
index 058840ea..9625bae7 100644
--- a/src/libfabric/operation_context_base.hpp
+++ b/src/libfabric/operation_context_base.hpp
@@ -10,15 +10,15 @@
 #pragma once
 
 #include <rdma/fi_eq.h>
-#include "libfabric_defines.hpp"
-
-#define NS_LIBFABRIC oomph::libfabric
+#include "oomph_libfabric_defines.hpp"
 
 namespace NS_LIBFABRIC
 {
 
 class controller;
 
+static NS_DEBUG::enable_print<true> ctx_bas("CTXBASE");
+
 enum operation_context_type : int32_t
 {
     ctx_unknown = 0,
@@ -45,6 +45,7 @@ struct operation_context_base
     : context_reserved_space()
     , type_{ctype}
     {
+        [[maybe_unused]] auto scp = ctx_bas.scope(NS_DEBUG::ptr(this), __func__);
     }
 
     // type is needed to smiplify the dispatch of errors
@@ -72,21 +73,14 @@ struct operation_context_base
     {
         return static_cast<Derived*>(this)->handle_tagged_send_completion_impl(user_data);
     }
-    int handle_tagged_send_completion_impl() { return 0; }
+    int handle_tagged_send_completion_impl(void* /*user_data*/) { return 0; }
 
     // recv
-    int handle_recv_completion(std::uint64_t len, bool threadlocal)
-    {
-        return static_cast<Derived*>(this)->handle_recv_completion_impl(len, threadlocal);
-    }
-    int handle_recv_completion_impl(std::uint64_t /*len*/, bool /*threadlocal*/) { return 0; }
-
-    // recv + with source adddress (used with FI_SOURCE)
-    int handle_recv_src_completion(fi_addr_t const src_addr, std::uint64_t len)
+    int handle_recv_completion(std::uint64_t len)
     {
-        return static_cast<Derived*>(this)->handle_recv_src_completion_impl(src_addr, len);
+        return static_cast<Derived*>(this)->handle_recv_completion_impl(len);
     }
-    int handle_recv_src_completion_impl(fi_addr_t const src_addr, std::uint64_t len) { return 0; }
+    int handle_recv_completion_impl(std::uint64_t /*len*/) { return 0; }
 
     // tagged recv
     int handle_tagged_recv_completion(void* user_data)
diff --git a/src/libfabric/print.hpp b/src/libfabric/print.hpp
index 37265bc3..7036d759 100644
--- a/src/libfabric/print.hpp
+++ b/src/libfabric/print.hpp
@@ -72,18 +72,9 @@ extern char** environ;
 // The output will only be produced every N seconds
 // ------------------------------------------------------------
 
-// Used to wrap function call parameters to prevent evaluation
-// when debugging is disabled
-#define OOMPH_DP_LAZY(printer, Expr) printer.eval([&] { return Expr; })
-#if (__cplusplus >= 201703L)
-#define OOMPH_DP_ONLY(printer, Expr)                                                               \
-    if constexpr (printer.is_enabled()) { printer.Expr; };
-#else
-#define OOMPH_DP_ONLY(printer, Expr)                                                               \
-    if (printer.is_enabled()) { printer.Expr; };
-#endif
-
 #define NS_DEBUG oomph::debug
+#define LF_DEB(printer, Expr)                                                                      \
+    if constexpr (printer.is_enabled()) { printer.Expr; };
 
 // ------------------------------------------------------------
 /// \cond NODETAIL
@@ -721,5 +712,5 @@ struct enable_print<true>
     }
 };
 
-} // namespace oomph::debug
+} // namespace NS_DEBUG
 /// \endcond
diff --git a/src/libfabric/request_state.hpp b/src/libfabric/request_state.hpp
index 03bca9d5..25f7ea60 100644
--- a/src/libfabric/request_state.hpp
+++ b/src/libfabric/request_state.hpp
@@ -24,7 +24,7 @@ struct request_state
 {
     using base = request_state_base<false>;
     using shared_ptr_t = util::unsafe_shared_ptr<request_state>;
-    using operation_context = oomph::libfabric::operation_context;
+    using operation_context = libfabric::operation_context;
 
     operation_context                      m_operation_context;
     util::unsafe_shared_ptr<request_state> m_self_ptr;
@@ -34,12 +34,7 @@ struct request_state
     : base{ctxt, comm, scheduled, rank, tag, std::move(cb)}
     , m_operation_context{this}
     {
-        [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__);
-    }
-
-    ~request_state()
-    {
-        [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__);
+        //[[maybe_unused]] auto scp = libfabric::opctx_deb.scope(NS_DEBUG::ptr(this), __func__);
     }
 
     void progress();
@@ -67,7 +62,7 @@ struct shared_request_state
 {
     using base = request_state_base<true>;
     using shared_ptr_t = std::shared_ptr<shared_request_state>;
-    using operation_context = oomph::libfabric::operation_context;
+    using operation_context = libfabric::operation_context;
 
     operation_context                     m_operation_context;
     std::shared_ptr<shared_request_state> m_self_ptr;
@@ -77,12 +72,12 @@ struct shared_request_state
     : base{ctxt, comm, scheduled, rank, tag, std::move(cb)}
     , m_operation_context{this}
     {
-        [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__);
+        [[maybe_unused]] auto scp = libfabric::opctx_deb.scope(NS_DEBUG::ptr(this), __func__);
     }
 
     ~shared_request_state()
     {
-        [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__);
+        [[maybe_unused]] auto scp = libfabric::opctx_deb.scope(NS_DEBUG::ptr(this), __func__);
     }
 
     void progress();
diff --git a/src/libfabric/simple_counter.hpp b/src/libfabric/simple_counter.hpp
index 44b79ef7..f44eac92 100644
--- a/src/libfabric/simple_counter.hpp
+++ b/src/libfabric/simple_counter.hpp
@@ -9,7 +9,7 @@
  */
 #pragma once
 
-#include "libfabric_defines.hpp"
+#include "oomph_libfabric_defines.hpp"
 //
 #include <atomic>
 #include <type_traits>
diff --git a/src/mpi/context.hpp b/src/mpi/context.hpp
index bead5917..2d221280 100644
--- a/src/mpi/context.hpp
+++ b/src/mpi/context.hpp
@@ -95,7 +95,7 @@ register_memory<context_impl>(context_impl& c, void* ptr, std::size_t)
 #if OOMPH_ENABLE_DEVICE
 template<>
 inline region
-register_device_memory<context_impl>(context_impl& c, void* ptr, std::size_t)
+register_device_memory<context_impl>(context_impl& c, int, void* ptr, std::size_t)
 {
     return c.make_region(ptr);
 }
diff --git a/src/mpi/rma_context.hpp b/src/mpi/rma_context.hpp
index e3b04b1a..aec295f0 100644
--- a/src/mpi/rma_context.hpp
+++ b/src/mpi/rma_context.hpp
@@ -75,7 +75,7 @@ register_memory<rma_context>(rma_context& c, void* ptr, std::size_t size)
 #if OOMPH_ENABLE_DEVICE
 template<>
 inline rma_region
-register_device_memory<rma_context>(rma_context& c, void* ptr, std::size_t size)
+register_device_memory<rma_context>(rma_context& c, int, void* ptr, std::size_t size)
 {
     return c.make_region(ptr, size);
 }
diff --git a/src/ucx/context.hpp b/src/ucx/context.hpp
index 728a4506..760d24e7 100644
--- a/src/ucx/context.hpp
+++ b/src/ucx/context.hpp
@@ -251,7 +251,7 @@ register_memory<context_impl>(context_impl& c, void* ptr, std::size_t)
 #if OOMPH_ENABLE_DEVICE
 template<>
 inline region
-register_device_memory<context_impl>(context_impl& c, void* ptr, std::size_t)
+register_device_memory<context_impl>(context_impl& c, int, void* ptr, std::size_t)
 {
     return c.make_region(ptr);
 }
diff --git a/src/ucx/rma_context.hpp b/src/ucx/rma_context.hpp
index 25ff69d6..6f506a1a 100644
--- a/src/ucx/rma_context.hpp
+++ b/src/ucx/rma_context.hpp
@@ -60,9 +60,9 @@ register_memory<rma_context>(rma_context& c, void* ptr, std::size_t size)
 #if OOMPH_ENABLE_DEVICE
 template<>
 inline rma_region
-register_device_memory<rma_context>(rma_context& c, void* ptr, std::size_t size)
+register_device_memory<rma_context>(rma_context& c, int, void* ptr, std::size_t size)
 {
-    return c.make_region(ptr, size, true);
+    return c.make_region(ptr, size);
 }
 #endif
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5217bbaf..92804a6e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,6 +2,11 @@ add_subdirectory(mpi_runner)
 
 set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Daint)")
 
+#------------------------------------------------------------------------------
+# Find Threads
+#------------------------------------------------------------------------------
+find_package(Threads REQUIRED)
+
 # ---------------------------------------------------------------------
 # compile tests
 # ---------------------------------------------------------------------
@@ -10,7 +15,7 @@ set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Da
 set(serial_tests test_unique_function test_unsafe_shared_ptr)
 
 # list of parallel tests to be executed
-set(parallel_tests test_context test_send_recv test_send_multi test_cancel test_locality)
+set(parallel_tests test_context test_send_recv test_send_multi test_cancel test_locality test_mem_reg)
 #test_tag_range)
 if (OOMPH_ENABLE_BARRIER)
     list(APPEND parallel_tests test_barrier)
@@ -24,7 +29,7 @@ function(compile_test t_)
     if (OOMPH_TEST_LEAK_GPU_MEMORY)
         target_compile_definitions(${t} PRIVATE OOMPH_TEST_LEAK_GPU_MEMORY)
     endif()
-    target_link_libraries(${t} PRIVATE ext-gtest)
+    target_link_libraries(${t} PRIVATE ${GTEST_LIBRARIES})
     target_link_libraries(${t} PUBLIC oomph)
 endfunction()
 
@@ -43,7 +48,7 @@ endforeach()
 function(reg_serial_test t)
     add_executable(${t} $<TARGET_OBJECTS:${t}_obj>)
     oomph_target_compile_options(${t})
-    target_link_libraries(${t} PRIVATE ext-gtest)
+    target_link_libraries(${t} PRIVATE GTest::gtest)
     target_link_libraries(${t} PRIVATE oomph_common)
     add_test(
         NAME ${t}
@@ -59,7 +64,7 @@ function(reg_parallel_test t_ lib n)
     set(t ${t_}_${lib})
     add_executable(${t} $<TARGET_OBJECTS:${t_}_obj>)
     oomph_target_compile_options(${t})
-    target_link_libraries(${t} PRIVATE gtest_main_mpi)
+    target_link_libraries(${t} PRIVATE gtest_main_mpi Threads::Threads)
     target_link_libraries(${t} PRIVATE oomph_${lib})
     add_test(
         NAME ${t}
@@ -70,19 +75,19 @@ endfunction()
 
 if (OOMPH_WITH_MPI)
     foreach(t ${parallel_tests})
-        reg_parallel_test(${t} mpi 4)
+        reg_parallel_test(${t} mpi 2)
     endforeach()
 endif()
 
 if (OOMPH_WITH_UCX)
     foreach(t ${parallel_tests})
-        reg_parallel_test(${t} ucx 4)
+        reg_parallel_test(${t} ucx 2)
     endforeach()
 endif()
 
 if (OOMPH_WITH_LIBFABRIC)
     foreach(t ${parallel_tests})
-        reg_parallel_test(${t} libfabric 4)
+        reg_parallel_test(${t} libfabric 2)
     endforeach()
 endif()
 
diff --git a/test/mpi_runner/CMakeLists.txt b/test/mpi_runner/CMakeLists.txt
index 47207747..408f9d1a 100644
--- a/test/mpi_runner/CMakeLists.txt
+++ b/test/mpi_runner/CMakeLists.txt
@@ -1,4 +1,2 @@
-add_library(gtest_main_mpi ./gtest_main_mpi.cpp)
-target_link_libraries(gtest_main_mpi PRIVATE ext-gtest)
-target_link_libraries(gtest_main_mpi PRIVATE MPI::MPI_CXX)
-
+add_library(gtest_main_mpi OBJECT ./gtest_main_mpi.cpp)
+target_link_libraries(gtest_main_mpi PRIVATE GTest::gtest MPI::MPI_CXX)
diff --git a/test/test_mem_reg.cpp b/test/test_mem_reg.cpp
new file mode 100644
index 00000000..d61a9bcc
--- /dev/null
+++ b/test/test_mem_reg.cpp
@@ -0,0 +1,258 @@
+/*
+ * ghex-org
+ *
+ * Copyright (c) 2014-2023, ETH Zurich
+ * All rights reserved.
+ *
+ * Please, refer to the LICENSE file in the root directory.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+#include <oomph/context.hpp>
+#include <gtest/gtest.h>
+#include "./mpi_runner/mpi_test_fixture.hpp"
+#include <iostream>
+#include <iomanip>
+#include <thread>
+#include <atomic>
+
+#define NITERS   2
+#define SIZE     64
+#define NTHREADS 1
+
+std::vector<std::atomic<int>> shared_received(NTHREADS);
+thread_local int              thread_id;
+
+void
+reset_counters()
+{
+    for (auto& x : shared_received) x.store(0);
+}
+
+struct test_environment_base
+{
+    using rank_type = oomph::rank_type;
+    using tag_type = oomph::tag_type;
+    using message = oomph::message_buffer<rank_type>;
+
+    oomph::context&     ctxt;
+    oomph::communicator comm;
+    rank_type           speer_rank;
+    rank_type           rpeer_rank;
+    int                 thread_id;
+    int                 num_threads;
+    tag_type            tag;
+
+    test_environment_base(oomph::context& c, int tid, int num_t)
+    : ctxt(c)
+    , comm(ctxt.get_communicator())
+    , speer_rank((comm.rank() + 1) % comm.size())
+    , rpeer_rank((comm.rank() + comm.size() - 1) % comm.size())
+    , thread_id(tid)
+    , num_threads(num_t)
+    , tag(tid)
+    {
+    }
+};
+
+struct test_environment : public test_environment_base
+{
+    using base = test_environment_base;
+
+    static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc,
+        rank_type* ptr)
+    {
+        if (user_alloc) return comm.make_buffer<rank_type>(ptr, size);
+        else
+            return comm.make_buffer<rank_type>(size);
+    }
+
+    std::vector<rank_type> raw_smsg;
+    std::vector<rank_type> raw_rmsg;
+    message                smsg;
+    message                rmsg;
+
+    test_environment(oomph::context& c, std::size_t size, int tid, int num_t, bool user_alloc)
+    : base(c, tid, num_t)
+    , raw_smsg(user_alloc ? size : 0)
+    , raw_rmsg(user_alloc ? size : 0)
+    , smsg(make_buffer(comm, size, user_alloc, raw_smsg.data()))
+    , rmsg(make_buffer(comm, size, user_alloc, raw_rmsg.data()))
+    {
+        fill_send_buffer();
+        fill_recv_buffer();
+    }
+
+    void fill_send_buffer()
+    {
+        for (auto& x : smsg) x = comm.rank();
+    }
+
+    void fill_recv_buffer()
+    {
+        for (auto& x : rmsg) x = -1;
+    }
+
+    bool check_recv_buffer()
+    {
+        for (auto const& x : rmsg)
+            if (x != rpeer_rank) return false;
+        return true;
+    }
+};
+
+#if HWMALLOC_ENABLE_DEVICE
+struct test_environment_device : public test_environment_base
+{
+    using base = test_environment_base;
+
+    static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc,
+        rank_type* device_ptr)
+    {
+        if (user_alloc) return comm.make_device_buffer<rank_type>(device_ptr, size, 0);
+        else
+            return comm.make_device_buffer<rank_type>(size, 0);
+    }
+
+    struct device_allocation
+    {
+        void* m_ptr = nullptr;
+        device_allocation(std::size_t size = 0)
+        {
+            if (size) m_ptr = hwmalloc::device_malloc(size * sizeof(rank_type));
+        }
+        device_allocation(device_allocation&& other)
+        : m_ptr{std::exchange(other.m_ptr, nullptr)}
+        {
+        }
+        ~device_allocation()
+        {
+#ifndef OOMPH_TEST_LEAK_GPU_MEMORY
+            if (m_ptr) hwmalloc::device_free(m_ptr);
+#endif
+        }
+        rank_type* get() const noexcept { return (rank_type*)m_ptr; }
+    };
+
+    device_allocation raw_device_smsg;
+    device_allocation raw_device_rmsg;
+    message           smsg;
+    message           rmsg;
+
+    test_environment_device(oomph::context& c, std::size_t size, int tid, int num_t,
+        bool user_alloc)
+    : base(c, tid, num_t)
+#ifndef OOMPH_TEST_LEAK_GPU_MEMORY
+    , raw_device_smsg(user_alloc ? size : 0)
+    , raw_device_rmsg(user_alloc ? size : 0)
+    , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get()))
+    , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get()))
+#else
+    , raw_device_smsg(size)
+    , raw_device_rmsg(size)
+    , smsg(make_buffer(comm, size, true, raw_device_smsg.get()))
+    , rmsg(make_buffer(comm, size, true, raw_device_rmsg.get()))
+#endif
+    {
+        fill_send_buffer();
+        fill_recv_buffer();
+    }
+
+    void fill_send_buffer()
+    {
+        for (auto& x : smsg) x = comm.rank();
+        smsg.clone_to_device();
+    }
+
+    void fill_recv_buffer()
+    {
+        for (auto& x : rmsg) x = -1;
+        rmsg.clone_to_device();
+    }
+
+    bool check_recv_buffer()
+    {
+        rmsg.clone_to_host();
+        for (auto const& x : rmsg)
+            if (x != rpeer_rank) return false;
+        return true;
+    }
+};
+#endif
+
+template<typename Func>
+void
+launch_test(Func f)
+{
+    // single threaded
+    {
+        oomph::context ctxt(MPI_COMM_WORLD, false);
+        reset_counters();
+        f(ctxt, SIZE, 0, 1, false);
+        reset_counters();
+        f(ctxt, SIZE, 0, 1, true);
+    }
+
+    // multi threaded
+    {
+        oomph::context           ctxt(MPI_COMM_WORLD, true);
+        std::vector<std::thread> threads;
+        threads.reserve(NTHREADS);
+        reset_counters();
+        for (int i = 0; i < NTHREADS; ++i)
+            threads.push_back(std::thread{f, std::ref(ctxt), SIZE, i, NTHREADS, false});
+        for (auto& t : threads) t.join();
+        threads.clear();
+        reset_counters();
+        for (int i = 0; i < NTHREADS; ++i)
+            threads.push_back(std::thread{f, std::ref(ctxt), SIZE, i, NTHREADS, true});
+        for (auto& t : threads) t.join();
+    }
+}
+
+// no callback
+// ===========
+template<typename Env>
+void
+test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc)
+{
+    Env env(ctxt, size, tid, num_threads, user_alloc);
+
+    // use is_ready() -> must manually progress the communicator
+    for (int i = 0; i < NITERS; i++)
+    {
+        auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag);
+        auto sreq = env.comm.send(env.smsg, env.speer_rank, env.tag);
+        while (!(rreq.is_ready() && sreq.is_ready())) { env.comm.progress(); };
+        EXPECT_TRUE(env.check_recv_buffer());
+        env.fill_recv_buffer();
+    }
+
+    // use test() -> communicator is progressed automatically
+    for (int i = 0; i < NITERS; i++)
+    {
+        auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag);
+        auto sreq = env.comm.send(env.smsg, env.speer_rank, env.tag);
+        while (!(rreq.test() && sreq.test())) {};
+        EXPECT_TRUE(env.check_recv_buffer());
+        env.fill_recv_buffer();
+    }
+
+    // use wait() -> communicator is progressed automatically
+    for (int i = 0; i < NITERS; i++)
+    {
+        auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag);
+        env.comm.send(env.smsg, env.speer_rank, env.tag).wait();
+        rreq.wait();
+        EXPECT_TRUE(env.check_recv_buffer());
+        env.fill_recv_buffer();
+    }
+}
+
+TEST_F(mpi_test_fixture, send_recv)
+{
+    launch_test(test_send_recv<test_environment>);
+    std::cout << "\n\n\n\n\n\n\n\n" << std::endl;
+#if HWMALLOC_ENABLE_DEVICE
+    launch_test(test_send_recv<test_environment_device>);
+#endif
+}
diff --git a/test/test_unique_function.cpp b/test/test_unique_function.cpp
index 8e18b86e..9d38d79b 100644
--- a/test/test_unique_function.cpp
+++ b/test/test_unique_function.cpp
@@ -53,11 +53,9 @@ TEST(unqiue_function, simple_function)
     EXPECT_EQ(3, uf1(4));
 }
 
-
 void test_stats(ctor_stats_data const& stats, int n_ctor, int n_dtor, int n_dtor_of_moved,
     int n_move_ctor, int n_calls);
 
-
 // small function which fits within the stack buffer
 struct small_function
 {
@@ -267,3 +265,10 @@ test_stats(ctor_stats_data const& stats, int n_ctor, int n_dtor, int n_dtor_of_m
 
     //std::cout << stats << std::endl;
 }
+
+int
+main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/test_unsafe_shared_ptr.cpp b/test/test_unsafe_shared_ptr.cpp
index a592f6b1..2668aa14 100644
--- a/test/test_unsafe_shared_ptr.cpp
+++ b/test/test_unsafe_shared_ptr.cpp
@@ -155,3 +155,10 @@ TEST(unsafe_shared_ptr, move_assign)
         EXPECT_EQ(d.alloc_ref_count, 0);
     }
 }
+
+int
+main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}