diff --git a/.gitmodules b/.gitmodules index 2fa156d1..5857c773 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,7 +2,3 @@ path = ext/hwmalloc url = https://github.com/ghex-org/hwmalloc.git branch = master -[submodule "ext/googletest"] - path = ext/googletest - url = https://github.com/google/googletest.git - branch = main diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c60d1b1..e0ae8b3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(OOMPH_GIT_SUBMODULE "Check submodules during build" ON) option(OOMPH_USE_BUNDLED_LIBS "Use bundled 3rd party libraries" OFF) include(oomph_external_dependencies) +include(ExternalProject) # --------------------------------------------------------------------- # Define main oomph library @@ -94,6 +95,8 @@ add_subdirectory(bindings) # --------------------------------------------------------------------- set(OOMPH_WITH_TESTING OFF CACHE BOOL "True if tests shall be built") if (OOMPH_WITH_TESTING) + set(CTEST_TEST_TIMEOUT 30) + include(CTest) enable_testing() add_subdirectory(test) endif() diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 9652b48b..8e018cef 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -22,6 +22,7 @@ function(make_benchmark t_ lib) add_executable(${t} ${t_}_mt.cpp) oomph_target_compile_options(${t}) target_link_libraries(${t} PRIVATE oomph_${lib}) + target_compile_definitions(${t} PRIVATE OOMPH_BENCHMARKS_${lib}) endfunction() function(make_benchmark_mt t_ lib) @@ -30,6 +31,7 @@ function(make_benchmark_mt t_ lib) add_executable(${t} ${t_}_mt.cpp) oomph_target_compile_options(${t}) target_compile_definitions(${t} PRIVATE OOMPH_BENCHMARKS_MT) + target_compile_definitions(${t} PRIVATE OOMPH_BENCHMARKS_${lib}) target_link_libraries(${t} PRIVATE oomph_${lib}) target_link_libraries(${t} PRIVATE OpenMP::OpenMP_CXX) endfunction() @@ -104,6 +106,7 @@ if (OOMPH_WITH_LIBFABRIC) make_benchmark_mt(${t} libfabric) endif() endforeach() + add_subdirectory(scripts) endif() else() message("warning: benchmarks cannot be built unless barrier is enabled") diff --git a/benchmarks/mpi_environment.hpp b/benchmarks/mpi_environment.hpp index 7affabd3..50d2c2bb 100644 --- a/benchmarks/mpi_environment.hpp +++ b/benchmarks/mpi_environment.hpp @@ -12,6 +12,25 @@ #include #include +#ifdef OOMPH_BENCHMARKS_mpi +#define TRANSPORT_STRING "mpi" +#define PROGRESS_STRING "unspecified" +#define ENDPOINT_STRING "unspecified" +#endif + +#ifdef OOMPH_BENCHMARKS_ucx +#define TRANSPORT_STRING "ucx" +#define PROGRESS_STRING "unspecified" +#define ENDPOINT_STRING "unspecified" +#endif + +#ifdef OOMPH_BENCHMARKS_libfabric +#include "../src/libfabric/controller.hpp" +#define TRANSPORT_STRING "libfabric" +#define PROGRESS_STRING LIBFABRIC_PROGRESS_STRING +#define ENDPOINT_STRING LIBFABRIC_ENDPOINT_STRING +#endif + namespace oomph { struct mpi_environment diff --git a/benchmarks/mpi_p2p_bi_avail_mt.cpp b/benchmarks/mpi_p2p_bi_avail_mt.cpp index 4bc1dedd..6013eb2c 100644 --- a/benchmarks/mpi_p2p_bi_avail_mt.cpp +++ b/benchmarks/mpi_p2p_bi_avail_mt.cpp @@ -23,6 +23,13 @@ #include #endif /* OOMPH_BENCHMARKS_MT */ +#ifdef USE_TESTANY +const char* syncmode = "testany"; +#else +const char* syncmode = "test"; +#endif +const char* waitmode = "avail"; + int main(int argc, char* argv[]) { @@ -209,8 +216,26 @@ main(int argc, char* argv[]) MPI_Barrier(MPI_COMM_WORLD); if (rank == 1) { + double elapsed = t1.toc(); t1.vtoc(); t1.vtoc("final ", (double)niter * size * buff_size); + double bw = ((double)(niter * size) * buff_size) / elapsed; + // clang-format off + std::cout << "time: " << elapsed/1000000 << "s\n"; + std::cout << "final MB/s: " << bw << "\n"; + std::cout << "CSVData" + << ", niter, " << niter + << ", buff_size, " << buff_size + << ", inflight, " << inflight + << ", num_threads, " << cmd_args.num_threads + << ", syncmode, " << syncmode + << ", waitmode, " << waitmode + << ", transport, " << "Native-MPI" + << ", BW MB/s, " << bw + << ", progress, " << "unspecified" + << ", endpoint, " << "unspecified" + << "\n"; + // clang-format on } return 0; diff --git a/benchmarks/mpi_p2p_bi_wait_mt.cpp b/benchmarks/mpi_p2p_bi_wait_mt.cpp index c19d69e6..617862de 100644 --- a/benchmarks/mpi_p2p_bi_wait_mt.cpp +++ b/benchmarks/mpi_p2p_bi_wait_mt.cpp @@ -23,6 +23,13 @@ #include #endif /* OOMPH_BENCHMARKS_MT */ +#ifdef USE_WAITALL +const char* syncmode = "waitall"; +#else +const char* syncmode = "wait"; +#endif +const char* waitmode = "wait"; + int main(int argc, char* argv[]) { @@ -155,8 +162,26 @@ main(int argc, char* argv[]) MPI_Barrier(MPI_COMM_WORLD); if (rank == 1) { + double elapsed = t1.toc(); t1.vtoc(); t1.vtoc("final ", (double)niter * size * buff_size); + double bw = ((double)(niter * size) * buff_size) / elapsed; + // clang-format off + std::cout << "time: " << elapsed/1000000 << "s\n"; + std::cout << "final MB/s: " << bw << "\n"; + std::cout << "CSVData" + << ", niter, " << niter + << ", buff_size, " << buff_size + << ", inflight, " << inflight + << ", num_threads, " << cmd_args.num_threads + << ", syncmode, " << syncmode + << ", waitmode, " << waitmode + << ", transport, " << "Native-MPI" + << ", BW MB/s, " << bw + << ", progress, " << "unspecified" + << ", endpoint, " << "unspecified" + << "\n"; + // clang-format on } return 0; diff --git a/benchmarks/scripts/CMakeLists.txt b/benchmarks/scripts/CMakeLists.txt new file mode 100644 index 00000000..e74f3c70 --- /dev/null +++ b/benchmarks/scripts/CMakeLists.txt @@ -0,0 +1,68 @@ +#------------------------------------------------------- +# Slurm job launching script generator +# +# We would like to generate a script which can be used to generate batch jobs +# to submit benchmarks to slurm using many combinations of settings. +# +# We add a custom command which takes our template script and +# expands out all the variables we need to pass into it. +# Unfortunately, due to the way cmake works, some variables are only known +# at build time, and not at cmake configure time. Using a custom command which +# calls cmake to run our script at build time, allows us to pass variables +# into the final script which is placed in our build dir. +# +# Note that we generate these scripts in the build dir instead +# of the install dir as they are intended for development testing. +# A version could be supported for installation later. +#------------------------------------------------------- + +set(OOMPH_BENCHMARK_SCRIPTS_PATH "${PROJECT_BINARY_DIR}/scripts" CACHE PATH "Directory to place batch scripts in") +mark_as_advanced(OOMPH_BENCHMARK_SCRIPTS_PATH) + +# Make sure scripts dir exists +execute_process(COMMAND "${CMAKE_COMMAND}" -E make_directory "${OOMPH_BENCHMARK_SCRIPTS_PATH}") + +# The MPI backend is always enabled and uses no extension on the benchmarks +# it will be added automatically by the script runner +set(BENCHMARK_SUFFIXES) + +if (OOMPH_WITH_LIBFABRIC) + list(APPEND BENCHMARK_SUFFIXES _libfabric) +endif() +if (OOMPH_WITH_UCX) + list(APPEND BENCHMARK_SUFFIXES _ucx) +endif() + +#-------------------------------------------------- +# Slurm script generator for benchmarks +#-------------------------------------------------- +set(SCRIPTS "generate-oomph.py") + +foreach(script ${SCRIPTS}) + ADD_CUSTOM_COMMAND( + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${script}" + COMMAND "${CMAKE_COMMAND}" + ARGS + # needed by the benchmark script + -DBIN_DIR=${PROJECT_BINARY_DIR}/benchmarks + -DRUN_DIR=${OOMPH_BENCHMARK_SCRIPTS_PATH} + -DMPIEXEC="${MPIEXEC}" + + # needed by the copy script + -DSCRIPT_SOURCE_DIR="${CMAKE_CURRENT_SOURCE_DIR}" + -DSCRIPT_NAME=${script} + -DSCRIPT_DEST_DIR="${OOMPH_BENCHMARK_SCRIPTS_PATH}" + + # quoted as it might be a list + "-DBENCHMARK_SUFFIXES=${BENCHMARK_SUFFIXES}" + -DJOB_OPTIONS1="${SLURM_JOB_OPTIONS1}" + -P "${CMAKE_CURRENT_SOURCE_DIR}/copy_script.cmake" + + OUTPUT "${OOMPH_BENCHMARK_SCRIPTS_PATH}/${script}" + VERBATIM + ) + + add_custom_target(script-${script} + DEPENDS "${OOMPH_BENCHMARK_SCRIPTS_PATH}/${script}" + ) +endforeach(script) diff --git a/benchmarks/scripts/copy_script.cmake b/benchmarks/scripts/copy_script.cmake new file mode 100644 index 00000000..837676de --- /dev/null +++ b/benchmarks/scripts/copy_script.cmake @@ -0,0 +1,22 @@ +FILE(TO_NATIVE_PATH "${SCRIPT_SOURCE_DIR}/${SCRIPT_NAME}" INFILE) +FILE(TO_NATIVE_PATH "${SCRIPT_DEST_DIR}/${SCRIPT_NAME}" OUTFILE) + +STRING(REPLACE "\"" "" FILE1 "${INFILE}") +STRING(REPLACE "\"" "" FILE2 "${OUTFILE}") + +# +# when debugging, this dumps all vars out +# +if (DEBUG_THIS_SCRIPT) + get_cmake_property(_variableNames VARIABLES) + list (SORT _variableNames) + foreach (_variableName ${_variableNames}) + message(STATUS "${_variableName}=${${_variableName}}") + endforeach() +endif() + +configure_file( + "${FILE1}" + "${FILE2}" + @ONLY +) diff --git a/benchmarks/scripts/generate-oomph.ipynb b/benchmarks/scripts/generate-oomph.ipynb new file mode 100644 index 00000000..eb077b36 --- /dev/null +++ b/benchmarks/scripts/generate-oomph.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from itertools import product\n", + "import math\n", + "import numpy as np\n", + "import inspect\n", + "import os\n", + "import time\n", + "from IPython.display import Image, display, HTML\n", + "import importlib\n", + "import socket\n", + "import argparse\n", + "\n", + "# working dir\n", + "cwd = os.getcwd()\n", + "\n", + "# name of this script\n", + "scriptname = inspect.getframeinfo(inspect.currentframe()).filename\n", + "scriptpath = os.path.dirname(os.path.abspath(scriptname))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook generate-oomph.ipynb to script\n", + "[NbConvertApp] Writing 13176 bytes to generate-oomph.py\n" + ] + } + ], + "source": [ + "def is_notebook():\n", + " try:\n", + " shell = get_ipython().__class__.__name__\n", + " if shell == 'ZMQInteractiveShell':\n", + " return True # Jupyter notebook or qtconsole\n", + " elif shell == 'TerminalInteractiveShell':\n", + " return False # Terminal running IPython\n", + " else:\n", + " return False # Other type (?)\n", + " except NameError:\n", + " return False # Probably standard Python interpreter\n", + "\n", + "if is_notebook():\n", + " # this makes the notebook wider on a larger screen using %x of the display\n", + " display(HTML(\"\"))\n", + " # save this notebook as a raw python file as well please\n", + " get_ipython().system('jupyter nbconvert --to script generate-oomph.ipynb')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# ------------------------------------------------------------------\n", + "# Command line params\n", + "# ------------------------------------------------------------------\n", + "def get_command_line_args(notebook_args=None):\n", + " parser = argparse.ArgumentParser(description='Generator for oomph benchmarks')\n", + " parser.add_argument('-d', '--dir', default=cwd, action='store', help='base directory to generate job scripts in')\n", + " parser.add_argument('-t', '--type', default='normal', action='store', help='normal, timed or native for different test types')\n", + " parser.add_argument('-T', '--timeout', default=120, action='store', help='executable timeout period')\n", + " parser.add_argument('-m', '--machine', default='', action='store', help='select machine batch job config/preamble')\n", + " if is_notebook():\n", + " parser.add_argument('-f', help='seems to be defaulted by jupyter')\n", + " return parser.parse_args(notebook_args)\n", + " return parser.parse_args()\n", + "\n", + "notebook_args = '--type=native --dir /home/biddisco/benchmarking-results/test'.split()\n", + "if is_notebook():\n", + " args = get_command_line_args(notebook_args)\n", + "else:\n", + " args = get_command_line_args()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CWD : /home/biddisco/src/ghex/extern/oomph/benchmarks/scripts \n", + "Scriptpath : /tmp/ipykernel_91426 \n", + "Hostname : oryx2\n" + ] + } + ], + "source": [ + "# hostname + cleanup login node 'daint101' etc\n", + "if args.machine != '':\n", + " hostname = args.machine\n", + "elif os.environ.get('LUMI_STACK_NAME', 'oryx2') == 'LUMI':\n", + " hostname = 'lumi'\n", + "elif socket.gethostname().startswith('daint'):\n", + " hostname = 'daint'\n", + "else :\n", + " hostname = 'oryx2'\n", + "\n", + "# summary\n", + "print(f'CWD : {cwd} \\nScriptpath : {scriptpath} \\nHostname : {hostname}')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def make_executable(path):\n", + " mode = os.stat(path).st_mode\n", + " mode |= (mode & 0o444) >> 2 # copy R bits to X\n", + " os.chmod(path, mode)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating scripts in /home/biddisco/benchmarking-results/test\n" + ] + } + ], + "source": [ + "# strings with @xxx@ will be substituted by cmake\n", + "binary_dir = \"@BIN_DIR@\"\n", + "\n", + "if args.dir:\n", + " run_dir = args.dir\n", + "else:\n", + " run_dir = \"@RUN_DIR@\"\n", + "\n", + "print(f'Generating scripts in {run_dir}')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "cscs = {}\n", + "\n", + "# jb laptop\n", + "cscs[\"oryx2\"] = {\n", + " \"Machine\":\"system76\",\n", + " \"Cores\": 8,\n", + " \"Threads per core\": 2,\n", + " \"Allowed rpns\": [1, 2],\n", + " \"Thread_array\": [1,2,4],\n", + " \"Sleeptime\":0,\n", + " \"Launch\": \"pushd {job_path} && source {job_file} && popd\",\n", + " \"Run command\": \"mpiexec -n {total_ranks} --oversubscribe timeout {timeout} \",\n", + " \"Batch preamble\": \"\"\"\n", + "#!/bin/bash -l\n", + "\n", + "# Env\n", + "#export OMP_NUM_THREADS={threads}\n", + "#export GOMP_CPU_AFFINITY=0-{threadsm1}\n", + "\n", + "# Commands\n", + "\"\"\"\n", + "}\n", + "\n", + "# daint mc nodes config\n", + "cscs[\"daint\"] = {\n", + " \"Machine\":\"daint\",\n", + " \"Cores\": 128,\n", + " \"Threads per core\": 2,\n", + " \"Allowed rpns\": [1],\n", + " \"Thread_array\": [1,2,4,8,16],\n", + " \"Sleeptime\":0.25,\n", + " \"Launch\": \"sbatch --chdir={job_path} {job_file}\",\n", + " \"Run command\": \"srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} \",\n", + " \"Batch preamble\": \"\"\"\n", + "#!/bin/bash -l\n", + "#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size}\n", + "#SBATCH --time={time_min}\n", + "#SBATCH --nodes={nodes}\n", + "#SBATCH --partition=normal\n", + "#SBATCH --account=csstaff\n", + "#SBATCH --constraint=mc\n", + "#SBATCH --output=output.txt\n", + "#SBATCH --error=error.txt\n", + "\n", + "module swap craype/2.7.10 craype/2.7.15\n", + "\n", + "# alternatives : srun --cpu-bind v,mask_cpu:0xffff\n", + "# export GOMP_CPU_AFFINITY=0-{threadsm1}\n", + "\n", + "# Old Env vars that might be useful\n", + "# export MPICH_MAX_THREAD_SAFETY=multiple\n", + "# export OMP_NUM_THREADS={threads}\n", + "# export MKL_NUM_THREADS={threads}\n", + "# export MPICH_GNI_NDREG_ENTRIES=1024\n", + "\n", + "# Debug\n", + "module list &> modules.txt\n", + "printenv > env.txt\n", + "\n", + "# Commands\n", + "\"\"\"\n", + "}\n", + "\n", + "# daint mc nodes config\n", + "cscs[\"lumi\"] = {\n", + " \"Machine\":\"lumi\",\n", + " \"Cores\": 16,\n", + " \"Threads per core\": 2,\n", + " \"Allowed rpns\": [1],\n", + " \"Thread_array\": [1,2,4,8,16],\n", + " \"Sleeptime\":0.25,\n", + " \"Launch\": \"sbatch --chdir={job_path} {job_file}\",\n", + " \"Run command\": \"srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} \",\n", + " \"Batch preamble\": \"\"\"\n", + "#!/bin/bash -l\n", + "#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size}\n", + "#SBATCH --time={time_min}\n", + "#SBATCH --nodes={nodes}\n", + "#SBATCH --partition=standard\n", + "#SBATCH --account=project_465000105\n", + "#SBATCH --output=output.txt\n", + "#SBATCH --error=error.txt\n", + "\n", + "module load LUMI/22.06\n", + "module load cpeGNU\n", + "module load buildtools\n", + "module load Boost\n", + "\n", + "# alternatives : srun --cpu-bind v,mask_cpu:0xffff\n", + "# export GOMP_CPU_AFFINITY=0-{threadsm1}\n", + "\n", + "export MPICH_MAX_THREAD_SAFETY=multiple\n", + "# export OMP_NUM_THREADS={threads}\n", + "# export MKL_NUM_THREADS={threads}\n", + "# export MPICH_GNI_NDREG_ENTRIES=1024\n", + "\n", + "# Debug\n", + "module list &> modules.txt\n", + "printenv > env.txt\n", + "\n", + "# Commands\n", + "\"\"\"\n", + "}\n", + "\n", + "\n", + "cscs['eiger'] = cscs['daint']\n", + "cscs['eiger']['Machine'] = 'eiger'\n", + "cscs['eiger']['Cores'] = 64\n", + "cscs['eiger']['Thread_array'] = [1,2,4,8,16]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Generate Job script preamble\n", + "#\n", + "def init_job_text(system, run_name, time_min, transport, nodes, threads, inflight, size):\n", + " return system[\"Batch preamble\"].format(run_name=run_name,\n", + " time_min=time_min,\n", + " transport=transport,\n", + " nodes=nodes,\n", + " threads=threads,\n", + " threadsm1=(threads-1),\n", + " inflight=inflight,\n", + " size=size).strip()\n", + "#\n", + "# create a directory name from params\n", + "#\n", + "def make_job_directory(fdir,name, transport, nodes, threads, inflight, size):\n", + " return f'{fdir}/{name}_{transport}_{nodes}_{threads}_{inflight}_{size}'\n", + "\n", + "#\n", + "# create the launch command-line\n", + "#\n", + "def run_command(system, total_ranks, cpus_per_rank, timeout):\n", + " return system[\"Run command\"].format(total_ranks=total_ranks, cpus_per_rank=cpus_per_rank, threads_per_rank=cpus_per_rank, timeout=timeout)\n", + "\n", + "#\n", + "# create dir + write final script for sbatch/shell or other job launcher\n", + "#\n", + "def write_job_file(system, launch_file, job_dir, job_text, suffix=''):\n", + " job_path = os.path.expanduser(job_dir)\n", + " os.makedirs(job_path, exist_ok=True)\n", + " job_file = f\"{job_path}/job_{suffix}.sh\"\n", + " print(f\"Generating : {job_path} : {job_file}\")\n", + "\n", + " with open(job_file, \"w\") as f:\n", + " f.write(job_text)\n", + " make_executable(job_file)\n", + "\n", + " launchstring = system[\"Launch\"].format(job_path=job_path,job_file=job_file) + '\\n'\n", + " launchstring += 'sleep ' + str(system['Sleeptime']) + '\\n'\n", + " launch_file.write(launchstring)\n", + "\n", + "#\n", + "# generate a string that decorates and launches a single instance of the test\n", + "#\n", + "def execution_string(env, launch_cmd, prog_cmd, output_redirect):\n", + " full_command = f\"{env} {launch_cmd} {prog_cmd}\".strip()\n", + " command_prologue = f'printf \"\\\\n'\n", + " command_prologue += f'# ----- Executing \\\\n'\n", + " command_prologue += f'{full_command} \\\\n'\n", + " command_prologue += f'# --------------- \\\\n\" >> {output_redirect}'\n", + " command_epilogue = f'printf \"\\\\n'\n", + " command_epilogue += f'# ----- Finished \\\\n\\\\n\" >> {output_redirect}'\n", + " return '\\n' + command_prologue + '\\n' + full_command + ' >> ' + output_redirect + '\\n' + command_epilogue + '\\n'\n", + "\n", + "#\n", + "# generate application specific commmands/flags/options that go into the job script\n", + "#\n", + "def oomph_original(system, bin_dir, timeout, transport, progs, nodes, threads, msg, size, inflight, env):\n", + " total_ranks = 2\n", + " whole_cmd = ''\n", + " suffix = ''\n", + "\n", + " # transport layers use '_libfabric', '_ucx', '_mpi', etc\n", + " if args.type!='native':\n", + " suffix = f'_{transport}'\n", + "\n", + " # timed version uses seconds instead of messages/iterations\n", + " if args.type=='timed':\n", + " msg = 30\n", + "\n", + " # always remember to add a space to the end of each env var for concatenation of many of them\n", + " if threads==1:\n", + " env += 'MPICH_MAX_THREAD_SAFETY=single '\n", + " else:\n", + " env += 'MPICH_MAX_THREAD_SAFETY=multiple '\n", + " env += f'OMP_NUM_THREADS={threads} '\n", + " env += f'GOMP_CPU_AFFINITY=0-{threads} '\n", + "\n", + " for prog in progs:\n", + " if threads>1:\n", + " if args.type=='normal' or args.type=='timed':\n", + " prog = prog + '_mt'\n", + "\n", + " if transport=='native' and threads==1:\n", + " prog = prog.replace('_mt_','_')\n", + "\n", + " # generate the name of the output file we redirect output to\n", + " outfile = f'{prog}_N{nodes}_T{threads}_I{msg}_S{size}_F{inflight}.out'\n", + "\n", + " # generate the program commmand with all command line params needed by program\n", + " prog_cmd = f\"{bin_dir}/{prog}{suffix} {msg} {size} {inflight}\"\n", + "\n", + " # get the system launch command (mpiexec, srun, etc) with options/params\n", + " launch_cmd = run_command(system, total_ranks, threads, timeout)\n", + "\n", + " if transport=='libfabric':\n", + " env2 = env + 'LIBFABRIC_POLL_SIZE=32 '\n", + " #for ep in ['single', 'multiple', 'scalableTx', 'threadlocal']:\n", + " for ep in ['threadlocal']:\n", + " whole_cmd += execution_string(env2 + f\"LIBFABRIC_ENDPOINT_TYPE={ep} \", launch_cmd, prog_cmd, outfile)\n", + " if False: # add option to enable this?\n", + " whole_cmd += execution_string(env2 + f\"LIBFABRIC_ENDPOINT_TYPE={ep} \" + f\"LIBFABRIC_AUTO_PROGRESS=1 \", launch_cmd, prog_cmd, outfile)\n", + " else:\n", + " whole_cmd += execution_string(env, launch_cmd, prog_cmd, outfile)\n", + "\n", + " return whole_cmd" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "system = cscs[hostname]\n", + "#\n", + "job_name = 'oomph'\n", + "timeout = args.timeout\n", + "time_min = 2000*60 # total time estimate\n", + "timestr = time.strftime('%H:%M:%S', time.gmtime(time_min))\n", + "ranks_per_node = 1\n", + "nodes_arr = [2]\n", + "thrd_arr = system['Thread_array']\n", + "size_arr = [1,10,100,1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000]\n", + "nmsg_lut = {1:500000,\n", + " 10:500000,\n", + " 100:500000,\n", + " 1000:500000,\n", + " 2000:500000,\n", + " 5000:250000,\n", + " 10000:250000,\n", + " 20000:250000,\n", + " 50000:250000,\n", + " 100000:250000,\n", + " 200000:250000,\n", + " 500000:100000,\n", + " 1000000:50000,\n", + " 2000000:25000}\n", + "\n", + "flight_arr = [1,10,50,100]\n", + "\n", + "if args.type=='normal':\n", + " trans_arr = ['libfabric', 'mpi']\n", + " prog_arr = [\n", + " #\"bench_p2p_bi_cb_avail\",\n", + " #\"bench_p2p_bi_cb_wait\",\n", + " \"bench_p2p_bi_ft_avail\",\n", + " #\"bench_p2p_bi_ft_wait\"\n", + " ]\n", + "\n", + "if args.type=='timed':\n", + " trans_arr = ['libfabric', 'mpi']\n", + " prog_arr = ['bench_p2p_pp_ft_avail']\n", + "\n", + "if args.type=='native':\n", + " trans_arr = ['native']\n", + " prog_arr = [\n", + " #\"mpi_p2p_bi_avail_mt_test\", \"mpi_p2p_bi_avail_mt_testany\",\n", + " #\"mpi_p2p_bi_wait_mt_wait\",\n", + " \"mpi_p2p_bi_wait_mt_waitall\"\n", + " ]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Uncommment the following line to perform the job creation\n", + "Generating : /home/biddisco/benchmarking-results/test/oomph_all_2_1_1_1 : /home/biddisco/benchmarking-results/test/oomph_all_2_1_1_1/job_.sh\n", + "Combinations 168 est-time 1344 minutes\n" + ] + } + ], + "source": [ + "combos = 0\n", + "\n", + "if run_dir.startswith('@'):\n", + " print(f'Skipping creation of job launch file for {run_dir}')\n", + "else:\n", + " job_launch = f\"{run_dir}/launch.sh\"\n", + " job_launch_file = open(job_launch, \"w\")\n", + " #\n", + " job_launch_file.write(\"#!/bin/bash -l\\n\")\n", + "\n", + "# create the output directory for each job\n", + "job_dir = make_job_directory(run_dir, 'oomph', \"all\", 2, 1, 1, 1)\n", + "\n", + "# first part of boiler plate job script\n", + "job_text = init_job_text(system, job_name, timestr, \"all\", 2, 16, 1, 1)\n", + "\n", + "# generate all combinations in one monster loop\n", + "for nodes, transport, threads, size, inflight in product(nodes_arr, trans_arr, thrd_arr, size_arr, flight_arr):\n", + "\n", + " env = \"\"\n", + " msg = nmsg_lut[size]\n", + "\n", + " # create the output directory for each job\n", + " #job_dir = make_job_directory(run_dir, 'oomph', transport, nodes, threads, inflight, size)\n", + "\n", + " # first part of boiler plate job script\n", + " #job_text = init_job_text(system, job_name, timestr, transport, nodes, threads, inflight, size)\n", + "\n", + " env = 'MPICH_GNI_NDREG_ENTRIES=1024 '\n", + "\n", + " # application specific part of job script\n", + " job_text += oomph_original(\n", + " system,\n", + " binary_dir,\n", + " timeout,\n", + " transport,\n", + " prog_arr,\n", + " nodes,\n", + " threads,\n", + " msg,\n", + " size,\n", + " inflight,\n", + " env\n", + " )\n", + " # debugging\n", + " # print(job_dir, '\\n', job_text, '\\n\\n\\n\\n')\n", + "\n", + " combos += 1\n", + "\n", + " if combos==1:\n", + " print('Uncommment the following line to perform the job creation')\n", + "\n", + "write_job_file(system, job_launch_file, job_dir, job_text)\n", + "\n", + "make_executable(job_launch)\n", + "print('Combinations', combos, 'est-time', combos*4*2,'minutes')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/benchmarks/scripts/generate-oomph.py b/benchmarks/scripts/generate-oomph.py new file mode 100644 index 00000000..1c43715d --- /dev/null +++ b/benchmarks/scripts/generate-oomph.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +from itertools import product +import math +import numpy as np +import inspect +import os +import time +from IPython.display import Image, display, HTML +import importlib +import socket +import argparse + +# working dir +cwd = os.getcwd() + +# name of this script +scriptname = inspect.getframeinfo(inspect.currentframe()).filename +scriptpath = os.path.dirname(os.path.abspath(scriptname)) + + +# In[2]: + + +def is_notebook(): + try: + shell = get_ipython().__class__.__name__ + if shell == 'ZMQInteractiveShell': + return True # Jupyter notebook or qtconsole + elif shell == 'TerminalInteractiveShell': + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + +if is_notebook(): + # this makes the notebook wider on a larger screen using %x of the display + display(HTML("")) + # save this notebook as a raw python file as well please + get_ipython().system('jupyter nbconvert --to script generate-oomph.ipynb') + + +# In[3]: + + +# ------------------------------------------------------------------ +# Command line params +# ------------------------------------------------------------------ +def get_command_line_args(notebook_args=None): + parser = argparse.ArgumentParser(description='Generator for oomph benchmarks') + parser.add_argument('-d', '--dir', default=cwd, action='store', help='base directory to generate job scripts in') + parser.add_argument('-t', '--type', default='normal', action='store', help='normal, timed or native for different test types') + parser.add_argument('-T', '--timeout', default=120, action='store', help='executable timeout period') + parser.add_argument('-m', '--machine', default='', action='store', help='select machine batch job config/preamble') + if is_notebook(): + parser.add_argument('-f', help='seems to be defaulted by jupyter') + return parser.parse_args(notebook_args) + return parser.parse_args() + +notebook_args = '--type=native --dir /home/biddisco/benchmarking-results/test'.split() +if is_notebook(): + args = get_command_line_args(notebook_args) +else: + args = get_command_line_args() + + +# In[4]: + + +# hostname + cleanup login node 'daint101' etc +if args.machine != '': + hostname = args.machine +elif os.environ.get('LUMI_STACK_NAME', 'oryx2') == 'LUMI': + hostname = 'lumi' +elif socket.gethostname().startswith('daint'): + hostname = 'daint' +else : + hostname = 'oryx2' + +# summary +print(f'CWD : {cwd} \nScriptpath : {scriptpath} \nHostname : {hostname}') + + +# In[5]: + + +def make_executable(path): + mode = os.stat(path).st_mode + mode |= (mode & 0o444) >> 2 # copy R bits to X + os.chmod(path, mode) + + +# In[6]: + + +# strings with @xxx@ will be substituted by cmake +binary_dir = "@BIN_DIR@" + +if args.dir: + run_dir = args.dir +else: + run_dir = "@RUN_DIR@" + +print(f'Generating scripts in {run_dir}') + + +# In[7]: + + +cscs = {} + +# jb laptop +cscs["oryx2"] = { + "Machine":"system76", + "Cores": 8, + "Threads per core": 2, + "Allowed rpns": [1, 2], + "Thread_array": [1,2,4], + "Sleeptime":0, + "Launch": "pushd {job_path} && source {job_file} && popd", + "Run command": "mpiexec -n {total_ranks} --oversubscribe timeout {timeout} ", + "Batch preamble": """ +#!/bin/bash -l + +# Env +#export OMP_NUM_THREADS={threads} +#export GOMP_CPU_AFFINITY=0-{threadsm1} + +# Commands +""" +} + +# daint mc nodes config +cscs["daint"] = { + "Machine":"daint", + "Cores": 128, + "Threads per core": 2, + "Allowed rpns": [1], + "Thread_array": [1,2,4,8,16], + "Sleeptime":0.25, + "Launch": "sbatch --chdir={job_path} {job_file}", + "Run command": "srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} ", + "Batch preamble": """ +#!/bin/bash -l +#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size} +#SBATCH --time={time_min} +#SBATCH --nodes={nodes} +#SBATCH --partition=normal +#SBATCH --account=csstaff +#SBATCH --constraint=mc +#SBATCH --output=output.txt +#SBATCH --error=error.txt + +module swap craype/2.7.10 craype/2.7.15 + +# alternatives : srun --cpu-bind v,mask_cpu:0xffff +# export GOMP_CPU_AFFINITY=0-{threadsm1} + +# Old Env vars that might be useful +# export MPICH_MAX_THREAD_SAFETY=multiple +# export OMP_NUM_THREADS={threads} +# export MKL_NUM_THREADS={threads} +# export MPICH_GNI_NDREG_ENTRIES=1024 + +# Debug +module list &> modules.txt +printenv > env.txt + +# Commands +""" +} + +# daint mc nodes config +cscs["lumi"] = { + "Machine":"lumi", + "Cores": 16, + "Threads per core": 2, + "Allowed rpns": [1], + "Thread_array": [1,2,4,8,16], + "Sleeptime":0.25, + "Launch": "sbatch --chdir={job_path} {job_file}", + "Run command": "srun --cpu-bind=cores --unbuffered --ntasks {total_ranks} --cpus-per-task {threads_per_rank} timeout {timeout} ", + "Batch preamble": """ +#!/bin/bash -l +#SBATCH --job-name={run_name}_{transport}_{nodes}_{threads}_{inflight}_{size} +#SBATCH --time={time_min} +#SBATCH --nodes={nodes} +#SBATCH --partition=standard +#SBATCH --account=project_465000105 +#SBATCH --output=output.txt +#SBATCH --error=error.txt + +module load LUMI/22.06 +module load cpeGNU +module load buildtools +module load Boost + +# alternatives : srun --cpu-bind v,mask_cpu:0xffff +# export GOMP_CPU_AFFINITY=0-{threadsm1} + +export MPICH_MAX_THREAD_SAFETY=multiple +# export OMP_NUM_THREADS={threads} +# export MKL_NUM_THREADS={threads} +# export MPICH_GNI_NDREG_ENTRIES=1024 + +# Debug +module list &> modules.txt +printenv > env.txt + +# Commands +""" +} + + +cscs['eiger'] = cscs['daint'] +cscs['eiger']['Machine'] = 'eiger' +cscs['eiger']['Cores'] = 64 +cscs['eiger']['Thread_array'] = [1,2,4,8,16] + + +# In[8]: + + +# +# Generate Job script preamble +# +def init_job_text(system, run_name, time_min, transport, nodes, threads, inflight, size): + return system["Batch preamble"].format(run_name=run_name, + time_min=time_min, + transport=transport, + nodes=nodes, + threads=threads, + threadsm1=(threads-1), + inflight=inflight, + size=size).strip() +# +# create a directory name from params +# +def make_job_directory(fdir,name, transport, nodes, threads, inflight, size): + return f'{fdir}/{name}_{transport}_{nodes}_{threads}_{inflight}_{size}' + +# +# create the launch command-line +# +def run_command(system, total_ranks, cpus_per_rank, timeout): + return system["Run command"].format(total_ranks=total_ranks, cpus_per_rank=cpus_per_rank, threads_per_rank=cpus_per_rank, timeout=timeout) + +# +# create dir + write final script for sbatch/shell or other job launcher +# +def write_job_file(system, launch_file, job_dir, job_text, suffix=''): + job_path = os.path.expanduser(job_dir) + os.makedirs(job_path, exist_ok=True) + job_file = f"{job_path}/job_{suffix}.sh" + print(f"Generating : {job_path} : {job_file}") + + with open(job_file, "w") as f: + f.write(job_text) + make_executable(job_file) + + launchstring = system["Launch"].format(job_path=job_path,job_file=job_file) + '\n' + launchstring += 'sleep ' + str(system['Sleeptime']) + '\n' + launch_file.write(launchstring) + +# +# generate a string that decorates and launches a single instance of the test +# +def execution_string(env, launch_cmd, prog_cmd, output_redirect): + full_command = f"{env} {launch_cmd} {prog_cmd}".strip() + command_prologue = f'printf "\\n' + command_prologue += f'# ----- Executing \\n' + command_prologue += f'{full_command} \\n' + command_prologue += f'# --------------- \\n" >> {output_redirect}' + command_epilogue = f'printf "\\n' + command_epilogue += f'# ----- Finished \\n\\n" >> {output_redirect}' + return '\n' + command_prologue + '\n' + full_command + ' >> ' + output_redirect + '\n' + command_epilogue + '\n' + +# +# generate application specific commmands/flags/options that go into the job script +# +def oomph_original(system, bin_dir, timeout, transport, progs, nodes, threads, msg, size, inflight, env): + total_ranks = 2 + whole_cmd = '' + suffix = '' + + # transport layers use '_libfabric', '_ucx', '_mpi', etc + if args.type!='native': + suffix = f'_{transport}' + + # timed version uses seconds instead of messages/iterations + if args.type=='timed': + msg = 30 + + # always remember to add a space to the end of each env var for concatenation of many of them + if threads==1: + env += 'MPICH_MAX_THREAD_SAFETY=single ' + else: + env += 'MPICH_MAX_THREAD_SAFETY=multiple ' + env += f'OMP_NUM_THREADS={threads} ' + env += f'GOMP_CPU_AFFINITY=0-{threads} ' + + for prog in progs: + if threads>1: + if args.type=='normal' or args.type=='timed': + prog = prog + '_mt' + + if transport=='native' and threads==1: + prog = prog.replace('_mt_','_') + + # generate the name of the output file we redirect output to + outfile = f'{prog}_N{nodes}_T{threads}_I{msg}_S{size}_F{inflight}.out' + + # generate the program commmand with all command line params needed by program + prog_cmd = f"{bin_dir}/{prog}{suffix} {msg} {size} {inflight}" + + # get the system launch command (mpiexec, srun, etc) with options/params + launch_cmd = run_command(system, total_ranks, threads, timeout) + + if transport=='libfabric': + env2 = env + 'LIBFABRIC_POLL_SIZE=32 ' + #for ep in ['single', 'multiple', 'scalableTx', 'threadlocal']: + for ep in ['threadlocal']: + whole_cmd += execution_string(env2 + f"LIBFABRIC_ENDPOINT_TYPE={ep} ", launch_cmd, prog_cmd, outfile) + if False: # add option to enable this? + whole_cmd += execution_string(env2 + f"LIBFABRIC_ENDPOINT_TYPE={ep} " + f"LIBFABRIC_AUTO_PROGRESS=1 ", launch_cmd, prog_cmd, outfile) + else: + whole_cmd += execution_string(env, launch_cmd, prog_cmd, outfile) + + return whole_cmd + + +# In[9]: + + +system = cscs[hostname] +# +job_name = 'oomph' +timeout = args.timeout +time_min = 2000*60 # total time estimate +timestr = time.strftime('%H:%M:%S', time.gmtime(time_min)) +ranks_per_node = 1 +nodes_arr = [2] +thrd_arr = system['Thread_array'] +size_arr = [1,10,100,1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000] +nmsg_lut = {1:500000, + 10:500000, + 100:500000, + 1000:500000, + 2000:500000, + 5000:250000, + 10000:250000, + 20000:250000, + 50000:250000, + 100000:250000, + 200000:250000, + 500000:100000, + 1000000:50000, + 2000000:25000} + +flight_arr = [1,10,50,100] + +if args.type=='normal': + trans_arr = ['libfabric', 'mpi'] + prog_arr = [ + #"bench_p2p_bi_cb_avail", + #"bench_p2p_bi_cb_wait", + "bench_p2p_bi_ft_avail", + #"bench_p2p_bi_ft_wait" + ] + +if args.type=='timed': + trans_arr = ['libfabric', 'mpi'] + prog_arr = ['bench_p2p_pp_ft_avail'] + +if args.type=='native': + trans_arr = ['native'] + prog_arr = [ + #"mpi_p2p_bi_avail_mt_test", "mpi_p2p_bi_avail_mt_testany", + #"mpi_p2p_bi_wait_mt_wait", + "mpi_p2p_bi_wait_mt_waitall" + ] + + +# In[10]: + + +combos = 0 + +if run_dir.startswith('@'): + print(f'Skipping creation of job launch file for {run_dir}') +else: + job_launch = f"{run_dir}/launch.sh" + job_launch_file = open(job_launch, "w") + # + job_launch_file.write("#!/bin/bash -l\n") + +# create the output directory for each job +job_dir = make_job_directory(run_dir, 'oomph', "all", 2, 1, 1, 1) + +# first part of boiler plate job script +job_text = init_job_text(system, job_name, timestr, "all", 2, 16, 1, 1) + +# generate all combinations in one monster loop +for nodes, transport, threads, size, inflight in product(nodes_arr, trans_arr, thrd_arr, size_arr, flight_arr): + + env = "" + msg = nmsg_lut[size] + + # create the output directory for each job + #job_dir = make_job_directory(run_dir, 'oomph', transport, nodes, threads, inflight, size) + + # first part of boiler plate job script + #job_text = init_job_text(system, job_name, timestr, transport, nodes, threads, inflight, size) + + env = 'MPICH_GNI_NDREG_ENTRIES=1024 ' + + # application specific part of job script + job_text += oomph_original( + system, + binary_dir, + timeout, + transport, + prog_arr, + nodes, + threads, + msg, + size, + inflight, + env + ) + # debugging + # print(job_dir, '\n', job_text, '\n\n\n\n') + + combos += 1 + + if combos==1: + print('Uncommment the following line to perform the job creation') + +write_job_file(system, job_launch_file, job_dir, job_text) + +make_executable(job_launch) +print('Combinations', combos, 'est-time', combos*4*2,'minutes') + + +# In[ ]: + + + + diff --git a/cmake/oomph_external_dependencies.cmake b/cmake/oomph_external_dependencies.cmake index 92de39bd..e0eb1a65 100644 --- a/cmake/oomph_external_dependencies.cmake +++ b/cmake/oomph_external_dependencies.cmake @@ -1,8 +1,9 @@ include(oomph_git_submodule) include(oomph_external_project) +include(ExternalProject) if(OOMPH_GIT_SUBMODULE) - update_git_submodules() +# update_git_submodules() endif() # --------------------------------------------------------------------- @@ -15,48 +16,40 @@ find_package(MPI REQUIRED COMPONENTS CXX) # --------------------------------------------------------------------- find_package(Boost REQUIRED) -# --------------------------------------------------------------------- -# hwmalloc setup -# --------------------------------------------------------------------- -cmake_dependent_option(OOMPH_USE_BUNDLED_HWMALLOC "Use bundled hwmalloc lib." ON - "OOMPH_USE_BUNDLED_LIBS" OFF) -if(OOMPH_USE_BUNDLED_HWMALLOC) - check_git_submodule(hwmalloc ext/hwmalloc) - add_subdirectory(ext/hwmalloc) - add_library(HWMALLOC::hwmalloc ALIAS hwmalloc) -else() - find_package(HWMALLOC REQUIRED) -endif() +#------------------------------------------------------------------------------ +# Find Threads +#------------------------------------------------------------------------------ +find_package(Threads REQUIRED) + +# ------------------------------------------------------------------------------ +# Build/Download HWMalloc +# ------------------------------------------------------------------------------ +get_external_project( + PROJECT_NAME + "hwmalloc" + FOLDER_NAME + "hwmalloc" + GIT_REPO + "https://github.com/ghex-org/hwmalloc.git" + GIT_TAG + "master" +) + +add_library(HWMALLOC::hwmalloc ALIAS hwmalloc) # --------------------------------------------------------------------- # google test setup # --------------------------------------------------------------------- -cmake_dependent_option(OOMPH_USE_BUNDLED_GTEST "Use bundled googletest lib." ON - "OOMPH_USE_BUNDLED_LIBS" OFF) -if (OOMPH_WITH_TESTING) - if(OOMPH_USE_BUNDLED_GTEST) - add_external_cmake_project( - NAME googletest - PATH ext/googletest - INTERFACE_NAME ext-gtest - LIBS libgtest.a libgtest_main.a - CMAKE_ARGS - "-DCMAKE_BUILD_TYPE=release" - "-DBUILD_SHARED_LIBS=OFF" - "-DBUILD_GMOCK=OFF") - # on some systems we need link explicitly against threads - if (TARGET ext-gtest) - find_package (Threads) - target_link_libraries(ext-gtest INTERFACE Threads::Threads) - endif() - else() - # Use system provided google test - find_package(GTest REQUIRED) - add_library(ext-gtest INTERFACE) - if (${CMAKE_VERSION} VERSION_LESS "3.20.0") - target_link_libraries(ext-gtest INTERFACE GTest::GTest GTest::Main) - else() - target_link_libraries(ext-gtest INTERFACE GTest::gtest GTest::gtest_main) - endif() - endif() +find_package(GTest QUIET) +message("GTest FOUND ${GTest_FOUND}") +if (NOT GTest_FOUND) + include(FetchContent) + FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG main + GIT_SHALLOW TRUE) + # For Windows: Prevent overriding the parent project's compiler/linker settings + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) endif() diff --git a/cmake/oomph_libfabric.cmake b/cmake/oomph_libfabric.cmake index 5a981bc1..c5209e99 100644 --- a/cmake/oomph_libfabric.cmake +++ b/cmake/oomph_libfabric.cmake @@ -76,7 +76,7 @@ if (OOMPH_WITH_LIBFABRIC) file(WRITE ${TEMP_FILENAME} ${PREAMBLE} ${oomph_config_defines} - "\n#endif\n" + "#endif\n" ) configure_file("${TEMP_FILENAME}" "${OPTION_FILENAME}" COPYONLY) file(REMOVE "${TEMP_FILENAME}") @@ -93,8 +93,9 @@ if (OOMPH_WITH_LIBFABRIC) # Hardware device selection #------------------------------------------------------------------------------ set(OOMPH_LIBFABRIC_PROVIDER "tcp" CACHE - STRING "The provider (cxi/gni/psm2/sockets/tcp/verbs)") - set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS "cxi" "gni" "psm2" "sockets" "tcp" "verbs") + STRING "The provider (cxi(Cray Slingshot)/efa(Amazon Elastic)/gni(Cray Gemini)/psm2(Intel Omni-Path)/tcp/verbs(Infiniband))") + set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS + "cxi" "efa" "gni" "psm2" "tcp" "verbs") oomph_libfabric_add_config_define_namespace( DEFINE HAVE_LIBFABRIC_PROVIDER @@ -115,11 +116,17 @@ if (OOMPH_WITH_LIBFABRIC) oomph_libfabric_add_config_define_namespace( DEFINE HAVE_LIBFABRIC_CXI NAMESPACE libfabric) + elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "efa") + oomph_libfabric_add_config_define_namespace( + DEFINE HAVE_LIBFABRIC_EFA + NAMESPACE libfabric) elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "tcp") oomph_libfabric_add_config_define_namespace( DEFINE HAVE_LIBFABRIC_TCP NAMESPACE libfabric) elseif(OOMPH_LIBFABRIC_PROVIDER MATCHES "sockets") + message(WARNING "The Sockets provider is deprecated in favor of the tcp, udp, " + "and utility providers") oomph_libfabric_add_config_define_namespace( DEFINE HAVE_LIBFABRIC_SOCKETS NAMESPACE libfabric) @@ -134,7 +141,6 @@ if (OOMPH_WITH_LIBFABRIC) #------------------------------------------------------------------------------ set(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS OFF BOOL STRING "Enable libfabric parcelport performance counters (default: OFF)") - set_property(CACHE OOMPH_LIBFABRIC_PROVIDER PROPERTY STRINGS "tcp" "sockets" "psm2" "verbs" "gni") mark_as_advanced(OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS) if (OOMPH_LIBFABRIC_WITH_PERFORMANCE_COUNTERS) @@ -143,14 +149,20 @@ if (OOMPH_WITH_LIBFABRIC) NAMESPACE libfabric) endif() + #------------------------------------------------------------------------------ + # used by template expansion for location of print.hpp + #------------------------------------------------------------------------------ + set(OOMPH_SRC_LIBFABRIC_DIR "${PROJECT_SOURCE_DIR}/src/libfabric") + #------------------------------------------------------------------------------ # Write options to file in build dir #------------------------------------------------------------------------------ oomph_libfabric_write_config_defines_file( NAMESPACE libfabric - FILENAME "${PROJECT_BINARY_DIR}/oomph_libfabric_defines.hpp" + FILENAME "${PROJECT_BINARY_DIR}/src/libfabric/oomph_libfabric_defines.hpp" + TEMPLATE "${OOMPH_SRC_LIBFABRIC_DIR}/libfabric_defines_template.hpp" ) - target_include_directories(oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}") + target_include_directories(oomph_libfabric PRIVATE "${PROJECT_BINARY_DIR}/src/libfabric") endif() diff --git a/ext/googletest b/ext/googletest deleted file mode 160000 index b10fad38..00000000 --- a/ext/googletest +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b10fad38c4026a29ea6561ab15fc4818170d1c10 diff --git a/include/oomph/context.hpp b/include/oomph/context.hpp index f1519ef6..bd12588b 100644 --- a/include/oomph/context.hpp +++ b/include/oomph/context.hpp @@ -120,7 +120,7 @@ template typename Context::region_type register_memory(Context&, void*, std::size_t); #if OOMPH_ENABLE_DEVICE template -typename Context::device_region_type register_device_memory(Context&, void*, std::size_t); +typename Context::device_region_type register_device_memory(Context&, int, void*, std::size_t); #endif } // namespace oomph diff --git a/spack/packages/oomph/package.py b/spack/packages/oomph/package.py new file mode 100644 index 00000000..a0281b7a --- /dev/null +++ b/spack/packages/oomph/package.py @@ -0,0 +1,96 @@ +from spack import * +import os + +class Oomph(CMakePackage, CudaPackage, ROCmPackage): + """dummy placeholder for oomph dependencies""" + homepage = "https://127.0.0.1/readme.html" + generator = "Ninja" + maintainers = ["biddisco"] + version("develop") + + # we have only tested/supported a subset of potential libfabrics providers + fabrics = ( + "cxi", "efa", "gni", "psm2", "tcp", "verbs", + ) + + variant( + "ofi", + default="tcp", + description="A list of enabled OFI fabrics", + values=fabrics, + multi=False, + ) + + # ------------------------------------------------------------------------ + # Exactly one of +cuda and +rocm need to be set + # ------------------------------------------------------------------------ + conflicts("+cuda +rocm") + + # ------------------------------------------------------------------------ + # variants + # ------------------------------------------------------------------------ + variant("ucx", default=False, description="Enable ucx support") + variant("ofi", default=False, description="Enable ofi libfabric support") + variant("testing", default=False, description="Enable testing") + + # ------------------------------------------------------------------------ + # build time dependencies + # ------------------------------------------------------------------------ + depends_on("ninja", type="build") + depends_on("cmake@3.22:", type="build") + + # ------------------------------------------------------------------------ + # generic c++ libs needed by several projects + # ------------------------------------------------------------------------ + depends_on("boost +atomic+chrono+container+context+coroutine+date_time+filesystem+program_options+regex+serialization+system+test+thread+mpi+graph+json cxxstd=17") + #depends_on("fmt") + + # ------------------------------------------------------------------------ + # GPU/cuda/rocm + # ------------------------------------------------------------------------ + depends_on("cuda", when="+cuda") + depends_on("rocm-core", when="+rocm") + + # ------------------------------------------------------------------------ + # allocators/memory + # ------------------------------------------------------------------------ + depends_on("hwloc +cuda", when="+cuda ^gcc@11.4:") + depends_on("hwloc ~cuda", when="gcc@:11.4") + depends_on("numactl") + + + # ------------------------------------------------------------------------ + # mpi and parallel io + # ------------------------------------------------------------------------ + depends_on("mpi") + depends_on("ucx +thread_multiple", when="+ucx") + depends_on("libfabric@1.17:", when="+ofi") + + # ------------------------------------------------------------------------ + # testing + # ------------------------------------------------------------------------ + depends_on("googletest") + + # ------------------------------------------------------------------------ + def cmake_args(self): + """Populate cmake arguments for Mercury.""" + spec = self.spec + define = self.define + define_from_variant = self.define_from_variant + parallel_tests = "+mpi" in spec and self.run_tests + + cmake_args = [ + define_from_variant("OOMPH_WITH_LIBFABRIC", "ofi"), + define_from_variant("OOMPH_WITH_UCX", "ucx"), + ] + + if "+ofi" in spec: + ofi_fabrics = spec["libfabric"].variants["fabrics"].value + ofi_fabrics = next((fabric for fabric in ofi_fabrics if fabric in self.fabrics), None) + if ofi_fabrics is None: + raise ValueError("No matching fabric found in ofi_fabrics and fabrics") + cmake_args.append(f"OOMPH_LIBFABRIC_PROVIDER={ofi_fabrics}") + print (cmake_args) + return cmake_args + + # ------------------------------------------------------------------------ diff --git a/src/libfabric/CMakeLists.txt b/src/libfabric/CMakeLists.txt index 6de175ad..81d7f907 100644 --- a/src/libfabric/CMakeLists.txt +++ b/src/libfabric/CMakeLists.txt @@ -1,10 +1,19 @@ find_package(Boost REQUIRED COMPONENTS thread) + +# dummy library of our private headers add_library(oomph_private_libfabric_headers INTERFACE) target_include_directories(oomph_private_libfabric_headers INTERFACE "$") + +# actual library (created in oomph_libfabric.cmake) source files, +# depends on dummy library target_link_libraries(oomph_libfabric PRIVATE oomph_private_libfabric_headers) target_link_libraries(oomph_libfabric PRIVATE Boost::thread) +# we need to include a binary dir for the oomph_config_defines.hpp file +target_include_directories(oomph_libfabric INTERFACE + "$") + list(TRANSFORM oomph_sources PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/../ OUTPUT_VARIABLE oomph_sources_libfabric) target_sources(oomph_libfabric PRIVATE ${oomph_sources_libfabric}) diff --git a/src/libfabric/README.txt b/src/libfabric/README.txt index 007e014d..70bc8151 100644 --- a/src/libfabric/README.txt +++ b/src/libfabric/README.txt @@ -42,3 +42,14 @@ clang-format -i ./include/hpx/parcelport_libfabric/controller_base.hpp meld ./include/hpx/parcelport_libfabric/memory_region.hpp ~/src/ghex/extern/oomph/src/libfabric/memory_region.hpp meld ./include/hpx/parcelport_libfabric/operation_context_base.hpp ~/src/ghex/extern/oomph/src/libfabric/operation_context_base.hpp meld ./include/hpx/parcelport_libfabric/controller_base.hpp ~/src/ghex/extern/oomph/src/libfabric/controller_base.hpp + +cp ~/src/ghex/extern/oomph/src/libfabric/memory_region.hpp ./include/hpx/parcelport_libfabric/memory_region.hpp +cp ~/src/ghex/extern/oomph/src/libfabric/operation_context_base.hpp ./include/hpx/parcelport_libfabric/operation_context_base.hpp +cp ~/src/ghex/extern/oomph/src/libfabric/controller_base.hpp ./include/hpx/parcelport_libfabric/controller_base.hpp +cp ~/src/ghex/extern/oomph/src/libfabric/fabric_error.hpp ./include/hpx/parcelport_libfabric/fabric_error.hpp + +clang-format -i include/hpx/parcelport_libfabric/memory_region.hpp +clang-format -i include/hpx/parcelport_libfabric/operation_context_base.hpp +clang-format -i include/hpx/parcelport_libfabric/controller_base.hpp +clang-format -i include/hpx/parcelport_libfabric/fabric_error.hpp + diff --git a/src/libfabric/communicator.hpp b/src/libfabric/communicator.hpp index 76509a2a..0b5397ba 100644 --- a/src/libfabric/communicator.hpp +++ b/src/libfabric/communicator.hpp @@ -20,7 +20,7 @@ // paths relative to backend #include <../communicator_base.hpp> #include <../device_guard.hpp> -#include <./operation_context.hpp> +#include #include #include #include @@ -28,12 +28,12 @@ namespace oomph { -using operation_context = oomph::libfabric::operation_context; +using operation_context = libfabric::operation_context; using tag_disp = NS_DEBUG::detail::hex<12, uintptr_t>; // cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print com_deb("COMMUNI"); +static NS_DEBUG::enable_print com_deb("COMMUNI"); static NS_DEBUG::enable_print com_err("COMMUNI"); class communicator_impl : public communicator_base @@ -63,7 +63,7 @@ class communicator_impl : public communicator_base , m_recv_cb_queue(128) , m_recv_cb_cancel(8) { - OOMPH_DP_ONLY(com_deb, debug(NS_DEBUG::str<>("MPI_comm"), NS_DEBUG::ptr(mpi_comm()))); + LF_DEB(com_deb, debug(NS_DEBUG::str<>("MPI_comm"), NS_DEBUG::ptr(mpi_comm()))); m_tx_endpoint = m_context->get_controller()->get_tx_endpoint(); m_rx_endpoint = m_context->get_controller()->get_rx_endpoint(); } @@ -77,10 +77,10 @@ class communicator_impl : public communicator_base // -------------------------------------------------------------------- /// generate a tag with 0xRRRRRRRRtttttttt rank, tag. /// original tag can be 32bits, then we add 32bits of rank info. - inline std::uint64_t make_tag64(std::uint32_t tag, std::uint32_t rank) + inline std::uint64_t make_tag64(std::uint32_t tag, /*std::uint32_t rank, */std::uintptr_t ctxt) { - return (((std::uint64_t(rank) & 0x00000000FFFFFFFF) << 32) | - ((std::uint64_t(tag) & 0x00000000FFFFFFFF))); + return (((ctxt & 0x0000000000FFFFFF) << 24) | + ((std::uint64_t(tag) & 0x0000000000FFFFFF))); } // -------------------------------------------------------------------- @@ -94,7 +94,7 @@ class communicator_impl : public communicator_base if (ret == 0) { return; } else if (ret == -FI_EAGAIN) { - com_deb.error("Reposting", msg); + // com_deb.error("Reposting", msg); // no point stressing the system m_context->get_controller()->poll_for_work_completions(this); } @@ -105,7 +105,7 @@ class communicator_impl : public communicator_base com_err.error("No destination endpoint, terminating."); std::terminate(); } - else if (ret) { throw libfabric::fabric_error(int(ret), msg); } + else if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), msg); } } } @@ -116,8 +116,8 @@ class communicator_impl : public communicator_base { [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__); // clang-format off - OOMPH_DP_ONLY(com_deb, - debug(NS_DEBUG::str<>("send message buffer"), + LF_DEB(com_deb, + debug(NS_DEBUG::str<>("send_tagged_region"), "->", NS_DEBUG::dec<2>(dst_addr_), send_region, "tag", tag_disp(tag_), @@ -135,12 +135,9 @@ class communicator_impl : public communicator_base { [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__); // clang-format on - OOMPH_DP_ONLY(com_deb, - debug(NS_DEBUG::str<>("inject tagged"), - "->", NS_DEBUG::dec<2>(dst_addr_), - send_region, - "tag", tag_disp(tag_), - "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep()))); + LF_DEB(com_deb, + debug(NS_DEBUG::str<>("inject tagged"), "->", NS_DEBUG::dec<2>(dst_addr_), send_region, + "tag", tag_disp(tag_), "tx endpoint", NS_DEBUG::ptr(m_tx_endpoint.get_ep()))); // clang-format off execute_fi_function(fi_tinject, "fi_tinject", m_tx_endpoint.get_ep(), send_region.get_address(), size, dst_addr_, tag_); @@ -155,8 +152,8 @@ class communicator_impl : public communicator_base { [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__); // clang-format off - OOMPH_DP_ONLY(com_deb, - debug(NS_DEBUG::str<>("recv message buffer"), + LF_DEB(com_deb, + debug(NS_DEBUG::str<>("recv_tagged_region"), "<-", NS_DEBUG::dec<2>(src_addr_), recv_region, "tag", tag_disp(tag_), @@ -175,15 +172,14 @@ class communicator_impl : public communicator_base std::size_t* scheduled) { [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__); - std::uint64_t stag = make_tag64(tag, this->rank()); + std::uint64_t stag = make_tag64(tag, /*this->rank(), */this->m_context->get_context_tag()); auto& reg = ptr.handle_ref(); #ifdef EXTRA_SIZE_CHECKS if (size != reg.get_size()) { - OOMPH_DP_ONLY(com_err, - error(NS_DEBUG::str<>("send mismatch"), "size", NS_DEBUG::hex<6>(size), "reg size", - NS_DEBUG::hex<6>(reg.get_size()))); + LF_DEB(com_err, error(NS_DEBUG::str<>("send mismatch"), "size", NS_DEBUG::hex<6>(size), + "reg size", NS_DEBUG::hex<6>(reg.get_size()))); } #endif m_context->get_controller()->sends_posted_++; @@ -215,7 +211,7 @@ class communicator_impl : public communicator_base s->create_self_ref(); // clang-format off - OOMPH_DP_ONLY(com_deb, + LF_DEB(com_deb, debug(NS_DEBUG::str<>("Send"), "thisrank", NS_DEBUG::dec<>(rank()), "rank", NS_DEBUG::dec<>(dst), @@ -238,15 +234,14 @@ class communicator_impl : public communicator_base std::size_t* scheduled) { [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__); - std::uint64_t stag = make_tag64(tag, src); + std::uint64_t stag = make_tag64(tag, /*src, */this->m_context->get_context_tag()); auto& reg = ptr.handle_ref(); #ifdef EXTRA_SIZE_CHECKS if (size != reg.get_size()) { - OOMPH_DP_ONLY(com_err, - error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), "reg size", - NS_DEBUG::hex<6>(reg.get_size()))); + LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), + "reg size", NS_DEBUG::hex<6>(reg.get_size()))); } #endif m_context->get_controller()->recvs_posted_++; @@ -257,8 +252,8 @@ class communicator_impl : public communicator_base s->create_self_ref(); // clang-format off - OOMPH_DP_ONLY(com_deb, - debug(NS_DEBUG::str<>("Recv"), + LF_DEB(com_deb, + debug(NS_DEBUG::str<>("recv"), "thisrank", NS_DEBUG::dec<>(rank()), "rank", NS_DEBUG::dec<>(src), "tag", tag_disp(std::uint64_t(tag)), @@ -281,15 +276,14 @@ class communicator_impl : public communicator_base std::atomic* scheduled) { [[maybe_unused]] auto scp = com_deb.scope(NS_DEBUG::ptr(this), __func__); - std::uint64_t stag = make_tag64(tag, src); + std::uint64_t stag = make_tag64(tag, /*src, */this->m_context->get_context_tag()); auto& reg = ptr.handle_ref(); #ifdef EXTRA_SIZE_CHECKS if (size != reg.get_size()) { - OOMPH_DP_ONLY(com_err, - error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), "reg size", - NS_DEBUG::hex<6>(reg.get_size()))); + LF_DEB(com_err, error(NS_DEBUG::str<>("recv mismatch"), "size", NS_DEBUG::hex<6>(size), + "reg size", NS_DEBUG::hex<6>(reg.get_size()))); } #endif m_context->get_controller()->recvs_posted_++; @@ -300,8 +294,8 @@ class communicator_impl : public communicator_base s->create_self_ref(); // clang-format off - OOMPH_DP_ONLY(com_deb, - debug(NS_DEBUG::str<>("Recv"), + LF_DEB(com_deb, + debug(NS_DEBUG::str<>("shared_recv"), "thisrank", NS_DEBUG::dec<>(rank()), "rank", NS_DEBUG::dec<>(src), "tag", tag_disp(std::uint64_t(tag)), @@ -368,7 +362,7 @@ class communicator_impl : public communicator_base // submit the cancellation request bool ok = (fi_cancel(&m_rx_endpoint.get_ep()->fid, op_ctx) == 0); - OOMPH_DP_ONLY(com_deb, + LF_DEB(com_deb, debug(NS_DEBUG::str<>("Cancel"), "ok", ok, "op_ctx", NS_DEBUG::ptr(op_ctx))); // if the cancel operation failed completely, return @@ -387,8 +381,8 @@ class communicator_impl : public communicator_base { // our recv was cancelled correctly found = true; - OOMPH_DP_ONLY(com_deb, debug(NS_DEBUG::str<>("Cancel"), "succeeded", "op_ctx", - NS_DEBUG::ptr(op_ctx))); + LF_DEB(com_deb, debug(NS_DEBUG::str<>("Cancel"), "succeeded", "op_ctx", + NS_DEBUG::ptr(op_ctx))); auto ptr = s->release_self_ref(); s->set_canceled(); } diff --git a/src/libfabric/context.cpp b/src/libfabric/context.cpp index 1baca07b..cbbc4b26 100644 --- a/src/libfabric/context.cpp +++ b/src/libfabric/context.cpp @@ -7,6 +7,8 @@ * Please, refer to the LICENSE file in the root directory. * SPDX-License-Identifier: BSD-3-Clause */ +#include +// #include // paths relative to backend #include @@ -17,9 +19,9 @@ namespace oomph { // cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print src_deb("__SRC__"); +static NS_DEBUG::enable_print src_deb("__SRC__"); -using controller_type = oomph::libfabric::controller; +using controller_type = libfabric::controller; context_impl::context_impl(MPI_Comm comm, bool thread_safe, bool message_pool_never_free, std::size_t message_pool_reserve) @@ -31,6 +33,12 @@ context_impl::context_impl(MPI_Comm comm, bool thread_safe, bool message_pool_ne int rank, size; OOMPH_CHECK_MPI_RESULT(MPI_Comm_rank(comm, &rank)); OOMPH_CHECK_MPI_RESULT(MPI_Comm_size(comm, &size)); + + m_ctxt_tag = reinterpret_cast(this); + OOMPH_CHECK_MPI_RESULT(MPI_Bcast(&m_ctxt_tag, 1, MPI_UINT64_T, 0, comm)); + LF_DEB(src_deb, debug(NS_DEBUG::str<>("Broadcast"), "rank", debug::dec<3>(rank), "context", + debug::ptr(m_ctxt_tag))); + // TODO fix the thread safety // problem: controller is a singleton and has problems when 2 contexts are created in the // following order: single threaded first, then multi-threaded after @@ -75,8 +83,8 @@ context_impl::init_libfabric_controller(oomph::context_impl* /*ctx*/, MPI_Comm c static std::shared_ptr instance(nullptr); if (!instance.get()) { - OOMPH_DP_ONLY(src_deb, debug(NS_DEBUG::str<>("New Controller"), "rank", debug::dec<3>(rank), - "size", debug::dec<3>(size), "threads", debug::dec<3>(threads))); + LF_DEB(src_deb, debug(NS_DEBUG::str<>("New Controller"), "rank", debug::dec<3>(rank), + "size", debug::dec<3>(size), "threads", debug::dec<3>(threads))); instance.reset(new controller_type()); instance->initialize(HAVE_LIBFABRIC_PROVIDER, rank == 0, size, threads, comm); } diff --git a/src/libfabric/context.hpp b/src/libfabric/context.hpp index f3b34259..d17d8a05 100644 --- a/src/libfabric/context.hpp +++ b/src/libfabric/context.hpp @@ -26,14 +26,16 @@ namespace oomph { -using controller_type = oomph::libfabric::controller; +static NS_DEBUG::enable_print ctx_deb("CONTEXT"); + +using controller_type = libfabric::controller; class context_impl : public context_base { public: - using region_type = oomph::libfabric::memory_segment; + using region_type = libfabric::memory_segment; using domain_type = region_type::provider_domain; - using device_region_type = oomph::libfabric::memory_segment; + using device_region_type = libfabric::memory_segment; using heap_type = hwmalloc::heap; using callback_queue = boost::lockfree::queue, boost::lockfree::allocator>>; @@ -42,6 +44,7 @@ class context_impl : public context_base heap_type m_heap; domain_type* m_domain; std::shared_ptr m_controller; + std::uintptr_t m_ctxt_tag; public: // -------------------------------------------------- @@ -61,22 +64,23 @@ class context_impl : public context_base context_impl(context_impl const&) = delete; context_impl(context_impl&&) = delete; - region_type make_region(void* const ptr, std::size_t size, bool /*device*/) + region_type make_region(void* const ptr, std::size_t size, int device_id) { - bool bind_mr = ((m_controller->memory_registration_mode_flags() & FI_MR_ENDPOINT) != 0); - if (bind_mr) { - void *endpoint = m_controller->get_rx_endpoint().get_ep(); - return oomph::libfabric::memory_segment(m_domain, ptr, size, bind_mr, endpoint); - } - else { - return oomph::libfabric::memory_segment(m_domain, ptr, size, false, nullptr); + if (m_controller->get_mrbind()) + { + void* endpoint = m_controller->get_rx_endpoint().get_ep(); + return libfabric::memory_segment(m_domain, ptr, size, true, endpoint, device_id); } + else { return libfabric::memory_segment(m_domain, ptr, size, false, nullptr, device_id); } } auto& get_heap() noexcept { return m_heap; } communicator_impl* get_communicator(); + // we must modify all tags to use 32bits of context ptr for uniqueness + inline std::uintptr_t get_context_tag() { return m_ctxt_tag; } + inline controller_type* get_controller() /*const */ { return m_controller.get(); } const char* get_transport_option(const std::string& opt); @@ -107,9 +111,8 @@ class context_impl : public context_base { // our recv was cancelled correctly found = true; - OOMPH_DP_ONLY(libfabric::ctx_deb, - debug(NS_DEBUG::str<>("Cancel shared"), "succeeded", "op_ctx", - NS_DEBUG::ptr(op_ctx))); + LF_DEB(oomph::ctx_deb, debug(NS_DEBUG::str<>("Cancel shared"), "succeeded", + "op_ctx", NS_DEBUG::ptr(op_ctx))); auto ptr = s->release_self_ref(); s->set_canceled(); } @@ -138,15 +141,15 @@ template<> inline oomph::libfabric::memory_segment register_memory(oomph::context_impl& c, void* const ptr, std::size_t size) { - return c.make_region(ptr, size, false); + return c.make_region(ptr, size, -2); } #if OOMPH_ENABLE_DEVICE template<> inline oomph::libfabric::memory_segment -register_device_memory(context_impl& c, void* ptr, std::size_t size) +register_device_memory(context_impl& c, int device_id, void* ptr, std::size_t size) { - return c.make_region(ptr, size, true); + return c.make_region(ptr, size, device_id); } #endif diff --git a/src/libfabric/controller.hpp b/src/libfabric/controller.hpp index 301c0eb5..04443a36 100644 --- a/src/libfabric/controller.hpp +++ b/src/libfabric/controller.hpp @@ -35,24 +35,21 @@ #include #include // +#include "oomph_libfabric_defines.hpp" #include "fabric_error.hpp" #include "locality.hpp" #include "memory_region.hpp" #include "operation_context.hpp" -#include "simple_counter.hpp" -#include "print.hpp" #include "controller_base.hpp" // #include // -#include "libfabric_defines.hpp" -// #include namespace NS_DEBUG { // cppcheck-suppress ConfigurationNotChecked -static debug::enable_print cnt_deb("CONTROL"); +static debug::enable_print cnt_deb("CONTROL"); static debug::enable_print cnt_err("CONTROL"); } // namespace NS_DEBUG @@ -79,7 +76,7 @@ class controller : public controller_base // -------------------------------------------------------------------- constexpr fi_threading threadlevel_flags() { -#if defined(HAVE_LIBFABRIC_GNI) || defined(HAVE_LIBFABRIC_CXI) +#if defined(HAVE_LIBFABRIC_GNI) /*|| defined(HAVE_LIBFABRIC_CXI)*/ return FI_THREAD_ENDPOINT; #else return FI_THREAD_SAFE; @@ -87,7 +84,15 @@ class controller : public controller_base } // -------------------------------------------------------------------- - constexpr uint64_t caps_flags() { return FI_MSG | FI_TAGGED; } + constexpr uint64_t caps_flags() + { +#if OOMPH_ENABLE_DEVICE + std::int64_t hmem_flags = FI_HMEM; +#else + std::int64_t hmem_flags = 0; +#endif + return hmem_flags | FI_MSG | FI_TAGGED | FI_RMA | FI_READ | FI_WRITE | FI_RECV | FI_SEND | FI_TRANSMIT | FI_REMOTE_READ | FI_REMOTE_WRITE ; + } // -------------------------------------------------------------------- // we do not need to perform any special actions on init (to contact root node) @@ -102,14 +107,14 @@ class controller : public controller_base // if (rank > 0) { - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("sending here"), iplocality(here_), "size", - locality_defs::array_size)); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("sending here"), iplocality(here_), "size", + locality_defs::array_size)); /*int err = */ MPI_Send(here_.fabric_data(), locality_defs::array_size, MPI_CHAR, 0, // dst rank 0, // tag comm); - DEBUG(NS_DEBUG::cnt_deb, + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("receiving all"), "size", locality_defs::array_size)); MPI_Status status; @@ -117,28 +122,29 @@ class controller : public controller_base 0, // src rank 0, // tag comm, &status); - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("received addresses"))); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("received addresses"))); } else { - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("receiving addresses"))); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("receiving addresses"))); memcpy(&localities[0], here_.fabric_data(), locality_defs::array_size); for (int i = 1; i < size; ++i) { - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("receiving address"), debug::dec<>(i))); + LF_DEB(NS_DEBUG::cnt_deb, + debug(debug::str<>("receiving address"), debug::dec<>(i))); MPI_Status status; /*int err = */ MPI_Recv(&localities[i * locality_defs::array_size], size * locality_defs::array_size, MPI_CHAR, i, // src rank 0, // tag comm, &status); - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("received address"), debug::dec<>(i))); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("received address"), debug::dec<>(i))); } - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("sending all"))); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("sending all"))); for (int i = 1; i < size; ++i) { - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("sending to"), debug::dec<>(i))); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("sending to"), debug::dec<>(i))); /*int err = */ MPI_Send(&localities[0], size * locality_defs::array_size, MPI_CHAR, i, // dst rank 0, // tag @@ -147,7 +153,7 @@ class controller : public controller_base } // all ranks should now have a full localities vector - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("populating vector"))); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("populating vector"))); for (int i = 0; i < size; ++i) { locality temp; @@ -168,11 +174,11 @@ class controller : public controller_base MPI_Comm_rank(mpi_comm, &rank); MPI_Comm_size(mpi_comm, &size); - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("initialize_localities"), size, "localities")); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("initialize_localities"), size, "localities")); MPI_exchange_localities(av, mpi_comm, rank, size); debug_print_av_vector(size); - DEBUG(NS_DEBUG::cnt_deb, debug(debug::str<>("Done localities"))); + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("Done localities"))); } // -------------------------------------------------------------------- @@ -182,8 +188,7 @@ class controller : public controller_base return true; #elif defined(HAVE_LIBFABRIC_CXI) // @todo : cxi provider is not yet thread safe using scalable endpoints - return (threadlevel_flags() == FI_THREAD_SAFE || - endpoint_type_ == endpoint_type::threadlocalTx); + return false; #else return (threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::threadlocalTx); @@ -240,7 +245,8 @@ class controller : public controller_base send_poll_stamp = now; #endif int ret; - fi_cq_msg_entry entry[256]; // max_completions_per_poll_ must be <= this + fi_cq_msg_entry entry[max_completions_array_limit_]; + assert(max_completions_per_poll_ <= max_completions_array_limit_); { auto lock = try_tx_lock(); @@ -249,7 +255,7 @@ class controller : public controller_base if (!bypass_tx_lock() && !lock.owns_lock()) { return -1; } static auto polling = NS_DEBUG::cnt_deb.make_timer(1, debug::str<>("poll send queue")); - DEBUG(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(send_cq))); + LF_DEB(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(send_cq))); // poll for completions { @@ -293,14 +299,14 @@ class controller : public controller_base for (int i = 0; i < ret; ++i) { ++sends_complete; - DEBUG(NS_DEBUG::cnt_deb, + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("Completion"), i, debug::dec<2>(i), "txcq flags", fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(", debug::dec<>(entry[i].flags), ")", "context", NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len))); if ((entry[i].flags & (FI_TAGGED | FI_SEND | FI_MSG)) != 0) { - DEBUG(NS_DEBUG::cnt_deb, + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("Completion"), "txcq tagged send completion", NS_DEBUG::ptr(entry[i].op_context))); @@ -336,8 +342,8 @@ class controller : public controller_base recv_poll_stamp = now; #endif int ret; - fi_cq_msg_entry entry[256]; // max_completions_per_poll_ must be <= this - + fi_cq_msg_entry entry[max_completions_array_limit_]; + assert(max_completions_per_poll_ <= max_completions_array_limit_); { auto lock = get_rx_lock(); @@ -346,7 +352,7 @@ class controller : public controller_base if (!bypass_rx_lock() && !lock.owns_lock()) { return -1; } static auto polling = NS_DEBUG::cnt_deb.make_timer(1, debug::str<>("poll recv queue")); - DEBUG(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(rx_cq))); + LF_DEB(NS_DEBUG::cnt_deb, timed(polling, NS_DEBUG::ptr(rx_cq))); // poll for completions { @@ -362,7 +368,7 @@ class controller : public controller_base // from the manpage 'man 3 fi_cq_readerr' if (e.err == FI_ECANCELED) { - DEBUG(NS_DEBUG::cnt_deb, + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("rxcq Cancelled"), "flags", debug::hex<6>(e.flags), "len", debug::hex<6>(e.len), "context", NS_DEBUG::ptr(e.op_context))); // the request was cancelled, we can simply exit @@ -373,9 +379,9 @@ class controller : public controller_base } else if (e.err != FI_SUCCESS) { - NS_DEBUG::cnt_err.error("rxcq Error ??? ", "err", debug::dec<>(-e.err), "flags", + NS_DEBUG::cnt_err.error(debug::str<>("poll_recv_queue"), "error code", debug::dec<>(-e.err), "flags", debug::hex<6>(e.flags), "len", debug::hex<6>(e.len), "context", - NS_DEBUG::ptr(e.op_context), "error", + NS_DEBUG::ptr(e.op_context), "error msg", fi_cq_strerror(rx_cq, e.prov_errno, e.err_data, (char*)e.buf, e.len)); } operation_context* handler = reinterpret_cast(e.op_context); @@ -392,14 +398,14 @@ class controller : public controller_base for (int i = 0; i < ret; ++i) { ++recvs_complete; - DEBUG(NS_DEBUG::cnt_deb, + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("Completion"), i, "rxcq flags", fi_tostr(&entry[i].flags, FI_TYPE_CQ_EVENT_FLAGS), "(", debug::dec<>(entry[i].flags), ")", "context", NS_DEBUG::ptr(entry[i].op_context), "length", debug::hex<6>(entry[i].len))); if ((entry[i].flags & (FI_TAGGED | FI_RECV)) != 0) { - DEBUG(NS_DEBUG::cnt_deb, + LF_DEB(NS_DEBUG::cnt_deb, debug(debug::str<>("Completion"), "rxcq tagged recv completion", NS_DEBUG::ptr(entry[i].op_context))); @@ -430,14 +436,16 @@ class controller : public controller_base (void)info; // unused variable warning (void)tx; // unused variable warning - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo"))); struct fi_info* hints = fi_dupinfo(info); if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo"); // clear any Rx address data that might be set - free(hints->src_addr); + // free(hints->src_addr); + // hints->src_addr = nullptr; + // hints->src_addrlen = 0; free(hints->dest_addr); - hints->src_addr = nullptr; hints->dest_addr = nullptr; + hints->dest_addrlen = 0; return hints; } }; diff --git a/src/libfabric/controller_base.hpp b/src/libfabric/controller_base.hpp index 1a4b768d..41191a57 100644 --- a/src/libfabric/controller_base.hpp +++ b/src/libfabric/controller_base.hpp @@ -38,33 +38,15 @@ #include #include // -#include "libfabric_defines.hpp" +#include "oomph_libfabric_defines.hpp" // #include "fabric_error.hpp" #include "locality.hpp" #include "memory_region.hpp" #include "operation_context_base.hpp" -#include "simple_counter.hpp" - -// ------------------------------------------------------------------ -// This section exists to make interoperabily/sharing of code -// between OOMPH/GHEX and HPX easier -#if __has_include("print.hpp") -#include "print.hpp" -#define NS_LIBFABRIC oomph::libfabric -#define DEBUG OOMPH_DP_ONLY -#elif __has_include() -#include -#define NS_LIBFABRIC hpx::parcelset::policies::libfabric -using namespace NS_LIBFABRIC; -#endif - -#if __has_include("simple_counter.hpp") -#include "simple_counter.hpp" -#endif //#define DISABLE_FI_INJECT -// #define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50 +//#define EXCESSIVE_POLLING_BACKOFF_MICRO_S 50 // ------------------------------------------------------------------ @@ -150,7 +132,8 @@ static int libfabric_completions_per_poll() { auto env_str = std::getenv("LIBFABRIC_POLL_SIZE"); - if (env_str != nullptr) { + if (env_str != nullptr) + { try { return std::atoi(env_str); @@ -169,7 +152,8 @@ static int libfabric_rendezvous_threshold(int def_val) { auto env_str = std::getenv("LIBFABRIC_RENDEZVOUS_THRESHOLD"); - if (env_str != nullptr) { + if (env_str != nullptr) + { try { char* end; @@ -191,12 +175,12 @@ libfabric_rendezvous_threshold(int def_val) #define OOMPH_GNI_REG "internal" //#define OOMPH_GNI_REG "udreg" -std::vector> gni_strs = { +static std::vector> gni_strs = { {GNI_MR_CACHE, "GNI_MR_CACHE"}, }; // clang-format off -std::vector> gni_ints = { +static std::vector> gni_ints = { {GNI_MR_CACHE_LAZY_DEREG, "GNI_MR_CACHE_LAZY_DEREG"}, {GNI_MR_HARD_REG_LIMIT, "GNI_MR_HARD_REG_LIMIT"}, {GNI_MR_SOFT_REG_LIMIT, "GNI_MR_SOFT_REG_LIMIT"}, @@ -225,13 +209,18 @@ std::vector> gni_ints = { // clang-format on #endif -#define LIBFABRIC_FI_VERSION_MAJOR 1 -#define LIBFABRIC_FI_VERSION_MINOR 11 +#if defined(HAVE_LIBFABRIC_CXI) +# define LIBFABRIC_FI_VERSION_MAJOR 1 +# define LIBFABRIC_FI_VERSION_MINOR 15 +#else +# define LIBFABRIC_FI_VERSION_MAJOR 1 +# define LIBFABRIC_FI_VERSION_MINOR 15 +#endif namespace NS_DEBUG { // cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print cnb_deb("CONBASE"); +static NS_DEBUG::enable_print cnb_deb("CONBASE"); static NS_DEBUG::enable_print cnb_err("CONBASE"); } // namespace NS_DEBUG @@ -255,7 +244,6 @@ struct progress_status namespace NS_LIBFABRIC { - /// A wrapper around fi_close that reports any error /// Because we use so many handles, we must be careful to /// delete them all before closing resources that use them @@ -263,7 +251,7 @@ template void fidclose(Handle fid, const char* msg) { - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("closing"), msg)); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("closing"), msg)); int ret = fi_close(fid); if (ret == -FI_EBUSY) { throw NS_LIBFABRIC::fabric_error(ret, "fi_close EBUSY"); } else if (ret == FI_SUCCESS) { return; } @@ -356,7 +344,7 @@ struct stack_endpoint ~stack_endpoint() { if (!pool_) return; - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "used push", "ep", NS_DEBUG::ptr(get_ep()), "tx cq", NS_DEBUG::ptr(get_tx_cq()), "rx cq", NS_DEBUG::ptr(get_rx_cq()))); pool_->push(endpoint_); @@ -422,10 +410,20 @@ class controller_base uint32_t max_completions_per_poll_; uint32_t msg_rendezvous_threshold_; + inline static constexpr uint32_t max_completions_array_limit_ = 256; static inline thread_local std::chrono::steady_clock::time_point send_poll_stamp; static inline thread_local std::chrono::steady_clock::time_point recv_poll_stamp; + // set if FI_MR_LOCAL is required (local access requires binding) + bool mrlocal = false; + // set if FI_MR_ENDPOINT is required (per endpoint memory binding) + bool mrbind = false; + // set if FI_MR_HRMEM provider requires heterogeneous memory registration + bool mrhmem = false; + public: + bool get_mrbind() { return mrbind;} + public: NS_LIBFABRIC::simple_counter sends_posted_; NS_LIBFABRIC::simple_counter recvs_posted_; @@ -436,7 +434,7 @@ class controller_base void finvoke(const char* msg, const char* err, int ret) { - DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>(msg))); + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>(msg))); if (ret) throw NS_LIBFABRIC::fabric_error(ret, err); } @@ -474,7 +472,7 @@ class controller_base unsigned int rma_reads_ = 0; unsigned int recv_deletes_ = 0; - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("counters"), "Received messages", debug::dec<>(messages_handled_), "Total reads", debug::dec<>(rma_reads_), "Total deletes", debug::dec<>(recv_deletes_), "deletes error", @@ -511,7 +509,7 @@ class controller_base fidclose(&fabric_->fid, "Fabric"); // clean up - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("freeing fabric_info"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("freeing fabric_info"))); fi_freeinfo(fabric_info_); } @@ -519,7 +517,8 @@ class controller_base // -------------------------------------------------------------------- // setup an endpoint for receiving messages, // usually an rx endpoint is shared by all threads - endpoint_wrapper create_rx_endpoint(struct fid_domain* domain, struct fi_info* info, struct fid_av* av) + endpoint_wrapper create_rx_endpoint(struct fid_domain* domain, struct fi_info* info, + struct fid_av* av) { auto ep_rx = new_endpoint_active(domain, info, false); @@ -541,24 +540,24 @@ class controller_base void initialize(std::string const& provider, bool rootnode, int size, size_t threads, Args&&... args) { - DEBUG(NS_DEBUG::cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); })); + LF_DEB(NS_DEBUG::cnb_deb, eval([]() { std::cout.setf(std::ios::unitbuf); })); [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); max_completions_per_poll_ = libfabric_completions_per_poll(); - DEBUG(NS_DEBUG::cnb_err, + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("Poll completions"), debug::dec<3>(max_completions_per_poll_))); uint32_t default_val = (threads == 1) ? 0x400 : 0x4000; msg_rendezvous_threshold_ = libfabric_rendezvous_threshold(default_val); - DEBUG(NS_DEBUG::cnb_err, + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("Rendezvous threshold"), debug::hex<4>(msg_rendezvous_threshold_))); endpoint_type_ = static_cast(libfabric_endpoint_type()); - DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("Endpoints"), libfabric_endpoint_string())); + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("Endpoints"), libfabric_endpoint_string())); eps_ = std::make_unique(); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Threads"), debug::dec<3>(threads))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Threads"), debug::dec<3>(threads))); open_fabric(provider, threads, rootnode); @@ -566,7 +565,8 @@ class controller_base av_ = create_address_vector(fabric_info_, size, threads); // we need an rx endpoint in all cases except scalable rx - if (endpoint_type_ != endpoint_type::scalableTxRx) { + if (endpoint_type_ != endpoint_type::scalableTxRx) + { // setup an endpoint for receiving messages // rx endpoint is typically shared by all threads eps_->ep_rx_ = create_rx_endpoint(fabric_domain_, fabric_info_, av_); @@ -578,8 +578,10 @@ class controller_base auto tx_cq = bind_tx_queue_to_rx_endpoint(fabric_info_, eps_->ep_rx_.get_ep()); eps_->ep_rx_.set_tx_cq(tx_cq); } - else if (endpoint_type_ != endpoint_type::scalableTxRx) { -#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI) + else if (endpoint_type_ != endpoint_type::scalableTxRx) + { +#if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || \ + defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_CXI) || defined(HAVE_LIBFABRIC_EFA) // it appears that the rx endpoint cannot be enabled if it does not // have a Tx CQ (at least when using sockets), so we create a dummy // Tx CQ and bind it just to stop libfabric from triggering an error. @@ -599,11 +601,11 @@ class controller_base auto ep_tx = new_endpoint_active(fabric_domain_, fabric_info_, true); // create a completion queue for tx endpoint - fabric_info_->tx_attr->op_flags |= FI_INJECT_COMPLETE | FI_COMPLETION; - auto tx_cq = create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size, - "tx multiple"); + fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); + auto tx_cq = + create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size, "tx multiple"); - bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "rx multiple"); + bind_queue_to_endpoint(ep_tx, tx_cq, FI_TRANSMIT | FI_RECV, "tx multiple"); bind_address_vector_to_endpoint(ep_tx, av_); enable_endpoint(ep_tx, "tx multiple"); @@ -619,10 +621,11 @@ class controller_base { // setup tx contexts for each possible thread size_t threads_allocated = 0; - auto ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads, threads_allocated); + auto ep_sx = new_endpoint_scalable(fabric_domain_, fabric_info_, true /*Tx*/, threads, + threads_allocated); - DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"), - "Contexts allocated", debug::dec<4>(threads_allocated))); + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"), + "Contexts allocated", debug::dec<4>(threads_allocated))); finvoke("fi_scalable_ep_bind AV", "fi_scalable_ep_bind", fi_scalable_ep_bind(ep_sx, &av_->fid, 0)); @@ -648,7 +651,7 @@ class controller_base enable_endpoint(scalable_ep_tx, "tx scalable"); endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable"); - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "initial tx push", "ep", NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq", NS_DEBUG::ptr(tx.get_rx_cq()))); @@ -661,7 +664,7 @@ class controller_base // once enabled we can get the address enable_endpoint(eps_->ep_rx_.get_ep(), "rx here"); here_ = get_endpoint_address(&eps_->ep_rx_.get_ep()->fid); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("setting 'here'"), iplocality(here_))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting 'here'"), iplocality(here_))); // // if we are using scalable endpoints, then setup tx/rx contexts // // we will us a single endpoint for all Tx/Rx contexts @@ -676,7 +679,7 @@ class controller_base // if (!ep_sx) // throw NS_LIBFABRIC::fabric_error(FI_EOTHER, "fi_scalable endpoint creation failed"); - // DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"), + // LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint ok"), // "Contexts allocated", debug::dec<4>(threads_allocated))); // // prepare the stack for insertions @@ -706,7 +709,7 @@ class controller_base // enable_endpoint(scalable_ep_tx, "tx scalable"); // endpoint_wrapper tx(scalable_ep_tx, nullptr, scalable_cq_tx, "tx scalable"); - // DEBUG(NS_DEBUG::cnb_deb, + // LF_DEB(NS_DEBUG::cnb_deb, // trace(debug::str<>("Scalable Ep"), "initial tx push", "ep", // NS_DEBUG::ptr(tx.get_ep()), "tx cq", NS_DEBUG::ptr(tx.get_tx_cq()), "rx cq", // NS_DEBUG::ptr(tx.get_rx_cq()))); @@ -724,7 +727,7 @@ class controller_base //// enable_endpoint(scalable_ep_rx, "rx scalable"); //// endpoint_wrapper rx(scalable_ep_rx, scalable_cq_rx, nullptr, "rx scalable"); - //// DEBUG(NS_DEBUG::cnb_deb, + //// LF_DEB(NS_DEBUG::cnb_deb, //// trace(debug::str<>("Scalable Ep"), "initial rx push", "ep", //// NS_DEBUG::ptr(rx.get_ep()), "tx cq", NS_DEBUG::ptr(rx.get_tx_cq()), "rx cq", //// NS_DEBUG::ptr(rx.get_rx_cq()))); @@ -750,24 +753,26 @@ class controller_base } // -------------------------------------------------------------------- - constexpr int memory_registration_mode_flags() + constexpr std::int64_t memory_registration_mode_flags() { - // use basic registration for providers except CXI + std::int64_t base_flags = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY; +#if OOMPH_ENABLE_DEVICE + base_flags = base_flags | FI_MR_HMEM; +#endif + base_flags = base_flags | FI_MR_LOCAL; + #if defined(HAVE_LIBFABRIC_CXI) - int base_flags = - FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL | FI_MR_MMU_NOTIFY; - return base_flags | FI_MR_ENDPOINT | FI_MR_HMEM; -#elif defined(HAVE_LIBFABRIC_GNI) - return FI_MR_BASIC; // FI_MR_SCALABLE one day?; + return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT; + +#elif defined(HAVE_LIBFABRIC_EFA) + return base_flags | FI_MR_MMU_NOTIFY | FI_MR_ENDPOINT; #else - return FI_MR_BASIC; + return base_flags; #endif } // -------------------------------------------------------------------- - uint32_t rendezvous_threshold() { - return msg_rendezvous_threshold_; - } + uint32_t rendezvous_threshold() { return msg_rendezvous_threshold_; } // -------------------------------------------------------------------- // initialize the basic fabric/domain/name void open_fabric(std::string const& provider, int threads, bool rootnode) @@ -780,10 +785,12 @@ class controller_base throw NS_LIBFABRIC::fabric_error(-1, "Failed to allocate fabric hints"); } - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Here locality"), iplocality(here_))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Here locality"), iplocality(here_))); #if defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_TCP) || defined(HAVE_LIBFABRIC_VERBS) fabric_hints_->addr_format = FI_SOCKADDR_IN; +#elif defined(HAVE_LIBFABRIC_EFA) + fabric_hints_->addr_format = FI_ADDR_EFA; #endif fabric_hints_->caps = caps_flags(); @@ -800,7 +807,7 @@ class controller_base strdup(std::string(provider + ";ofi_rxm").c_str()); } else { fabric_hints_->fabric_attr->prov_name = strdup(provider.c_str()); } - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fabric provider"), fabric_hints_->fabric_attr->prov_name)); fabric_hints_->domain_attr->mr_mode = memory_registration_mode_flags(); @@ -809,11 +816,11 @@ class controller_base auto progress = libfabric_progress_type(); fabric_hints_->domain_attr->control_progress = progress; fabric_hints_->domain_attr->data_progress = progress; - DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("progress"), libfabric_progress_string())); + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("progress"), libfabric_progress_string())); if (threads > 1) { - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_FID"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_FID"))); // Enable thread safe mode (Does not work with psm2 provider) // fabric_hints_->domain_attr->threading = FI_THREAD_SAFE; //fabric_hints_->domain_attr->threading = FI_THREAD_FID; @@ -821,7 +828,7 @@ class controller_base } else { - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_DOMAIN"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("FI_THREAD_DOMAIN"))); // we serialize everything fabric_hints_->domain_attr->threading = FI_THREAD_DOMAIN; } @@ -829,11 +836,11 @@ class controller_base // Enable resource management fabric_hints_->domain_attr->resource_mgmt = FI_RM_ENABLED; - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("fabric endpoint"), "RDM")); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fabric endpoint"), "RDM")); fabric_hints_->ep_attr->type = FI_EP_RDM; uint64_t flags = 0; - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("get fabric info"), "FI_VERSION", debug::dec(LIBFABRIC_FI_VERSION_MAJOR), debug::dec(LIBFABRIC_FI_VERSION_MINOR))); @@ -843,29 +850,32 @@ class controller_base if (rootnode) { - DEBUG(NS_DEBUG::cnb_err, + LF_DEB(NS_DEBUG::cnb_err, trace(debug::str<>("Fabric info"), "\n", fi_tostr(fabric_info_, FI_TYPE_INFO))); } bool context = (fabric_hints_->mode & FI_CONTEXT) != 0; - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_CONTEXT"), context)); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_CONTEXT"), context)); - bool mrlocal = (fabric_hints_->domain_attr->mr_mode & FI_MR_LOCAL) != 0; - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_LOCAL"), mrlocal)); + mrlocal = (fabric_hints_->domain_attr->mr_mode & FI_MR_LOCAL) != 0; + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_LOCAL"), mrlocal)); - bool mrbind = (fabric_hints_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0; - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ENDPOINT"), mrbind)); + mrbind = (fabric_hints_->domain_attr->mr_mode & FI_MR_ENDPOINT) != 0; + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ENDPOINT"), mrbind)); /* Check if provider requires heterogeneous memory registration */ - bool mrhmem = (fabric_hints_->domain_attr->mr_mode & FI_MR_HMEM) != 0; - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_HMEM"), mrhmem)); + mrhmem = (fabric_hints_->domain_attr->mr_mode & FI_MR_HMEM) != 0; + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_HMEM"), mrhmem)); + + bool mrhalloc = (fabric_hints_->domain_attr->mr_mode & FI_MR_ALLOCATED) != 0; + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Requires FI_MR_ALLOCATED"), mrhalloc)); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating fi_fabric"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating fi_fabric"))); ret = fi_fabric(fabric_info_->fabric_attr, &fabric_, nullptr); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "Failed to get fi_fabric"); // Allocate a domain. - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Allocating domain"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Allocating domain"))); ret = fi_domain(fabric_, fabric_info_, &fabric_domain_, nullptr); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_domain"); @@ -874,18 +884,20 @@ class controller_base [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), "GNI memory registration block"); - DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI String values")); + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI String values")); // Dump out all vars for debug purposes - for (auto &gni_data : gni_strs) { - _set_check_domain_op_value(gni_data.first, 0, - gni_data.second.c_str(), false); + for (auto& gni_data : gni_strs) + { + _set_check_domain_op_value(gni_data.first, 0, gni_data.second.c_str(), + false); } - DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI Int values")); - for (auto &gni_data : gni_ints) { - _set_check_domain_op_value(gni_data.first, 0, - gni_data.second.c_str(), false); + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"), "GNI Int values")); + for (auto& gni_data : gni_ints) + { + _set_check_domain_op_value(gni_data.first, 0, gni_data.second.c_str(), + false); } - DEBUG(NS_DEBUG::cnb_err, debug(debug::str<>("-------"))); + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("-------"))); // -------------------------- // GNI_MR_CACHE @@ -908,7 +920,7 @@ class controller_base // Enable lazy deregistration in MR cache // int32_t enable = 1; - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("setting GNI_MR_CACHE_LAZY_DEREG"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("setting GNI_MR_CACHE_LAZY_DEREG"))); _set_check_domain_op_value(GNI_MR_CACHE_LAZY_DEREG, enable, "GNI_MR_CACHE_LAZY_DEREG"); @@ -947,11 +959,13 @@ class controller_base static struct fi_gni_ops_domain* gni_domain_ops = nullptr; int ret = 0; - if (gni_domain_ops == nullptr) { + if (gni_domain_ops == nullptr) + { ret = fi_open_ops(&fabric_domain_->fid, FI_GNI_DOMAIN_OPS_1, 0, (void**)&gni_domain_ops, nullptr); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"), - NS_DEBUG::ptr(gni_domain_ops))); + LF_DEB(NS_DEBUG::cnb_deb, + debug(debug::str<>("gni open ops"), (ret == 0 ? "OK" : "FAIL"), + NS_DEBUG::ptr(gni_domain_ops))); } // if open was ok and set flag is present, then set value @@ -960,24 +974,25 @@ class controller_base ret = gni_domain_ops->set_val(&fabric_domain_->fid, (dom_ops_val_t)(op), reinterpret_cast(&value)); - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("gni set ops val"), value, (ret == 0 ? "OK" : "FAIL"))); } // Get the value (so we can check that the value we set is now returned) T new_value; ret = gni_domain_ops->get_val(&fabric_domain_->fid, (dom_ops_val_t)(op), &new_value); - if constexpr (std::is_integral::value) { - DEBUG(NS_DEBUG::cnb_err, - debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, debug::hex<8>(new_value))); + if constexpr (std::is_integral::value) + { + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), + info, debug::hex<8>(new_value))); } - else { - DEBUG(NS_DEBUG::cnb_err, + else + { + LF_DEB(NS_DEBUG::cnb_err, debug(debug::str<>("gni op val"), (ret == 0 ? "OK" : "FAIL"), info, new_value)); } // - if (ret) - throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info); + if (ret) throw NS_LIBFABRIC::fabric_error(ret, std::string("setting ") + info); return ret; } @@ -994,7 +1009,7 @@ class controller_base struct fi_info* hints = set_src_dst_addresses(info, tx); [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Got info mode"), (info->mode & FI_NOTIFY_FLAGS_ONLY))); struct fid_ep* ep; @@ -1005,8 +1020,7 @@ class controller_base "endpoints?)"); } fi_freeinfo(hints); - DEBUG(NS_DEBUG::cnb_deb, - debug(debug::str<>("new_endpoint_active"), NS_DEBUG::ptr(ep))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_active"), NS_DEBUG::ptr(ep))); return ep; } @@ -1019,7 +1033,7 @@ class controller_base [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("fi_dupinfo"))); struct fi_info* hints = fi_dupinfo(info); if (!hints) throw NS_LIBFABRIC::fabric_error(0, "fi_dupinfo"); @@ -1035,7 +1049,7 @@ class controller_base else { context_count = std::min(new_hints->domain_attr->rx_ctx_cnt, threads); } // clang-format off - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("scalable endpoint"), "Tx", tx, "Threads", debug::dec<3>(threads), @@ -1051,8 +1065,7 @@ class controller_base struct fid_ep* ep; ret = fi_scalable_ep(domain, new_hints, &ep, nullptr); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_scalable_ep"); - DEBUG(NS_DEBUG::cnb_deb, - debug(debug::str<>("new_endpoint_scalable"), NS_DEBUG::ptr(ep))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("new_endpoint_scalable"), NS_DEBUG::ptr(ep))); fi_freeinfo(hints); return ep; } @@ -1061,7 +1074,7 @@ class controller_base endpoint_wrapper& get_rx_endpoint() { static auto rx = NS_DEBUG::cnb_deb.make_timer(1, debug::str<>("get_rx_endpoint")); - DEBUG(NS_DEBUG::cnb_deb, timed(rx)); + LF_DEB(NS_DEBUG::cnb_deb, timed(rx)); if (endpoint_type_ == endpoint_type::scalableTxRx) { @@ -1072,7 +1085,7 @@ class controller_base if (!ok) { // clang-format off - DEBUG(NS_DEBUG::cnb_deb, error(debug::str<>("Scalable Ep"), "pop rx", + LF_DEB(NS_DEBUG::cnb_deb, error(debug::str<>("Scalable Ep"), "pop rx", "ep", NS_DEBUG::ptr(ep.get_ep()), "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()), "rx cq", NS_DEBUG::ptr(ep.get_rx_cq()))); @@ -1081,10 +1094,10 @@ class controller_base } eps_->tl_srx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), ep.get_name(), &rx_endpoints_); - DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop rx", "ep", - NS_DEBUG::ptr(eps_->tl_srx_.get_ep()), "tx cq", - NS_DEBUG::ptr(eps_->tl_srx_.get_tx_cq()), "rx cq", - NS_DEBUG::ptr(eps_->tl_srx_.get_rx_cq()))); + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop rx", "ep", + NS_DEBUG::ptr(eps_->tl_srx_.get_ep()), "tx cq", + NS_DEBUG::ptr(eps_->tl_srx_.get_tx_cq()), "rx cq", + NS_DEBUG::ptr(eps_->tl_srx_.get_rx_cq()))); } return eps_->tl_srx_.endpoint_; } @@ -1103,7 +1116,7 @@ class controller_base NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, "threadlocal"); // create a completion queue for tx endpoint - fabric_info_->tx_attr->op_flags |= FI_INJECT_COMPLETE | FI_COMPLETION; + fabric_info_->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); auto tx_cq = create_completion_queue(fabric_domain_, fabric_info_->tx_attr->size, "tx threadlocal"); @@ -1117,7 +1130,7 @@ class controller_base enable_endpoint(ep_tx, "tx threadlocal"); // set threadlocal endpoint wrapper - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Threadlocal Ep"), "create Tx", "ep", NS_DEBUG::ptr(ep_tx), "tx cq", NS_DEBUG::ptr(tx_cq), "rx cq", NS_DEBUG::ptr(nullptr))); // for cleaning up at termination @@ -1136,7 +1149,7 @@ class controller_base bool ok = tx_endpoints_.pop(ep); if (!ok) { - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, error(debug::str<>("Scalable Ep"), "pop tx", "ep", NS_DEBUG::ptr(ep.get_ep()), "tx cq", NS_DEBUG::ptr(ep.get_tx_cq()), "rx cq", NS_DEBUG::ptr(ep.get_rx_cq()))); @@ -1144,20 +1157,15 @@ class controller_base } eps_->tl_stx_ = stack_endpoint(ep.get_ep(), ep.get_rx_cq(), ep.get_tx_cq(), ep.get_name(), &tx_endpoints_); - DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop tx", "ep", - NS_DEBUG::ptr(eps_->tl_stx_.get_ep()), "tx cq", - NS_DEBUG::ptr(eps_->tl_stx_.get_tx_cq()), "rx cq", - NS_DEBUG::ptr(eps_->tl_stx_.get_rx_cq()))); + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("Scalable Ep"), "pop tx", "ep", + NS_DEBUG::ptr(eps_->tl_stx_.get_ep()), "tx cq", + NS_DEBUG::ptr(eps_->tl_stx_.get_tx_cq()), "rx cq", + NS_DEBUG::ptr(eps_->tl_stx_.get_rx_cq()))); } return eps_->tl_stx_.endpoint_; } else if (endpoint_type_ == endpoint_type::multiple) { return eps_->ep_tx_; } - else if (endpoint_type_ == endpoint_type::single) - { - // shared tx/rx endpoint - return eps_->ep_rx_; - } - // shared tx/rx endpoint + // single : shared tx/rx endpoint return eps_->ep_rx_; } @@ -1166,7 +1174,7 @@ class controller_base { [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding AV"), "to", NS_DEBUG::ptr(endpoint))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding AV"), "to", NS_DEBUG::ptr(endpoint))); int ret = fi_ep_bind(endpoint, &av->fid, 0); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind address_vector"); } @@ -1177,7 +1185,8 @@ class controller_base { [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Binding CQ"), "to", NS_DEBUG::ptr(endpoint), type)); + LF_DEB(NS_DEBUG::cnb_deb, + debug(debug::str<>("Binding CQ"), "to", NS_DEBUG::ptr(endpoint), type)); int ret = fi_ep_bind(endpoint, &cq->fid, cqtype); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "bind cq"); } @@ -1186,7 +1195,7 @@ class controller_base fid_cq* bind_tx_queue_to_rx_endpoint(struct fi_info* info, struct fid_ep* ep) { [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - info->tx_attr->op_flags |= FI_INJECT_COMPLETE | FI_COMPLETION; + info->tx_attr->op_flags |= (FI_INJECT_COMPLETE | FI_COMPLETION); fid_cq* tx_cq = create_completion_queue(fabric_domain_, info->tx_attr->size, "tx->rx"); // shared send/recv endpoint - bind send cq to the recv endpoint bind_queue_to_endpoint(ep, tx_cq, FI_TRANSMIT, "tx->rx bug fix"); @@ -1198,7 +1207,8 @@ class controller_base { [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__, type); - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Enabling endpoint"), NS_DEBUG::ptr(endpoint))); + LF_DEB(NS_DEBUG::cnb_deb, + debug(debug::str<>("Enabling endpoint"), NS_DEBUG::ptr(endpoint))); int ret = fi_enable(endpoint); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_enable"); } @@ -1227,14 +1237,14 @@ class controller_base temp1 << debug::ipaddr(&local_addr[i]) << " - "; } - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), "size", - debug::dec<>(addrlen), " : ", temp1.str().c_str())); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), "size", + debug::dec<>(addrlen), " : ", temp1.str().c_str())); std::stringstream temp2; for (std::size_t i = 0; i < locality_defs::array_length; ++i) { temp2 << debug::hex<8>(local_addr[i]) << " - "; } - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), temp2.str().c_str())); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("raw address data"), temp2.str().c_str())); } return locality(local_addr); } @@ -1253,6 +1263,9 @@ class controller_base // -------------------------------------------------------------------- inline const locality& here() const { return here_; } + // -------------------------------------------------------------------- + inline const fi_addr_t& fi_address() const { return here_.fi_address(); } + // -------------------------------------------------------------------- inline void setHere(const locality& val) { here_ = val; } @@ -1297,19 +1310,73 @@ class controller_base addr.set_fi_address(fi_addr_t(i)); if ((ret == 0) && (addrlen == locality_defs::array_size)) { - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("address vector"), debug::dec<3>(i), iplocality(addr))); } else { - DEBUG(NS_DEBUG::cnb_err, - error(debug::str<>("address length"), debug::dec<3>(addrlen), debug::dec<3>(locality_defs::array_size))); + LF_DEB(NS_DEBUG::cnb_err, + error(debug::str<>("address length"), debug::dec<3>(addrlen), + debug::dec<3>(locality_defs::array_size))); throw std::runtime_error("debug_print_av_vector : address vector " "traversal failure"); } } } + // -------------------------------------------------------------------- + inline constexpr bool bypass_tx_lock() + { +#if defined(HAVE_LIBFABRIC_GNI) + return true; +#elif defined(HAVE_LIBFABRIC_CXI) + // @todo : cxi provider is not yet thread safe using scalable endpoints + return false; +#else + return (threadlevel_flags() == FI_THREAD_SAFE || + endpoint_type_ == endpoint_type::threadlocalTx); +#endif + } + + // -------------------------------------------------------------------- + inline controller_base::unique_lock get_tx_lock() + { + if (bypass_tx_lock()) return unique_lock(); + return unique_lock(send_mutex_); + } + + // -------------------------------------------------------------------- + inline controller_base::unique_lock try_tx_lock() + { + if (bypass_tx_lock()) return unique_lock(); + return unique_lock(send_mutex_, std::try_to_lock_t{}); + } + + // -------------------------------------------------------------------- + inline constexpr bool bypass_rx_lock() + { +#ifdef HAVE_LIBFABRIC_GNI + return true; +#else + return ( + threadlevel_flags() == FI_THREAD_SAFE || endpoint_type_ == endpoint_type::scalableTxRx); +#endif + } + + // -------------------------------------------------------------------- + inline controller_base::unique_lock get_rx_lock() + { + if (bypass_rx_lock()) return unique_lock(); + return unique_lock(recv_mutex_); + } + + // -------------------------------------------------------------------- + inline controller_base::unique_lock try_rx_lock() + { + if (bypass_rx_lock()) return unique_lock(); + return unique_lock(recv_mutex_, std::try_to_lock_t{}); + } + // -------------------------------------------------------------------- progress_status poll_for_work_completions(void* user_data) { @@ -1317,11 +1384,13 @@ class controller_base bool retry = false; do { // sends - uint32_t nsend = static_cast(this)->poll_send_queue(get_tx_endpoint().get_tx_cq(), user_data); + uint32_t nsend = static_cast(this)->poll_send_queue( + get_tx_endpoint().get_tx_cq(), user_data); p.m_num_sends += nsend; retry = (nsend == max_completions_per_poll_); // recvs - uint32_t nrecv = static_cast(this)->poll_recv_queue(get_rx_endpoint().get_rx_cq(), user_data); + uint32_t nrecv = static_cast(this)->poll_recv_queue( + get_rx_endpoint().get_rx_cq(), user_data); p.m_num_recvs += nrecv; retry |= (nrecv == max_completions_per_poll_); } while (retry); @@ -1352,7 +1421,7 @@ class controller_base cq_attr.wait_cond = FI_CQ_COND_NONE; cq_attr.size = size; cq_attr.flags = 0 /*FI_COMPLETION*/; - DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("CQ size"), debug::dec<4>(size))); + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("CQ size"), debug::dec<4>(size))); // open completion queue on fabric domain and set context to null int ret = fi_cq_open(domain, &cq_attr, &cq, nullptr); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_cq_open"); @@ -1375,7 +1444,7 @@ class controller_base #ifdef RX_CONTEXTS_SUPPORT while (num_rx_contexts >> ++rx_ctx_bits) ; - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("rx_ctx_bits"), rx_ctx_bits)); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("rx_ctx_bits"), rx_ctx_bits)); #endif av_attr.rx_ctx_bits = rx_ctx_bits; // if contexts is nonzero, then we are using a single scalable endpoint @@ -1387,11 +1456,11 @@ class controller_base } else { - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("map FI_AV_TABLE"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("map FI_AV_TABLE"))); av_attr.type = FI_AV_TABLE; } - DEBUG(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating AV"))); + LF_DEB(NS_DEBUG::cnb_deb, debug(debug::str<>("Creating AV"))); int ret = fi_av_open(fabric_domain_, &av_attr, &av, nullptr); if (ret) throw NS_LIBFABRIC::fabric_error(ret, "fi_av_open"); return av; @@ -1405,7 +1474,7 @@ class controller_base { [[maybe_unused]] auto scp = NS_DEBUG::cnb_deb.scope(NS_DEBUG::ptr(this), __func__); - DEBUG(NS_DEBUG::cnb_deb, + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("inserting AV"), iplocality(address), NS_DEBUG::ptr(av))); fi_addr_t fi_addr = 0xffffffff; int ret = fi_av_insert(av, address.fabric_data(), 1, &fi_addr, 0, nullptr); @@ -1417,8 +1486,8 @@ class controller_base } // address was generated correctly, now update the locality with the fi_addr locality new_locality(address, fi_addr); - DEBUG(NS_DEBUG::cnb_deb, trace(debug::str<>("AV add"), "rank", debug::dec<>(fi_addr), - iplocality(new_locality), "fi_addr", debug::hex<4>(fi_addr))); + LF_DEB(NS_DEBUG::cnb_deb, trace(debug::str<>("AV add"), "rank", debug::dec<>(fi_addr), + iplocality(new_locality), "fi_addr", debug::hex<4>(fi_addr))); return new_locality; } }; diff --git a/src/libfabric/fabric_error.hpp b/src/libfabric/fabric_error.hpp index bc7183a4..89422e19 100644 --- a/src/libfabric/fabric_error.hpp +++ b/src/libfabric/fabric_error.hpp @@ -15,17 +15,15 @@ // #include // -#include "./print.hpp" +#include "oomph_libfabric_defines.hpp" -namespace oomph +namespace NS_DEBUG { // cppcheck-suppress ConfigurationNotChecked static NS_DEBUG::enable_print err_deb("ERROR__"); -} // namespace oomph +} // namespace NS_DEBUG -namespace oomph -{ -namespace libfabric +namespace NS_LIBFABRIC { class fabric_error : public std::runtime_error @@ -36,7 +34,7 @@ class fabric_error : public std::runtime_error : std::runtime_error(std::string(fi_strerror(-err)) + msg) , error_(err) { - err_deb.error(msg, ":", fi_strerror(-err)); + NS_DEBUG::err_deb.error(msg, ":", fi_strerror(-err)); std::terminate(); } @@ -44,12 +42,11 @@ class fabric_error : public std::runtime_error : std::runtime_error(fi_strerror(-err)) , error_(-err) { - err_deb.error(what()); + NS_DEBUG::err_deb.error(what()); std::terminate(); } int error_; }; -} // namespace libfabric -} // namespace oomph +} // namespace NS_LIBFABRIC diff --git a/src/libfabric/gni-debug.txt b/src/libfabric/gni-debug.txt index 53eb3018..7f61601c 100644 --- a/src/libfabric/gni-debug.txt +++ b/src/libfabric/gni-debug.txt @@ -14,3 +14,5 @@ mpiexec -n 2 --oversubscribe konsole -e gdb -ex run --args /home/biddisco/build/ # Build on LUMI # ----------------------------------------------- cmake -DCMAKE_BUILD_TYPE=Release -DOOMPH_WITH_LIBFABRIC=ON -DOOMPH_LIBFABRIC_PROVIDER=cxi -DLIBFABRIC_ROOT=/opt/cray/libfabric/1.15.0.0/ -DOOMPH_WITH_BENCHMARKS=ON -DOOMPH_BENCHMARKS_MT=ON -DOOMPH_WITH_TESTING=ON /users/jobiddis/src/ghex/extern/oomph/ + +MPICH_GNI_NDREG_ENTRIES=1024 MPICH_MAX_THREAD_SAFETY=single LIBFABRIC_POLL_SIZE=32 LIBFABRIC_ENDPOINT_TYPE=scalableTx srun --cpu-bind=cores --unbuffered --ntasks 2 --cpus-per-task 1 timeout 120 /scratch/snx3000/biddisco/build/oomph/benchmarks/bench_p2p_bi_ft_avail_libfabric 50000 1000000 10 > dump diff --git a/src/libfabric/libfabric_defines.hpp b/src/libfabric/libfabric_defines.hpp deleted file mode 100644 index 03328979..00000000 --- a/src/libfabric/libfabric_defines.hpp +++ /dev/null @@ -1,19 +0,0 @@ -/* - * ghex-org - * - * Copyright (c) 2014-2023, ETH Zurich - * All rights reserved. - * - * Please, refer to the LICENSE file in the root directory. - * SPDX-License-Identifier: BSD-3-Clause - */ -#pragma once - -// ------------------------------------------------------------------ -// This section exists to make interoperabily/sharing of code -// between OOMPH/GHEX and HPX easier -#if __has_include() -#include -#elif __has_include("oomph_libfabric_defines.hpp") -#include "oomph_libfabric_defines.hpp" -#endif diff --git a/src/libfabric/libfabric_defines_template.hpp b/src/libfabric/libfabric_defines_template.hpp new file mode 100644 index 00000000..64c04944 --- /dev/null +++ b/src/libfabric/libfabric_defines_template.hpp @@ -0,0 +1,39 @@ +#ifndef OOMPH_LIBFABRIC_CONFIG_LIBFABRIC_HPP +#define OOMPH_LIBFABRIC_CONFIG_LIBFABRIC_HPP + +// definitions that cmake generates from user options +// clang-format off +@oomph_config_defines@ +// clang-format on + +// ------------------------------------------------------------------ +// This section exists to make interoperabily/sharing of code +// between OOMPH/GHEX and HPX easier - there are some files that do +// the majority of libfabric initialization/setup and polling that +// are basically the same in many apps, these files can be reused provided +// some namespaces for the lib and for debugging are setup correctly + +#define NS_LIBFABRIC oomph::libfabric +#define NS_MEMORY oomph::libfabric +#define NS_DEBUG oomph::debug + +#ifndef LF_DEB +#define LF_DEB(printer, Expr) \ + if constexpr (printer.is_enabled()) { printer.Expr; }; +#endif + +#define LFSOURCE_DIR "@OOMPH_SRC_LIBFABRIC_DIR@" +#define LFPRINT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/print.hpp" +#define LFCOUNT_HPP "@OOMPH_SRC_LIBFABRIC_DIR@/simple_counter.hpp" + +// oomph has a debug print helper file in the main source tree +#if __has_include(LFPRINT_HPP) +#include LFPRINT_HPP +#define has_debug 1 +#endif + +#if __has_include(LFCOUNT_HPP) +#include LFCOUNT_HPP +#endif + +#endif diff --git a/src/libfabric/locality.hpp b/src/libfabric/locality.hpp index 2b65d61b..0bbbb16f 100644 --- a/src/libfabric/locality.hpp +++ b/src/libfabric/locality.hpp @@ -9,17 +9,17 @@ */ #pragma once -#include -#include -#include #include +#include +#include +#include +#include // #include #include #include // -#include "libfabric_defines.hpp" -#include "print.hpp" +#include "oomph_libfabric_defines.hpp" // Different providers use different address formats that we must accommodate // in our locality object. @@ -31,6 +31,10 @@ #define HAVE_LIBFABRIC_LOCALITY_SIZE 4 #endif +#ifdef HAVE_LIBFABRIC_EFA +#define HAVE_LIBFABRIC_LOCALITY_SIZE 32 +#endif + #if defined(HAVE_LIBFABRIC_VERBS) || defined(HAVE_LIBFABRIC_TCP) || \ defined(HAVE_LIBFABRIC_SOCKETS) || defined(HAVE_LIBFABRIC_PSM2) #define HAVE_LIBFABRIC_LOCALITY_SIZE 16 @@ -40,7 +44,7 @@ namespace oomph { // cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print loc_deb("LOCALIT"); +static NS_DEBUG::enable_print loc_deb("LOCALIT"); } // namespace oomph namespace oomph @@ -86,41 +90,41 @@ struct locality { std::memcpy(&data_[0], &in_data[0], locality_defs::array_size); fi_address_ = 0; - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("expl constructing"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("expl constructing"), iplocality((*this)))); } locality() { std::memset(&data_[0], 0x00, locality_defs::array_size); fi_address_ = 0; - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("default construct"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("default construct"), iplocality((*this)))); } locality(const locality& other) : data_(other.data_) , fi_address_(other.fi_address_) { - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("copy construct"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy construct"), iplocality((*this)))); } locality(const locality& other, fi_addr_t addr) : data_(other.data_) , fi_address_(addr) { - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy fi construct"), iplocality((*this)))); } locality(locality&& other) : data_(std::move(other.data_)) , fi_address_(other.fi_address_) { - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("move construct"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("move construct"), iplocality((*this)))); } // provided to support sockets mode bootstrap explicit locality(const std::string& address, const std::string& portnum) { - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), address, ":", portnum)); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("explicit construct"), address, ":", portnum)); // struct sockaddr_in socket_data; memset(&socket_data, 0, sizeof(socket_data)); @@ -130,19 +134,19 @@ struct locality // std::memcpy(&data_[0], &socket_data, locality_defs::array_size); fi_address_ = 0; - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("string constructing"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("string constructing"), iplocality((*this)))); } // some condition marking this locality as valid explicit inline operator bool() const { - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("bool operator"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("bool operator"), iplocality((*this)))); return (ip_address() != 0); } inline bool valid() const { - OOMPH_DP_ONLY(loc_deb, trace(NS_DEBUG::str<>("valid operator"), iplocality((*this)))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("valid operator"), iplocality((*this)))); return (ip_address() != 0); } @@ -150,21 +154,21 @@ struct locality { data_ = other.data_; fi_address_ = other.fi_address_; - OOMPH_DP_ONLY(loc_deb, + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("copy operator"), iplocality(*this), iplocality(other))); return *this; } bool operator==(const locality& other) { - OOMPH_DP_ONLY(loc_deb, + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("equality operator"), iplocality(*this), iplocality(other))); return std::memcmp(&data_, &other.data_, locality_defs::array_size) == 0; } bool less_than(const locality& other) { - OOMPH_DP_ONLY(loc_deb, + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("less operator"), iplocality(*this), iplocality(other))); if (ip_address() < other.ip_address()) return true; if (ip_address() == other.ip_address()) return port() < other.port(); @@ -179,6 +183,8 @@ struct locality return data_[0]; #elif defined(HAVE_LIBFABRIC_CXI) return data_[0]; +#elif defined(HAVE_LIBFABRIC_EFA) + return data_[0]; #else throw fabric_error(0, "unsupported fabric provider, please fix ASAP"); #endif @@ -192,6 +198,8 @@ struct locality return data[0]; #elif defined(HAVE_LIBFABRIC_CXI) return data[0]; +#elif defined(HAVE_LIBFABRIC_EFA) + return data[0]; #else throw fabric_error(0, "unsupported fabric provider, please fix ASAP"); #endif @@ -215,7 +223,7 @@ struct locality private: friend bool operator==(locality const& lhs, locality const& rhs) { - OOMPH_DP_ONLY(loc_deb, + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("equality friend"), iplocality(lhs), iplocality(rhs))); return ((lhs.data_ == rhs.data_) && (lhs.fi_address_ == rhs.fi_address_)); } @@ -226,8 +234,7 @@ struct locality const uint32_t& a2 = rhs.ip_address(); const fi_addr_t& f1 = lhs.fi_address(); const fi_addr_t& f2 = rhs.fi_address(); - OOMPH_DP_ONLY(loc_deb, - trace(NS_DEBUG::str<>("less friend"), iplocality(lhs), iplocality(rhs))); + LF_DEB(loc_deb, trace(NS_DEBUG::str<>("less friend"), iplocality(lhs), iplocality(rhs))); return (a1 < a2) || (a1 == a2 && f1 < f2); } diff --git a/src/libfabric/memory_region.hpp b/src/libfabric/memory_region.hpp index bb67a4ed..97cdfaa8 100644 --- a/src/libfabric/memory_region.hpp +++ b/src/libfabric/memory_region.hpp @@ -18,31 +18,40 @@ #include #include +#include "oomph_libfabric_defines.hpp" #include "fabric_error.hpp" -// ------------------------------------------------------------------ -// This section exists to make interoperabily/sharing of code -// between OOMPH/GHEX and HPX easier -#if __has_include("./print.hpp") -#include "print.hpp" -#define DEBUG OOMPH_DP_ONLY -#define has_debug 1 -#elif __has_include() -#include -#include -#define DEBUG(printer, Expr) HPX_DP_ONLY(printer, Expr) -#define has_debug 1 -#else -#define DEBUG(printer, Expr) +#ifdef OOMPH_ENABLE_DEVICE +#include #endif // ------------------------------------------------------------------ -#define NS_MEMORY oomph::libfabric - namespace NS_MEMORY { -static NS_DEBUG::enable_print mrn_deb("REGION_"); +static NS_DEBUG::enable_print mrn_deb("REGION_"); + +/* +struct fi_mr_attr { + const struct iovec *mr_iov; + size_t iov_count; + uint64_t access; + uint64_t offset; + uint64_t requested_key; + void *context; + size_t auth_key_size; + uint8_t *auth_key; + enum fi_hmem_iface iface; + union { + uint64_t reserved; + int cuda; + int ze; + int neuron; + int synapseai; + } device; + void *hmem_data; +}; +*/ // This is the only part of the code that actually // calls libfabric functions @@ -53,30 +62,56 @@ struct region_provider using provider_domain = struct fid_domain; // register region - template - static inline int register_memory(Args&&... args) + static inline int fi_register_memory(provider_domain* pd, int device_id, const void* buf, + size_t len, uint64_t access_flags, uint64_t offset, uint64_t request_key, struct fid_mr** mr) { - return fi_mr_reg(std::forward(args)...); - } - - // register region - template - static inline int register_memory_attr(Args&&... args) - { - [[maybe_unused]] auto scp = NS_MEMORY::mrn_deb.scope(__func__, std::forward(args)...); - // int x = FI_HMEM_ROCR; - // fi_mr_regattr(struct fid_domain *domain, const struct fi_mr_attr *attr, - // uint64_t flags, struct fid_mr **mr) - return fi_mr_regattr(std::forward(args)...); + [[maybe_unused]] auto scp = NS_MEMORY::mrn_deb.scope(__func__, NS_DEBUG::ptr(buf), NS_DEBUG::dec<>(len), device_id); + // + struct iovec addresses = {/*.iov_base = */const_cast(buf), /*.iov_len = */len}; + fi_mr_attr attr = { + /*.mr_iov = */&addresses, + /*.iov_count = */1, + /*.access = */access_flags, + /*.offset = */offset, + /*.requested_key = */request_key, + /*.context = */nullptr, + /*.auth_key_size = */0, + /*.auth_key = */nullptr, + /*.iface = */FI_HMEM_SYSTEM, + /*.device = */{0}, +#if (FI_MAJOR_VERSION == 1) && (FI_MINOR_VERSION < 17) + }; +#else + /*.hmem_data = */nullptr}; +#endif + if (device_id >= 0) + { +#ifdef OOMPH_ENABLE_DEVICE + attr.device.cuda = device_id; + int handle = hwmalloc::get_device_id(); + attr.device.cuda = handle; +#if defined(OOMPH_DEVICE_CUDA) + attr.iface = FI_HMEM_CUDA; + LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("CUDA"), "set device id", device_id, handle)); +#elif defined(OOMPH_DEVICE_HIP) + attr.iface = FI_HMEM_ROCR; + LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("HIP"), "set device id", device_id, handle)); +#endif +#endif + } + uint64_t flags = 0; + int ret = fi_mr_regattr(pd, &attr, flags, mr); + if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "register_memory"); } + return ret; } // unregister region static inline int unregister_memory(provider_region* region) { return fi_close(®ion->fid); } // Default registration flags for this provider - static inline constexpr int flags() + static inline constexpr int access_flags() { - return FI_READ | FI_WRITE | FI_RECV | FI_SEND | FI_REMOTE_READ | FI_REMOTE_WRITE; + return FI_READ | FI_WRITE | FI_RECV | FI_SEND /*| FI_REMOTE_READ | FI_REMOTE_WRITE*/; } // Get the local descriptor of the memory region. @@ -116,7 +151,7 @@ struct memory_handle , size_{uint32_t(size)} , used_space_{0} { - // DEBUG(NS_MEMORY::mrn_deb, + // LF_DEB(NS_MEMORY::mrn_deb, // trace(NS_DEBUG::str<>("memory_handle"), *this)); } @@ -181,16 +216,16 @@ struct memory_handle { if (region_ /*&& !get_user_region()*/) { - DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_)); + LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("release"), region_)); // if (region_provider::unregister_memory(region_)) { - DEBUG(NS_MEMORY::mrn_deb, error("fi_close mr failed")); + LF_DEB(NS_MEMORY::mrn_deb, error("fi_close mr failed")); return -1; } else { - DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this)); + LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("de-Registered region"), *this)); } region_ = nullptr; } @@ -201,16 +236,16 @@ struct memory_handle friend std::ostream& operator<<(std::ostream& os, memory_handle const& region) { (void)region; -#if has_debug - // clang-format off - os /*<< "region "*/ << NS_DEBUG::ptr(®ion) - //<< " fi_region " << NS_DEBUG::ptr(region.region_) - << " address " << NS_DEBUG::ptr(region.address_) - << " size " << NS_DEBUG::hex<6>(region.size_) - //<< " used_space " << NS_DEBUG::hex<6>(region.used_space_/*size_*/) - << " loc key " << NS_DEBUG::ptr(region.region_ ? region_provider::get_local_key(region.region_) : nullptr) - << " rem key " << NS_DEBUG::ptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0); - // clang-format on +#if 1 || has_debug + os << "region " << NS_DEBUG::ptr(®ion) + //<< " fi_region " << NS_DEBUG::ptr(region.region_) + << " address " << NS_DEBUG::ptr(region.address_) + << " size " << NS_DEBUG::hex<6>(region.size_) + //<< " used_space " << NS_DEBUG::hex<6>(region.used_space_/*size_*/) + << " loc key " << NS_DEBUG::ptr(region.region_ ? region_provider::get_local_key(region.region_) : nullptr) + << " rem key " << NS_DEBUG::ptr(region.region_ ? region_provider::get_remote_key(region.region_) : 0); + ///// clang-format off + ///// clang-format on #endif return os; } @@ -274,7 +309,8 @@ struct memory_segment : public memory_handle // we do not cache local/remote keys here because memory segments are only // used by the heap to store chunks and the user will always receive // a memory_handle - which does have keys cached - memory_segment(provider_domain* pd, const void* buffer, const uint64_t length, bool bind_mr, void *ep) + memory_segment(provider_domain* pd, const void* buffer, const uint64_t length, bool bind_mr, + void* ep, int device_id) { // an rma key counter to keep some providers (CXI) happy static std::atomic key = 0; @@ -285,22 +321,25 @@ struct memory_segment : public memory_handle region_ = nullptr; // base_addr_ = memory_handle::address_; - DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("memory_segment"), *this)); + LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("memory_segment"), *this, device_id)); - int ret = region_provider::register_memory(pd, const_cast(buffer), length, - region_provider::flags(), 0, key++, 0, &(region_), nullptr); - if (ret) { throw libfabric::fabric_error(int(ret), "register_memory"); } - else { DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Registered region"), *this)); } + int ret = region_provider::fi_register_memory(pd, device_id, buffer, length, + region_provider::access_flags(), 0, key++, &(region_)); + if (!ret) + { + LF_DEB(NS_MEMORY::mrn_deb, + trace(NS_DEBUG::str<>("Registered region"), "device", device_id, *this)); + } if (bind_mr) { ret = fi_mr_bind(region_, (struct fid*)ep, 0); - if (ret) { throw libfabric::fabric_error(int(ret), "fi_mr_bind"); } - else { DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Bound region"), *this)); } + if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_bind"); } + else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Bound region"), *this)); } ret = fi_mr_enable(region_); - if (ret) { throw libfabric::fabric_error(int(ret), "fi_mr_enable"); } - else { DEBUG(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Enabled region"), *this)); } + if (ret) { throw NS_LIBFABRIC::fabric_error(int(ret), "fi_mr_enable"); } + else { LF_DEB(NS_MEMORY::mrn_deb, trace(NS_DEBUG::str<>("Enabled region"), *this)); } } } diff --git a/src/libfabric/operation_context.cpp b/src/libfabric/operation_context.cpp index 11bc85dc..92ffbf78 100644 --- a/src/libfabric/operation_context.cpp +++ b/src/libfabric/operation_context.cpp @@ -13,38 +13,36 @@ #include #include -namespace oomph -{ -namespace libfabric +namespace oomph::libfabric { void operation_context::handle_cancelled() { - [[maybe_unused]] auto scp = ctx_deb.scope(NS_DEBUG::ptr(this), __func__); + [[maybe_unused]] auto scp = opctx_deb.scope(NS_DEBUG::ptr(this), __func__); // enqueue the cancelled/callback - if (m_req.index() == 0) + if (std::holds_alternative(m_req)) { // regular (non-shared) recv - auto s = std::get<0>(m_req); + auto s = std::get(m_req); while (!(s->m_comm->m_recv_cb_cancel.push(s))) {} } - else + else if (std::holds_alternative(m_req)) { // shared recv - auto s = std::get<1>(m_req); - + auto s = std::get(m_req); while (!(s->m_ctxt->m_recv_cb_cancel.push(s))) {} } + else { throw std::runtime_error("Request state invalid in handle_cancelled"); } } int operation_context::handle_tagged_recv_completion_impl(void* user_data) { - [[maybe_unused]] auto scp = ctx_deb.scope(NS_DEBUG::ptr(this), __func__); - if (m_req.index() == 0) + [[maybe_unused]] auto scp = opctx_deb.scope(NS_DEBUG::ptr(this), __func__); + if (std::holds_alternative(m_req)) { // regular (non-shared) recv - auto s = std::get<0>(m_req); + auto s = std::get(m_req); //if (std::this_thread::get_id() == thread_id_) if (reinterpret_cast(user_data) == s->m_comm) { @@ -66,10 +64,10 @@ operation_context::handle_tagged_recv_completion_impl(void* user_data) while (!(s->m_comm->m_recv_cb_queue.push(s))) {} } } - else + else if (std::holds_alternative(m_req)) { // shared recv - auto s = std::get<1>(m_req); + auto s = std::get(m_req); if (!s->m_comm->m_context->has_reached_recursion_depth()) { auto inc = s->m_comm->m_context->recursion(); @@ -82,20 +80,36 @@ operation_context::handle_tagged_recv_completion_impl(void* user_data) while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {} } } + else + { + detail::request_state** req = reinterpret_cast(&m_req); + LF_DEB(NS_MEMORY::opctx_deb, + error(NS_DEBUG::str<>("invalid request_state"), this, "request", NS_DEBUG::ptr(req))); + throw std::runtime_error("Request state invalid in handle_tagged_recv"); + } return 1; } int operation_context::handle_tagged_send_completion_impl(void* user_data) { - auto s = std::get<0>(m_req); - if (reinterpret_cast(user_data) == s->m_comm) + if (std::holds_alternative(m_req)) { - if (!s->m_comm->has_reached_recursion_depth()) + // regular (non-shared) recv + auto s = std::get(m_req); + if (reinterpret_cast(user_data) == s->m_comm) { - auto inc = s->m_comm->recursion(); - auto ptr = s->release_self_ref(); - s->invoke_cb(); + if (!s->m_comm->has_reached_recursion_depth()) + { + auto inc = s->m_comm->recursion(); + auto ptr = s->release_self_ref(); + s->invoke_cb(); + } + else + { + // enqueue the callback + while (!(s->m_comm->m_send_cb_queue.push(s))) {} + } } else { @@ -103,12 +117,23 @@ operation_context::handle_tagged_send_completion_impl(void* user_data) while (!(s->m_comm->m_send_cb_queue.push(s))) {} } } - else + else if (std::holds_alternative(m_req)) { - // enqueue the callback - while (!(s->m_comm->m_send_cb_queue.push(s))) {} + // shared recv + auto s = std::get(m_req); + if (!s->m_comm->m_context->has_reached_recursion_depth()) + { + auto inc = s->m_comm->m_context->recursion(); + auto ptr = s->release_self_ref(); + s->invoke_cb(); + } + else + { + // enqueue the callback + while (!(s->m_comm->m_context->m_recv_cb_queue.push(s))) {} + } } + else { throw std::runtime_error("Request state invalid in handle_tagged_send"); } return 1; } -} // namespace libfabric -} // namespace oomph +} // namespace oomph::libfabric diff --git a/src/libfabric/operation_context.hpp b/src/libfabric/operation_context.hpp index 4c0114a4..c99533db 100644 --- a/src/libfabric/operation_context.hpp +++ b/src/libfabric/operation_context.hpp @@ -13,16 +13,13 @@ // #include // -#include -#include +#include "operation_context_base.hpp" // -namespace oomph -{ -namespace libfabric +namespace oomph::libfabric { // cppcheck-suppress ConfigurationNotChecked -static NS_DEBUG::enable_print ctx_deb("CONTEXT"); +static NS_DEBUG::enable_print opctx_deb("OP__CXT"); // This struct holds the ready state of a future // we must also store the context used in libfabric, in case @@ -37,7 +34,7 @@ struct operation_context : public operation_context_base , m_req{req} { [[maybe_unused]] auto scp = - ctx_deb.scope(NS_DEBUG::ptr(this), __func__, NS_DEBUG::ptr(req)); + opctx_deb.scope(NS_DEBUG::ptr(this), __func__, "request", req); } // -------------------------------------------------------------------- @@ -53,5 +50,4 @@ struct operation_context : public operation_context_base int handle_tagged_send_completion_impl(void* user_data); }; -} // namespace libfabric -} // namespace oomph +} // namespace oomph::libfabric diff --git a/src/libfabric/operation_context_base.hpp b/src/libfabric/operation_context_base.hpp index 058840ea..9625bae7 100644 --- a/src/libfabric/operation_context_base.hpp +++ b/src/libfabric/operation_context_base.hpp @@ -10,15 +10,15 @@ #pragma once #include -#include "libfabric_defines.hpp" - -#define NS_LIBFABRIC oomph::libfabric +#include "oomph_libfabric_defines.hpp" namespace NS_LIBFABRIC { class controller; +static NS_DEBUG::enable_print ctx_bas("CTXBASE"); + enum operation_context_type : int32_t { ctx_unknown = 0, @@ -45,6 +45,7 @@ struct operation_context_base : context_reserved_space() , type_{ctype} { + [[maybe_unused]] auto scp = ctx_bas.scope(NS_DEBUG::ptr(this), __func__); } // type is needed to smiplify the dispatch of errors @@ -72,21 +73,14 @@ struct operation_context_base { return static_cast(this)->handle_tagged_send_completion_impl(user_data); } - int handle_tagged_send_completion_impl() { return 0; } + int handle_tagged_send_completion_impl(void* /*user_data*/) { return 0; } // recv - int handle_recv_completion(std::uint64_t len, bool threadlocal) - { - return static_cast(this)->handle_recv_completion_impl(len, threadlocal); - } - int handle_recv_completion_impl(std::uint64_t /*len*/, bool /*threadlocal*/) { return 0; } - - // recv + with source adddress (used with FI_SOURCE) - int handle_recv_src_completion(fi_addr_t const src_addr, std::uint64_t len) + int handle_recv_completion(std::uint64_t len) { - return static_cast(this)->handle_recv_src_completion_impl(src_addr, len); + return static_cast(this)->handle_recv_completion_impl(len); } - int handle_recv_src_completion_impl(fi_addr_t const src_addr, std::uint64_t len) { return 0; } + int handle_recv_completion_impl(std::uint64_t /*len*/) { return 0; } // tagged recv int handle_tagged_recv_completion(void* user_data) diff --git a/src/libfabric/print.hpp b/src/libfabric/print.hpp index 37265bc3..7036d759 100644 --- a/src/libfabric/print.hpp +++ b/src/libfabric/print.hpp @@ -72,18 +72,9 @@ extern char** environ; // The output will only be produced every N seconds // ------------------------------------------------------------ -// Used to wrap function call parameters to prevent evaluation -// when debugging is disabled -#define OOMPH_DP_LAZY(printer, Expr) printer.eval([&] { return Expr; }) -#if (__cplusplus >= 201703L) -#define OOMPH_DP_ONLY(printer, Expr) \ - if constexpr (printer.is_enabled()) { printer.Expr; }; -#else -#define OOMPH_DP_ONLY(printer, Expr) \ - if (printer.is_enabled()) { printer.Expr; }; -#endif - #define NS_DEBUG oomph::debug +#define LF_DEB(printer, Expr) \ + if constexpr (printer.is_enabled()) { printer.Expr; }; // ------------------------------------------------------------ /// \cond NODETAIL @@ -721,5 +712,5 @@ struct enable_print } }; -} // namespace oomph::debug +} // namespace NS_DEBUG /// \endcond diff --git a/src/libfabric/request_state.hpp b/src/libfabric/request_state.hpp index 03bca9d5..25f7ea60 100644 --- a/src/libfabric/request_state.hpp +++ b/src/libfabric/request_state.hpp @@ -24,7 +24,7 @@ struct request_state { using base = request_state_base; using shared_ptr_t = util::unsafe_shared_ptr; - using operation_context = oomph::libfabric::operation_context; + using operation_context = libfabric::operation_context; operation_context m_operation_context; util::unsafe_shared_ptr m_self_ptr; @@ -34,12 +34,7 @@ struct request_state : base{ctxt, comm, scheduled, rank, tag, std::move(cb)} , m_operation_context{this} { - [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__); - } - - ~request_state() - { - [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__); + //[[maybe_unused]] auto scp = libfabric::opctx_deb.scope(NS_DEBUG::ptr(this), __func__); } void progress(); @@ -67,7 +62,7 @@ struct shared_request_state { using base = request_state_base; using shared_ptr_t = std::shared_ptr; - using operation_context = oomph::libfabric::operation_context; + using operation_context = libfabric::operation_context; operation_context m_operation_context; std::shared_ptr m_self_ptr; @@ -77,12 +72,12 @@ struct shared_request_state : base{ctxt, comm, scheduled, rank, tag, std::move(cb)} , m_operation_context{this} { - [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__); + [[maybe_unused]] auto scp = libfabric::opctx_deb.scope(NS_DEBUG::ptr(this), __func__); } ~shared_request_state() { - [[maybe_unused]] auto scp = oomph::libfabric::ctx_deb.scope(NS_DEBUG::ptr(this), __func__); + [[maybe_unused]] auto scp = libfabric::opctx_deb.scope(NS_DEBUG::ptr(this), __func__); } void progress(); diff --git a/src/libfabric/simple_counter.hpp b/src/libfabric/simple_counter.hpp index 44b79ef7..f44eac92 100644 --- a/src/libfabric/simple_counter.hpp +++ b/src/libfabric/simple_counter.hpp @@ -9,7 +9,7 @@ */ #pragma once -#include "libfabric_defines.hpp" +#include "oomph_libfabric_defines.hpp" // #include #include diff --git a/src/mpi/context.hpp b/src/mpi/context.hpp index bead5917..2d221280 100644 --- a/src/mpi/context.hpp +++ b/src/mpi/context.hpp @@ -95,7 +95,7 @@ register_memory(context_impl& c, void* ptr, std::size_t) #if OOMPH_ENABLE_DEVICE template<> inline region -register_device_memory(context_impl& c, void* ptr, std::size_t) +register_device_memory(context_impl& c, int, void* ptr, std::size_t) { return c.make_region(ptr); } diff --git a/src/mpi/rma_context.hpp b/src/mpi/rma_context.hpp index e3b04b1a..aec295f0 100644 --- a/src/mpi/rma_context.hpp +++ b/src/mpi/rma_context.hpp @@ -75,7 +75,7 @@ register_memory(rma_context& c, void* ptr, std::size_t size) #if OOMPH_ENABLE_DEVICE template<> inline rma_region -register_device_memory(rma_context& c, void* ptr, std::size_t size) +register_device_memory(rma_context& c, int, void* ptr, std::size_t size) { return c.make_region(ptr, size); } diff --git a/src/ucx/context.hpp b/src/ucx/context.hpp index 728a4506..760d24e7 100644 --- a/src/ucx/context.hpp +++ b/src/ucx/context.hpp @@ -251,7 +251,7 @@ register_memory(context_impl& c, void* ptr, std::size_t) #if OOMPH_ENABLE_DEVICE template<> inline region -register_device_memory(context_impl& c, void* ptr, std::size_t) +register_device_memory(context_impl& c, int, void* ptr, std::size_t) { return c.make_region(ptr); } diff --git a/src/ucx/rma_context.hpp b/src/ucx/rma_context.hpp index 25ff69d6..6f506a1a 100644 --- a/src/ucx/rma_context.hpp +++ b/src/ucx/rma_context.hpp @@ -60,9 +60,9 @@ register_memory(rma_context& c, void* ptr, std::size_t size) #if OOMPH_ENABLE_DEVICE template<> inline rma_region -register_device_memory(rma_context& c, void* ptr, std::size_t size) +register_device_memory(rma_context& c, int, void* ptr, std::size_t size) { - return c.make_region(ptr, size, true); + return c.make_region(ptr, size); } #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5217bbaf..92804a6e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,11 @@ add_subdirectory(mpi_runner) set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Daint)") +#------------------------------------------------------------------------------ +# Find Threads +#------------------------------------------------------------------------------ +find_package(Threads REQUIRED) + # --------------------------------------------------------------------- # compile tests # --------------------------------------------------------------------- @@ -10,7 +15,7 @@ set(OOMPH_TEST_LEAK_GPU_MEMORY OFF CACHE BOOL "Do not free memory (bug on Piz Da set(serial_tests test_unique_function test_unsafe_shared_ptr) # list of parallel tests to be executed -set(parallel_tests test_context test_send_recv test_send_multi test_cancel test_locality) +set(parallel_tests test_context test_send_recv test_send_multi test_cancel test_locality test_mem_reg) #test_tag_range) if (OOMPH_ENABLE_BARRIER) list(APPEND parallel_tests test_barrier) @@ -24,7 +29,7 @@ function(compile_test t_) if (OOMPH_TEST_LEAK_GPU_MEMORY) target_compile_definitions(${t} PRIVATE OOMPH_TEST_LEAK_GPU_MEMORY) endif() - target_link_libraries(${t} PRIVATE ext-gtest) + target_link_libraries(${t} PRIVATE ${GTEST_LIBRARIES}) target_link_libraries(${t} PUBLIC oomph) endfunction() @@ -43,7 +48,7 @@ endforeach() function(reg_serial_test t) add_executable(${t} $) oomph_target_compile_options(${t}) - target_link_libraries(${t} PRIVATE ext-gtest) + target_link_libraries(${t} PRIVATE GTest::gtest) target_link_libraries(${t} PRIVATE oomph_common) add_test( NAME ${t} @@ -59,7 +64,7 @@ function(reg_parallel_test t_ lib n) set(t ${t_}_${lib}) add_executable(${t} $) oomph_target_compile_options(${t}) - target_link_libraries(${t} PRIVATE gtest_main_mpi) + target_link_libraries(${t} PRIVATE gtest_main_mpi Threads::Threads) target_link_libraries(${t} PRIVATE oomph_${lib}) add_test( NAME ${t} @@ -70,19 +75,19 @@ endfunction() if (OOMPH_WITH_MPI) foreach(t ${parallel_tests}) - reg_parallel_test(${t} mpi 4) + reg_parallel_test(${t} mpi 2) endforeach() endif() if (OOMPH_WITH_UCX) foreach(t ${parallel_tests}) - reg_parallel_test(${t} ucx 4) + reg_parallel_test(${t} ucx 2) endforeach() endif() if (OOMPH_WITH_LIBFABRIC) foreach(t ${parallel_tests}) - reg_parallel_test(${t} libfabric 4) + reg_parallel_test(${t} libfabric 2) endforeach() endif() diff --git a/test/mpi_runner/CMakeLists.txt b/test/mpi_runner/CMakeLists.txt index 47207747..408f9d1a 100644 --- a/test/mpi_runner/CMakeLists.txt +++ b/test/mpi_runner/CMakeLists.txt @@ -1,4 +1,2 @@ -add_library(gtest_main_mpi ./gtest_main_mpi.cpp) -target_link_libraries(gtest_main_mpi PRIVATE ext-gtest) -target_link_libraries(gtest_main_mpi PRIVATE MPI::MPI_CXX) - +add_library(gtest_main_mpi OBJECT ./gtest_main_mpi.cpp) +target_link_libraries(gtest_main_mpi PRIVATE GTest::gtest MPI::MPI_CXX) diff --git a/test/test_mem_reg.cpp b/test/test_mem_reg.cpp new file mode 100644 index 00000000..d61a9bcc --- /dev/null +++ b/test/test_mem_reg.cpp @@ -0,0 +1,258 @@ +/* + * ghex-org + * + * Copyright (c) 2014-2023, ETH Zurich + * All rights reserved. + * + * Please, refer to the LICENSE file in the root directory. + * SPDX-License-Identifier: BSD-3-Clause + */ +#include +#include +#include "./mpi_runner/mpi_test_fixture.hpp" +#include +#include +#include +#include + +#define NITERS 2 +#define SIZE 64 +#define NTHREADS 1 + +std::vector> shared_received(NTHREADS); +thread_local int thread_id; + +void +reset_counters() +{ + for (auto& x : shared_received) x.store(0); +} + +struct test_environment_base +{ + using rank_type = oomph::rank_type; + using tag_type = oomph::tag_type; + using message = oomph::message_buffer; + + oomph::context& ctxt; + oomph::communicator comm; + rank_type speer_rank; + rank_type rpeer_rank; + int thread_id; + int num_threads; + tag_type tag; + + test_environment_base(oomph::context& c, int tid, int num_t) + : ctxt(c) + , comm(ctxt.get_communicator()) + , speer_rank((comm.rank() + 1) % comm.size()) + , rpeer_rank((comm.rank() + comm.size() - 1) % comm.size()) + , thread_id(tid) + , num_threads(num_t) + , tag(tid) + { + } +}; + +struct test_environment : public test_environment_base +{ + using base = test_environment_base; + + static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc, + rank_type* ptr) + { + if (user_alloc) return comm.make_buffer(ptr, size); + else + return comm.make_buffer(size); + } + + std::vector raw_smsg; + std::vector raw_rmsg; + message smsg; + message rmsg; + + test_environment(oomph::context& c, std::size_t size, int tid, int num_t, bool user_alloc) + : base(c, tid, num_t) + , raw_smsg(user_alloc ? size : 0) + , raw_rmsg(user_alloc ? size : 0) + , smsg(make_buffer(comm, size, user_alloc, raw_smsg.data())) + , rmsg(make_buffer(comm, size, user_alloc, raw_rmsg.data())) + { + fill_send_buffer(); + fill_recv_buffer(); + } + + void fill_send_buffer() + { + for (auto& x : smsg) x = comm.rank(); + } + + void fill_recv_buffer() + { + for (auto& x : rmsg) x = -1; + } + + bool check_recv_buffer() + { + for (auto const& x : rmsg) + if (x != rpeer_rank) return false; + return true; + } +}; + +#if HWMALLOC_ENABLE_DEVICE +struct test_environment_device : public test_environment_base +{ + using base = test_environment_base; + + static auto make_buffer(oomph::communicator& comm, std::size_t size, bool user_alloc, + rank_type* device_ptr) + { + if (user_alloc) return comm.make_device_buffer(device_ptr, size, 0); + else + return comm.make_device_buffer(size, 0); + } + + struct device_allocation + { + void* m_ptr = nullptr; + device_allocation(std::size_t size = 0) + { + if (size) m_ptr = hwmalloc::device_malloc(size * sizeof(rank_type)); + } + device_allocation(device_allocation&& other) + : m_ptr{std::exchange(other.m_ptr, nullptr)} + { + } + ~device_allocation() + { +#ifndef OOMPH_TEST_LEAK_GPU_MEMORY + if (m_ptr) hwmalloc::device_free(m_ptr); +#endif + } + rank_type* get() const noexcept { return (rank_type*)m_ptr; } + }; + + device_allocation raw_device_smsg; + device_allocation raw_device_rmsg; + message smsg; + message rmsg; + + test_environment_device(oomph::context& c, std::size_t size, int tid, int num_t, + bool user_alloc) + : base(c, tid, num_t) +#ifndef OOMPH_TEST_LEAK_GPU_MEMORY + , raw_device_smsg(user_alloc ? size : 0) + , raw_device_rmsg(user_alloc ? size : 0) + , smsg(make_buffer(comm, size, user_alloc, raw_device_smsg.get())) + , rmsg(make_buffer(comm, size, user_alloc, raw_device_rmsg.get())) +#else + , raw_device_smsg(size) + , raw_device_rmsg(size) + , smsg(make_buffer(comm, size, true, raw_device_smsg.get())) + , rmsg(make_buffer(comm, size, true, raw_device_rmsg.get())) +#endif + { + fill_send_buffer(); + fill_recv_buffer(); + } + + void fill_send_buffer() + { + for (auto& x : smsg) x = comm.rank(); + smsg.clone_to_device(); + } + + void fill_recv_buffer() + { + for (auto& x : rmsg) x = -1; + rmsg.clone_to_device(); + } + + bool check_recv_buffer() + { + rmsg.clone_to_host(); + for (auto const& x : rmsg) + if (x != rpeer_rank) return false; + return true; + } +}; +#endif + +template +void +launch_test(Func f) +{ + // single threaded + { + oomph::context ctxt(MPI_COMM_WORLD, false); + reset_counters(); + f(ctxt, SIZE, 0, 1, false); + reset_counters(); + f(ctxt, SIZE, 0, 1, true); + } + + // multi threaded + { + oomph::context ctxt(MPI_COMM_WORLD, true); + std::vector threads; + threads.reserve(NTHREADS); + reset_counters(); + for (int i = 0; i < NTHREADS; ++i) + threads.push_back(std::thread{f, std::ref(ctxt), SIZE, i, NTHREADS, false}); + for (auto& t : threads) t.join(); + threads.clear(); + reset_counters(); + for (int i = 0; i < NTHREADS; ++i) + threads.push_back(std::thread{f, std::ref(ctxt), SIZE, i, NTHREADS, true}); + for (auto& t : threads) t.join(); + } +} + +// no callback +// =========== +template +void +test_send_recv(oomph::context& ctxt, std::size_t size, int tid, int num_threads, bool user_alloc) +{ + Env env(ctxt, size, tid, num_threads, user_alloc); + + // use is_ready() -> must manually progress the communicator + for (int i = 0; i < NITERS; i++) + { + auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag); + auto sreq = env.comm.send(env.smsg, env.speer_rank, env.tag); + while (!(rreq.is_ready() && sreq.is_ready())) { env.comm.progress(); }; + EXPECT_TRUE(env.check_recv_buffer()); + env.fill_recv_buffer(); + } + + // use test() -> communicator is progressed automatically + for (int i = 0; i < NITERS; i++) + { + auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag); + auto sreq = env.comm.send(env.smsg, env.speer_rank, env.tag); + while (!(rreq.test() && sreq.test())) {}; + EXPECT_TRUE(env.check_recv_buffer()); + env.fill_recv_buffer(); + } + + // use wait() -> communicator is progressed automatically + for (int i = 0; i < NITERS; i++) + { + auto rreq = env.comm.recv(env.rmsg, env.rpeer_rank, env.tag); + env.comm.send(env.smsg, env.speer_rank, env.tag).wait(); + rreq.wait(); + EXPECT_TRUE(env.check_recv_buffer()); + env.fill_recv_buffer(); + } +} + +TEST_F(mpi_test_fixture, send_recv) +{ + launch_test(test_send_recv); + std::cout << "\n\n\n\n\n\n\n\n" << std::endl; +#if HWMALLOC_ENABLE_DEVICE + launch_test(test_send_recv); +#endif +} diff --git a/test/test_unique_function.cpp b/test/test_unique_function.cpp index 8e18b86e..9d38d79b 100644 --- a/test/test_unique_function.cpp +++ b/test/test_unique_function.cpp @@ -53,11 +53,9 @@ TEST(unqiue_function, simple_function) EXPECT_EQ(3, uf1(4)); } - void test_stats(ctor_stats_data const& stats, int n_ctor, int n_dtor, int n_dtor_of_moved, int n_move_ctor, int n_calls); - // small function which fits within the stack buffer struct small_function { @@ -267,3 +265,10 @@ test_stats(ctor_stats_data const& stats, int n_ctor, int n_dtor, int n_dtor_of_m //std::cout << stats << std::endl; } + +int +main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/test_unsafe_shared_ptr.cpp b/test/test_unsafe_shared_ptr.cpp index a592f6b1..2668aa14 100644 --- a/test/test_unsafe_shared_ptr.cpp +++ b/test/test_unsafe_shared_ptr.cpp @@ -155,3 +155,10 @@ TEST(unsafe_shared_ptr, move_assign) EXPECT_EQ(d.alloc_ref_count, 0); } } + +int +main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}