diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..71b4e67e5e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+*.csv filter=lfs diff=lfs merge=lfs -text
+inputs/wmd/data.001.csv filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
new file mode 100644
index 0000000000..925406e22c
--- /dev/null
+++ b/.github/workflows/build-and-test.yml
@@ -0,0 +1,162 @@
+name: Docker / Ubuntu 22.04 / Build and Test
+run-name: docker-ubuntu-2204 performed by @${{ github.triggering_actor }}
+
+on:
+  pull_request:
+    branches:
+      - master
+      - main
+  push:
+    branches:
+      - master
+      - main
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  docker-create-ubuntu-2204:
+    name: create
+    runs-on: self-hosted
+    permissions:
+      contents: read
+      packages: write
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Create Docker Image
+      timeout-minutes: 45
+      run: |
+        make ci-image
+
+  ubuntu-2204-docker:
+    name: gcc / ${{ matrix.build-type }} / ${{ matrix.sanitizer-type }}
+    runs-on: self-hosted
+    permissions:
+      contents: read
+      packages: write
+    env:
+      IMAGE_NAME: galois
+      CONTAINER_SRC_DIR: "/pando-galois"
+      CONTAINER_BUILD_DIR: "/pando-galois/build"
+      CONTAINER_WORK_DIR: "/pando-galois"
+      GALOIS_CONTAINER_ENV: "-e=GALOIS_BUILD_TOOL=Ninja"
+      INTERACTIVE: ""
+    defaults:
+      run:
+        shell: bash -l {0}
+    strategy:
+      matrix:
+        build-type: ['Release']
+        sanitizer-type: ['nosan', 'san']
+    needs: docker-create-ubuntu-2204
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        lfs: 'true'
+        submodules: recursive
+
+    - name: Set up environment variables
+      timeout-minutes: 1
+      run: |
+        echo "UNAME=$(whoami)" >> $GITHUB_ENV
+        echo "UID=$(id -u)" >> $GITHUB_ENV
+        echo "GID=$(id -g)" >> $GITHUB_ENV
+        echo "SRC_DIR=$(pwd)" >> $GITHUB_ENV
+        echo "GALOIS_CCACHE_DIR=/var/local/$(whoami)/.ccache" >> $GITHUB_ENV
+        echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile)" >> $GITHUB_ENV
+        if [ ${{ matrix.sanitizer-type }} == 'san' ]; then
+          echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='\"-fsanitize=address -fsanitize=undefined\"'" >> $GITHUB_ENV
+        fi
+        if [ ${{ matrix.build-type }} == 'Debug' ]; then
+          echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='-O3'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'zerberus-0' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'zerberus-1' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-0' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=0,1,2,3,4,5,6,7,32,33,34,35,36,37,38,39'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-1' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=16,17,18,19,20,21,22,23,48,49,50,51,52,53,54,55'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-2' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=8,9,10,11,12,13,14,15,40,41,42,43,44,45,46,47'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'languedoc-3' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=24,25,26,27,28,29,30,31,56,57,58,59,60,61,62,63'" >> $GITHUB_ENV
+        fi
+        cat $GITHUB_ENV
+
+    - name: Configure
+      timeout-minutes: 10
+      run: |
+        mkdir -p ${{ env.GALOIS_CCACHE_DIR }} -m=777
+        CONTAINER_CMD="bash -lc 'source /opt/intel/oneapi/setvars.sh && make setup-ci'" \
+          CONTAINER_OPTS="-e=BUILD_TYPE=${{ matrix.build-type }}" \
+          IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          make docker
+
+    - name: Build
+      timeout-minutes: 15
+      run: |
+        CONTAINER_CMD="bash -c 'ninja -j10 || ninja || ninja'" \
+          IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          CONTAINER_WORKDIR="${{ env.CONTAINER_BUILD_DIR }}" \
+          make docker
+
+    - name: Run Tests
+      timeout-minutes: 5
+      run: |
+        CONTAINER_CMD="make run-tests" \
+          IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          make docker
+
+  docker-pre-commit-ubuntu-2204:
+    name: pre-commit
+    runs-on: self-hosted
+    permissions:
+      contents: read
+      packages: write
+    env:
+      IMAGE_NAME: galois
+      CONTAINER_SRC_DIR: "/pando-galois"
+      CONTAINER_BUILD_DIR: "/pando-galois/build"
+      CONTAINER_WORK_DIR: "/pando-galois"
+      INTERACTIVE: ""
+    defaults:
+      run:
+        shell: bash -l {0}
+    needs: docker-create-ubuntu-2204
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up environment variables
+      timeout-minutes: 1
+      run: |
+        echo "SRC_DIR=$(pwd)" >> $GITHUB_ENV
+        echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile)" >> $GITHUB_ENV
+        cat $GITHUB_ENV
+    - name: Check pre-commit
+      timeout-minutes: 10
+      run: |
+        IMAGE_NAME="${{ env.IMAGE_NAME }}" \
+          VERSION="${{ env.IMAGE_VERSION }}" \
+          make docker-pre-commit
diff --git a/.gitignore b/.gitignore
index 94fc673c6e..a1238adb3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,12 +21,15 @@ cscope.out
 .tags*
 tags
 .ycm_extra_conf.py
+.ccache
 
 # no build files
-/build*
+/*build*
+/dockerbuild*
 
 # no python build artifacts
 *.pyc
 /python/galois.egg-info
 /python/galois/*.so
 /_skbuild
+
diff --git a/.gitmodules b/.gitmodules
index 0095886558..d66cce84ad 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
-[submodule "moderngpu"]
-	path = external/moderngpu
-	url = https://github.com/moderngpu/moderngpu.git
-[submodule "cub"]
-	path = external/cub
-	url = https://github.com/NVlabs/cub.git
+[submodule "external/pcg-cpp"]
+	path = external/pcg-cpp
+	url = https://github.com/imneme/pcg-cpp.git
+[submodule "external/parallel-hashmap"]
+	path = external/parallel-hashmap
+	url = https://github.com/greg7mdp/parallel-hashmap.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..c30b4276e2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,25 @@
+---
+files: ^libcusp|^libdeepgalois|^libdist|^libgalois|^libgluon|^libgnn|^libwmd
+exclude: ^scripts|^python|^inputs
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.2.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+  - repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.2.0
+    hooks:
+      - id: forbid-tabs
+        exclude: ^scripts|^python
+      - id: remove-tabs
+        exclude: ^scripts|^python
+        args: [--whitespaces-count, '2']
+  - repo: https://github.com/pocc/pre-commit-hooks
+    rev: v1.3.5
+    hooks:
+      - id: clang-format
+        args: [-i]
+      # - id: clang-tidy
+      #   args: [--fix, -p=build/compile_commands.json]
diff --git a/.tool-versions b/.tool-versions
new file mode 100644
index 0000000000..c00efa2d48
--- /dev/null
+++ b/.tool-versions
@@ -0,0 +1 @@
+pre-commit 2.19.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b278ea0df3..721a4db6e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.17)
 
 project(Galois)
 
@@ -6,6 +6,13 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 
 include(GNUInstallDirs)
 
+if(STACK_CAPTURE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions")
+  set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h")
+  set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}")
+endif(STACK_CAPTURE)
+
 file(STRINGS config/version.txt GALOIS_VERSION)
 string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION})
 string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" GALOIS_VERSION_MAJOR ${GALOIS_VERSION})
@@ -22,9 +29,10 @@ endif()
 
 ###### Options (alternatively pass as options to cmake -DName=Value) ######
 ###### Distributed-heterogeneous features ######
-set(GALOIS_ENABLE_DIST OFF CACHE BOOL "Enable distributed features")
+set(GALOIS_ENABLE_DIST ON CACHE BOOL "Enable distributed features")
 set(GALOIS_CUDA_CAPABILITY "" CACHE STRING "Semi-colon list of CUDA compute capability version numbers to enable GPU features") # e.g., "3.7;6.1"
 set(GALOIS_COMM_STATS OFF CACHE BOOL "Report more detailed statistics of communication")
+set(GALOIS_ENABLE_WMD ON CACHE BOOL "Enable WMD dataset support")
 ###### General features ######
 set(GALOIS_ENABLE_PAPI OFF CACHE BOOL "Use PAPI counters for profiling")
 set(GALOIS_ENABLE_VTUNE OFF CACHE BOOL "Use VTune for profiling")
@@ -33,7 +41,12 @@ set(GALOIS_GRAPH_LOCATION "" CACHE PATH "Location of inputs for tests if downloa
 set(CXX_CLANG_TIDY "" CACHE STRING "Semi-colon list specifying clang-tidy command and arguments")
 set(CMAKE_CXX_COMPILER_LAUNCHER "" CACHE STRING "Semi-colon list specifying command to wrap compiler invocations (e.g., ccache)")
 set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture ('none' to disable)")
-set(GALOIS_USE_SANITIZER "" CACHE STRING "Semi-colon list of sanitizers to use (Memory, MemoryWithOrigins, Address, Undefined, Thread)")
+
+set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library")
+set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS")
+# TODO; this is GNN related; find better way to do than hardcode
+#SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2)
+
 # This option is automatically handled by CMake.
 # It makes add_library build a shared lib unless STATIC is explicitly specified.
 # Putting this here is mostly just a placeholder so people know it's an option.
@@ -45,6 +58,7 @@ set(GALOIS_NUM_TEST_GPUS "0" CACHE STRING "Number of test GPUs to use (on a sing
 set(GALOIS_USE_LCI OFF CACHE BOOL "Use LCI network runtime instead of MPI")
 set(GALOIS_USE_BARE_MPI OFF CACHE BOOL "Use MPI directly (no dedicated network-runtime thread)")
 set(GALOIS_NUM_TEST_THREADS "" CACHE STRING "Maximum number of threads to use when running tests (default: number of physical cores)")
+set(GALOIS_ENABLE_INSTRUMENT OFF CACHE BOOL "Enable generating instrument in the runtime")
 
 if(NOT GALOIS_NUM_TEST_THREADS)
   cmake_host_system_information(RESULT GALOIS_NUM_TEST_THREADS QUERY NUMBER_OF_PHYSICAL_CORES)
@@ -59,6 +73,31 @@ include(CTest)
 
 ###### Configure compiler ######
 
+if(PROJECT_IS_TOP_LEVEL)
+  include_directories(${Galois_SOURCE_DIR}/external/parallel-hashmap)
+  
+  if(CMAKE_CXX_FLAGS)
+    message(STATUS "Provided CXX Flags: " ${CMAKE_CXX_FLAGS})
+  endif()
+
+  set(CMAKE_OPTIMIZE_DEPENDENCIES true)
+
+  # Setup CCache
+  find_program(CCACHE_EXECUTABLE ccache)
+  if(CCACHE_EXECUTABLE)
+    message(STATUS "CCache found at: " ${CCACHE_EXECUTABLE})
+    set(ccacheEnv
+      CCACHE_SLOPPINESS=pch_defines,time_macros
+    )
+    # NOTE: Ccache 4.2+ required for reliable CUDA support
+    foreach(lang IN ITEMS C CXX OBJC OBJCXX CUDA)
+      set(CMAKE_${lang}_COMPILER_LAUNCHER
+        ${CMAKE_COMMAND} -E env ${ccacheEnv} ${CCACHE_EXECUTABLE}
+      )
+    endforeach()
+  endif()
+endif()
+
 # generate compile_commands.json
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -130,6 +169,31 @@ endif()
 
 ###### Configure features ######
 
+################################################################################
+# For GNN matrix multiplies
+# TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file
+################################################################################
+if(USE_MKL_BLAS)
+  find_package(MKL CONFIG REQUIRED PATH $ENV{MKL_ROOT})
+  if (MKL_FOUND)
+  else()
+    message(WARNING "MKL not found")
+  endif()
+endif()
+
+#SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1)
+if(USE_OPENBLAS)
+  find_package(OpenBLAS)
+  message(STATUS "OpenBLAS: ${OPENBLAS_INCLUDE_DIRS}")
+  if (OPENBLAS_FOUND)
+    include_directories(${OPENBLAS_INCLUDE_DIRS})
+  else()
+    message(WARNING "OpenBLAS not found")
+  endif()
+endif()
+
+################################################################################
+
 if(GALOIS_ENABLE_VTUNE)
   set(VTune_ROOT /opt/intel/vtune_amplifier)
   find_package(VTune REQUIRED)
@@ -137,6 +201,7 @@ if(GALOIS_ENABLE_VTUNE)
   add_definitions(-DGALOIS_ENABLE_VTUNE)
 endif()
 
+
 if(GALOIS_ENABLE_PAPI)
   find_package(PAPI REQUIRED)
   include_directories(${PAPI_INCLUDE_DIRS})
@@ -191,6 +256,7 @@ endif()
 
 add_custom_target(lib)
 add_custom_target(apps)
+add_subdirectory(external)
 
 # Core libraries (lib)
 add_subdirectory(libsupport)
@@ -201,8 +267,19 @@ if (GALOIS_ENABLE_DIST)
   add_subdirectory(libdist)
   add_subdirectory(libcusp)
   add_subdirectory(libgluon)
+  if (GALOIS_ENABLE_WMD)
+    find_package(MPI REQUIRED)
+    add_subdirectory(libwmd)
+  endif()
 endif()
+
+# TODO(loc) prefix with GALOIS
+if(USE_DEEPGALOIS)
+  add_subdirectory(libdeepgalois)
+endif(USE_DEEPGALOIS)
+
 string(COMPARE NOTEQUAL "${GALOIS_CUDA_CAPABILITY}" "" GALOIS_ENABLE_GPU)
+
 if (GALOIS_ENABLE_GPU)
   enable_language(CUDA)
   foreach(GENCODE ${GALOIS_CUDA_CAPABILITY})
@@ -210,8 +287,42 @@ if (GALOIS_ENABLE_GPU)
     add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>")
   endforeach()
 
+  # This is necessary to allow building for CUDA 11.x (where CUB is bundled) and earlier versions (where CUB is not included)
+  add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
+
   add_subdirectory(libgpu)
+
+  if (USE_DEEPGALOIS)
+    SET(CUDA_SEPARABLE_COMPILATION ON)
+    find_package(CUDA REQUIRED)
+    set(CUDA_PROPAGATE_HOST_FLAGS off)
+    set(CUDA_HOST_COMPILER g++)
+
+    string(REPLACE "." "" GENCODES ${GALOIS_CUDA_CAPABILITY})
+    string(REPLACE "," ";" GENCODES ${GENCODES})
+    foreach(GENCODE ${GENCODES})
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE})
+    endforeach()
+
+    cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include")
+
+    # MGPU v1.1
+    set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
+    cuda_include_directories("${MGPU_ROOT}/src")
+
+    # CUB v1.6.4
+    set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
+    cuda_include_directories("${CUB_ROOT}")
+
+    #find_package(OpenCL REQUIRED)
+  endif()
 endif()
+
+if (GALOIS_ENABLE_DIST AND USE_MKL_BLAS)
+  # here because I need the GPU declarations above
+  add_subdirectory(libgnn)
+endif()
+
 add_subdirectory(libpangolin)
 
 # Applications (apps)
@@ -274,3 +385,11 @@ set(CPACK_PACKAGE_VERSION_MAJOR ${GALOIS_VERSION_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${GALOIS_VERSION_MINOR})
 set(CPACK_PACKAGE_VERSION_PATCH ${GALOIS_VERSION_PATCH})
 include(CPack)
+
+if(STACK_CAPTURE)
+  message("Writing CMAKE_CXX_FLAGS")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSTACK_TRACE -finstrument-functions")
+  set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h")
+  set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}" CACHE STRING "CMAKE Flags" FORCE)
+endif(STACK_CAPTURE)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..007227dc70
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,37 @@
+# Contributing
+
+Contributors must run quality checks on code.  In place of CI we
+recommend using `pre-commit` (described below) instead of running
+tools like `clang-format` manually.
+
+Code should be clear and documented where needed.
+
+## Setup
+
+Users can run `make docker-image` to setup all dependecies needed for
+`pando-galois`.  After creating the image it can be run via `make docker`.
+And for first time cmake users can run `make run-cmake`.
+
+## Tools
+
+### [asdf](https://asdf-vm.com)
+
+Provides a declarative set of tools pinned to
+specific versions for environmental consistency.
+
+These tools are defined in `.tool-versions`.
+Run `make dependencies` to initialize a new environment.
+
+### [pre-commit](https://pre-commit.com)
+
+A left shifting tool to consistently run a set of checks on the code repo.
+Our checks enforce syntax validations and formatting.
+We encourage contributors to use pre-commit hooks.
+
+```shell
+# install all pre-commit hooks
+make hooks
+
+# run pre-commit on repo once
+make pre-commit
+```
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000..cceb15b94a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,88 @@
+ARG BUILD_IMAGE=ubuntu:22.04
+FROM --platform=linux/amd64 ${BUILD_IMAGE} AS dev
+
+WORKDIR /tmp
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && \
+  apt install -y \
+  cmake \
+  gcc \
+  g++ \
+  ccache \
+  build-essential \
+  make \
+  libboost-all-dev \
+  libfmt-dev \
+  libzstd-dev \
+  lsb-release \
+  wget \
+  software-properties-common \
+  gnupg \
+  gdb \
+  vim \
+  git \
+  python3 \
+  python3-pip \
+  unzip \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# setup intel repo for intel-basekit
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | \
+  gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \
+  tee /etc/apt/sources.list.d/oneAPI.list
+RUN apt update && \
+  apt install -y \
+  intel-basekit \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
+
+ENV NINJA_BUILD_VERSION=1.11.1
+RUN wget https://github.com/ninja-build/ninja/releases/download/v${NINJA_BUILD_VERSION}/ninja-linux.zip -P /tmp && \
+  unzip /tmp/ninja-linux.zip -d /usr/bin && \
+  rm /tmp/ninja-linux.zip
+
+ARG IS_CI=true
+
+RUN if [ "${IS_CI}" != "true" ] ; then \
+  apt update -y \
+  &&  apt install -y \
+  vim \
+  gdb \
+  universal-ctags \
+  powerline \
+  zsh \
+  valgrind \
+  sudo \
+  doxygen \
+  texlive-latex-extra \
+  texlive-font-utils \
+  &&  apt clean; fi
+
+ARG SRC_DIR=/pando-galois
+ARG BUILD_DIR=/pando-galois/dockerbuild
+ARG UNAME
+ARG UID
+ARG GID
+
+RUN if [ "${UNAME}" != "root" ] ; then groupadd -g ${GID} ${UNAME} \
+  &&  useradd -ms /bin/bash  -u "${UID}" -g "${GID}" ${UNAME} ; fi
+
+RUN mkdir -p /home/${UNAME} \
+  && chown ${UNAME}:${UNAME} /home/${UNAME}
+
+USER ${UNAME}
+WORKDIR /home/${UNAME}
+ENV BUILD_DIR=${BUILD_DIR}
+
+RUN pip3 install compdb pre-commit cpplint "clang-format>=14.0.0,<17.0.0"
+
+RUN echo "PATH=/home/${UNAME}/.local/bin/:\$PATH" >> /home/${UNAME}/.zshenv
+
+RUN echo "export SRC_DIR=${SRC_DIR}" >> /home/${UNAME}/.bashrc
+RUN echo "export BUILD_DIR=${BUILD_DIR}" >> /home/${UNAME}/.bashrc
+RUN echo "source /opt/intel/oneapi/setvars.sh > /dev/null" >> /home/${UNAME}/.bashrc
+
+WORKDIR ${SRC_DIR}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000..1a5e58b116
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,131 @@
+SHELL := /bin/bash
+
+UNAME ?= $(shell whoami)
+UID ?= $(shell id -u)
+GID ?= $(shell id -g)
+
+BASE_IMAGE_NAME ?= pando-galois
+IMAGE_NAME ?= ${UNAME}-${BASE_IMAGE_NAME}
+SRC_DIR ?= $(shell pwd)
+VERSION ?= $(shell git log --pretty="%h" -1 Dockerfile)
+
+CONTAINER_SRC_DIR ?= /pando-galois
+CONTAINER_BUILD_DIR ?= /pando-galois/build
+CONTAINER_WORKDIR ?= ${CONTAINER_SRC_DIR}
+CONTAINER_CONTEXT ?= default
+CONTAINER_OPTS ?=
+CONTAINER_CPUSET ?=
+CONTAINER_CMD ?= bash -l
+INTERACTIVE ?= i
+
+BUILD_TYPE ?= RelWithDebInfo
+
+# CMake variables
+GALOIS_EXTRA_CMAKE_FLAGS ?= ""
+GALOIS_EXTRA_CXX_FLAGS ?= ""
+
+# Developer variables that should be set as env vars in startup files like .profile
+GALOIS_CONTAINER_MOUNTS ?=
+GALOIS_CONTAINER_ENV ?=
+GALOIS_CONTAINER_FLAGS ?=
+GALOIS_BUILD_TOOL ?= 'Unix Makefiles'
+GALOIS_CCACHE_DIR ?= ${SRC_DIR}/.ccache
+
+dependencies: dependencies-asdf
+
+dependencies-asdf:
+	@echo "Updating asdf plugins..."
+	@asdf plugin update --all >/dev/null 2>&1 || true
+	@echo "Adding new asdf plugins..."
+	@cut -d" " -f1 ./.tool-versions | xargs -I % asdf plugin-add % >/dev/null 2>&1 || true
+	@echo "Installing asdf tools..."
+	@cat ./.tool-versions | xargs -I{} bash -c 'asdf install {}'
+	@echo "Updating local environment to use proper tool versions..."
+	@cat ./.tool-versions | xargs -I{} bash -c 'asdf local {}'
+	@asdf reshim
+	@echo "Done!"
+
+hooks:
+	@pre-commit install --hook-type pre-commit
+	@pre-commit install-hooks
+
+pre-commit:
+	@pre-commit run -a
+
+ci-image:
+	@${MAKE} docker-image-dependencies
+	@docker image inspect galois:${VERSION} >/dev/null 2>&1 || \
+	docker --context ${CONTAINER_CONTEXT} build \
+	--build-arg SRC_DIR=${CONTAINER_SRC_DIR} \
+	--build-arg BUILD_DIR=${CONTAINER_BUILD_DIR} \
+	--build-arg UNAME=runner \
+  --build-arg UID=1078 \
+  --build-arg GID=504 \
+	-t galois:${VERSION} \
+	--file Dockerfile \
+	--target dev .
+
+docker-image:
+	@${MAKE} docker-image-dependencies
+	@docker image inspect ${IMAGE_NAME}:${VERSION} >/dev/null 2>&1 || \
+	docker --context ${CONTAINER_CONTEXT} build \
+	--build-arg SRC_DIR=${CONTAINER_SRC_DIR} \
+	--build-arg BUILD_DIR=${CONTAINER_BUILD_DIR} \
+	--build-arg UNAME=${UNAME} \
+	--build-arg IS_CI=false \
+  --build-arg UID=${UID} \
+  --build-arg GID=${GID} \
+	-t ${IMAGE_NAME}:${VERSION} \
+	--file Dockerfile \
+	--target dev .
+
+docker-image-dependencies:
+	@mkdir -p build
+	@mkdir -p data
+	@mkdir -p .ccache
+
+.PHONY: docker
+docker:
+	@docker --context ${CONTAINER_CONTEXT} run --rm \
+	-v ${SRC_DIR}/:${CONTAINER_SRC_DIR} \
+	-v ${GALOIS_CCACHE_DIR}/:/home/${UNAME}/.ccache \
+	${GALOIS_CONTAINER_MOUNTS} \
+	${GALOIS_CONTAINER_ENV} \
+	${GALOIS_CONTAINER_FLAGS} \
+	${CONTAINER_CPUSET} \
+	--privileged \
+	--workdir=${CONTAINER_WORKDIR} \
+	${CONTAINER_OPTS} \
+	-${INTERACTIVE}t \
+	${IMAGE_NAME}:${VERSION} \
+	${CONTAINER_CMD}
+
+run-cmake:
+	@cmake \
+  -S ${SRC_DIR} \
+  -B ${BUILD_DIR} \
+	-G ${GALOIS_BUILD_TOOL} \
+  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+	-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+	-DUSE_MKL_BLAS=ON \
+	-DGALOIS_ENABLE_DIST=ON \
+	${GALOIS_EXTRA_CMAKE_FLAGS}
+
+setup: run-cmake
+
+setup-ci: run-cmake
+
+run-tests:
+	@ctest --test-dir build -R wmd --verbose
+	@ctest --test-dir build -R large-vec --verbose
+	@ctest --test-dir build -R compile-lscsr --verbose
+	@ctest --test-dir build -R prefixsum --verbose
+	@ctest --test-dir build -R wfl --verbose
+
+# this command is slow since hooks are not stored in the container image
+# this is mostly for CI use
+docker-pre-commit:
+	@docker --context ${CONTAINER_CONTEXT} run --rm \
+	-v ${SRC_DIR}/:${CONTAINER_SRC_DIR} --privileged \
+	--workdir=${CONTAINER_WORKDIR} -t \
+	${IMAGE_NAME}:${VERSION} bash -lc "git config --global --add safe.directory /pando-galois && make hooks && make pre-commit"
diff --git a/README.md b/README.md
index 3375e800ee..ffda74f765 100644
--- a/README.md
+++ b/README.md
@@ -11,17 +11,17 @@ an implicitly parallel programming model, where the programmer replaces serial l
 constructs (e.g. for and while) and serial data structures in their algorithms with parallel loop
 constructs and concurrent data structures provided by Galois to express their algorithms.
 Galois is designed so that the programmer does not have to deal with low-level parallel programming constructs such as
-threads, locks, barriers, condition variables, etc. 
+threads, locks, barriers, condition variables, etc.
 
 Highlights include:
 - Parallel *for_each* loop that handles dependencies between iterations, as well as
   dynamic work creation, and a *do_all* loop for simple parallelism. Both provide load balancing and excellent
   scalability on multi-socket systems
 - A concurrent graph library designed for graph analytics algorithms as well as
-  other domains such as irregular meshes. 
-- Scalable concurrent containers such as bag, vector, list, etc. 
+  other domains such as irregular meshes.
+- Scalable concurrent containers such as bag, vector, list, etc.
 
-Galois is released under the BSD-3-Clause license. 
+Galois is released under the BSD-3-Clause license.
 
 
 Building Galois
@@ -45,7 +45,7 @@ Dependencies
 
 Galois builds, runs, and has been tested on GNU/Linux. Even though
 Galois may build on systems similar to Linux, we have not tested correctness or performance, so please
-beware. 
+beware.
 
 At the minimum, Galois depends on the following software:
 
@@ -55,7 +55,7 @@ At the minimum, Galois depends on the following software:
 - libllvm (>= 7.0 with RTTI support)
 - libfmt (>= 4.0)
 
-Here are the dependencies for the optional features: 
+Here are the dependencies for the optional features:
 
 - Linux HUGE_PAGES support (please see [www.kernel.org/doc/Documentation/vm/hugetlbpage.txt](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt)). Performance will most likely degrade without HUGE_PAGES
   enabled. Galois uses 2MB huge page size and relies on the kernel configuration to set aside a large amount of 2MB pages. For example, our performance testing machine (4x14 cores, 192GB RAM) is configured to support up to 65536 2MB pages:
@@ -70,13 +70,14 @@ Here are the dependencies for the optional features:
   ```
 
 - libnuma support. Performance may degrade without it. Please install
-  libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. 
-- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files 
+  libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems.
+- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files
 - PAPI (>= 5.2.0.0 ) for profiling sections of code
 - Vtune (>= 2017 ) for profiling sections of code
 - MPICH2 (>= 3.2) if you are interested in building and running distributed system
   applications in Galois
-- CUDA (>= 8.0) if you want to build GPU or distributed heterogeneous applications
+- CUDA (>= 8.0 and < 11.0) if you want to build GPU or distributed heterogeneous applications.
+  Note that versions >= 11.0 use an incompatible CUB module and will fail to execute.
 - Eigen (3.3.1 works for us) for some matrix-completion app variants
 
 
@@ -148,6 +149,12 @@ ctest
 
 in the build directory.
 
+Capturing Stack Information
+---------------------------
+Currently if you add `-DSTACK_CAPTURE` to your `cmake` line then you will configure stack capturing.
+Please view `libgalois/include/runtime/StackTracer.h` for documentation on functions for printing and reseting.
+Do not attempt to modify the capture process otherwise.
+
 
 Running Galois Applications
 ===========================
@@ -156,9 +163,9 @@ Graph Format
 ------------
 
 Many Galois/Lonestar applications work with graphs. We store graphs in a binary format
-called *galois graph file* 
+called *galois graph file*
 (`.gr` file extension). Other formats such as edge-list or Matrix-Market can be
-converted to `.gr` format with `graph-convert` tool provided in galois. 
+converted to `.gr` format with `graph-convert` tool provided in galois.
 You can build graph-convert as follows:
 
 ```Shell
@@ -168,20 +175,20 @@ make graph-convert
 ```
 
 Other applications, such as Delaunay Mesh Refinement may read special file formats
-or some may even generate random inputs on the fly. 
+or some may even generate random inputs on the fly.
 
 Running
 -------
 
 All Lonestar applications take a `-t` command-line option to specify the number of
 threads to use. All applications run a basic sanity check (often insufficient for
-correctness) on the program output, which can be turned off with the `-noverify` option. You 
-can specify `-help` command-line option to print all available options. 
+correctness) on the program output, which can be turned off with the `-noverify` option. You
+can specify `-help` command-line option to print all available options.
 
 Upon successful completion, each application will produce some stats regarding running
 time of various sections, parallel loop iterations and memory usage, etc. These
 stats are in CSV format and can be redirected to a file using `-statFile` option.
-Please refer to the manual for details on stats. 
+Please refer to the manual for details on stats.
 
 Running LonestarGPU applications
 --------------------------
@@ -199,7 +206,7 @@ Documentation
 =============
 
 Galois documentation is produced using doxygen, included in this repository, which includes a tutorial, a user's
-manual and API documentation for the Galois library. 
+manual and API documentation for the Galois library.
 
 Users can build doxygen documentation in the build directory using:
 
@@ -215,12 +222,12 @@ See online documentation at:
 Source-Tree Organization
 ========================
 
-- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. 
+- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc.
 - `lonestar` contains the Lonestar benchmark applications and tutorial examples for Galois
 - `libdist` contains the source code for the distributed-memory and heterogeneous Galois library
 - `lonestardist` contains the source code for the distributed-memory and heterogeneous
   benchmark applications. Please refer to `lonestardist/README.md` for instructions on
-  building and running these apps. 
+  building and running these apps.
 - `tools` contains various helper programs such as graph-converter to convert
   between graph file formats and graph-stats to print graph properties
 
diff --git a/README_SHAD.md b/README_SHAD.md
new file mode 100644
index 0000000000..4253bb0e55
--- /dev/null
+++ b/README_SHAD.md
@@ -0,0 +1,57 @@
+README related to SHAD input graph ingestion
+(Including some notes for other workflows)
+This README is for our internal purpose.
+This README will be refined with more concrete information later.
+
+1. CMakeList paths:
+The current CMake in Galois is using hard-:coded paths for CUDA_HOME,
+OPENBLAS_ROOT, INTEL_COMPILER_LIBRARIES, and MKL_LIBRARIES.
+Please set those variables based on your environments.
+
+
+2. Assumptions regarding SHAD WMD graph formats:
+We assume that in SHAD WMD graph formats, each node and edge has a single type,
+and those types are ALWAYS uint64_t.
+The current Galois does not support node/edge properties (possibly,
+programmers can implement a struct containing multiple
+fields, but that is not like getData<Property1>(n), getData<Property2>(n), etc.)
+and so, we store those SHAD types in node and edge data.
+If you need other types than uint64_t, you should add new execution paths for
+them.
+
+
+3. Limitations of the current SHAD graph ingestion module:
+In the original CuSP, each host reads parts of the .gr graph file and constructs
+in-memory format. In this case, each host does not need to load the full graph
+in its memory space. This is possible since .gr file is CSR and each component
+such as outgoing edge indices, outgoing edge destinations, and outgoing edge
+data is stored consecutively.
+
+However, in the SHAD graph format, all components are not stored consecutively.
+They are unsorted. For example, edges and nodes can be stored in interleaved
+manner. Therefore, it is not possible to read partial graphs by using
+the original method. 
+
+As the current SHAD graph ingestion does not focus on decent/scalable methods,
+but to make SHAD graphs work in Galois to proceed with workflows,
+each host reads the FULL graph to in-memory. This should NOT be the final
+artifact since our long-run target graphs should exceed a single machine memory.
+But for the immediate goal and the target data sets, I assume that it is fine
+for now.
+
+UT team is currently working on new graph formats for dynamic graphs, and 
+scalable SHAD graph ingestion across hosts.
+
+4. TODO:
+CuSP marks training/test/validation nodes while it is partitioning a graph.
+It is not implemented yet for a SHAD graph.
+This will be added in a GNN/feature construction branch.
+
+5. Requirements:
+Galois-GNN requires additional packages listed below on top of the requirements of Galois.
+You can use older/newer versions but let me (hochan) also list the versions that I have used:
+1) Intel MKL: 2023.1.0
+2) Intel Compiler (including runtime libraries): 2023.0.0
+3) Intel Onedpl-devel library: 2023.1.0
+4) Intel OpenMP: 2023.0.0
+
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
new file mode 100644
index 0000000000..d87020f770
--- /dev/null
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,24 @@
+# Find MKL libraries
+# Once done this will define
+#  MKL_FOUND - System has MKL
+#  MKL_INCLUDE_DIRS - The MKL include directories
+#  MKL_LIBRARIES - The libraries needed to use MKL
+
+set(MKL_LIBRARIES) # Include-only library
+
+if(MKL_INCLUDE_DIRS)
+  set(MKL_FIND_QUIETLY TRUE)
+endif()
+
+find_path(MKL_INCLUDE_DIRS mkl.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
+message(STATUS "MKL_INCLUDE_DIRS: ${MKL_INCLUDE_DIRS}")
+find_library(MKL_LIBRARY NAMES mkl_rt PATHS ${MKL_ROOT} PATH_SUFFIXES lib/intel64)
+message(STATUS "MKL_LIBRARY: ${MKL_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIRS)
+if(MKL_FOUND)
+  set(MKL_FOUND on)
+endif()
+
+mark_as_advanced(MKL_INCLUDE_DIRS)
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
new file mode 100644
index 0000000000..3f595744d0
--- /dev/null
+++ b/cmake/Modules/FindOpenBLAS.cmake
@@ -0,0 +1,24 @@
+# Find OpenBLAS libraries
+# Once done this will define
+#  OpenBLAS_FOUND - System has OpenBLAS
+#  OpenBLAS_INCLUDE_DIRS - The OpenBLAS include directories
+#  OpenBLAS_LIBRARIES - The libraries needed to use OpenBLAS
+
+set(OPENBLAS_LIBRARIES) # Include-only library
+
+if(OPENBLAS_INCLUDE_DIRS)
+  set(OPENBLAS_FIND_QUIETLY TRUE)
+endif()
+
+find_path(OPENBLAS_INCLUDE_DIRS cblas.h PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES include/openblas)
+message(STATUS "OPENBLAS_INCLUDE_DIRS: ${OPENBLAS_INCLUDE_DIRS}")
+find_library(OPENBLAS_LIBRARY NAMES openblas PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES lib64)
+message(STATUS "OPENBLAS_LIBRARY: ${OPENBLAS_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OPENBLAS DEFAULT_MSG OPENBLAS_LIBRARY OPENBLAS_INCLUDE_DIRS)
+if(OPENBLAS_FOUND)
+  set(OPENBLAS_FOUND on)
+endif()
+
+mark_as_advanced(OPENBLAS_INCLUDE_DIRS)
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
new file mode 100644
index 0000000000..310000adc8
--- /dev/null
+++ b/external/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(parallel-hashmap)
\ No newline at end of file
diff --git a/external/cub b/external/cub
deleted file mode 160000
index c3cceac115..0000000000
--- a/external/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304
diff --git a/external/moderngpu b/external/moderngpu
deleted file mode 160000
index 2b3985541c..0000000000
--- a/external/moderngpu
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2b3985541c8e88a133769598c406c33ddde9d0a5
diff --git a/external/parallel-hashmap b/external/parallel-hashmap
new file mode 160000
index 0000000000..67c24619e4
--- /dev/null
+++ b/external/parallel-hashmap
@@ -0,0 +1 @@
+Subproject commit 67c24619e4f5ab2097b74cc397732c17a25d6944
diff --git a/external/pcg-cpp b/external/pcg-cpp
new file mode 160000
index 0000000000..428802d1a5
--- /dev/null
+++ b/external/pcg-cpp
@@ -0,0 +1 @@
+Subproject commit 428802d1a5634f96bcd0705fab379ff0113bcf13
diff --git a/inputs/wmd/data.00001.csv b/inputs/wmd/data.00001.csv
new file mode 100644
index 0000000000..0f18f74182
--- /dev/null
+++ b/inputs/wmd/data.00001.csv
@@ -0,0 +1,744 @@
+#delimieter: ,
+#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon
+#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE
+Publication,,,,,102583151124020340,,4/1/2013,,
+Publication,,,,,1004346153600881042,,12/2/2014,,
+Publication,,,,,1433303251800176474,,1/1/2014,,
+Publication,,,,,963345652072941810,,3/1/2017,,
+ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,,
+ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,,
+ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,,
+ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,,
+ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,,
+ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,,
+ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,,
+ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,,
+ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,,
+ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,,
+ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,,
+ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,,
+ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,,
+ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,,
+ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,,
+ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,,
+ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,,
+ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,,
+ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,,
+ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,,
+ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,,
+ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,,
+ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,,
+ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,,
+ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,,
+ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,,
+ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,,
+ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,,
+ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,,
+ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,,
+ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,,
+ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,,
+ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,,
+ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,,
+ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,,
+ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,,
+ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,,
+ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,,
+ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,,
+ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,,
+ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,,
+ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,,
+ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,,
+ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,,
+ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,,
+ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,,
+ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,,
+ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,,
+ForumEvent,,,,1142353335442842612,,,10/2/2019,,
+ForumEvent,,,,1417645062678302203,,,10/27/2019,,
+ForumEvent,,,,691612430615344311,,,11/18/2019,,
+ForumEvent,,,,499518911125406276,,,11/7/2019,,
+ForumEvent,,,,802203574353867462,,,12/26/2019,,
+ForumEvent,,,,1154045191214226005,,,12/19/2019,,
+Forum,,,227560344059645632,,,,,,
+Forum,,,642724485236726353,,,,,,
+Forum,,,1583773067440233990,,,,,,
+Forum,,,353365307219544531,,,,,,
+Forum,,,1372844135435303981,,,,,,
+Forum,,,817570614729612563,,,,,,
+Forum,,,1222966301068614432,,,,,,
+Forum,,,254347350613723281,,,,,,
+Forum,,,230406515001545612,,,,,,
+Forum,,,1561731546512891652,,,,,,
+Forum,,,188043543797416114,,,,,,
+Forum,,,1083041743586306041,,,,,,
+Forum,,,132472381132383125,,,,,,
+Forum,,,20118285562646166,,,,,,
+Forum,,,555784630220125214,,,,,,
+Forum,,,1015255971523263924,,,,,,
+Forum,,,1342495276080758813,,,,,,
+Forum,,,851350143155248158,,,,,,
+Forum,,,1427292001647224242,,,,,,
+Forum,,,722051276937327353,,,,,,
+Forum,,,1107212912316309796,,,,,,
+Forum,,,504490409499070811,,,,,,
+Forum,,,15133734353741126,,,,,,
+Forum,,,869745302967338810,,,,,,
+Forum,,,324124332757504717,,,,,,
+Forum,,,852491638004013222,,,,,,
+Forum,,,1040437236245414809,,,,,,
+Forum,,,442231451428861295,,,,,,
+Forum,,,101022092642335391,,,,,,
+Forum,,,1037815940207624157,,,,,,
+Forum,,,1331941318481662527,,,,,,
+Forum,,,1615340315424362057,,,,,,
+Forum,,,1425519641234605945,,,,,,
+Forum,,,705065952261175094,,,,,,
+Forum,,,1314315120197156050,,,,,,
+Forum,,,214214821270800149,,,,,,
+Forum,,,1361197157264541395,,,,,,
+Forum,,,1033538541314217453,,,,,,
+Forum,,,565733832133342431,,,,,,
+Forum,,,451888058015735870,,,,,,
+Forum,,,155345234637251110,,,,,,
+Forum,,,1371100161965701220,,,,,,
+Forum,,,1307221369082243900,,,,,,
+Forum,,,406508153569651122,,,,,,
+Forum,,,1202482536733844323,,,,,,
+Forum,,,912373284682369433,,,,,,
+Person,477384404927196020,,,,,,,,
+Person,182010581109145287,,,,,,,,
+Topic,,,,,,271997,,,
+Topic,,,,,,127197,,,
+Person,284405379592161575,,,,,,,,
+Topic,,,,,,11650,,,
+Topic,,,,,,185785,,,
+Topic,,,,,,1907525,,,
+Topic,,,,,,1333024,,,
+Topic,,,,,,2329,,,
+Topic,,,,,,571,,,
+Topic,,,,,,56683126,,,
+Topic,,,,,,146,,,
+Topic,,,,,,487,,,
+Topic,,,,,,193294,,,
+Topic,,,,,,177,,,
+Topic,,,,,,81944,,,
+Topic,,,,,,998,,,
+Topic,,,,,,55424107,,,
+Topic,,,,,,41323,,,
+Topic,,,,,,38695,,,
+Topic,,,,,,379860,,,
+Topic,,,,,,1149078,,,
+Topic,,,,,,172809,,,
+Topic,,,,,,1642639,,,
+Topic,,,,,,903552,,,
+Topic,,,,,,204,,,
+Topic,,,,,,7817,,,
+Topic,,,,,,201816,,,
+Topic,,,,,,785,,49.19,-2.11
+Topic,,,,,,127,,,
+Topic,,,,,,206021,,,
+Topic,,,,,,181508,,,
+Topic,,,,,,735,,,
+Topic,,,,,,304878,,,
+Topic,,,,,,7590,,,
+Topic,,,,,,8074,,,
+Topic,,,,,,24862,,,
+Topic,,,,,,35127,,,
+Topic,,,,,,60,,40.67,-73.94
+Topic,,,,,,443533,,,
+Person,1160244137181801222,,,,,,,,
+Topic,,,,,,192242,,,
+Topic,,,,,,11707,,,
+Topic,,,,,,73843,,,
+Topic,,,,,,505619,,,
+Topic,,,,,,158668,,,
+Topic,,,,,,889,,34.0,66.0
+Person,895197896920634500,,,,,,,,
+Topic,,,,,,18426,,40.84676,-73.873207
+Topic,,,,,,787185,,,
+Topic,,,,,,467,,,
+Person,1419850416906085161,,,,,,,,
+Topic,,,,,,2869238,,,
+Topic,,,,,,5,,,
+Topic,,,,,,334600,,,
+Topic,,,,,,191290,,,
+Topic,,,,,,122113,,,
+Topic,,,,,,179057,,,
+Topic,,,,,,11635,,,
+Topic,,,,,,329717,,,
+Person,33927662206515912,,,,,,,,
+Topic,,,,,,35140,,,
+Topic,,,,,,485537,,,
+Topic,,,,,,102014,,,
+Topic,,,,,,40357,,,
+Topic,,,,,,1337691,,,
+Topic,,,,,,160409,,40.7825,-73.966111111111
+Topic,,,,,,69871376,,,
+Topic,,,,,,177749,,,
+Topic,,,,,,11348,,,
+Topic,,,,,,182218,,,
+Topic,,,,,,1229,,47.568611111111,40.852783333333
+Topic,,,,,,5322,,,
+Person,1035098046740791143,,,,,,,,
+Topic,,,,,,792565,,48.10277778,20.78388889
+Topic,,,,,,37654,,,
+Topic,,,,,,25395,,40.735277777778,-74.185
+Topic,,,,,,169313,,,
+Topic,,,,,,728,,,
+Topic,,,,,,699385,,,
+Topic,,,,,,22983,,,
+Person,971383124880710240,,,,,,,,
+Person,1010629269012322480,,,,,,,,
+Topic,,,,,,11299,,40.728333333333,-73.994166666667
+Topic,,,,,,83460,,,
+Topic,,,,,,10289,,,
+Topic,,,,,,11019,,,
+Topic,,,,,,470118,,,
+Person,1426050562563532645,,,,,,,,
+Person,75415528634186650,,,,,,,,
+Person,1001287904525368324,,,,,,,,
+Person,242111862342742257,,,,,,,,
+Topic,,,,,,11249,,40.747,-73.986
+Topic,,,,,,3933135,,,
+Topic,,,,,,44311,,,
+Person,1025135622623992536,,,,,,,,
+Topic,,,,,,37497186,,,
+Person,584485814982143221,,,,,,,,
+Person,1508332501512270227,,,,,,,,
+Topic,,,,,,328473,,40.712,-74.002
+Topic,,,,,,25347,,,
+Topic,,,,,,175111,,,
+Person,1312322776399358210,,,,,,,,
+Topic,,,,,,16003594,,,
+Topic,,,,,,48789658,,,
+Topic,,,,,,8148,,,
+Topic,,,,,,9420,,,
+Topic,,,,,,771572,,40.699945,-73.950148
+Topic,,,,,,5088838,,,
+Person,1597454052092354280,,,,,,,,
+Person,961135479935321085,,,,,,,,
+Topic,,,,,,123705,,,
+Topic,,,,,,598435,,,
+Topic,,,,,,732934,,,
+Person,317248309514344163,,,,,,,,
+Person,1524681741257900519,,,,,,,,
+Topic,,,,,,254860,,,
+Topic,,,,,,335046,,,
+Person,534449219561977424,,,,,,,,
+Person,1035056342462002945,,,,,,,,
+Person,1222330726897222256,,,,,,,,
+Person,493345739124130581,,,,,,,,
+Topic,,,,,,831691,,,
+Topic,,,,,,28321638,,,
+Person,682588400093615551,,,,,,,,
+Person,920136262355651383,,,,,,,,
+Person,351354309273100074,,,,,,,,
+Person,495352903902152146,,,,,,,,
+Person,211778681592778731,,,,,,,,
+Topic,,,,,,1189753,,,
+Person,396953035572582107,,,,,,,,
+Topic,,,,,,828749,,,
+Topic,,,,,,904756,,,
+Topic,,,,,,7392008,,,
+Topic,,,,,,2566598,,,
+Person,363047312690634767,,,,,,,,
+Topic,,,,,,618102,,,
+Person,205415260510814362,,,,,,,,
+Person,1251650482793161774,,,,,,,,
+Topic,,,,,,620463,,,
+Person,1463522545161373807,,,,,,,,
+Person,1150357430325141247,,,,,,,,
+Person,674253449444876344,,,,,,,,
+Person,1073324208204442390,,,,,,,,
+Topic,,,,,,1049632,,40.665352,-73.969264
+Topic,,,,,,29171,,,
+Person,1637740339335566412,,,,,,,,
+Person,524508243055647325,,,,,,,,
+Person,320151361710953715,,,,,,,,
+Person,128643504412157535,,,,,,,,
+Topic,,,,,,361,,,
+Person,1243472362254658420,,,,,,,,
+Topic,,,,,,617927,,,
+Person,1275555184736572954,,,,,,,,
+Topic,,,,,,974850,,,
+Person,934144115142885657,,,,,,,,
+Topic,,,,,,217627,,,
+Topic,,,,,,223155,,,
+Person,1504217244688272832,,,,,,,,
+Person,144548678565311334,,,,,,,,
+Person,1400516284533535554,,,,,,,,
+Topic,,,,,,14528,,,
+Person,1508951542204233332,,,,,,,,
+Person,611325512448133762,,,,,,,,
+Person,635555368637193420,,,,,,,,
+Person,134403203055015143,,,,,,,,
+Topic,,,,,,202013,,,
+Topic,,,,,,7602643,,,
+Topic,,,,,,121765,,40.774444444444,-73.904166666667
+Person,765254641650259739,,,,,,,,
+Person,975526659664533195,,,,,,,,
+Person,273872236541568195,,,,,,,,
+Topic,,,,,,557887,,,
+Topic,,,,,,774228,,,
+Topic,,,,,,46744,,,
+Person,352033450190732475,,,,,,,,
+Person,841466124620556016,,,,,,,,
+Person,1517466541524095404,,,,,,,,
+Topic,,,,,,114633,,,
+Person,301710390995444087,,,,,,,,
+Topic,,,,,,16868955,,,
+Person,747231730275042400,,,,,,,,
+Person,1164902255571715230,,,,,,,,
+Person,291914370254601234,,,,,,,,
+Topic,,,,,,247154,,,
+Topic,,,,,,519,,,
+Topic,,,,,,3303945,,,
+Person,1022241560051472272,,,,,,,,
+Person,566448585007839403,,,,,,,,
+Topic,,,,,,11229,,,
+Person,735713441679521195,,,,,,,,
+Person,1128501731262832684,,,,,,,,
+Person,446962590481145702,,,,,,,,
+Person,1125113326787431160,,,,,,,,
+Person,437201545096608055,,,,,,,,
+Person,940377106445268064,,,,,,,,
+Person,1647329525841402942,,,,,,,,
+Topic,,,,,,3884230,,,
+Topic,,,,,,131191,,,
+Person,1376053313411407054,,,,,,,,
+Person,1347432655942023365,,,,,,,,
+Person,1472154222902711100,,,,,,,,
+Person,529550602103217450,,,,,,,,
+Topic,,,,,,43035,,,
+Topic,,,,,,126095,,,
+Topic,,,,,,49088,,40.8075,-73.961944444444
+Person,910075513854877065,,,,,,,,
+Topic,,,,,,5018694,,40.859105555556,-74.198686111111
+Topic,,,,,,2446683,,43.1189,20.0797
+Topic,,,,,,2030894,,40.850852,-73.844949
+Topic,,,,,,29718382,,,
+Topic,,,,,,130965,,40.860833333333,-73.884444444444
+Topic,,,,,,167172,,,
+Topic,,,,,,2456507,,,
+Person,842652402732741813,,,,,,,,
+Topic,,,,,,670897,,33.421111111111,-111.93166666667
+Topic,,,,,,1436668,,,
+Topic,,,,,,753651,,,
+Topic,,,,,,7451247,,,
+Topic,,,,,,2493,,,
+Person,719533111062900642,,,,,,,,
+Person,834321901190546647,,,,,,,,
+Topic,,,,,,12796,,,
+Person,937074421253040138,,,,,,,,
+Topic,,,,,,18159587,,,
+Person,101810442957214781,,,,,,,,
+Topic,,,,,,131401,,,
+Topic,,,,,,929920,,,
+Topic,,,,,,466439,,,
+Topic,,,,,,6498684,,,
+Topic,,,,,,206361,,,
+Topic,,,,,,41796,,,
+Person,1152266442105786574,,,,,,,,
+Person,95240187156237415,,,,,,,,
+Topic,,,,,,7897553,,,
+Topic,,,,,,206887,,,
+Topic,,,,,,5405633,,,
+Person,1031526243841315760,,,,,,,,
+Topic,,,,,,11348540,,,
+Topic,,,,,,4198163,,,
+Topic,,,,,,16048728,,,
+Topic,,,,,,189756,,,
+Topic,,,,,,643638,,,
+Topic,,,,,,783874,,,
+Topic,,,,,,492346,,37.2708,-76.7069
+Topic,,,,,,29042975,,,
+Topic,,,,,,12103677,,,
+Topic,,,,,,2329157,,,
+Person,1563598527979706128,,,,,,,,
+Topic,,,,,,4442,,,
+Person,264075025125849069,,,,,,,,
+Person,369370063627142227,,,,,,,,
+Person,1300183120520109060,,,,,,,,
+Topic,,,,,,18031504,,,
+Topic,,,,,,4229887,,,
+Person,611117914195523184,,,,,,,,
+Topic,,,,,,7543639,,,
+Topic,,,,,,13977,,,
+Topic,,,,,,18122778,,,
+Person,166319955306346577,,,,,,,,
+Topic,,,,,,588894,,,
+Topic,,,,,,2454265,,,
+Person,1547400408884914628,,,,,,,,
+Person,373641740834326257,,,,,,,,
+Topic,,,,,,5264957,,,
+Topic,,,,,,968598,,,
+Person,754480939973310112,,,,,,,,
+Topic,,,,,,1808877,,,
+Person,1443919105364146460,,,,,,,,
+Person,735243266472522113,,,,,,,,
+Person,1321304826561136177,,,,,,,,
+Person,1560601202484151215,,,,,,,,
+Person,1403521534163206962,,,,,,,,
+Person,231472126788137195,,,,,,,,
+Person,208411288512434105,,,,,,,,
+Topic,,,,,,7252790,,,
+Person,1211456636406749825,,,,,,,,
+Person,1071303249530347453,,,,,,,,
+Person,1069710216181783510,,,,,,,,
+Person,1578613817419480731,,,,,,,,
+Person,944546653739552042,,,,,,,,
+Topic,,,,,,8856932,,,
+Person,616673625330310949,,,,,,,,
+Person,1302421465423646583,,,,,,,,
+Person,720320812100121121,,,,,,,,
+Person,653345304799504620,,,,,,,,
+Person,346401281431409585,,,,,,,,
+Person,1526112405471861415,,,,,,,,
+Person,1501623481588541372,,,,,,,,
+Person,312380611598980641,,,,,,,,
+Person,1115244423173415593,,,,,,,,
+Person,1555348115336584230,,,,,,,,
+Person,12321118467056216,,,,,,,,
+Person,1352636429150180228,,,,,,,,
+Person,725324491051434870,,,,,,,,
+Person,846764541256336994,,,,,,,,
+Person,140443713446471314,,,,,,,,
+Person,1135272113235621141,,,,,,,,
+Person,775818654043059161,,,,,,,,
+Person,529476525413023401,,,,,,,,
+Person,1262668194076216011,,,,,,,,
+Person,119474435514352445,,,,,,,,
+Person,437573095319558705,,,,,,,,
+Person,1035555223142154728,,,,,,,,
+Person,556320934631523806,,,,,,,,
+Person,1356253242219285320,,,,,,,,
+Person,248654236829951090,,,,,,,,
+Person,481153633235353485,,,,,,,,
+Includes,,,1202482536733844323,1296829658689065159,,,,,
+HasTopic,,,,956704137555154092,,335046,,,
+HasTopic,,,,1028329324575034354,,1808877,,,
+HasTopic,,,,,1004346153600881042,735,,,
+Author,1560601202484151215,,,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,131401,,,
+HasTopic,,,,1302313601603127196,,48789658,,,
+HasTopic,,,,1114502034902546550,,40357,,,
+Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,,
+HasTopic,,,,78678286442461987,,28321638,,,
+HasTopic,,,,854149383334143372,,903552,,,
+HasTopic,,,,240337224527030225,,519,,,
+HasTopic,,,,116892402526543412,,5264957,,,
+HasTopic,,,,202421472143651025,,12103677,,,
+HasTopic,,,,393285992310638641,,470118,,,
+Author,910075513854877065,,,,102583151124020340,,,,
+Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,,
+Author,1128501731262832684,,,1513662032452523252,,,,,
+Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,,
+Author,477384404927196020,,,1651365355351122204,,,,,
+Author,725324491051434870,,,202421472143651025,,,,,
+HasTopic,,,,932362105613871012,,69871376,,,
+HasTopic,,,,1290121451283392110,,169313,,,
+Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,,
+HasTopic,,,,701755398615636460,,8148,,,
+Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,,
+HasTopic,,,,1651365355351122204,,643638,,,
+HasTopic,,,,1114502034902546550,,46744,,,
+HasTopic,,,,936722743217343702,,123705,,,
+HasTopic,,,,321724159614056152,,6498684,,,
+Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,,
+HasTopic,,,1372844135435303981,,,60,,,
+Author,1578613817419480731,,,321724159614056152,,,,,
+HasTopic,,,,1512214307542520410,,8074,,,
+HasTopic,,,,618434247743641149,,192242,,,
+Includes,,,1202482536733844323,1114502034902546550,,,,,
+Includes,,,1615340315424362057,877764733212222524,,,,,
+HasTopic,,,,240337224527030225,,1229,,,
+HasTopic,,,,1209342585680609487,,179057,,,
+Author,834321901190546647,,,,1433303251800176474,,,,
+HasTopic,,,,1209342585680609487,,175111,,,
+Includes,,,1372844135435303981,581543512052485139,,,,,
+Author,720320812100121121,,,420762134340393550,,,,,
+Includes,,,1615340315424362057,1424660009578332566,,,,,
+HasTopic,,,,932362105613871012,,771572,,,
+Includes,,,353365307219544531,956704137555154092,,,,,
+HasTopic,,,,956704137555154092,,929920,,,
+HasTopic,,,,1441762191425652442,,177749,,,
+Author,1563598527979706128,,,1220295546212024391,,,,,
+Author,248654236829951090,,,1424660009578332566,,,,,
+Author,529476525413023401,,,1302313601603127196,,,,,
+HasTopic,,,,1209342585680609487,,771572,,,
+HasTopic,,,,,1433303251800176474,83460,,,
+Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,,
+Includes,,,1615340315424362057,128423416112315798,,,,,
+Includes,,,1372844135435303981,932362105613871012,,,,,
+Includes,,,1372844135435303981,747423119260925972,,,,,
+HasTopic,,,,854149383334143372,,24862,,,
+Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,,
+HasTopic,,,,1114502034902546550,,328473,,,
+Includes,,,1314315120197156050,854149383334143372,,,,,
+Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,,
+HasOrg,,,,,102583151124020340,49088,,,
+HasTopic,,,,932362105613871012,,11299,,,
+HasTopic,,,,932362105613871012,,18426,,,
+HasTopic,,,,,1004346153600881042,7392008,,,
+Author,1526112405471861415,,,846536331643665114,,,,,
+HasTopic,,,,78678286442461987,,617927,,,
+HasTopic,,,,321724159614056152,,4229887,,,
+HasTopic,,,,82629615412640377,,5322,,,
+Author,1128501731262832684,,,1114502034902546550,,,,,
+Author,166319955306346577,,,209800678458482108,,,,,
+Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,,
+Author,944546653739552042,,,803952155714850701,,,,,
+HasTopic,,,,581543512052485139,,3933135,,,
+HasTopic,,,,209800678458482108,,4198163,,,
+HasTopic,,,,932362105613871012,,7590,,,
+Includes,,,1372844135435303981,1060309546214304182,,,,,
+HasTopic,,,,846536331643665114,,167172,,,
+HasTopic,,,,740410432146852843,,11348540,,,
+Author,937074421253040138,,,,1004346153600881042,,,,
+HasTopic,,,,1282227710122181132,,2493,,,
+HasTopic,,,,,1433303251800176474,43035,,,
+Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,,
+Includes,,,1372844135435303981,1512214307542520410,,,,,
+HasTopic,,,353365307219544531,,,2329,,,
+Includes,,,1615340315424362057,701755398615636460,,,,,
+Includes,,,1372844135435303981,1209342585680609487,,,,,
+Includes,,,1202482536733844323,1513662032452523252,,,,,
+HasTopic,,,,701755398615636460,,968598,,,
+HasTopic,,,,128423416112315798,,11019,,,
+Author,1356253242219285320,,,854149383334143372,,,,,
+Author,1031526243841315760,,,,1433303251800176474,,,,
+Author,842652402732741813,,,,1433303251800176474,,,,
+HasTopic,,,,440265285168056234,,18159587,,,
+Author,1501623481588541372,,,833681012494554358,,,,,
+HasTopic,,,,1028329324575034354,,11299,,,
+Author,140443713446471314,,,1028329324575034354,,,,,
+Author,264075025125849069,,,1245126351375505703,,,,,
+Includes,,,1314315120197156050,1614534111336540475,,,,,
+HasTopic,,,,,1433303251800176474,131191,,,
+HasTopic,,,,420762134340393550,,1907525,,,
+Sale,975526659664533195,524508243055647325,,,,,10/15/2018,,
+Author,284405379592161575,,,,102583151124020340,,,,
+HasTopic,,,,,963345652072941810,735,,,
+HasTopic,,,,,1433303251800176474,3303945,,,
+Author,1152266442105786574,,,,963345652072941810,,,,
+HasTopic,,,,202421472143651025,,1189753,,,
+HasTopic,,,,1028329324575034354,,11229,,,
+Includes,,,1615340315424362057,1220295546212024391,,,,,
+Includes,,,1314315120197156050,846536331643665114,,,,,
+HasTopic,,,,1296829658689065159,,7451247,,,
+Includes,,,1615340315424362057,78678286442461987,,,,,
+HasTopic,,,,541215404780905313,,11635,,,
+Author,481153633235353485,,,956704137555154092,,,,,
+Author,611117914195523184,,,701755398615636460,,,,,
+Author,95240187156237415,,,,963345652072941810,,,,
+Includes,,,1372844135435303981,91431002216341149,,,,,
+Author,846764541256336994,,,1290121451283392110,,,,,
+HasTopic,,,,393285992310638641,,206887,,,
+HasTopic,,,,1512214307542520410,,177749,,,
+HasTopic,,,,1114502034902546550,,7252790,,,
+HasTopic,,,1202482536733844323,,,60,,,
+HasTopic,,,,1302313601603127196,,41796,,,
+HasTopic,,,,1114502034902546550,,44311,,,
+Author,1547400408884914628,,,393285992310638641,,,,,
+Author,312380611598980641,,,78678286442461987,,,,,
+Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,,
+HasTopic,,,,1513662032452523252,,728,,,
+Includes,,,1615340315424362057,447169043921403064,,,,,
+Includes,,,1372844135435303981,82629615412640377,,,,,
+HasTopic,,,,1302313601603127196,,29042975,,,
+Author,1321304826561136177,,,1532662490035322233,,,,,
+Includes,,,1372844135435303981,1302313601603127196,,,,,
+Sale,363047312690634767,242111862342742257,,,,,10/4/2018,,
+Author,1071303249530347453,,,737353170652104031,,,,,
+HasTopic,,,,1282227710122181132,,35140,,,
+HasTopic,,,,91431002216341149,,46744,,,
+Includes,,,1372844135435303981,1441762191425652442,,,,,
+Sale,446962590481145702,534449219561977424,,,,,10/11/2018,,
+Author,1035555223142154728,,,877764733212222524,,,,,
+Author,1403521534163206962,,,932362105613871012,,,,,
+HasTopic,,,,701755398615636460,,35127,,,
+Includes,,,1372844135435303981,1028329324575034354,,,,,
+HasTopic,,,,1245126351375505703,,254860,,,
+HasTopic,,,,209800678458482108,,7897553,,,
+Includes,,,1372844135435303981,186108460103013588,,,,,
+HasOrg,,,,,102583151124020340,304878,,,
+HasTopic,,,,,1433303251800176474,998,,,
+Author,653345304799504620,,,581543512052485139,,,,,
+Author,1302421465423646583,,,240337224527030225,,,,,
+Author,1211456636406749825,,,618434247743641149,,,,,
+HasTopic,,,,240337224527030225,,785,,,
+HasTopic,,,,,1004346153600881042,83460,,,
+Includes,,,1615340315424362057,1245126351375505703,,,,,
+HasTopic,,,,747423119260925972,,16868955,,,
+HasTopic,,,,91431002216341149,,9420,,,
+Includes,,,451888058015735870,541215404780905313,,,,,
+HasTopic,,,,1209342585680609487,,492346,,,
+HasTopic,,,,1290121451283392110,,114633,,,
+HasTopic,,,,1441762191425652442,,11249,,,
+HasTopic,,,,1532662490035322233,,753651,,,
+HasTopic,,,,1532662490035322233,,11707,,,
+HasTopic,,,,1245126351375505703,,904756,,,
+HasTopic,,,15133734353741126,,,189756,,,
+Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,,
+Includes,,,1372844135435303981,1424263331858043042,,,,,
+Includes,,,1615340315424362057,1290121451283392110,,,,,
+Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,,
+HasTopic,,,,737353170652104031,,16003594,,,
+HasTopic,,,,747423119260925972,,2329157,,,
+Sale,566448585007839403,765254641650259739,,,,,10/9/2018,,
+HasOrg,,,,,963345652072941810,130965,,,
+Author,616673625330310949,,,936722743217343702,,,,,
+HasTopic,,,,1513662032452523252,,44311,,,
+Author,119474435514352445,,,1282227710122181132,,,,,
+HasTopic,,,,82629615412640377,,22983,,,
+HasTopic,,,,833681012494554358,,201816,,,
+HasTopic,,,,,1433303251800176474,14528,,,
+Sale,33927662206515912,934144115142885657,,,,,10/10/2018,,
+Includes,,,1372844135435303981,618434247743641149,,,,,
+Includes,,,1615340315424362057,740410432146852843,,,,,
+Author,1300183120520109060,,,128423416112315798,,,,,
+Author,1069710216181783510,,,1296829658689065159,,,,,
+Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,,
+Author,1115244423173415593,,,1614534111336540475,,,,,
+Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,,
+HasTopic,,,,1285128710332882742,,37654,,,
+HasTopic,,,1615340315424362057,,,12796,,,
+Author,1555348115336584230,,,817526874194673140,,,,,
+HasTopic,,,,,102583151124020340,43035,,,
+HasTopic,,,,737353170652104031,,206361,,,
+Includes,,,1314315120197156050,833681012494554358,,,,,
+Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,,
+HasTopic,,,,1220295546212024391,,588894,,,
+HasTopic,,,,202421472143651025,,7602643,,,
+Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,,
+Author,735243266472522113,,,1209342585680609487,,,,,
+Includes,,,1372844135435303981,1651365355351122204,,,,,
+HasTopic,,,,1424660009578332566,,618102,,,
+HasTopic,,,,,102583151124020340,126095,,,
+HasTopic,,,,,1004346153600881042,2446683,,,
+HasTopic,,,,,1433303251800176474,598435,,,
+Author,735713441679521195,,,,1433303251800176474,,,,
+Includes,,,1314315120197156050,321724159614056152,,,,,
+HasTopic,,,,1114502034902546550,,8856932,,,
+HasTopic,,,,,1433303251800176474,5405633,,,
+HasTopic,,,,1285128710332882742,,11299,,,
+HasTopic,,,,420762134340393550,,12796,,,
+HasTopic,,,,1184855350262395542,,329717,,,
+HasTopic,,,,116892402526543412,,2456507,,,
+Includes,,,1615340315424362057,817526874194673140,,,,,
+Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,,
+HasTopic,,,,1209342585680609487,,127197,,,
+HasTopic,,,,420762134340393550,,16048728,,,
+HasTopic,,,,209800678458482108,,774228,,,
+Author,231472126788137195,,,82629615412640377,,,,,
+HasTopic,,,,,1433303251800176474,7817,,,
+HasTopic,,,,932362105613871012,,1049632,,,
+Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,,
+HasTopic,,,,,1433303251800176474,467,,,
+HasTopic,,,,1220295546212024391,,37497186,,,
+HasTopic,,,,817526874194673140,,732934,,,
+Includes,,,1615340315424362057,202421472143651025,,,,,
+HasTopic,,,,1513662032452523252,,889,,,
+Includes,,,1372844135435303981,737353170652104031,,,,,
+HasTopic,,,,1296829658689065159,,83460,,,
+HasTopic,,,,747423119260925972,,1333024,,,
+HasTopic,,,,1290121451283392110,,204,,,
+Author,611117914195523184,,,440265285168056234,,,,,
+Author,754480939973310112,,,1441762191425652442,,,,,
+Includes,,,1427292001647224242,936722743217343702,,,,,
+HasTopic,,,,1441762191425652442,,485537,,,
+Sale,1376053313411407054,1347432655942023365,,,,55424107,2/3/2019,,
+HasTopic,,,,581543512052485139,,519,,,
+Sale,682588400093615551,920136262355651383,,,,,10/5/2018,,
+Author,346401281431409585,,,1114502034902546550,,,,,
+Sale,317248309514344163,1524681741257900519,,,,,8/3/2018,,
+HasTopic,,,,618434247743641149,,3884230,,,
+Sale,1243472362254658420,205415260510814362,,,,,8/9/2018,,
+Sale,1472154222902711100,529550602103217450,,,,185785,2/17/2019,,
+Includes,,,1314315120197156050,803952155714850701,,,,,
+Author,1262668194076216011,,,747423119260925972,,,,,
+HasTopic,,,,833681012494554358,,787185,,,
+HasTopic,,,,932362105613871012,,160409,,,
+Sale,1125113326787431160,437201545096608055,,,,146,10/14/2018,,
+HasTopic,,,,1424660009578332566,,334600,,,
+HasTopic,,,451888058015735870,,,121765,,,
+HasTopic,,,,1285128710332882742,,193294,,,
+HasTopic,,,,956704137555154092,,217627,,,
+HasTopic,,,,1424263331858043042,,2454265,,,
+HasTopic,,,,1651365355351122204,,783874,,,
+HasTopic,,,,1209342585680609487,,792565,,,
+Author,1128501731262832684,,,1060309546214304182,,,,,
+Author,775818654043059161,,,740410432146852843,,,,,
+HasTopic,,,,854149383334143372,,4442,,,
+HasOrg,,,,,1004346153600881042,670897,,,
+Sale,1508951542204233332,611325512448133762,,,,,10/11/2018,,
+HasTopic,,,,747423119260925972,,202013,,,
+Author,208411288512434105,,,1513662032452523252,,,,,
+HasTopic,,,,740410432146852843,,5088838,,,
+Includes,,,15133734353741126,1532662490035322233,,,,,
+HasTopic,,,,817526874194673140,,18122778,,,
+HasTopic,,,,440265285168056234,,102014,,,
+HasTopic,,,,186108460103013588,,732934,,,
+Includes,,,1615340315424362057,209800678458482108,,,,,
+Author,373641740834326257,,,116892402526543412,,,,,
+HasTopic,,,,740410432146852843,,122113,,,
+Includes,,,1615340315424362057,440265285168056234,,,,,
+Author,719533111062900642,,,,1433303251800176474,,,,
+HasTopic,,,,420762134340393550,,10289,,,
+HasTopic,,,,1184855350262395542,,11348,,,
+HasTopic,,,,82629615412640377,,247154,,,
+Author,369370063627142227,,,1184855350262395542,,,,,
+Author,1135272113235621141,,,91431002216341149,,,,,
+Sale,635555368637193420,134403203055015143,,,,,8/15/2018,,
+Includes,,,1314315120197156050,1282227710122181132,,,,,
+HasTopic,,,,,1433303251800176474,828749,,,
+HasTopic,,,1314315120197156050,,,12796,,,
+HasTopic,,,,541215404780905313,,182218,,,
+HasTopic,,,,877764733212222524,,7543639,,,
+HasTopic,,,,,1433303251800176474,1436668,,,
+HasTopic,,,,447169043921403064,,29171,,,
+HasOrg,,,,,1433303251800176474,2030894,,,
+Sale,477384404927196020,1128501731262832684,,,,271997,09/30/2018,,
+HasTopic,,,,581543512052485139,,505619,,,
+HasTopic,,,,618434247743641149,,60,,,
+HasTopic,,,,803952155714850701,,557887,,,
+HasTopic,,,,78678286442461987,,831691,,,
+Author,101810442957214781,,,,1004346153600881042,,,,
+Sale,1597454052092354280,961135479935321085,,,,1149078,8/1/2018,,
+HasOrg,,,,,1004346153600881042,5018694,,,
+Sale,301710390995444087,1312322776399358210,,,,379860,10/12/2018,,
+HasTopic,,,,,1004346153600881042,29718382,,,
+HasTopic,,,,,1433303251800176474,620463,,,
+HasTopic,,,,1651365355351122204,,44311,,,
+Author,1443919105364146460,,,1424263331858043042,,,,,
+Sale,895197896920634500,1508332501512270227,,,,487,7/31/2018,,
+HasTopic,,,,321724159614056152,,158668,,,
+Includes,,,1427292001647224242,240337224527030225,,,,,
+HasTopic,,,,1184855350262395542,,223155,,,
+Includes,,,1615340315424362057,1184855350262395542,,,,,
+HasTopic,,,,1532662490035322233,,1337691,,,
+Author,1352636429150180228,,,186108460103013588,,,,,
+HasTopic,,,,393285992310638641,,974850,,,
+Includes,,,1615340315424362057,116892402526543412,,,,,
+HasTopic,,,,128423416112315798,,361,,,
+HasTopic,,,,541215404780905313,,13977,,,
+HasTopic,,,,1512214307542520410,,699385,,,
+Sale,1035098046740791143,352033450190732475,,,,,8/25/2018,,
+Includes,,,1615340315424362057,420762134340393550,,,,,
+Author,556320934631523806,,,541215404780905313,,,,,
+HasTopic,,,1427292001647224242,,,25395,,,
+HasTopic,,,,1424660009578332566,,191290,,,
+Sale,351354309273100074,1073324208204442390,,,,206021,8/7/2018,,
+Sale,1025135622623992536,1001287904525368324,,,,,10/2/2018,,
+Author,12321118467056216,,,1512214307542520410,,,,,
+HasTopic,,,,932362105613871012,,60,,,
+HasTopic,,,,803952155714850701,,5,,,
+HasTopic,,,,1220295546212024391,,18031504,,,
+HasTopic,,,,91431002216341149,,73843,,,
+Author,437573095319558705,,,447169043921403064,,,,,
+HasTopic,,,,1424263331858043042,,2566598,,,
+Includes,,,15133734353741126,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,466439,,,
+Includes,,,1314315120197156050,393285992310638641,,,,,
diff --git a/inputs/wmd/data.001.csv b/inputs/wmd/data.001.csv
new file mode 100644
index 0000000000..f5479d326c
--- /dev/null
+++ b/inputs/wmd/data.001.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff7df1aa0a2261d930471fc057251d1aa2cb404fa8c88c12c3b65fe2a5204bf8
+size 2879652
diff --git a/inputs/wmd/dynamic0.txt b/inputs/wmd/dynamic0.txt
new file mode 100644
index 0000000000..37648532bc
--- /dev/null
+++ b/inputs/wmd/dynamic0.txt
@@ -0,0 +1,46 @@
+Author,346401281431409585,,,1114502034902546550,,,,,
+Sale,317248309514344163,1524681741257900519,,,,,8/3/2018,,
+HasTopic,,,,618434247743641149,,3884230,,,
+Sale,1243472362254658420,205415260510814362,,,,,8/9/2018,,
+Sale,1472154222902711100,529550602103217450,,,,185785,2/17/2019,,
+Includes,,,1314315120197156050,803952155714850701,,,,,
+Author,1262668194076216011,,,747423119260925972,,,,,
+HasTopic,,,,833681012494554358,,787185,,,
+HasTopic,,,,932362105613871012,,160409,,,
+Sale,1125113326787431160,437201545096608055,,,,146,10/14/2018,,
+HasTopic,,,,1424660009578332566,,334600,,,
+HasTopic,,,451888058015735870,,,121765,,,
+HasTopic,,,,1285128710332882742,,193294,,,
+HasTopic,,,,956704137555154092,,217627,,,
+HasTopic,,,,1424263331858043042,,2454265,,,
+HasTopic,,,,1651365355351122204,,783874,,,
+HasTopic,,,,1209342585680609487,,792565,,,
+Author,1128501731262832684,,,1060309546214304182,,,,,
+Author,775818654043059161,,,740410432146852843,,,,,
+HasTopic,,,,854149383334143372,,4442,,,
+HasOrg,,,,,1004346153600881042,670897,,,
+Sale,1508951542204233332,611325512448133762,,,,,10/11/2018,,
+HasTopic,,,,747423119260925972,,202013,,,
+Author,208411288512434105,,,1513662032452523252,,,,,
+HasTopic,,,,740410432146852843,,5088838,,,
+Includes,,,15133734353741126,1532662490035322233,,,,,
+HasTopic,,,,817526874194673140,,18122778,,,
+HasTopic,,,,440265285168056234,,102014,,,
+HasTopic,,,,186108460103013588,,732934,,,
+Includes,,,1615340315424362057,209800678458482108,,,,,
+Author,373641740834326257,,,116892402526543412,,,,,
+HasTopic,,,,740410432146852843,,122113,,,
+Includes,,,1615340315424362057,440265285168056234,,,,,
+Author,719533111062900642,,,,1433303251800176474,,,,
+HasTopic,,,,420762134340393550,,10289,,,
+HasTopic,,,,1184855350262395542,,11348,,,
+HasTopic,,,,82629615412640377,,247154,,,
+Author,369370063627142227,,,1184855350262395542,,,,,
+Author,1135272113235621141,,,91431002216341149,,,,,
+Sale,635555368637193420,134403203055015143,,,,,8/15/2018,,
+Includes,,,1314315120197156050,1282227710122181132,,,,,
+HasTopic,,,,,1433303251800176474,828749,,,
+HasTopic,,,1314315120197156050,,,12796,,,
+HasTopic,,,,541215404780905313,,182218,,,
+HasTopic,,,,877764733212222524,,7543639,,,
+HasTopic,,,,,1433303251800176474,1436668,,,
diff --git a/inputs/wmd/dynamic1.txt b/inputs/wmd/dynamic1.txt
new file mode 100644
index 0000000000..8f13aa07e8
--- /dev/null
+++ b/inputs/wmd/dynamic1.txt
@@ -0,0 +1,44 @@
+HasTopic,,,,447169043921403064,,29171,,,
+HasOrg,,,,,1433303251800176474,2030894,,,
+Sale,477384404927196020,1128501731262832684,,,,271997,09/30/2018,,
+HasTopic,,,,581543512052485139,,505619,,,
+HasTopic,,,,618434247743641149,,60,,,
+HasTopic,,,,803952155714850701,,557887,,,
+HasTopic,,,,78678286442461987,,831691,,,
+Author,101810442957214781,,,,1004346153600881042,,,,
+Sale,1597454052092354280,961135479935321085,,,,1149078,8/1/2018,,
+HasOrg,,,,,1004346153600881042,5018694,,,
+Sale,301710390995444087,1312322776399358210,,,,379860,10/12/2018,,
+HasTopic,,,,,1004346153600881042,29718382,,,
+HasTopic,,,,,1433303251800176474,620463,,,
+HasTopic,,,,1651365355351122204,,44311,,,
+Author,1443919105364146460,,,1424263331858043042,,,,,
+Sale,895197896920634500,1508332501512270227,,,,487,7/31/2018,,
+HasTopic,,,,321724159614056152,,158668,,,
+Includes,,,1427292001647224242,240337224527030225,,,,,
+HasTopic,,,,1184855350262395542,,223155,,,
+Includes,,,1615340315424362057,1184855350262395542,,,,,
+HasTopic,,,,1532662490035322233,,1337691,,,
+Author,1352636429150180228,,,186108460103013588,,,,,
+HasTopic,,,,393285992310638641,,974850,,,
+Includes,,,1615340315424362057,116892402526543412,,,,,
+HasTopic,,,,128423416112315798,,361,,,
+HasTopic,,,,541215404780905313,,13977,,,
+HasTopic,,,,1512214307542520410,,699385,,,
+Sale,1035098046740791143,352033450190732475,,,,,8/25/2018,,
+Includes,,,1615340315424362057,420762134340393550,,,,,
+Author,556320934631523806,,,541215404780905313,,,,,
+HasTopic,,,1427292001647224242,,,25395,,,
+HasTopic,,,,1424660009578332566,,191290,,,
+Sale,351354309273100074,1073324208204442390,,,,206021,8/7/2018,,
+Sale,1025135622623992536,1001287904525368324,,,,,10/2/2018,,
+Author,12321118467056216,,,1512214307542520410,,,,,
+HasTopic,,,,932362105613871012,,60,,,
+HasTopic,,,,803952155714850701,,5,,,
+HasTopic,,,,1220295546212024391,,18031504,,,
+HasTopic,,,,91431002216341149,,73843,,,
+Author,437573095319558705,,,447169043921403064,,,,,
+HasTopic,,,,1424263331858043042,,2566598,,,
+Includes,,,15133734353741126,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,466439,,,
+Includes,,,1314315120197156050,393285992310638641,,,,,
diff --git a/inputs/wmd/static b/inputs/wmd/static
new file mode 100644
index 0000000000..b95a23c9b4
--- /dev/null
+++ b/inputs/wmd/static
@@ -0,0 +1,650 @@
+#delimieter: ,
+#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon
+#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE
+Publication,,,,,102583151124020340,,4/1/2013,,
+Publication,,,,,1004346153600881042,,12/2/2014,,
+Publication,,,,,1433303251800176474,,1/1/2014,,
+Publication,,,,,963345652072941810,,3/1/2017,,
+ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,,
+ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,,
+ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,,
+ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,,
+ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,,
+ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,,
+ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,,
+ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,,
+ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,,
+ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,,
+ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,,
+ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,,
+ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,,
+ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,,
+ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,,
+ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,,
+ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,,
+ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,,
+ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,,
+ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,,
+ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,,
+ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,,
+ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,,
+ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,,
+ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,,
+ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,,
+ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,,
+ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,,
+ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,,
+ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,,
+ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,,
+ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,,
+ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,,
+ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,,
+ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,,
+ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,,
+ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,,
+ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,,
+ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,,
+ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,,
+ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,,
+ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,,
+ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,,
+ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,,
+ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,,
+ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,,
+ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,,
+ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,,
+ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,,
+ForumEvent,,,,1142353335442842612,,,10/2/2019,,
+ForumEvent,,,,1417645062678302203,,,10/27/2019,,
+ForumEvent,,,,691612430615344311,,,11/18/2019,,
+ForumEvent,,,,499518911125406276,,,11/7/2019,,
+ForumEvent,,,,802203574353867462,,,12/26/2019,,
+ForumEvent,,,,1154045191214226005,,,12/19/2019,,
+Forum,,,227560344059645632,,,,,,
+Forum,,,642724485236726353,,,,,,
+Forum,,,1583773067440233990,,,,,,
+Forum,,,353365307219544531,,,,,,
+Forum,,,1372844135435303981,,,,,,
+Forum,,,817570614729612563,,,,,,
+Forum,,,1222966301068614432,,,,,,
+Forum,,,254347350613723281,,,,,,
+Forum,,,230406515001545612,,,,,,
+Forum,,,1561731546512891652,,,,,,
+Forum,,,188043543797416114,,,,,,
+Forum,,,1083041743586306041,,,,,,
+Forum,,,132472381132383125,,,,,,
+Forum,,,20118285562646166,,,,,,
+Forum,,,555784630220125214,,,,,,
+Forum,,,1015255971523263924,,,,,,
+Forum,,,1342495276080758813,,,,,,
+Forum,,,851350143155248158,,,,,,
+Forum,,,1427292001647224242,,,,,,
+Forum,,,722051276937327353,,,,,,
+Forum,,,1107212912316309796,,,,,,
+Forum,,,504490409499070811,,,,,,
+Forum,,,15133734353741126,,,,,,
+Forum,,,869745302967338810,,,,,,
+Forum,,,324124332757504717,,,,,,
+Forum,,,852491638004013222,,,,,,
+Forum,,,1040437236245414809,,,,,,
+Forum,,,442231451428861295,,,,,,
+Forum,,,101022092642335391,,,,,,
+Forum,,,1037815940207624157,,,,,,
+Forum,,,1331941318481662527,,,,,,
+Forum,,,1615340315424362057,,,,,,
+Forum,,,1425519641234605945,,,,,,
+Forum,,,705065952261175094,,,,,,
+Forum,,,1314315120197156050,,,,,,
+Forum,,,214214821270800149,,,,,,
+Forum,,,1361197157264541395,,,,,,
+Forum,,,1033538541314217453,,,,,,
+Forum,,,565733832133342431,,,,,,
+Forum,,,451888058015735870,,,,,,
+Forum,,,155345234637251110,,,,,,
+Forum,,,1371100161965701220,,,,,,
+Forum,,,1307221369082243900,,,,,,
+Forum,,,406508153569651122,,,,,,
+Forum,,,1202482536733844323,,,,,,
+Forum,,,912373284682369433,,,,,,
+Person,477384404927196020,,,,,,,,
+Person,182010581109145287,,,,,,,,
+Topic,,,,,,271997,,,
+Topic,,,,,,127197,,,
+Person,284405379592161575,,,,,,,,
+Topic,,,,,,11650,,,
+Topic,,,,,,185785,,,
+Topic,,,,,,1907525,,,
+Topic,,,,,,1333024,,,
+Topic,,,,,,2329,,,
+Topic,,,,,,571,,,
+Topic,,,,,,56683126,,,
+Topic,,,,,,146,,,
+Topic,,,,,,487,,,
+Topic,,,,,,193294,,,
+Topic,,,,,,177,,,
+Topic,,,,,,81944,,,
+Topic,,,,,,998,,,
+Topic,,,,,,55424107,,,
+Topic,,,,,,41323,,,
+Topic,,,,,,38695,,,
+Topic,,,,,,379860,,,
+Topic,,,,,,1149078,,,
+Topic,,,,,,172809,,,
+Topic,,,,,,1642639,,,
+Topic,,,,,,903552,,,
+Topic,,,,,,204,,,
+Topic,,,,,,7817,,,
+Topic,,,,,,201816,,,
+Topic,,,,,,785,,49.19,-2.11
+Topic,,,,,,127,,,
+Topic,,,,,,206021,,,
+Topic,,,,,,181508,,,
+Topic,,,,,,735,,,
+Topic,,,,,,304878,,,
+Topic,,,,,,7590,,,
+Topic,,,,,,8074,,,
+Topic,,,,,,24862,,,
+Topic,,,,,,35127,,,
+Topic,,,,,,60,,40.67,-73.94
+Topic,,,,,,443533,,,
+Person,1160244137181801222,,,,,,,,
+Topic,,,,,,192242,,,
+Topic,,,,,,11707,,,
+Topic,,,,,,73843,,,
+Topic,,,,,,505619,,,
+Topic,,,,,,158668,,,
+Topic,,,,,,889,,34.0,66.0
+Person,895197896920634500,,,,,,,,
+Topic,,,,,,18426,,40.84676,-73.873207
+Topic,,,,,,787185,,,
+Topic,,,,,,467,,,
+Person,1419850416906085161,,,,,,,,
+Topic,,,,,,2869238,,,
+Topic,,,,,,5,,,
+Topic,,,,,,334600,,,
+Topic,,,,,,191290,,,
+Topic,,,,,,122113,,,
+Topic,,,,,,179057,,,
+Topic,,,,,,11635,,,
+Topic,,,,,,329717,,,
+Person,33927662206515912,,,,,,,,
+Topic,,,,,,35140,,,
+Topic,,,,,,485537,,,
+Topic,,,,,,102014,,,
+Topic,,,,,,40357,,,
+Topic,,,,,,1337691,,,
+Topic,,,,,,160409,,40.7825,-73.966111111111
+Topic,,,,,,69871376,,,
+Topic,,,,,,177749,,,
+Topic,,,,,,11348,,,
+Topic,,,,,,182218,,,
+Topic,,,,,,1229,,47.568611111111,40.852783333333
+Topic,,,,,,5322,,,
+Person,1035098046740791143,,,,,,,,
+Topic,,,,,,792565,,48.10277778,20.78388889
+Topic,,,,,,37654,,,
+Topic,,,,,,25395,,40.735277777778,-74.185
+Topic,,,,,,169313,,,
+Topic,,,,,,728,,,
+Topic,,,,,,699385,,,
+Topic,,,,,,22983,,,
+Person,971383124880710240,,,,,,,,
+Person,1010629269012322480,,,,,,,,
+Topic,,,,,,11299,,40.728333333333,-73.994166666667
+Topic,,,,,,83460,,,
+Topic,,,,,,10289,,,
+Topic,,,,,,11019,,,
+Topic,,,,,,470118,,,
+Person,1426050562563532645,,,,,,,,
+Person,75415528634186650,,,,,,,,
+Person,1001287904525368324,,,,,,,,
+Person,242111862342742257,,,,,,,,
+Topic,,,,,,11249,,40.747,-73.986
+Topic,,,,,,3933135,,,
+Topic,,,,,,44311,,,
+Person,1025135622623992536,,,,,,,,
+Topic,,,,,,37497186,,,
+Person,584485814982143221,,,,,,,,
+Person,1508332501512270227,,,,,,,,
+Topic,,,,,,328473,,40.712,-74.002
+Topic,,,,,,25347,,,
+Topic,,,,,,175111,,,
+Person,1312322776399358210,,,,,,,,
+Topic,,,,,,16003594,,,
+Topic,,,,,,48789658,,,
+Topic,,,,,,8148,,,
+Topic,,,,,,9420,,,
+Topic,,,,,,771572,,40.699945,-73.950148
+Topic,,,,,,5088838,,,
+Person,1597454052092354280,,,,,,,,
+Person,961135479935321085,,,,,,,,
+Topic,,,,,,123705,,,
+Topic,,,,,,598435,,,
+Topic,,,,,,732934,,,
+Person,317248309514344163,,,,,,,,
+Person,1524681741257900519,,,,,,,,
+Topic,,,,,,254860,,,
+Topic,,,,,,335046,,,
+Person,534449219561977424,,,,,,,,
+Person,1035056342462002945,,,,,,,,
+Person,1222330726897222256,,,,,,,,
+Person,493345739124130581,,,,,,,,
+Topic,,,,,,831691,,,
+Topic,,,,,,28321638,,,
+Person,682588400093615551,,,,,,,,
+Person,920136262355651383,,,,,,,,
+Person,351354309273100074,,,,,,,,
+Person,495352903902152146,,,,,,,,
+Person,211778681592778731,,,,,,,,
+Topic,,,,,,1189753,,,
+Person,396953035572582107,,,,,,,,
+Topic,,,,,,828749,,,
+Topic,,,,,,904756,,,
+Topic,,,,,,7392008,,,
+Topic,,,,,,2566598,,,
+Person,363047312690634767,,,,,,,,
+Topic,,,,,,618102,,,
+Person,205415260510814362,,,,,,,,
+Person,1251650482793161774,,,,,,,,
+Topic,,,,,,620463,,,
+Person,1463522545161373807,,,,,,,,
+Person,1150357430325141247,,,,,,,,
+Person,674253449444876344,,,,,,,,
+Person,1073324208204442390,,,,,,,,
+Topic,,,,,,1049632,,40.665352,-73.969264
+Topic,,,,,,29171,,,
+Person,1637740339335566412,,,,,,,,
+Person,524508243055647325,,,,,,,,
+Person,320151361710953715,,,,,,,,
+Person,128643504412157535,,,,,,,,
+Topic,,,,,,361,,,
+Person,1243472362254658420,,,,,,,,
+Topic,,,,,,617927,,,
+Person,1275555184736572954,,,,,,,,
+Topic,,,,,,974850,,,
+Person,934144115142885657,,,,,,,,
+Topic,,,,,,217627,,,
+Topic,,,,,,223155,,,
+Person,1504217244688272832,,,,,,,,
+Person,144548678565311334,,,,,,,,
+Person,1400516284533535554,,,,,,,,
+Topic,,,,,,14528,,,
+Person,1508951542204233332,,,,,,,,
+Person,611325512448133762,,,,,,,,
+Person,635555368637193420,,,,,,,,
+Person,134403203055015143,,,,,,,,
+Topic,,,,,,202013,,,
+Topic,,,,,,7602643,,,
+Topic,,,,,,121765,,40.774444444444,-73.904166666667
+Person,765254641650259739,,,,,,,,
+Person,975526659664533195,,,,,,,,
+Person,273872236541568195,,,,,,,,
+Topic,,,,,,557887,,,
+Topic,,,,,,774228,,,
+Topic,,,,,,46744,,,
+Person,352033450190732475,,,,,,,,
+Person,841466124620556016,,,,,,,,
+Person,1517466541524095404,,,,,,,,
+Topic,,,,,,114633,,,
+Person,301710390995444087,,,,,,,,
+Topic,,,,,,16868955,,,
+Person,747231730275042400,,,,,,,,
+Person,1164902255571715230,,,,,,,,
+Person,291914370254601234,,,,,,,,
+Topic,,,,,,247154,,,
+Topic,,,,,,519,,,
+Topic,,,,,,3303945,,,
+Person,1022241560051472272,,,,,,,,
+Person,566448585007839403,,,,,,,,
+Topic,,,,,,11229,,,
+Person,735713441679521195,,,,,,,,
+Person,1128501731262832684,,,,,,,,
+Person,446962590481145702,,,,,,,,
+Person,1125113326787431160,,,,,,,,
+Person,437201545096608055,,,,,,,,
+Person,940377106445268064,,,,,,,,
+Person,1647329525841402942,,,,,,,,
+Topic,,,,,,3884230,,,
+Topic,,,,,,131191,,,
+Person,1376053313411407054,,,,,,,,
+Person,1347432655942023365,,,,,,,,
+Person,1472154222902711100,,,,,,,,
+Person,529550602103217450,,,,,,,,
+Topic,,,,,,43035,,,
+Topic,,,,,,126095,,,
+Topic,,,,,,49088,,40.8075,-73.961944444444
+Person,910075513854877065,,,,,,,,
+Topic,,,,,,5018694,,40.859105555556,-74.198686111111
+Topic,,,,,,2446683,,43.1189,20.0797
+Topic,,,,,,2030894,,40.850852,-73.844949
+Topic,,,,,,29718382,,,
+Topic,,,,,,130965,,40.860833333333,-73.884444444444
+Topic,,,,,,167172,,,
+Topic,,,,,,2456507,,,
+Person,842652402732741813,,,,,,,,
+Topic,,,,,,670897,,33.421111111111,-111.93166666667
+Topic,,,,,,1436668,,,
+Topic,,,,,,753651,,,
+Topic,,,,,,7451247,,,
+Topic,,,,,,2493,,,
+Person,719533111062900642,,,,,,,,
+Person,834321901190546647,,,,,,,,
+Topic,,,,,,12796,,,
+Person,937074421253040138,,,,,,,,
+Topic,,,,,,18159587,,,
+Person,101810442957214781,,,,,,,,
+Topic,,,,,,131401,,,
+Topic,,,,,,929920,,,
+Topic,,,,,,466439,,,
+Topic,,,,,,6498684,,,
+Topic,,,,,,206361,,,
+Topic,,,,,,41796,,,
+Person,1152266442105786574,,,,,,,,
+Person,95240187156237415,,,,,,,,
+Topic,,,,,,7897553,,,
+Topic,,,,,,206887,,,
+Topic,,,,,,5405633,,,
+Person,1031526243841315760,,,,,,,,
+Topic,,,,,,11348540,,,
+Topic,,,,,,4198163,,,
+Topic,,,,,,16048728,,,
+Topic,,,,,,189756,,,
+Topic,,,,,,643638,,,
+Topic,,,,,,783874,,,
+Topic,,,,,,492346,,37.2708,-76.7069
+Topic,,,,,,29042975,,,
+Topic,,,,,,12103677,,,
+Topic,,,,,,2329157,,,
+Person,1563598527979706128,,,,,,,,
+Topic,,,,,,4442,,,
+Person,264075025125849069,,,,,,,,
+Person,369370063627142227,,,,,,,,
+Person,1300183120520109060,,,,,,,,
+Topic,,,,,,18031504,,,
+Topic,,,,,,4229887,,,
+Person,611117914195523184,,,,,,,,
+Topic,,,,,,7543639,,,
+Topic,,,,,,13977,,,
+Topic,,,,,,18122778,,,
+Person,166319955306346577,,,,,,,,
+Topic,,,,,,588894,,,
+Topic,,,,,,2454265,,,
+Person,1547400408884914628,,,,,,,,
+Person,373641740834326257,,,,,,,,
+Topic,,,,,,5264957,,,
+Topic,,,,,,968598,,,
+Person,754480939973310112,,,,,,,,
+Topic,,,,,,1808877,,,
+Person,1443919105364146460,,,,,,,,
+Person,735243266472522113,,,,,,,,
+Person,1321304826561136177,,,,,,,,
+Person,1560601202484151215,,,,,,,,
+Person,1403521534163206962,,,,,,,,
+Person,231472126788137195,,,,,,,,
+Person,208411288512434105,,,,,,,,
+Topic,,,,,,7252790,,,
+Person,1211456636406749825,,,,,,,,
+Person,1071303249530347453,,,,,,,,
+Person,1069710216181783510,,,,,,,,
+Person,1578613817419480731,,,,,,,,
+Person,944546653739552042,,,,,,,,
+Topic,,,,,,8856932,,,
+Person,616673625330310949,,,,,,,,
+Person,1302421465423646583,,,,,,,,
+Person,720320812100121121,,,,,,,,
+Person,653345304799504620,,,,,,,,
+Person,346401281431409585,,,,,,,,
+Person,1526112405471861415,,,,,,,,
+Person,1501623481588541372,,,,,,,,
+Person,312380611598980641,,,,,,,,
+Person,1115244423173415593,,,,,,,,
+Person,1555348115336584230,,,,,,,,
+Person,12321118467056216,,,,,,,,
+Person,1352636429150180228,,,,,,,,
+Person,725324491051434870,,,,,,,,
+Person,846764541256336994,,,,,,,,
+Person,140443713446471314,,,,,,,,
+Person,1135272113235621141,,,,,,,,
+Person,775818654043059161,,,,,,,,
+Person,529476525413023401,,,,,,,,
+Person,1262668194076216011,,,,,,,,
+Person,119474435514352445,,,,,,,,
+Person,437573095319558705,,,,,,,,
+Person,1035555223142154728,,,,,,,,
+Person,556320934631523806,,,,,,,,
+Person,1356253242219285320,,,,,,,,
+Person,248654236829951090,,,,,,,,
+Person,481153633235353485,,,,,,,,
+Includes,,,1202482536733844323,1296829658689065159,,,,,
+HasTopic,,,,956704137555154092,,335046,,,
+HasTopic,,,,1028329324575034354,,1808877,,,
+HasTopic,,,,,1004346153600881042,735,,,
+Author,1560601202484151215,,,1285128710332882742,,,,,
+HasTopic,,,,1513662032452523252,,131401,,,
+HasTopic,,,,1302313601603127196,,48789658,,,
+HasTopic,,,,1114502034902546550,,40357,,,
+Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,,
+HasTopic,,,,78678286442461987,,28321638,,,
+HasTopic,,,,854149383334143372,,903552,,,
+HasTopic,,,,240337224527030225,,519,,,
+HasTopic,,,,116892402526543412,,5264957,,,
+HasTopic,,,,202421472143651025,,12103677,,,
+HasTopic,,,,393285992310638641,,470118,,,
+Author,910075513854877065,,,,102583151124020340,,,,
+Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,,
+Author,1128501731262832684,,,1513662032452523252,,,,,
+Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,,
+Author,477384404927196020,,,1651365355351122204,,,,,
+Author,725324491051434870,,,202421472143651025,,,,,
+HasTopic,,,,932362105613871012,,69871376,,,
+HasTopic,,,,1290121451283392110,,169313,,,
+Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,,
+HasTopic,,,,701755398615636460,,8148,,,
+Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,,
+HasTopic,,,,1651365355351122204,,643638,,,
+HasTopic,,,,1114502034902546550,,46744,,,
+HasTopic,,,,936722743217343702,,123705,,,
+HasTopic,,,,321724159614056152,,6498684,,,
+Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,,
+HasTopic,,,1372844135435303981,,,60,,,
+Author,1578613817419480731,,,321724159614056152,,,,,
+HasTopic,,,,1512214307542520410,,8074,,,
+HasTopic,,,,618434247743641149,,192242,,,
+Includes,,,1202482536733844323,1114502034902546550,,,,,
+Includes,,,1615340315424362057,877764733212222524,,,,,
+HasTopic,,,,240337224527030225,,1229,,,
+HasTopic,,,,1209342585680609487,,179057,,,
+Author,834321901190546647,,,,1433303251800176474,,,,
+HasTopic,,,,1209342585680609487,,175111,,,
+Includes,,,1372844135435303981,581543512052485139,,,,,
+Author,720320812100121121,,,420762134340393550,,,,,
+Includes,,,1615340315424362057,1424660009578332566,,,,,
+HasTopic,,,,932362105613871012,,771572,,,
+Includes,,,353365307219544531,956704137555154092,,,,,
+HasTopic,,,,956704137555154092,,929920,,,
+HasTopic,,,,1441762191425652442,,177749,,,
+Author,1563598527979706128,,,1220295546212024391,,,,,
+Author,248654236829951090,,,1424660009578332566,,,,,
+Author,529476525413023401,,,1302313601603127196,,,,,
+HasTopic,,,,1209342585680609487,,771572,,,
+HasTopic,,,,,1433303251800176474,83460,,,
+Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,,
+Includes,,,1615340315424362057,128423416112315798,,,,,
+Includes,,,1372844135435303981,932362105613871012,,,,,
+Includes,,,1372844135435303981,747423119260925972,,,,,
+HasTopic,,,,854149383334143372,,24862,,,
+Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,,
+HasTopic,,,,1114502034902546550,,328473,,,
+Includes,,,1314315120197156050,854149383334143372,,,,,
+Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,,
+HasOrg,,,,,102583151124020340,49088,,,
+HasTopic,,,,932362105613871012,,11299,,,
+HasTopic,,,,932362105613871012,,18426,,,
+HasTopic,,,,,1004346153600881042,7392008,,,
+Author,1526112405471861415,,,846536331643665114,,,,,
+HasTopic,,,,78678286442461987,,617927,,,
+HasTopic,,,,321724159614056152,,4229887,,,
+HasTopic,,,,82629615412640377,,5322,,,
+Author,1128501731262832684,,,1114502034902546550,,,,,
+Author,166319955306346577,,,209800678458482108,,,,,
+Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,,
+Author,944546653739552042,,,803952155714850701,,,,,
+HasTopic,,,,581543512052485139,,3933135,,,
+HasTopic,,,,209800678458482108,,4198163,,,
+HasTopic,,,,932362105613871012,,7590,,,
+Includes,,,1372844135435303981,1060309546214304182,,,,,
+HasTopic,,,,846536331643665114,,167172,,,
+HasTopic,,,,740410432146852843,,11348540,,,
+Author,937074421253040138,,,,1004346153600881042,,,,
+HasTopic,,,,1282227710122181132,,2493,,,
+HasTopic,,,,,1433303251800176474,43035,,,
+Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,,
+Includes,,,1372844135435303981,1512214307542520410,,,,,
+HasTopic,,,353365307219544531,,,2329,,,
+Includes,,,1615340315424362057,701755398615636460,,,,,
+Includes,,,1372844135435303981,1209342585680609487,,,,,
+Includes,,,1202482536733844323,1513662032452523252,,,,,
+HasTopic,,,,701755398615636460,,968598,,,
+HasTopic,,,,128423416112315798,,11019,,,
+Author,1356253242219285320,,,854149383334143372,,,,,
+Author,1031526243841315760,,,,1433303251800176474,,,,
+Author,842652402732741813,,,,1433303251800176474,,,,
+HasTopic,,,,440265285168056234,,18159587,,,
+Author,1501623481588541372,,,833681012494554358,,,,,
+HasTopic,,,,1028329324575034354,,11299,,,
+Author,140443713446471314,,,1028329324575034354,,,,,
+Author,264075025125849069,,,1245126351375505703,,,,,
+Includes,,,1314315120197156050,1614534111336540475,,,,,
+HasTopic,,,,,1433303251800176474,131191,,,
+HasTopic,,,,420762134340393550,,1907525,,,
+Sale,975526659664533195,524508243055647325,,,,,10/15/2018,,
+Author,284405379592161575,,,,102583151124020340,,,,
+HasTopic,,,,,963345652072941810,735,,,
+HasTopic,,,,,1433303251800176474,3303945,,,
+Author,1152266442105786574,,,,963345652072941810,,,,
+HasTopic,,,,202421472143651025,,1189753,,,
+HasTopic,,,,1028329324575034354,,11229,,,
+Includes,,,1615340315424362057,1220295546212024391,,,,,
+Includes,,,1314315120197156050,846536331643665114,,,,,
+HasTopic,,,,1296829658689065159,,7451247,,,
+Includes,,,1615340315424362057,78678286442461987,,,,,
+HasTopic,,,,541215404780905313,,11635,,,
+Author,481153633235353485,,,956704137555154092,,,,,
+Author,611117914195523184,,,701755398615636460,,,,,
+Author,95240187156237415,,,,963345652072941810,,,,
+Includes,,,1372844135435303981,91431002216341149,,,,,
+Author,846764541256336994,,,1290121451283392110,,,,,
+HasTopic,,,,393285992310638641,,206887,,,
+HasTopic,,,,1512214307542520410,,177749,,,
+HasTopic,,,,1114502034902546550,,7252790,,,
+HasTopic,,,1202482536733844323,,,60,,,
+HasTopic,,,,1302313601603127196,,41796,,,
+HasTopic,,,,1114502034902546550,,44311,,,
+Author,1547400408884914628,,,393285992310638641,,,,,
+Author,312380611598980641,,,78678286442461987,,,,,
+Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,,
+HasTopic,,,,1513662032452523252,,728,,,
+Includes,,,1615340315424362057,447169043921403064,,,,,
+Includes,,,1372844135435303981,82629615412640377,,,,,
+HasTopic,,,,1302313601603127196,,29042975,,,
+Author,1321304826561136177,,,1532662490035322233,,,,,
+Includes,,,1372844135435303981,1302313601603127196,,,,,
+Sale,363047312690634767,242111862342742257,,,,,10/4/2018,,
+Author,1071303249530347453,,,737353170652104031,,,,,
+HasTopic,,,,1282227710122181132,,35140,,,
+HasTopic,,,,91431002216341149,,46744,,,
+Includes,,,1372844135435303981,1441762191425652442,,,,,
+Sale,446962590481145702,534449219561977424,,,,,10/11/2018,,
+Author,1035555223142154728,,,877764733212222524,,,,,
+Author,1403521534163206962,,,932362105613871012,,,,,
+HasTopic,,,,701755398615636460,,35127,,,
+Includes,,,1372844135435303981,1028329324575034354,,,,,
+HasTopic,,,,1245126351375505703,,254860,,,
+HasTopic,,,,209800678458482108,,7897553,,,
+Includes,,,1372844135435303981,186108460103013588,,,,,
+HasOrg,,,,,102583151124020340,304878,,,
+HasTopic,,,,,1433303251800176474,998,,,
+Author,653345304799504620,,,581543512052485139,,,,,
+Author,1302421465423646583,,,240337224527030225,,,,,
+Author,1211456636406749825,,,618434247743641149,,,,,
+HasTopic,,,,240337224527030225,,785,,,
+HasTopic,,,,,1004346153600881042,83460,,,
+Includes,,,1615340315424362057,1245126351375505703,,,,,
+HasTopic,,,,747423119260925972,,16868955,,,
+HasTopic,,,,91431002216341149,,9420,,,
+Includes,,,451888058015735870,541215404780905313,,,,,
+HasTopic,,,,1209342585680609487,,492346,,,
+HasTopic,,,,1290121451283392110,,114633,,,
+HasTopic,,,,1441762191425652442,,11249,,,
+HasTopic,,,,1532662490035322233,,753651,,,
+HasTopic,,,,1532662490035322233,,11707,,,
+HasTopic,,,,1245126351375505703,,904756,,,
+HasTopic,,,15133734353741126,,,189756,,,
+Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,,
+Includes,,,1372844135435303981,1424263331858043042,,,,,
+Includes,,,1615340315424362057,1290121451283392110,,,,,
+Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,,
+HasTopic,,,,737353170652104031,,16003594,,,
+HasTopic,,,,747423119260925972,,2329157,,,
+Sale,566448585007839403,765254641650259739,,,,,10/9/2018,,
+HasOrg,,,,,963345652072941810,130965,,,
+Author,616673625330310949,,,936722743217343702,,,,,
+HasTopic,,,,1513662032452523252,,44311,,,
+Author,119474435514352445,,,1282227710122181132,,,,,
+HasTopic,,,,82629615412640377,,22983,,,
+HasTopic,,,,833681012494554358,,201816,,,
+HasTopic,,,,,1433303251800176474,14528,,,
+Sale,33927662206515912,934144115142885657,,,,,10/10/2018,,
+Includes,,,1372844135435303981,618434247743641149,,,,,
+Includes,,,1615340315424362057,740410432146852843,,,,,
+Author,1300183120520109060,,,128423416112315798,,,,,
+Author,1069710216181783510,,,1296829658689065159,,,,,
+Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,,
+Author,1115244423173415593,,,1614534111336540475,,,,,
+Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,,
+HasTopic,,,,1285128710332882742,,37654,,,
+HasTopic,,,1615340315424362057,,,12796,,,
+Author,1555348115336584230,,,817526874194673140,,,,,
+HasTopic,,,,,102583151124020340,43035,,,
+HasTopic,,,,737353170652104031,,206361,,,
+Includes,,,1314315120197156050,833681012494554358,,,,,
+Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,,
+HasTopic,,,,1220295546212024391,,588894,,,
+HasTopic,,,,202421472143651025,,7602643,,,
+Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,,
+Author,735243266472522113,,,1209342585680609487,,,,,
+Includes,,,1372844135435303981,1651365355351122204,,,,,
+HasTopic,,,,1424660009578332566,,618102,,,
+HasTopic,,,,,102583151124020340,126095,,,
+HasTopic,,,,,1004346153600881042,2446683,,,
+HasTopic,,,,,1433303251800176474,598435,,,
+Author,735713441679521195,,,,1433303251800176474,,,,
+Includes,,,1314315120197156050,321724159614056152,,,,,
+HasTopic,,,,1114502034902546550,,8856932,,,
+HasTopic,,,,,1433303251800176474,5405633,,,
+HasTopic,,,,1285128710332882742,,11299,,,
+HasTopic,,,,420762134340393550,,12796,,,
+HasTopic,,,,1184855350262395542,,329717,,,
+HasTopic,,,,116892402526543412,,2456507,,,
+Includes,,,1615340315424362057,817526874194673140,,,,,
+Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,,
+HasTopic,,,,1209342585680609487,,127197,,,
+HasTopic,,,,420762134340393550,,16048728,,,
+HasTopic,,,,209800678458482108,,774228,,,
+Author,231472126788137195,,,82629615412640377,,,,,
+HasTopic,,,,,1433303251800176474,7817,,,
+HasTopic,,,,932362105613871012,,1049632,,,
+Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,,
+HasTopic,,,,,1433303251800176474,467,,,
+HasTopic,,,,1220295546212024391,,37497186,,,
+HasTopic,,,,817526874194673140,,732934,,,
+Includes,,,1615340315424362057,202421472143651025,,,,,
+HasTopic,,,,1513662032452523252,,889,,,
+Includes,,,1372844135435303981,737353170652104031,,,,,
+HasTopic,,,,1296829658689065159,,83460,,,
+HasTopic,,,,747423119260925972,,1333024,,,
+HasTopic,,,,1290121451283392110,,204,,,
+Author,611117914195523184,,,440265285168056234,,,,,
+Author,754480939973310112,,,1441762191425652442,,,,,
+Includes,,,1427292001647224242,936722743217343702,,,,,
diff --git a/libcusp/CMakeLists.txt b/libcusp/CMakeLists.txt
index 2cc6e1714d..67b603019e 100644
--- a/libcusp/CMakeLists.txt
+++ b/libcusp/CMakeLists.txt
@@ -27,3 +27,5 @@ install(TARGETS galois_cusp
     COMPONENT lib
   INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
 )
+
+add_subdirectory(test)
diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h
index 22fdf63d10..515f957e54 100644
--- a/libcusp/include/galois/graphs/BasePolicies.h
+++ b/libcusp/include/galois/graphs/BasePolicies.h
@@ -42,6 +42,9 @@ class PartitioningScaffold {
   uint64_t _numEdges; //!< number of edges in graph
   //! maps from host id to nodes that host as read from disk
   std::vector<std::pair<uint64_t, uint64_t>> _gid2host;
+  std::vector<uint32_t>
+      _virtualToPhyMapping; // saving Virtual hosts to Phy hosts map
+  bool hash;                // switch between using gid2host and VtoP maps
 
 public:
   /**
@@ -64,7 +67,14 @@ class PartitioningScaffold {
    */
   void saveGIDToHost(std::vector<std::pair<uint64_t, uint64_t>>& gid2host) {
     _gid2host = gid2host;
+    hash      = false;
   }
+  void saveGIDToHost(std::vector<uint32_t>& virtualToPhyMapping) {
+    _virtualToPhyMapping = virtualToPhyMapping;
+    hash                 = true;
+  }
+
+  bool predeterminedMapping(std::vector<uint32_t>&) { return false; }
 };
 
 /**
@@ -88,15 +98,19 @@ class ReadMasterAssignment : public PartitioningScaffold {
    * @returns Host ID of host that read the node specified by the GID.
    */
   uint32_t retrieveMaster(uint32_t gid) const {
-    for (auto h = 0U; h < _numHosts; ++h) {
-      uint64_t start, end;
-      std::tie(start, end) = _gid2host[h];
-      if (gid >= start && gid < end) {
-        return h;
+    if (hash == false) {
+      for (auto h = 0U; h < _numHosts; ++h) {
+        uint64_t start, end;
+        std::tie(start, end) = _gid2host[h];
+        if (gid >= start && gid < end) {
+          return h;
+        }
       }
+      assert(false);
+      return _numHosts;
+    } else {
+      return _virtualToPhyMapping[gid % (_virtualToPhyMapping.size())];
     }
-    assert(false);
-    return _numHosts;
   }
 
   // below all unused if not assigning masters in default manner, but must be
@@ -149,8 +163,13 @@ class CustomMasterAssignment : public PartitioningScaffold {
   char _status; //!< Specifies what phase of master assignment partitioner is on
   //! Metadata for determining where a node's master is
   std::vector<uint32_t> _localNodeToMaster;
-  //! Map GID to its master
+  //! Map GID to its master; only for nodes we own
   std::unordered_map<uint64_t, uint32_t> _gid2masters;
+  //! Unlike gid2masters, this contains a mapping in vector form of ALL mappings
+  //! for all nodes in the graph instead of just local ones; only used if it is
+  //! known exactly where everything ends up before partitioning
+  std::vector<uint32_t> _globalHostMap;
+
   //! This host's node offset (each host reads a distinct contiguous portion
   //! of graph
   uint64_t _nodeOffset;
@@ -183,6 +202,8 @@ class CustomMasterAssignment : public PartitioningScaffold {
    * mapping is not found but instead returns -1 if in stage 1, else
    * fails.
    *
+   * ONLY WORKS IF GID IS ON LOCAL HOST ELSE WILL FAIL
+   *
    * @param gid GID to get master of
    * @returns Master of specified GID, -1, unsigned, if not found
    */
@@ -194,19 +215,17 @@ class CustomMasterAssignment : public PartitioningScaffold {
         // found in map
         if (gidMasterIter != _gid2masters.end()) {
           uint32_t mappedMaster = gidMasterIter->second;
-          // galois::gDebug("[", _hostID, "] ", gid, " found with master ",
-          //               mappedMaster, "!");
           // make sure host is in bounds
           assert(mappedMaster < _numHosts);
           return mappedMaster;
         } else {
           // NOT FOUND (not necessarily a bad thing, and required for
           // some cases)
-          galois::gDebug("[", _hostID, "] ", gid, " not found!");
           if (_status == 2) {
             // die if we expect all gids to be mapped already (stage 2)
             GALOIS_DIE("should not fail to find a GID after stage 2 "
-                       "of master assignment phase");
+                       "of master assignment phase; that or passed in gid that"
+                       " doesn't exist on this host");
           }
           return (uint32_t)-1;
         }
@@ -242,7 +261,6 @@ class CustomMasterAssignment : public PartitioningScaffold {
 
     for (auto i = gid2offsets.begin(); i != gid2offsets.end(); i++) {
       assert(i->second < localNodeToMaster.size());
-      galois::gDebug("Map ", i->first, " to ", localNodeToMaster[i->second]);
       _gid2masters[i->first] = localNodeToMaster[i->second];
     }
     assert(_gid2masters.size() == (originalSize + gid2offsets.size()));
@@ -303,13 +321,10 @@ class CustomMasterAssignment : public PartitioningScaffold {
       auto offsetIntoMapIter = _gid2masters.find(gid);
       if (offsetIntoMapIter == _gid2masters.end()) {
         // NOT FOUND
-        galois::gDebug("[", _hostID, "] ", gid, " not found; mapping!");
         _gid2masters[gid] = mappedMaster;
         return true;
       } else {
         // already mapped
-        galois::gDebug("[", _hostID, "] ", gid, " already mapped with master ",
-                       offsetIntoMapIter->second, "!");
         assert(offsetIntoMapIter->second == mappedMaster);
         return false;
       }
diff --git a/libcusp/include/galois/graphs/CuSPPartitioner.h b/libcusp/include/galois/graphs/CuSPPartitioner.h
index 6df9707a27..5541be426d 100644
--- a/libcusp/include/galois/graphs/CuSPPartitioner.h
+++ b/libcusp/include/galois/graphs/CuSPPartitioner.h
@@ -50,6 +50,7 @@ using DistGraphPtr =
  * to the partitioner
  * @param outputType Specifies the output format (CSR or CSC) that each
  * partition will be created in
+ * @param useWMD "true" if the passed graph file format is a WMD graph
  * @param symmetricGraph This should be "true" if the passed in graphFile
  * is a symmetric graph
  * @param transposeGraphFile Transpose graph of graphFile in Galois binary
@@ -83,7 +84,8 @@ template <typename PartitionPolicy, typename NodeData = char,
           typename EdgeData = void>
 DistGraphPtr<NodeData, EdgeData>
 cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,
-                   CUSP_GRAPH_TYPE outputType, bool symmetricGraph = false,
+                   CUSP_GRAPH_TYPE outputType, bool useWMD = false,
+                   bool symmetricGraph            = false,
                    std::string transposeGraphFile = "",
                    std::string masterBlockFile = "", bool cuspAsync = true,
                    uint32_t cuspStateRounds = 100,
@@ -126,13 +128,13 @@ cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,
     }
 
     return std::make_unique<DistGraphConstructor>(
-        inputToUse, net.ID, net.Num, cuspAsync, cuspStateRounds, useTranspose,
-        readPolicy, nodeWeight, edgeWeight, masterBlockFile);
+        inputToUse, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds,
+        useTranspose, readPolicy, nodeWeight, edgeWeight, masterBlockFile);
   } else {
     // symmetric graph path: assume the passed in graphFile is a symmetric
     // graph; output is also symmetric
     return std::make_unique<DistGraphConstructor>(
-        graphFile, net.ID, net.Num, cuspAsync, cuspStateRounds, false,
+        graphFile, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds, false,
         readPolicy, nodeWeight, edgeWeight, masterBlockFile);
   }
 }
diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h
index 1720081e77..540b25e120 100644
--- a/libcusp/include/galois/graphs/DistributedGraph.h
+++ b/libcusp/include/galois/graphs/DistributedGraph.h
@@ -31,6 +31,7 @@
 #include <fstream>
 
 #include "galois/graphs/LC_CSR_Graph.h"
+#include "galois/graphs/LC_CSR_CSC_Graph.h"
 #include "galois/graphs/BufferedGraph.h"
 #include "galois/runtime/DistStats.h"
 #include "galois/graphs/OfflineGraph.h"
@@ -60,13 +61,16 @@ enum MASTERS_DISTRIBUTION {
  * @tparam NodeTy type of node data for the graph
  * @tparam EdgeTy type of edge data for the graph
  */
-template <typename NodeTy, typename EdgeTy>
+template <typename NodeTy, typename EdgeTy, typename NodeIndexTy = uint32_t,
+          typename EdgeIndexTy = uint64_t>
 class DistGraph {
 private:
   //! Graph name used for printing things
   constexpr static const char* const GRNAME = "dGraph";
 
-  using GraphTy = galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true>;
+  using GraphTy =
+      galois::graphs::LC_CSR_CSC_Graph<NodeTy, EdgeTy, false, true, false,
+                                       false, EdgeTy, NodeIndexTy, EdgeIndexTy>;
 
   // vector for determining range objects for master nodes + nodes
   // with edges (which includes masters)
@@ -256,14 +260,14 @@ class DistGraph {
       for (unsigned d = 0; d < DecomposeFactor; ++d) {
         galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
       }
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -326,14 +330,210 @@ class DistGraph {
         continue;
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, gid2host[id]);
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      galois::runtime::gDeserialize(b, gid2host[p->first]);
+      ++received;
+    }
+    increment_evilPhase();
+  }
+
+  /**
+   * Given the number of global nodes, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host. Considers
+   * ONLY nodes and not edges.
+   *
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBlockedNodes(uint64_t numGlobalNodes,
+                                  const std::vector<unsigned>& scalefactor,
+                                  unsigned DecomposeFactor = 1) {
+    uint64_t numNodes_to_divide = numGlobalNodes;
+    if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) {
+      for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i)
+        gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide,
+                                               i, numHosts * DecomposeFactor));
+      return;
+    }
+
+    // TODO: not compatible with DecomposeFactor.
+    assert(scalefactor.size() == numHosts);
+
+    unsigned numBlocks = 0;
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      numBlocks += scalefactor[i];
+    }
+
+    std::vector<std::pair<uint64_t, uint64_t>> blocks;
+    for (unsigned i = 0; i < numBlocks; ++i) {
+      blocks.push_back(
+          galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks));
+    }
+
+    std::vector<unsigned> prefixSums;
+    prefixSums.push_back(0);
+
+    for (unsigned i = 1; i < numHosts; ++i) {
+      prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]);
+    }
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      unsigned firstBlock = prefixSums[i];
+      unsigned lastBlock  = prefixSums[i] + scalefactor[i] - 1;
+      gid2host.push_back(
+          std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second));
+    }
+  }
+
+  /**
+   * Given the number of global nodes and edges,
+   * compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the only edges of the node to get
+   * even blocks.
+   *
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param numGlobalEdges The number of global edges to divide
+   * @param outIndices A complete outgoing edge range array of CSR to calculate
+   * range
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBalancedEdges(uint64_t numGlobalNodes,
+                                   uint64_t numGlobalEdges,
+                                   uint64_t* outIndices,
+                                   const std::vector<unsigned>& scalefactor,
+                                   uint32_t edgeWeight,
+                                   unsigned DecomposeFactor = 1) {
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    gid2host.resize(numHosts * DecomposeFactor);
+    for (unsigned d = 0; d < DecomposeFactor; ++d) {
+      // TODO(hc):
+      auto r = galois::graphs::divideNodesBinarySearch(
+          numGlobalNodes, numGlobalEdges, 0, edgeWeight, (id + d * numHosts),
+          numHosts * DecomposeFactor, outIndices, scalefactor);
+      gid2host[id + d * numHosts].first  = *(r.first.first);
+      gid2host[id + d * numHosts].second = *(r.first.second);
+    }
+
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id) {
+        continue;
+      }
+      galois::runtime::SendBuffer b;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
+      }
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]);
+      }
+      ++received;
+    }
+    increment_evilPhase();
+
+#ifndef NDEBUG
+    // TODO(hc):
+    for (unsigned h = 0; h < numHosts; h++) {
+      if (h == 0) {
+        assert(gid2host[h].first == 0);
+      } else if (h == numHosts - 1) {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == numGlobalNodes);
+      } else {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == gid2host[h + 1].first);
+      }
+    }
+#endif
+  }
+
+  /**
+   * Given the number of global nodes and edges,
+   * compute the masters for each node by evenly
+   * (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the edges of the node AND the node itself.
+   *
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param numGlobalEdges The number of global edges to divide
+   * @param outIndices A complete outgoing edge range array of CSR to calculate
+   * range
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1. Ignored
+   * in this function currently.
+   *
+   * @todo make this function work with decompose factor
+   */
+  void computeMastersBalancedNodesAndEdges(
+      uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint64_t* outIndices,
+      const std::vector<unsigned>& scalefactor, uint32_t nodeWeight,
+      uint32_t edgeWeight, unsigned) {
+    if (nodeWeight == 0) {
+      nodeWeight = numGlobalEdges / numGlobalNodes; // average degree
+    }
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    gid2host.resize(numHosts);
+    auto r = galois::graphs::divideNodesBinarySearch(
+        numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight, id, numHosts,
+        outIndices, scalefactor);
+    gid2host[id].first  = *r.first.first;
+    gid2host[id].second = *r.first.second;
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id)
+        continue;
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, gid2host[id]);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -390,10 +590,68 @@ class DistGraph {
     galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(
         GRNAME, "MasterDistTime", timer.get());
 
-    galois::gPrint(
+    galois::gDebug(
         "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f,
         " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(),
-        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n");
+        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)");
+    return numNodes_to_divide;
+  }
+
+  /**
+   * Wrapper call that will call into more specific compute masters
+   * functions that compute masters based on nodes, edges, or both.
+   *
+   * @param masters_distribution method of masters distribution to use
+   * @param numGlobalNodes The number of global nodes to divide
+   * @param numGlobalEdges The number of global edges to divide
+   * @param outIndices A complete outgoing edge range array of CSR to calculate
+   * range
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param nodeWeight weight to give nodes when computing balance
+   * @param edgeWeight weight to give edges when computing balance
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution,
+                          uint64_t numGlobalNodes, uint64_t numGlobalEdges,
+                          uint64_t* outIndices,
+                          const std::vector<unsigned>& scalefactor,
+                          uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
+                          unsigned DecomposeFactor = 1) {
+    galois::Timer timer;
+    timer.start();
+    uint64_t numNodes_to_divide = numGlobalNodes;
+
+    // compute masters for all nodes
+    switch (masters_distribution) {
+    case BALANCED_MASTERS:
+      computeMastersBlockedNodes(numGlobalNodes, scalefactor, DecomposeFactor);
+      break;
+    case BALANCED_MASTERS_AND_EDGES:
+      computeMastersBalancedNodesAndEdges(numGlobalNodes, numGlobalEdges,
+                                          outIndices, scalefactor, nodeWeight,
+                                          edgeWeight, DecomposeFactor);
+      break;
+    case BALANCED_EDGES_OF_MASTERS:
+    default:
+      computeMastersBalancedEdges(numGlobalNodes, numGlobalEdges, outIndices,
+                                  scalefactor, edgeWeight, DecomposeFactor);
+      break;
+    }
+
+    timer.stop();
+
+    galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(
+        GRNAME, "MasterDistTime", timer.get());
+
+#if 0
+    galois::gDebug(
+        "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f,
+        " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(),
+        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)");
+#endif
     return numNodes_to_divide;
   }
 
@@ -443,14 +701,14 @@ class DistGraph {
         continue;
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, gid2host[id]);
-      net.sendTagged(h, galois::runtime::evilPhase, b);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     net.flush();
     unsigned received = 1;
     while (received < numHosts) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       assert(p->first != id);
       auto& b = p->second;
@@ -539,6 +797,9 @@ class DistGraph {
 
 public:
   virtual ~DistGraph() {}
+
+  unsigned GetLIDHost(uint64_t lid) const { return getHostIDImpl(getGID(lid)); }
+
   //! Determines which host has the master for a particular node
   //! @returns Host id of node in question
   inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }
@@ -742,6 +1003,51 @@ class DistGraph {
     return IDs;
   }
 
+  //////////////////////////////////////////////////////////////////////////////
+  // for in edges
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Construct the transpose graph for the partitioned graph
+  void ConstructIncomingEdges() { graph.constructIncomingEdges(); }
+
+  /**
+   * Get the edge data for a particular edge in the graph.
+   *
+   * @param ni edge to get the data of
+   * @param mflag access flag for edge data
+   * @returns The edge data for the requested edge
+   */
+  typename GraphTy::edge_data_reference
+  GetInEdgeData(edge_iterator ni,
+                galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {
+    return graph.getInEdgeData(ni, mflag);
+  }
+
+  GraphNode GetInEdgeDest(edge_iterator ni) { return graph.getInEdgeDst(ni); }
+
+  edge_iterator in_edge_begin(GraphNode N) {
+    return graph.in_edge_begin(N, galois::MethodFlag::UNPROTECTED);
+  }
+
+  edge_iterator in_edge_end(GraphNode N) {
+    return graph.in_edge_end(N, galois::MethodFlag::UNPROTECTED);
+  }
+
+  galois::runtime::iterable<galois::NoDerefIterator<edge_iterator>>
+  in_edges(GraphNode N) {
+    return galois::graphs::internal::make_no_deref_range(in_edge_begin(N),
+                                                         in_edge_end(N));
+  }
+
+  //! Return corresponding out-edge index for an in-edge
+  size_t InEdgeToOutEdge(edge_iterator ni) const {
+    return graph.InEdgeToOutEdge(ni);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // end in edges
+  //////////////////////////////////////////////////////////////////////////////
+
 protected:
   /**
    * Uses a pre-computed prefix sum to determine division of nodes among
@@ -773,7 +1079,6 @@ class DistGraph {
                withEdgeRanges.size() != 0) {
       masterRanges = withEdgeRanges;
     } else {
-      galois::gDebug("Manually det. master thread ranges");
       masterRanges = galois::graphs::determineUnitRangesFromGraph(
           graph, galois::runtime::activeThreads, beginMaster,
           beginMaster + numOwned, 0);
@@ -799,7 +1104,6 @@ class DistGraph {
                masterRanges.size() != 0) {
       withEdgeRanges = masterRanges;
     } else {
-      galois::gDebug("Manually det. with edges thread ranges");
       withEdgeRanges = galois::graphs::determineUnitRangesFromGraph(
           graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);
     }
@@ -865,10 +1169,7 @@ class DistGraph {
   /**
    * Deallocates underlying LC CSR Graph
    */
-  void deallocate() {
-    galois::gDebug("Deallocating CSR in DistGraph");
-    graph.deallocate();
-  }
+  void deallocate() { graph.deallocate(); }
 
   /**
    * Sort the underlying LC_CSR_Graph by ID (destinations)
@@ -881,10 +1182,22 @@ class DistGraph {
         [&](GN n) { graph.sortEdges(n, IdLess<GN, EdgeTy>()); },
         galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal());
   }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // what follows are GNN functions; some are not great (e.g. expose arrays)
+  // TODO figure out better way to do this
+  ////////////////////////////////////////////////////////////////////////////////
+  EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); }
+  NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); }
+
+  //! Used by substrate to determine if some stats are to be reported
+  bool is_a_graph() const { return true; }
 };
 
-template <typename NodeTy, typename EdgeTy>
-constexpr const char* const galois::graphs::DistGraph<NodeTy, EdgeTy>::GRNAME;
+template <typename NodeTy, typename EdgeTy, typename NodeIndexTy,
+          typename EdgeIndexTy>
+constexpr const char* const
+    galois::graphs::DistGraph<NodeTy, EdgeTy, NodeIndexTy, EdgeIndexTy>::GRNAME;
 } // end namespace graphs
 } // end namespace galois
 
diff --git a/libcusp/include/galois/graphs/DistributedLocalGraph.h b/libcusp/include/galois/graphs/DistributedLocalGraph.h
new file mode 100644
index 0000000000..2920614232
--- /dev/null
+++ b/libcusp/include/galois/graphs/DistributedLocalGraph.h
@@ -0,0 +1,1063 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+/**
+ * @file DistributedLocalGraph.h
+ *
+ * Contains the implementation for DistLocalGraph. Command line argument
+ * definitions are found in DistributedGraph.cpp.
+ */
+
+#ifndef _GALOIS_DISTRIBUTED_LOCAL_GRAPH_H
+#define _GALOIS_DISTRIBUTED_LOCAL_GRAPH_H
+
+#include <unordered_map>
+#include <fstream>
+
+#include "galois/graphs/DistributedGraph.h"
+#include "galois/graphs/LS_LC_CSR_Graph.h"
+#include "galois/graphs/BufferedGraph.h"
+#include "galois/runtime/DistStats.h"
+#include "galois/graphs/OfflineGraph.h"
+#include "galois/DynamicBitset.h"
+
+/*
+ * Headers for boost serialization
+ */
+
+namespace galois {
+namespace graphs {
+
+/**
+ * Base DistLocalGraph class that all distributed graphs extend from.
+ *
+ * @tparam NodeTy type of node data for the graph
+ * @tparam EdgeTy type of edge data for the graph
+ */
+template <typename NodeTy, typename EdgeTy>
+class DistLocalGraph {
+private:
+  //! Graph name used for printing things
+  constexpr static const char* const GRNAME = "dGraph";
+
+  using GraphTy = galois::graphs::LS_LC_CSR_Graph<NodeTy, EdgeTy>;
+
+  // vector for determining range objects for master nodes + nodes
+  // with edges (which includes masters)
+  //! represents split of all nodes among threads to balance edges
+  std::vector<uint32_t> allNodesRanges;
+  //! represents split of master nodes among threads to balance edges
+  std::vector<uint32_t> masterRanges;
+  //! represents split of nodes with edges (includes masters) among threads to
+  //! balance edges
+  std::vector<uint32_t> withEdgeRanges;
+  //! represents split of all nodes among threads to balance in-edges
+  std::vector<uint32_t> allNodesRangesIn;
+  //! represents split of master nodes among threads to balance in-edges
+  std::vector<uint32_t> masterRangesIn;
+
+  using NodeRangeType =
+      galois::runtime::SpecificRange<boost::counting_iterator<size_t>>;
+
+  //! Vector of ranges that stores the 3 different range objects that a user is
+  //! able to access
+  std::vector<NodeRangeType> specificRanges;
+  //! Like specificRanges, but for in edges
+  std::vector<NodeRangeType> specificRangesIn;
+
+protected:
+  //! The internal graph used by DistLocalGraph to represent the graph
+  GraphTy* graph;
+
+  //! Marks if the graph is transposed or not.
+  bool transposed;
+
+  // global graph variables
+  uint64_t numGlobalNodes; //!< Total nodes in the global unpartitioned graph.
+  uint64_t numGlobalEdges; //!< Total edges in the global unpartitioned graph.
+  uint32_t numNodes;       //!< Num nodes in this graph in total
+  uint64_t numEdges;       //!< Num edges in this graph in total
+
+  const unsigned id;       //!< ID of the machine.
+  const uint32_t numHosts; //!< Total number of machines
+
+  // local graph
+  // size() = Number of nodes created on this host (masters + mirrors)
+  uint32_t numOwned;    //!< Number of nodes owned (masters) by this host.
+                        //!< size() - numOwned = mirrors on this host
+  uint32_t beginMaster; //!< Local id of the beginning of master nodes.
+                        //!< beginMaster + numOwned = local id of the end of
+                        //!< master nodes
+  uint32_t numNodesWithEdges; //!< Number of nodes (masters + mirrors) that have
+                              //!< outgoing edges
+
+  //! Information that converts host to range of nodes that host reads
+  std::vector<std::pair<uint64_t, uint64_t>> gid2host;
+  //! Mirror nodes from different hosts. For reduce
+  std::vector<std::vector<size_t>> mirrorNodes;
+
+  //! GID = localToGlobalVector[LID]
+  std::vector<uint64_t> localToGlobalVector;
+  //! LID = globalToLocalMap[GID]
+  std::unordered_map<uint64_t, uint32_t> globalToLocalMap;
+
+  //! Increments evilPhase, a phase counter used by communication.
+  void inline increment_evilPhase() {
+    ++galois::runtime::evilPhase;
+    if (galois::runtime::evilPhase >=
+        static_cast<uint32_t>(
+            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
+                                                    // LCI
+      galois::runtime::evilPhase = 1;
+    }
+  }
+
+  //! Returns evilPhase + 1, handling loop around as necessary
+  unsigned inline evilPhasePlus1() {
+    unsigned result = galois::runtime::evilPhase + 1;
+
+    // limit defined by MPI or LCI
+    if (result >= uint32_t{std::numeric_limits<int16_t>::max()}) {
+      return 1;
+    }
+    return result;
+  }
+
+  //! used to sort edges in the sort edges function
+  template <typename GraphNode, typename ET>
+  struct IdLess {
+    bool
+    operator()(const galois::graphs::EdgeSortValue<GraphNode, ET>& e1,
+               const galois::graphs::EdgeSortValue<GraphNode, ET>& e2) const {
+      return e1.dst < e2.dst;
+    }
+  };
+
+private:
+  /**
+   * Given an OfflineGraph, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host. Considers
+   * ONLY nodes and not edges.
+   *
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBlockedNodes(galois::graphs::OfflineGraph& g,
+                                  const std::vector<unsigned>& scalefactor,
+                                  unsigned DecomposeFactor = 1) {
+    uint64_t numNodes_to_divide = g.size();
+    if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) {
+      for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i)
+        gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide,
+                                               i, numHosts * DecomposeFactor));
+      return;
+    }
+
+    // TODO: not compatible with DecomposeFactor.
+    assert(scalefactor.size() == numHosts);
+
+    unsigned numBlocks = 0;
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      numBlocks += scalefactor[i];
+    }
+
+    std::vector<std::pair<uint64_t, uint64_t>> blocks;
+    for (unsigned i = 0; i < numBlocks; ++i) {
+      blocks.push_back(
+          galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks));
+    }
+
+    std::vector<unsigned> prefixSums;
+    prefixSums.push_back(0);
+
+    for (unsigned i = 1; i < numHosts; ++i) {
+      prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]);
+    }
+
+    for (unsigned i = 0; i < numHosts; ++i) {
+      unsigned firstBlock = prefixSums[i];
+      unsigned lastBlock  = prefixSums[i] + scalefactor[i] - 1;
+      gid2host.push_back(
+          std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second));
+    }
+  }
+
+  /**
+   * Given an OfflineGraph, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the only edges of the node to get
+   * even blocks.
+   *
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  void computeMastersBalancedEdges(galois::graphs::OfflineGraph& g,
+                                   const std::vector<unsigned>& scalefactor,
+                                   uint32_t edgeWeight,
+                                   unsigned DecomposeFactor = 1) {
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    gid2host.resize(numHosts * DecomposeFactor);
+    for (unsigned d = 0; d < DecomposeFactor; ++d) {
+      auto r = g.divideByNode(0, edgeWeight, (id + d * numHosts),
+                              numHosts * DecomposeFactor, scalefactor);
+      gid2host[id + d * numHosts].first  = *(r.first.first);
+      gid2host[id + d * numHosts].second = *(r.first.second);
+    }
+
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id) {
+        continue;
+      }
+      galois::runtime::SendBuffer b;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);
+      }
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      for (unsigned d = 0; d < DecomposeFactor; ++d) {
+        galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]);
+      }
+      ++received;
+    }
+    increment_evilPhase();
+
+#ifndef NDEBUG
+    for (unsigned h = 0; h < numHosts; h++) {
+      if (h == 0) {
+        assert(gid2host[h].first == 0);
+      } else if (h == numHosts - 1) {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == g.size());
+      } else {
+        assert(gid2host[h].first == gid2host[h - 1].second);
+        assert(gid2host[h].second == gid2host[h + 1].first);
+      }
+    }
+#endif
+  }
+
+  /**
+   * Given an OfflineGraph, compute the masters for each node by
+   * evenly (or unevenly as specified by scale factor)
+   * blocking the nodes off to assign to each host while taking
+   * into consideration the edges of the node AND the node itself.
+   *
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1. Ignored
+   * in this function currently.
+   *
+   * @todo make this function work with decompose factor
+   */
+  void computeMastersBalancedNodesAndEdges(
+      galois::graphs::OfflineGraph& g, const std::vector<unsigned>& scalefactor,
+      uint32_t nodeWeight, uint32_t edgeWeight, unsigned) {
+    if (nodeWeight == 0) {
+      nodeWeight = g.sizeEdges() / g.size(); // average degree
+    }
+    if (edgeWeight == 0) {
+      edgeWeight = 1;
+    }
+
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    gid2host.resize(numHosts);
+    auto r = g.divideByNode(nodeWeight, edgeWeight, id, numHosts, scalefactor);
+    gid2host[id].first  = *r.first.first;
+    gid2host[id].second = *r.first.second;
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id)
+        continue;
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, gid2host[id]);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      galois::runtime::gDeserialize(b, gid2host[p->first]);
+      ++received;
+    }
+    increment_evilPhase();
+  }
+
+protected:
+  /**
+   * Wrapper call that will call into more specific compute masters
+   * functions that compute masters based on nodes, edges, or both.
+   *
+   * @param masters_distribution method of masters distribution to use
+   * @param g The offline graph which has loaded the graph you want
+   * to get the masters for
+   * @param scalefactor A vector that specifies if a particular host
+   * should have more or less than other hosts
+   * @param nodeWeight weight to give nodes when computing balance
+   * @param edgeWeight weight to give edges when computing balance
+   * @param DecomposeFactor Specifies how decomposed the blocking
+   * of nodes should be. For example, a factor of 2 will make 2 blocks
+   * out of 1 block had the decompose factor been set to 1.
+   */
+  uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution,
+                          galois::graphs::OfflineGraph& g,
+                          const std::vector<unsigned>& scalefactor,
+                          uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
+                          unsigned DecomposeFactor = 1) {
+    galois::Timer timer;
+    timer.start();
+    g.reset_seek_counters();
+
+    uint64_t numNodes_to_divide = g.size();
+
+    // compute masters for all nodes
+    switch (masters_distribution) {
+    case BALANCED_MASTERS:
+      computeMastersBlockedNodes(g, scalefactor, DecomposeFactor);
+      break;
+    case BALANCED_MASTERS_AND_EDGES:
+      computeMastersBalancedNodesAndEdges(g, scalefactor, nodeWeight,
+                                          edgeWeight, DecomposeFactor);
+      break;
+    case BALANCED_EDGES_OF_MASTERS:
+    default:
+      computeMastersBalancedEdges(g, scalefactor, edgeWeight, DecomposeFactor);
+      break;
+    }
+
+    timer.stop();
+
+    galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(
+        GRNAME, "MasterDistTime", timer.get());
+
+    galois::gPrint(
+        "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f,
+        " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(),
+        " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n");
+    return numNodes_to_divide;
+  }
+
+  //! reader assignment from a file
+  //! corresponds to master assignment if using an edge cut
+  void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) {
+    // read file lines
+    std::ifstream mappings(filename);
+    std::string curLine;
+
+    unsigned timesToRead = id + 1;
+
+    for (unsigned i = 0; i < timesToRead; i++) {
+      std::getline(mappings, curLine);
+    }
+
+    std::vector<char> modifyLine(curLine.begin(), curLine.end());
+    char* tokenizedString = modifyLine.data();
+    char* token;
+    token = strtok(tokenizedString, " ");
+
+    // loop 6 more times
+    for (unsigned i = 0; i < 6; i++) {
+      token = strtok(NULL, " ");
+    }
+    std::string left(token);
+
+    // 3 more times for right
+    for (unsigned i = 0; i < 3; i++) {
+      token = strtok(NULL, " ");
+    }
+    std::string right(token);
+
+    gid2host.resize(numHosts);
+    gid2host[id].first  = std::stoul(left);
+    gid2host[id].second = std::stoul(right) + 1;
+    galois::gPrint("[", id, "] Left: ", gid2host[id].first,
+                   ", Right: ", gid2host[id].second, "\n");
+
+    /////////////////////////
+    // send/recv from other hosts
+    /////////////////////////
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    for (unsigned h = 0; h < numHosts; ++h) {
+      if (h == id)
+        continue;
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, gid2host[id]);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    unsigned received = 1;
+    while (received < numHosts) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      assert(p->first != id);
+      auto& b = p->second;
+      galois::runtime::gDeserialize(b, gid2host[p->first]);
+      ++received;
+    }
+    increment_evilPhase();
+
+    // sanity checking assignment
+    for (unsigned h = 0; h < numHosts; h++) {
+      if (h == 0) {
+        GALOIS_ASSERT(gid2host[h].first == 0);
+      } else if (h == numHosts - 1) {
+        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,
+                      gid2host[h].first, " ", gid2host[h - 1].second);
+        GALOIS_ASSERT(gid2host[h].second == g.size(), gid2host[h].second, " ",
+                      g.size());
+      } else {
+        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,
+                      gid2host[h].first, " ", gid2host[h - 1].second);
+        GALOIS_ASSERT(gid2host[h].second == gid2host[h + 1].first,
+                      gid2host[h].second, " ", gid2host[h + 1].first);
+      }
+    }
+  }
+
+  uint32_t G2L(uint64_t gid) const {
+    assert(isLocal(gid));
+    return globalToLocalMap.at(gid);
+  }
+
+  uint64_t L2G(uint32_t lid) const { return localToGlobalVector[lid]; }
+
+public:
+  //! Type representing a node in this graph
+  using GraphNode = typename GraphTy::VertexTopologyID;
+  //! Type representing an edge data in this graph
+  using EdgeType = EdgeTy;
+  //! iterator type over edges
+  using edge_iterator = typename GraphTy::EdgeIterator;
+
+  /**
+   * Constructor for DistLocalGraph. Initializes metadata fields.
+   *
+   * @param host host number that this graph resides on
+   * @param numHosts total number of hosts in the currently executing program
+   */
+  DistLocalGraph(unsigned host, unsigned numHosts)
+      : transposed(false), id(host), numHosts(numHosts) {
+    mirrorNodes.resize(numHosts);
+    numGlobalNodes = 0;
+    numGlobalEdges = 0;
+  }
+
+  /**
+   * Return a vector of pairs denoting mirror node ranges.
+   *
+   * Assumes all mirror nodes occur after the masters: this invariant should be
+   * held by CuSP.
+   */
+  std::vector<std::pair<uint32_t, uint32_t>> getMirrorRanges() const {
+    std::vector<std::pair<uint32_t, uint32_t>> mirrorRangesVector;
+    // order of nodes locally is masters, outgoing mirrors, incoming mirrors,
+    // so just get from numOwned to end
+    if (numOwned != numNodes) {
+      assert(numOwned < numNodes);
+      mirrorRangesVector.push_back(std::make_pair(numOwned, numNodes));
+    }
+    return mirrorRangesVector;
+  }
+
+  std::vector<std::vector<size_t>>& getMirrorNodes() { return mirrorNodes; }
+
+private:
+  virtual unsigned getHostIDImpl(uint64_t) const = 0;
+  virtual bool isOwnedImpl(uint64_t) const       = 0;
+  virtual bool isLocalImpl(uint64_t) const       = 0;
+  virtual bool isVertexCutImpl() const           = 0;
+  virtual std::pair<unsigned, unsigned> cartesianGridImpl() const {
+    return std::make_pair(0u, 0u);
+  }
+
+public:
+  virtual ~DistLocalGraph() {}
+  void initGraph(uint64_t numNodes) { graph = new GraphTy(numNodes); }
+  //! Determines which host has the master for a particular node
+  //! @returns Host id of node in question
+  inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }
+  //! Determine if a node has a master on this host.
+  //! @returns True if passed in global id has a master on this host
+  inline bool isOwned(uint64_t gid) const { return isOwnedImpl(gid); }
+  //! Determine if a node has a proxy on this host
+  //! @returns True if passed in global id has a proxy on this host
+  inline bool isLocal(uint64_t gid) const { return isLocalImpl(gid); }
+  /**
+   * Returns true if current partition is a vertex cut
+   * @returns true if partition being stored in this graph is a vertex cut
+   */
+  inline bool is_vertex_cut() const { return isVertexCutImpl(); }
+  /**
+   * Returns Cartesian split (if it exists, else returns pair of 0s
+   */
+  inline std::pair<unsigned, unsigned> cartesianGrid() const {
+    return cartesianGridImpl();
+  }
+
+  bool isTransposed() { return transposed; }
+
+  /**
+   * Converts a local node id into a global node id
+   *
+   * @param nodeID local node id
+   * @returns global node id corresponding to the local one
+   */
+  inline uint64_t getGID(const uint32_t nodeID) const { return L2G(nodeID); }
+
+  /**
+   * Converts a global node id into a local node id
+   *
+   * @param nodeID global node id
+   * @returns local node id corresponding to the global one
+   */
+  inline uint32_t getLID(const uint64_t nodeID) const { return G2L(nodeID); }
+
+  /**
+   * Get data of a node.
+   *
+   * @param N node to get the data of
+   * @param mflag access flag for node data
+   * @returns A node data object
+   */
+  inline NodeTy& getData(GraphNode N) {
+    auto& r = graph->getData(N);
+    return r;
+  }
+
+  /**
+   * Get the edge data for a particular edge in the graph.
+   *
+   * @param ni edge to get the data of
+   * @param mflag access flag for edge data
+   * @returns The edge data for the requested edge
+   */
+  inline EdgeTy& getEdgeData(GraphNode src, edge_iterator ni) {
+    GraphNode dst = getEdgeDst(ni);
+    auto& r       = graph->getEdgeData(std::make_pair(src, getGID(dst)));
+    return r;
+  }
+
+  inline EdgeTy& getEdgeData(edge_iterator ni) {
+    auto& r = graph->getEdgeData(*ni);
+    return r;
+  }
+
+  /**
+   * Gets edge destination of edge ni.
+   *
+   * @param ni edge id to get destination of
+   * @returns Local ID of destination of edge ni
+   */
+  GraphNode getEdgeDst(edge_iterator ni) {
+    return getGID(graph->getEdgeDst(*ni));
+  }
+
+  /**
+   * Gets the first edge of some node.
+   *
+   * @param N node to get the edge of
+   * @returns iterator to first edge of N
+   */
+  inline edge_iterator edge_begin(GraphNode N) {
+    return graph->edges(N).begin();
+  }
+
+  /**
+   * Gets the end edge boundary of some node.
+   *
+   * @param N node to get the edge of
+   * @returns iterator to the end of the edges of node N, i.e. the first edge
+   * of the next node (or an "end" iterator if there is no next node)
+   */
+  inline edge_iterator edge_end(GraphNode N) { return graph->edges(N).end(); }
+
+  /**
+   * Return the degree of the edge in the local graph
+   **/
+  inline uint64_t localDegree(GraphNode N) { return graph->getDegree(N); }
+
+  /**
+   * Returns an iterable object over the edges of a particular node in the
+   * graph.
+   *
+   * @param N node to get edges iterator over
+   */
+  inline galois::runtime::iterable<galois::NoDerefIterator<edge_iterator>>
+  edges(GraphNode N) {
+    return galois::graphs::internal::make_no_deref_range(edge_begin(N),
+                                                         edge_end(N));
+  }
+
+  /**
+   * Gets number of nodes on this (local) graph.
+   *
+   * @returns number of nodes present in this (local) graph
+   */
+  inline size_t size() const { return graph->size(); }
+
+  /**
+   * Gets number of edges on this (local) graph.
+   *
+   * @returns number of edges present in this (local) graph
+   */
+  inline size_t sizeEdges() { return graph->sizeEdges(); }
+
+  /**
+   * Gets number of nodes on this (local) graph.
+   *
+   * @returns number of nodes present in this (local) graph
+   */
+  inline size_t numMasters() const { return numOwned; }
+
+  /**
+   * Gets number of nodes with edges (may include nodes without edges)
+   * on this (local) graph.
+   *
+   * @returns number of nodes with edges (may include nodes without edges
+   * as it measures a contiguous range)
+   */
+  inline size_t getNumNodesWithEdges() const { return numNodesWithEdges; }
+
+  /**
+   * Gets number of nodes on the global unpartitioned graph.
+   *
+   * @returns number of nodes present in the global unpartitioned graph
+   */
+  inline size_t globalSize() const { return numGlobalNodes; }
+
+  /**
+   * Gets number of edges on the global unpartitioned graph.
+   *
+   * @returns number of edges present in the global unpartitioned graph
+   */
+  inline size_t globalSizeEdges() const { return numGlobalEdges; }
+
+  /**
+   * Returns a range object that encapsulates all nodes of the graph.
+   *
+   * @returns A range object that contains all the nodes in this graph
+   */
+  inline const NodeRangeType& allNodesRange() const {
+    assert(specificRanges.size() == 3);
+    return specificRanges[0];
+  }
+
+  /**
+   * Returns a range object that encapsulates only master nodes in this
+   * graph.
+   *
+   * @returns A range object that contains the master nodes in this graph
+   */
+  inline const NodeRangeType& masterNodesRange() const {
+    assert(specificRanges.size() == 3);
+    return specificRanges[1];
+  }
+
+  /**
+   * Returns a range object that encapsulates master nodes and nodes
+   * with edges in this graph.
+   *
+   * @returns A range object that contains the master nodes and the nodes
+   * with outgoing edges in this graph
+   */
+  inline const NodeRangeType& allNodesWithEdgesRange() const {
+    assert(specificRanges.size() == 3);
+    return specificRanges[2];
+  }
+
+  /**
+   * Returns a vector object that contains the global IDs (in order) of
+   * the master nodes in this graph.
+   *
+   * @returns A vector object that contains the global IDs (in order) of
+   * the master nodes in this graph
+   */
+  std::vector<uint64_t> getMasterGlobalIDs() {
+    std::vector<uint64_t> IDs;
+
+    IDs.reserve(numMasters());
+    for (auto node : masterNodesRange()) {
+      IDs.push_back(getGID(node));
+    }
+
+    return IDs;
+  }
+
+protected:
+  /**
+   * Uses a pre-computed prefix sum to determine division of nodes among
+   * threads.
+   *
+   * The call uses binary search to determine the ranges.
+   */
+  inline void determineThreadRanges() {
+    allNodesRanges = galois::graphs::determineUnitRangesFromPrefixSum(
+        galois::runtime::activeThreads, graph->getEdgePrefixSum());
+  }
+
+  /**
+   * Determines the thread ranges for master nodes only and saves them to
+   * the object.
+   *
+   * Only call after graph is constructed + only call once
+   */
+  inline void determineThreadRangesMaster() {
+    // make sure this hasn't been called before
+    assert(masterRanges.size() == 0);
+
+    // first check if we even need to do any work; if already calculated,
+    // use already calculated vector
+    if (beginMaster == 0 && (beginMaster + numOwned) == size()) {
+      masterRanges = allNodesRanges;
+    } else if (beginMaster == 0 &&
+               (beginMaster + numOwned) == numNodesWithEdges &&
+               withEdgeRanges.size() != 0) {
+      masterRanges = withEdgeRanges;
+    } else {
+      galois::gDebug("Manually det. master thread ranges");
+      masterRanges = galois::graphs::determineUnitRangesFromGraph(
+          *graph, galois::runtime::activeThreads, beginMaster,
+          beginMaster + numOwned, 0, true);
+    }
+  }
+
+  /**
+   * Determines the thread ranges for nodes with edges only and saves them to
+   * the object.
+   *
+   * Only call after graph is constructed + only call once
+   */
+  inline void determineThreadRangesWithEdges() {
+    // make sure not called before
+    assert(withEdgeRanges.size() == 0);
+
+    // first check if we even need to do any work; if already calculated,
+    // use already calculated vector
+    if (numNodesWithEdges == size()) {
+      withEdgeRanges = allNodesRanges;
+    } else if (beginMaster == 0 &&
+               (beginMaster + numOwned) == numNodesWithEdges &&
+               masterRanges.size() != 0) {
+      withEdgeRanges = masterRanges;
+    } else {
+      galois::gDebug("Manually det. with edges thread ranges");
+      withEdgeRanges = galois::graphs::determineUnitRangesFromGraph(
+          *graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);
+    }
+  }
+
+  /**
+   * Initializes the 3 range objects that a user can access to iterate
+   * over the graph in different ways.
+   */
+  void initializeSpecificRanges() {
+    assert(specificRanges.size() == 0);
+
+    // TODO/FIXME assertion likely not safe if a host gets no nodes
+    // make sure the thread ranges have already been calculated
+    // for the 3 ranges
+    assert(allNodesRanges.size() != 0);
+    assert(masterRanges.size() != 0);
+    assert(withEdgeRanges.size() != 0);
+
+    // 0 is all nodes
+    specificRanges.push_back(galois::runtime::makeSpecificRange(
+        boost::counting_iterator<size_t>(0),
+        boost::counting_iterator<size_t>(size()), allNodesRanges.data()));
+
+    // 1 is master nodes
+    specificRanges.push_back(galois::runtime::makeSpecificRange(
+        boost::counting_iterator<size_t>(beginMaster),
+        boost::counting_iterator<size_t>(beginMaster + numOwned),
+        masterRanges.data()));
+
+    // 2 is with edge nodes
+    specificRanges.push_back(galois::runtime::makeSpecificRange(
+        boost::counting_iterator<size_t>(0),
+        boost::counting_iterator<size_t>(numNodesWithEdges),
+        withEdgeRanges.data()));
+
+    assert(specificRanges.size() == 3);
+  }
+
+  /**
+   * Specific range editor: makes the range for edges equivalent to the range
+   * for masters.
+   */
+  void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; }
+
+  void recalculateG2LMap() {
+    for (uint64_t i = 0; i < localToGlobalVector.size(); i++) {
+      globalToLocalMap[localToGlobalVector[i]] = i;
+    }
+  }
+
+public:
+  /**
+   * Write the local LC_CSR graph to the file on a disk.
+   *
+   * @todo revive this
+   */
+  void save_local_graph_to_file(std::string) { GALOIS_DIE("not implemented"); }
+
+  /**
+   * Read the local LC_CSR graph from the file on a disk.
+   *
+   * @todo revive this
+   */
+  void read_local_graph_from_file(std::string) {
+    GALOIS_DIE("not implemented");
+  }
+
+  /**
+   * Deallocates underlying LC CSR Graph
+   */
+  void deallocate() {
+    galois::gDebug("Deallocating CSR in DistLocalGraph");
+    graph->deallocate();
+  }
+
+  /**
+   * Sort the underlying LC_CSR_Graph by ID (destinations)
+   * It sorts edges of the nodes by destination.
+   */
+  void sortEdgesByDestination() {
+    galois::do_all(
+        galois::iterate(graph->vertices().begin(), graph->vertices().end()),
+        [&](GraphNode n) { graph->sortEdges(n); }, galois::no_stats(),
+        galois::loopname("CSREdgeSort"), galois::steal());
+  }
+
+  //! Used by substrate to determine if some stats are to be reported
+  bool is_a_graph() const { return true; }
+  inline NodeTy& getTopologyID(uint64_t nodeID) {
+    return graph.getData(getLID(nodeID));
+  }
+
+  inline NodeTy& getTopologyIDFromIndex(uint64_t index) {
+    return graph.getData(index);
+  }
+
+  uint64_t getTokenID(NodeTy& vertex) {
+    return getGID(&vertex - &graph.getData(0));
+  }
+
+  uint32_t getVertexIndex(NodeTy& vertex) {
+    return (&vertex - &graph.getData(0));
+  }
+
+  uint64_t getLocalityVertex(NodeTy& vertex) {
+    uint64_t gid = getTopologyID(vertex);
+    return getHostIDImpl(gid);
+  }
+
+  /** Edge Manipulation **/
+  edge_iterator mintEdgeHandle(NodeTy& src, std::uint64_t off) {
+    return edge_begin(src) + off;
+  }
+
+  // template <typename T = NodeTy>
+  // typename std::enable_if<!std::is_void<T>::value>::type
+  // setData(typename GraphTy::node_data_reference vertex, T data) {
+  //   graph.setData(vertex, data);
+  // }
+
+  ///** Data Manipulations **/
+
+  // typename GraphTy::node_data_reference
+  // getData(typename GraphTy::node_data_reference vertex) {
+  //   return graph.getData(getTokenID(vertex));
+  // }
+
+  template <typename T = NodeTy>
+  typename std::enable_if<!std::is_void<T>::value>::type
+  setEdgeData(edge_iterator eh, T data) {
+    graph.setEdgeData(eh, data);
+  }
+
+  template <typename T = NodeTy>
+  typename std::enable_if<!std::is_void<T>::value, EdgeTy&>::type
+  getEdgeData(edge_iterator eh) {
+    return graph.getEdgeData(eh);
+  }
+
+  enum Task {
+    ADD_VERTEX,
+    ADD_VERTEX_TOPOLOGY_ONLY,
+    ADD_EDGES,
+    ADD_EDGES_TOPOLOGY_ONLY,
+    DELETE_VERTEX,
+    DELETE_EDGES
+  };
+
+  template <typename... Args>
+  void sendModifyRequest(uint32_t host, Args... args) {
+    galois::runtime::SendBuffer b;
+    galois::runtime::gSerialize(b, args...);
+    galois::runtime::getSystemNetworkInterface().sendTagged(
+        host, galois::runtime::evilPhase, std::move(b));
+  }
+
+  // Assumptions:
+  //  1. A vertex is added before any edges are added to it
+  //  2. No support for deleting edges/vertices yet
+  //  3. Only works for OEC
+  void
+  updateVariables(bool isVertex, uint64_t src,
+                  std::optional<std::vector<uint64_t>> dsts = std::nullopt) {
+
+    if (isVertex) {
+      if (globalToLocalMap.find(src) == globalToLocalMap.end()) {
+        localToGlobalVector.push_back(src);
+        globalToLocalMap[src] = localToGlobalVector.size() - 1;
+        numNodes++;
+      }
+      numOwned++;
+    } else {
+      uint64_t srcLID = globalToLocalMap[src];
+      if (edge_begin(srcLID) == edge_end(srcLID)) {
+        numNodesWithEdges++;
+      }
+      for (auto token : dsts.value()) {
+        if (globalToLocalMap.find(token) == globalToLocalMap.end()) {
+          localToGlobalVector.push_back(token);
+          globalToLocalMap[token] = localToGlobalVector.size() - 1;
+          numNodes++;
+        }
+        if (!isOwned(token)) {
+          mirrorNodes[getHostID(token)].push_back(token);
+        }
+      }
+      numEdges += dsts.value().size();
+    }
+  }
+
+  /** Topology Modifications **/
+  void addVertexTopologyOnly(uint32_t token) {
+    uint64_t belongsTo = getHostID(token);
+    if (belongsTo == id) {
+      updateVariables(true, token);
+      // graph->addVertexTopologyOnly();
+    } else {
+      sendModifyRequest(belongsTo, ADD_VERTEX_TOPOLOGY_ONLY, token);
+    }
+  }
+
+  template <typename T>
+  void addVertex(uint64_t token, T data) {
+    uint64_t belongsTo = getHostID(token);
+    if (belongsTo == id) {
+      updateVariables(true, token);
+      // graph->setData(getLID(token), data);
+    } else {
+      sendModifyRequest(belongsTo, ADD_VERTEX, token, data);
+    }
+  }
+
+  void addEdgesTopologyOnly(uint64_t src, std::vector<uint64_t> dsts) {
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      updateVariables(false, src, dsts);
+      graph->addEdgesTopologyOnly(getLID(src), dsts);
+    } else {
+      sendModifyRequest(belongsTo, ADD_EDGES_TOPOLOGY_ONLY, src, dsts);
+    }
+  }
+
+  void addEdges(uint64_t src, std::vector<uint64_t> dsts,
+                std::vector<EdgeTy> data) {
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      updateVariables(false, src, dsts);
+      std::vector<uint64_t> lids;
+      for (uint32_t i = 0; i < dsts.size(); i++) {
+        lids.push_back(getLID(dsts[i]));
+      }
+      graph->addEdges(getLID(src), lids, data);
+    } else {
+      sendModifyRequest(belongsTo, src, dsts, data);
+    }
+  }
+
+  void deleteVertex(uint64_t src) {
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      // TODO(Divija): Uncomment when we have the graph API
+      // graph.deleteVertex(getLID(src));
+    } else {
+      sendModifyRequest(belongsTo, DELETE_VERTEX, src);
+    }
+  }
+
+  void deleteEdges(uint64_t src, std::vector<edge_iterator> edges) {
+    // TODO:Remove dst tokens from local map?
+    uint64_t belongsTo = getHostID(src);
+    if (belongsTo == id) {
+      // TODO(Divija): Uncomment when we have the graph API
+      // return graph.deleteEdges(getLID(src), edges);
+    } else {
+      sendModifyRequest(belongsTo, DELETE_EDGES, src, edges);
+    }
+  }
+};
+
+template <typename NodeTy, typename EdgeTy>
+constexpr const char* const
+    galois::graphs::DistLocalGraph<NodeTy, EdgeTy>::GRNAME;
+} // end namespace graphs
+} // end namespace galois
+
+#endif //_GALOIS_DISTRIBUTED_LOCAL_GRAPH_H
diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h
index 942a2ceb61..3794d9eef1 100644
--- a/libcusp/include/galois/graphs/GenericPartitioners.h
+++ b/libcusp/include/galois/graphs/GenericPartitioners.h
@@ -25,8 +25,6 @@ class NoCommunication : public galois::graphs::ReadMasterAssignment {
   }
 };
 
-/**
- */
 class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment {
 public:
   MiningPolicyNaive(uint32_t, uint32_t numHosts, uint64_t, uint64_t,
@@ -38,6 +36,17 @@ class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment {
   bool keepEdge(uint32_t src, uint32_t dst) const { return src < dst; }
 };
 
+class OECPolicy : public galois::graphs::ReadMasterAssignment {
+public:
+  OECPolicy(uint32_t, uint32_t numHosts, uint64_t, uint64_t,
+            std::vector<uint64_t>&)
+      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {}
+
+  static bool needNodeDegrees() { return false; }
+
+  bool keepEdge(uint32_t, uint32_t) const { return true; }
+};
+
 class MiningPolicyDegrees : public galois::graphs::ReadMasterAssignment {
   std::vector<uint64_t>& ndegrees;
 
@@ -905,4 +914,145 @@ class SugarColumnFlipP : public galois::graphs::CustomMasterAssignment {
   }
 };
 
+class GnnOEC : public galois::graphs::CustomMasterAssignment {
+public:
+  GnnOEC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
+         uint64_t numEdges)
+      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
+                                               numEdges){};
+
+  template <typename EdgeTy>
+  uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph<EdgeTy>&,
+                     const std::vector<uint32_t>&,
+                     std::unordered_map<uint64_t, uint32_t>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&) {
+    // this is expected to be set
+    return _globalHostMap[src];
+  }
+
+  uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; }
+
+  //! outgoing edge cut
+  uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const {
+    return retrieveMaster(src);
+  }
+
+  bool noCommunication() { return false; }
+  bool isVertexCut() const { return false; }
+  void serializePartition(boost::archive::binary_oarchive&) {}
+  void deserializePartition(boost::archive::binary_iarchive&) {}
+  std::pair<unsigned, unsigned> cartesianGrid() {
+    return std::make_pair(0u, 0u);
+  }
+
+  bool predeterminedMapping(std::vector<uint32_t>& mappings) {
+    if (mappings.size() != _numNodes) {
+      GALOIS_DIE("predetermined mapping size not equal to num nodes");
+    }
+    _globalHostMap.resize(_numNodes);
+
+    galois::do_all(galois::iterate((size_t)0, mappings.size()),
+                   [&](size_t n) { _globalHostMap[n] = mappings[n]; });
+
+    return true;
+  }
+};
+
+class GnnCVC : public galois::graphs::CustomMasterAssignment {
+  unsigned numRowHosts;
+  unsigned numColumnHosts;
+  unsigned _h_offset;
+
+  void factorizeHosts() {
+    numColumnHosts = sqrt(_numHosts);
+
+    while ((_numHosts % numColumnHosts) != 0)
+      numColumnHosts--;
+
+    numRowHosts = _numHosts / numColumnHosts;
+    assert(numRowHosts >= numColumnHosts);
+
+    if (_hostID == 0) {
+      galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts,
+                     "\n");
+    }
+  }
+
+  //! Returns the grid row ID of this host
+  unsigned gridRowID() const { return (_hostID / numColumnHosts); }
+  //! Returns the grid row ID of the specified host
+  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }
+  //! Returns the grid column ID of this host
+  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }
+  //! Returns the grid column ID of the specified host
+  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }
+
+  //! Find the row of a particular node
+  unsigned getRowOfNode(uint64_t gid) const {
+    return gridRowID(retrieveMaster(gid));
+  }
+
+  //! Find the column of a particular node
+  unsigned getColumnOfNode(uint64_t gid) const {
+    return gridColumnID(retrieveMaster(gid));
+  }
+
+public:
+  GnnCVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,
+         uint64_t numEdges)
+      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,
+                                               numEdges) {
+    factorizeHosts();
+    _h_offset = gridRowID() * numColumnHosts;
+  };
+
+  template <typename EdgeTy>
+  uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph<EdgeTy>&,
+                     const std::vector<uint32_t>&,
+                     std::unordered_map<uint64_t, uint32_t>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&,
+                     const std::vector<uint64_t>&,
+                     std::vector<galois::CopyableAtomic<uint64_t>>&) {
+    // this is expected to be set
+    return _globalHostMap[src];
+  }
+
+  uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; }
+
+  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const {
+    unsigned blockedRowOffset   = getRowOfNode(src) * numColumnHosts;
+    unsigned cyclicColumnOffset = getColumnOfNode(dst);
+    return blockedRowOffset + cyclicColumnOffset;
+  }
+
+  bool noCommunication() { return false; }
+  bool isVertexCut() const {
+    if ((numRowHosts == 1) || (numColumnHosts == 1))
+      return false;
+    return true;
+  }
+
+  void serializePartition(boost::archive::binary_oarchive&) {}
+  void deserializePartition(boost::archive::binary_iarchive&) {}
+  std::pair<unsigned, unsigned> cartesianGrid() {
+    return std::make_pair(numRowHosts, numColumnHosts);
+  }
+
+  bool predeterminedMapping(std::vector<uint32_t>& mappings) {
+    if (mappings.size() != _numNodes) {
+      GALOIS_DIE("predetermined mapping size not equal to num nodes");
+    }
+    _globalHostMap.resize(_numNodes);
+
+    galois::do_all(galois::iterate((size_t)0, mappings.size()),
+                   [&](size_t n) { _globalHostMap[n] = mappings[n]; });
+
+    return true;
+  }
+};
+
 #endif
diff --git a/libcusp/include/galois/graphs/MiningPartitioner.h b/libcusp/include/galois/graphs/MiningPartitioner.h
index e49d16023e..c809c24dd0 100644
--- a/libcusp/include/galois/graphs/MiningPartitioner.h
+++ b/libcusp/include/galois/graphs/MiningPartitioner.h
@@ -540,15 +540,15 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
       if (h != base_DistGraph::id) {
         galois::runtime::SendBuffer bitsetBuffer;
         galois::runtime::gSerialize(bitsetBuffer, presentProxies);
-        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer));
       }
     }
 
     // receive loop
     for (unsigned h = 0; h < net.Num - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       uint32_t sendingHost = p->first;
       // deserialize proxiesOnOtherHosts
@@ -653,8 +653,7 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
       bytesSent.update(b.size());
 
       // send buffer and free memory
-      net.sendTagged(h, galois::runtime::evilPhase, b);
-      b.getVec().clear();
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
     galois::runtime::reportStat_Tsum(
         GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce());
@@ -675,9 +674,9 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
 
     for (unsigned h = 0; h < net.Num - 1; h++) {
       // expect data from comm partner back
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint32_t sendingHost = p->first;
@@ -1057,15 +1056,15 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
                 bytesSent.update(b.size());
                 maxBytesSent.update(b.size());
 
-                net.sendTagged(h, galois::runtime::evilPhase, b);
-                b.getVec().clear();
-                b.getVec().reserve(edgePartitionSendBufSize * 1.25);
+                net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+                b = galois::runtime::SerializeBuffer();
+                b.reserve(edgePartitionSendBufSize * 1.25);
               }
             }
           }
 
           // overlap receives
-          auto buffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          auto buffer = net.recieveTagged(galois::runtime::evilPhase);
           this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
         },
 #if MORE_DIST_STATS
@@ -1085,8 +1084,8 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
           bytesSent.update(sendBuffer.size());
           maxBytesSent.update(sendBuffer.size());
 
-          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
-          sendBuffer.getVec().clear();
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+          sendBuffer = galois::runtime::SerializeBuffer();
         }
       }
     }
@@ -1108,7 +1107,7 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
       GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
     if (buffer) {
       auto& rb = buffer->second;
-      while (rb.r_size() > 0) {
+      while (rb.size() > 0) {
         uint64_t n;
         std::vector<uint64_t> gdst_vec;
         galois::runtime::gDeserialize(rb, n);
@@ -1134,8 +1133,8 @@ class MiningGraph : public DistGraph<NodeTy, EdgeTy> {
 
     // receive edges for all mirror nodes
     while (receivedNodes < nodesToReceive) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      p = net.recieveTagged(galois::runtime::evilPhase);
       processReceivedEdgeBuffer(p, graph, receivedNodes);
     }
   }
diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h
index bfc92d989a..e8d7e15d8e 100644
--- a/libcusp/include/galois/graphs/NewGeneric.h
+++ b/libcusp/include/galois/graphs/NewGeneric.h
@@ -29,6 +29,9 @@
 
 #include "galois/graphs/DistributedGraph.h"
 #include "galois/DReducible.h"
+
+#include "shad/ShadGraphConverter.h"
+
 #include <optional>
 #include <sstream>
 
@@ -75,6 +78,69 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
   uint32_t nodesToReceive;
 
+  std::vector<uint32_t> getGNNBreakpoints(std::string filename) {
+    // contains 2 numbers: begin and end of train
+    // everything else can be split evenly among hosts as they are not
+    // performance critical
+    std::vector<uint32_t> bps;
+
+    // TODO(loc) avoid this entirely and load it from file...
+    // if through all possible GNN outputs
+    if (filename.find("cora") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(140);
+    } else if (filename.find("reddit") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(153431);
+    } else if (filename.find("citeseer") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(120);
+    } else if (filename.find("pubmed") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(60);
+    } else if (filename.find("ppi") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(9716);
+    } else if (filename.find("tester") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(5);
+    } else if (filename.find("ogbn-arxiv") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(169252);
+    } else if (filename.find("ogbn-products") != std::string::npos) {
+      bps.push_back(0);
+      bps.push_back(196615);
+    } else if (filename.find("yelp") != std::string::npos) {
+      // this is entire graph: yelp's mask isn't contiguous
+      bps.push_back(0);
+      bps.push_back(716847);
+    } else if (filename.find("amazon") != std::string::npos) {
+      // this is entire graph: amazon's mask isn't contiguous
+      bps.push_back(0);
+      bps.push_back(1569960);
+    } else if (filename.find("ogbn-proteins") != std::string::npos) {
+      // this is entire graph: amazon's mask isn't contiguous
+      bps.push_back(0);
+      bps.push_back(86618);
+    } else if (filename.find("ogbn-papers100M-remap") != std::string::npos) {
+      galois::gInfo("papers remap being used");
+      // whole graph (non contiguous mask)
+      bps.push_back(0);
+      bps.push_back(1207178);
+    } else if (filename.find("ogbn-papers100M") != std::string::npos) {
+      // whole graph (non contiguous mask)
+      bps.push_back(0);
+      bps.push_back(111059956);
+    } else {
+      // TODO(loc) only die under certain conditions; don't die if something
+      // is missing
+      // GALOIS_DIE("invalid input for gnn partitioning ", filename,
+      //           " hardcode needed");
+    }
+
+    return bps;
+  }
+
 public:
   //! typedef for base DistGraph class
   using base_DistGraph = DistGraph<NodeTy, EdgeTy>;
@@ -157,7 +223,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
    */
   NewDistGraphGeneric(
       const std::string& filename, unsigned host, unsigned _numHosts,
-      bool cuspAsync = true, uint32_t stateRounds = 100, bool transpose = false,
+      bool useWMD = false, bool cuspAsync = true, uint32_t stateRounds = 100,
+      bool transpose                          = false,
       galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS,
       uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,
       std::string masterBlockFile = "", bool readFromFile = false,
@@ -170,24 +237,83 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     Tgraph_construct.start();
 
     if (readFromFile) {
-      galois::gPrint("[", base_DistGraph::id,
-                     "] Reading local graph from file ", localGraphFileName,
-                     "\n");
+      galois::gDebug("[", base_DistGraph::id,
+                     "] Reading local graph from file ", localGraphFileName);
       base_DistGraph::read_local_graph_from_file(localGraphFileName);
       Tgraph_construct.stop();
       return;
     }
 
-    galois::graphs::OfflineGraph g(filename);
-    base_DistGraph::numGlobalNodes = g.size();
-    base_DistGraph::numGlobalEdges = g.sizeEdges();
+    galois::graphs::OfflineGraph* offlineGraph{nullptr};
+
+    std::string host_prefix =
+        std::string("[") +
+        std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+        std::string("] ");
+
+    shad::ShadGraphConverter shadConverter;
+    galois::graphs::BufferedGraph<EdgeTy> bufGraph;
+    bufGraph.resetReadCounters();
+
     std::vector<unsigned> dummy;
     // not actually getting masters, but getting assigned readers for nodes
     if (masterBlockFile == "") {
-      base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight);
+      if (useWMD) {
+        uint64_t numGlobalNodes{0}, numGlobalEdges{0};
+        galois::gInfo(host_prefix, "Starts reading SHAD graph file");
+        // Read and load the whole SHAD WMD dataset to memory.
+        // TODO(hc): Note that this reads the entire graph.
+        //           We will improve this to read partial graphs
+        //           on each host later. For now, the main focus is
+        //           to enable WMD dataset for the workflows.
+        shadConverter.readSHADFile(filename, &numGlobalNodes, &numGlobalEdges);
+        galois::gInfo(host_prefix, "Completes reading SHAD graph file");
+        base_DistGraph::numGlobalNodes = numGlobalNodes;
+        base_DistGraph::numGlobalEdges = numGlobalEdges;
+
+        galois::gInfo(host_prefix,
+                      "Read graph # nodes:", std::to_string(numGlobalNodes),
+                      " # edges:", std::to_string(numGlobalEdges));
+        galois::gInfo(host_prefix, "Starts node array construction from SHAD"
+                                   " graph");
+        // Construct node data/outgoing index range arrays
+        // for a GLOBAL array, not a local array.
+        // Later, parts for the local graph partition will be
+        // extracted and be used after graph partitioning is done.
+        // Basically, the idea that is used here is to mimic
+        // the BufferedGraph. BufferedGraph does not load the whole arrays
+        // to memory, but only reads and loads parts of the arrays from
+        // an input file. It is possible since the .gr files are stored
+        // in a CSR format, and in a consecutive manner. We can know
+        // offset for each data in advance.
+        // However, we cannot achieve it from a SHAD graph file since
+        // it is not consecutive, but edges and nodes are mixed.
+        // Due to this, we construct nodes' array for a global graph
+        // here. This array will be restructured after CuSP decides
+        // local nodes.
+        // TODO(hc): UT will improve and redesign this part to
+        // get scalability.
+        shadConverter.constructNodeArrays(0, numGlobalNodes, numGlobalNodes);
+        galois::gInfo(host_prefix, "Completes node array construction from SHAD"
+                                   " graph");
+        // Compute master proxies by using the number of global nodes
+        // and edges.
+        base_DistGraph::computeMasters(
+            md, base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges,
+            shadConverter.getOutIndexBuffer(), dummy, nodeWeight, edgeWeight);
+      } else {
+        offlineGraph = new galois::graphs::OfflineGraph(filename);
+        base_DistGraph::numGlobalNodes = offlineGraph->size();
+        base_DistGraph::numGlobalEdges = offlineGraph->sizeEdges();
+        base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight,
+                                       edgeWeight);
+      }
     } else {
+      if (useWMD) {
+        GALOIS_DIE("SHAD graph format does not support master block file");
+      }
       galois::gInfo("Getting reader assignment from file");
-      base_DistGraph::readersFromFile(g, masterBlockFile);
+      base_DistGraph::readersFromFile(*offlineGraph, masterBlockFile);
     }
 
     graphPartitioner = std::make_unique<Partitioner>(
@@ -196,12 +322,43 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     // TODO abstract this away somehow
     graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);
 
-    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
-    typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
-        g.edge_begin(nodeBegin);
-    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;
-    typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =
-        g.edge_begin(nodeEnd);
+    // get training nodes and split evenly among hosts
+    std::vector<uint32_t> trainPoints = this->getGNNBreakpoints(filename);
+    // TODO(hc)
+    if (!trainPoints.empty()) {
+      std::vector<unsigned> testDistribution =
+          galois::graphs::determineUnitRangesFromPrefixSum(
+              base_DistGraph::numHosts, *offlineGraph, trainPoints[0],
+              trainPoints[1]);
+
+      std::vector<unsigned> restDistribution =
+          galois::graphs::determineUnitRangesFromPrefixSum(
+              base_DistGraph::numHosts, *offlineGraph, trainPoints[1],
+              offlineGraph->size());
+
+      // create global distribution of edges
+      std::vector<uint32_t> mappings(offlineGraph->size());
+      galois::do_all(
+          galois::iterate((size_t)0, (size_t)base_DistGraph::numHosts),
+          [&](size_t h) {
+            // test
+            uint32_t hCur = testDistribution[h];
+            uint32_t hEnd = testDistribution[h + 1];
+            for (; hCur < hEnd; hCur++) {
+              mappings[hCur] = h;
+            }
+            // the rest
+            hCur = restDistribution[h];
+            hEnd = restDistribution[h + 1];
+            for (; hCur < hEnd; hCur++) {
+              mappings[hCur] = h;
+            }
+          });
+      bool validPart = graphPartitioner->predeterminedMapping(mappings);
+      if (!validPart) {
+        galois::gWarn("partitioning policy used doesn't use trainpoints");
+      }
+    }
 
     // signifies how many outgoing edges a particular host should expect from
     // this host
@@ -222,27 +379,39 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     // phase 0
 
-    galois::gPrint("[", base_DistGraph::id, "] Starting graph reading.\n");
-    galois::graphs::BufferedGraph<EdgeTy> bufGraph;
-    bufGraph.resetReadCounters();
+    galois::gDebug("[", base_DistGraph::id, "] Starting graph reading.");
     galois::StatTimer graphReadTimer("GraphReading", GRNAME);
     graphReadTimer.start();
-    bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,
-                              *edgeEnd, base_DistGraph::numGlobalNodes,
-                              base_DistGraph::numGlobalEdges);
+
+    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;
+    uint64_t nodeEnd   = base_DistGraph::gid2host[base_DistGraph::id].second;
+
+    if (!useWMD) {
+      // If the input graph is not SHAD WMD format,
+      // construct a buffered graph from the file directly, as ordinary.
+      typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =
+          offlineGraph->edge_begin(nodeBegin);
+      typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =
+          offlineGraph->edge_begin(nodeEnd);
+      bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,
+                                *edgeEnd, base_DistGraph::numGlobalNodes,
+                                base_DistGraph::numGlobalEdges);
+    } else {
+      constructCSRFromSHADGraph(&bufGraph, &shadConverter, nodeBegin, nodeEnd,
+                                host_prefix);
+    }
+
     graphReadTimer.stop();
-    galois::gPrint("[", base_DistGraph::id, "] Reading graph complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Reading graph complete.");
 
     if (graphPartitioner->masterAssignPhase()) {
       // loop over all nodes, determine where neighbors are, assign masters
       galois::StatTimer phase0Timer("Phase0", GRNAME);
-      galois::gPrint("[", base_DistGraph::id,
-                     "] Starting master assignment.\n");
+      galois::gDebug("[", base_DistGraph::id, "] Starting master assignment.");
       phase0Timer.start();
       phase0(bufGraph, cuspAsync, stateRounds);
       phase0Timer.stop();
-      galois::gPrint("[", base_DistGraph::id,
-                     "] Master assignment complete.\n");
+      galois::gDebug("[", base_DistGraph::id, "] Master assignment complete.");
     }
 
     galois::StatTimer inspectionTimer("EdgeInspection", GRNAME);
@@ -357,13 +526,23 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     base_DistGraph::initializeSpecificRanges();
 
     Tgraph_construct.stop();
-    galois::gPrint("[", base_DistGraph::id, "] Graph construction complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Graph construction complete.");
+
+    if (useWMD) {
+      // Different from the gr format file that has been used by Galois
+      // and does not contain node data in the file,
+      // a SHAD graph file has a single type for each node, and it
+      // is considered as node data.
+      // This function constructs and sets node data (type).
+      assignNodeDataFromSHADProp(&shadConverter);
+    }
 
     // report state rounds
     if (base_DistGraph::id == 0) {
       galois::runtime::reportStat_Single(GRNAME, "CuSPStateRounds",
                                          (uint32_t)stateRounds);
     }
+    galois::gPrint("[", base_DistGraph::id, "] Dist graph constructed\n");
   }
 
 private:
@@ -406,6 +585,108 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     return toReturn;
   }
 
+  /// Construct arrays for in-memory CSR.
+  /// In case of the node out-going edge range array and
+  /// the node data array, it will extract parts corresponding to
+  /// local graph paritition from the arrays holding the global
+  /// array information.
+  /// Edge destination and data arrays are constructed based on
+  /// unordered maps constructed from SHAD graph reading.
+  /// NOTE that those arrays for CSR all store GLOBAL node ids.
+  /// For example, edge destination array's size is equal
+  /// to the number of local edges, but its destination ID is
+  /// global node IDs, not local node IDs.
+  ///
+  /// @tparam T Graph node data type
+  ///
+  /// @param bufGraph Buffered graph to construct
+  /// @param shadConverter Shad graph ingestor which ingested
+  /// a SHAD graph in memory to an unordered node/edge map
+  /// @param nodeBegin Global id of the first local node range
+  /// @param nodeEnd Global id of the last local node range
+  /// @param host_prefix Log prefix string for this host
+  template <
+      typename T                                                      = NodeTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void
+  constructCSRFromSHADGraph(galois::graphs::BufferedGraph<EdgeTy>* bufGraph,
+                            shad::ShadGraphConverter* shadConverter,
+                            uint64_t nodeBegin, uint64_t nodeEnd,
+                            std::string host_prefix) {
+    uint32_t numLocalNodes = nodeEnd - nodeBegin;
+    // So, this holds outgoing edge array of a whole (global) graph.
+    uint64_t* outIndexBuffer = shadConverter->getOutIndexBuffer();
+    // Global edge id range assigned to the current host.
+    uint64_t edgeBegin = (nodeBegin == 0) ? 0 : outIndexBuffer[nodeBegin - 1];
+    // This is the last local node's edge range end.
+    // So, [edgeBegin, edgeEnd) is for this current host.
+    uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1];
+    galois::gInfo(host_prefix, "Starts local out index array construction");
+    // Extract node out-going range and data arrays of local nodes.
+    // From now on, those arrays store local node information
+    // as a dense memory representation.
+    shadConverter->extractLocalOutIndexArray(nodeBegin, nodeEnd);
+    galois::gInfo(host_prefix, "Completes local out index array construction");
+
+    galois::gInfo(host_prefix, "Starts edge destination/data "
+                               "array construction");
+    uint64_t numLocalEdges = edgeEnd - edgeBegin;
+    shadConverter->constructEdgeArrays(nodeBegin, edgeBegin, numLocalNodes,
+                                       numLocalEdges);
+
+    galois::gInfo(host_prefix, "Completes edge destination/data "
+                               "array construction");
+    // Construct a buffered graph that is used by CuSP to partition
+    // a graph.
+    shadConverter->constructBufferedGraph(
+        base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges,
+        nodeBegin, nodeEnd, edgeBegin, edgeEnd, bufGraph);
+    galois::gInfo(host_prefix, "Completes buffered graph construction from"
+                               " SHAD graph");
+  }
+
+  // Disable this method for non-SHAD graph construction.
+  template <typename T = NodeTy,
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
+  void constructCSRFromSHADGraph(galois::graphs::BufferedGraph<EdgeTy>*,
+                                 shad::ShadGraphConverter*, uint64_t, uint64_t,
+                                 std::string) {}
+
+  /**
+   * @brief Assign a SHAD node type to a node data.
+   *
+   * @detail Different from the gr format file that has been used by Galois
+   * and does not contain node data in the file,
+   * a SHAD graph file has a single type for each node, and it
+   * considered as node data. This function constructs and sets node
+   * data based on that.
+   * This function assumes that the node type's data type is always
+   * uint64_t.
+   *
+   * @tparam T Node data type
+   *
+   * @param shadConverter SHAD graph converter holding node data from a
+   * SHAD file.
+   */
+  template <
+      typename T                                                      = NodeTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void assignNodeDataFromSHADProp(shad::ShadGraphConverter* shadConverter) {
+    galois::gPrint("[", base_DistGraph::id, "] Graph node data is assigned.");
+    shad::ShadNodeTy* nodeDataBuffer = shadConverter->getNodeDataBuffer();
+    galois::do_all(galois::iterate(base_DistGraph::allNodesRange()),
+                   [&](uint32_t lid) {
+                     uint64_t gid       = this->getGID(lid);
+                     this->getData(lid) = nodeDataBuffer[gid];
+                   });
+  }
+
+  template <typename T = NodeTy,
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
+  void assignNodeDataFromSHADProp(shad::ShadGraphConverter*) {}
+
   /**
    * For each other host, determine which nodes that this host needs to get
    * info from
@@ -496,16 +777,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           lid++;
         }
       }
-      galois::gDebug("[", base_DistGraph::id, " -> ", h, "] bitset size ",
-                     (end - start) / 64, " vs. vector size ",
-                     syncNodes[h].size() / 2);
     }
     lid -= numLocal;
 
     assert(lid == numToReserve);
-    galois::gDebug("[", base_DistGraph::id, "] total bitset size ",
-                   (ghosts.size() - numLocal) / 64, " vs. total vector size ",
-                   numToReserve / 2);
 
     // TODO: should not be used after this - refactor to make this clean
     ghosts.resize(0);
@@ -539,16 +814,16 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       if (h != base_DistGraph::id) {
         galois::runtime::gSerialize(bitsetBuffer, syncNodes[h]);
         bytesSent += bitsetBuffer.size();
-        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer));
       }
     }
 
     // Step 5: recv bitset to other hosts; this indicates which local nodes each
     // other host needs to be informed of updates of
     for (unsigned h = 0; h < net.Num - 1; h++) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
       uint32_t sendingHost = p->first;
       // deserialize into neighbor bitsets
@@ -638,7 +913,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
         // note the +1 on evil phase; load messages send using a different
         // phase to avoid conflicts
-        net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);
+        net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b));
       }
     }
     sendTimer.stop();
@@ -658,13 +933,13 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                      std::vector<uint64_t>& edgeLoads,
                      galois::DynamicBitSet& loadsClear) {
     auto& net = galois::runtime::getSystemNetworkInterface();
-    decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr)) p;
+    decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1())) p;
 
     galois::StatTimer recvTimer("Phase0AsyncRecvLoadTime", GRNAME);
     recvTimer.start();
     do {
       // note the +1
-      p = net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr);
+      p = net.recieveTagged(base_DistGraph::evilPhasePlus1());
 
       if (p) {
         unsigned messageType = (unsigned)-1;
@@ -859,13 +1134,13 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::runtime::gSerialize(b, mastersToSend);
       }
       bytesSent += b.size();
-      net.sendTagged(targetHost, galois::runtime::evilPhase, b);
+      net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b));
     } else {
       // send empty no-op message, tag 0
       galois::runtime::SendBuffer b;
       galois::runtime::gSerialize(b, 0u);
       bytesSent += b.size();
-      net.sendTagged(targetHost, galois::runtime::evilPhase, b);
+      net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b));
     }
     sendOffsetsTimer.stop();
 
@@ -934,9 +1209,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         bytesSent += b.size();
         // assumes phase is 0 or 1
         if (phase == 1) {
-          net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);
+          net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b));
         } else if (phase == 0) {
-          net.sendTagged(h, galois::runtime::evilPhase, b);
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
         } else {
           GALOIS_DIE("unexpected phase: ", phase);
         }
@@ -954,8 +1229,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                             std::vector<uint32_t>& receivedOffsets,
                             std::vector<uint32_t>& receivedMasters) {
     uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first;
-    galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, " offset ",
-                   hostOffset);
 
     // if execution gets here, messageType was 1 or 2
     assert(receivedMasters.size() == receivedOffsets.size());
@@ -963,10 +1236,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     galois::do_all(
         galois::iterate((size_t)0, receivedMasters.size()),
         [&](size_t i) {
-          uint64_t curGID       = hostOffset + receivedOffsets[i];
-          uint32_t indexIntoMap = gid2offsets[curGID];
-          galois::gDebug("[", base_DistGraph::id, "] gid ", curGID, " offset ",
-                         indexIntoMap);
+          uint64_t curGID                 = hostOffset + receivedOffsets[i];
+          uint32_t indexIntoMap           = gid2offsets[curGID];
           localNodeToMaster[indexIntoMap] = receivedMasters[i];
         },
         galois::no_stats());
@@ -985,9 +1256,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                         std::vector<uint32_t>& receivedMasters) {
     auto& net = galois::runtime::getSystemNetworkInterface();
 
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
     do {
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      p = net.recieveTagged(galois::runtime::evilPhase);
     } while (!p);
 
     uint32_t sendingHost = p->first;
@@ -1011,9 +1282,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                  messageType);
     }
 
-    galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost,
-                   " send message type ", messageType);
-
     return std::make_pair(sendingHost, messageType);
   }
 
@@ -1030,11 +1298,11 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       std::unordered_map<uint64_t, uint32_t>& gid2offsets,
       galois::DynamicBitSet& hostFinished) {
     auto& net = galois::runtime::getSystemNetworkInterface();
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
 
     // repeat loop until no message
     do {
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      p = net.recieveTagged(galois::runtime::evilPhase);
       if (p) {
         uint32_t sendingHost = p->first;
         unsigned messageType = (unsigned)-1;
@@ -1068,9 +1336,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           GALOIS_DIE("invalid message type for sync of master assignments: ",
                      messageType);
         }
-
-        galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost,
-                       " send message type ", messageType);
       }
     } while (p);
   }
@@ -1258,8 +1523,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     // gid to vector offset setup
     std::unordered_map<uint64_t, uint32_t> gid2offsets;
     uint64_t neighborCount = phase0MapSetup(ghosts, gid2offsets, syncNodes);
-    galois::gDebug("[", base_DistGraph::id, "] num neighbors found is ",
-                   neighborCount);
     // send off neighbor metadata
     phase0SendRecv(syncNodes);
 
@@ -1291,7 +1554,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     if (async) {
       if (base_DistGraph::id == 0) {
-        galois::gPrint("Using asynchronous master determination sends.\n");
+        galois::gDebug("Using asynchronous master determination sends.");
       }
 
       hostFinished.resize(base_DistGraph::numHosts);
@@ -1309,8 +1572,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 #endif
 
     if (base_DistGraph::id == 0) {
-      galois::gPrint("Number of BSP sync rounds in master assignment: ",
-                     stateRounds, "\n");
+      galois::gDebug("Number of BSP sync rounds in master assignment: ",
+                     stateRounds);
     }
 
     // galois::PerThreadTimer<CUSP_PT_TIMER> ptt(
@@ -1328,13 +1591,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       auto work =
           getSpecificThreadRange(bufGraph, rangeVec, beginNode, endNode);
 
-      // debug print
-      // galois::on_each([&] (unsigned i, unsigned j) {
-      //  galois::gDebug("[", base_DistGraph::id, " ", i, "] sync round ",
-      //  syncRound, " local range ",
-      //                 *work.local_begin(), " ", *work.local_end());
-      //});
-
       galois::do_all(
           // iterate over my read nodes
           galois::iterate(work),
@@ -1352,10 +1608,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
             // on map with subtraction
             localNodeToMaster[node - globalOffset] = assignedHost;
 
-            // galois::gDebug("[", base_DistGraph::id, "] state round ",
-            // syncRound,
-            //               " set ", node, " ", node - globalOffset);
-
             // ptt.stop();
           },
           galois::loopname("Phase0DetermineMasters"), galois::steal(),
@@ -1386,13 +1638,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         asyncSyncLoad(nodeLoads, nodeAccum, edgeLoads, edgeAccum, loadsClear);
       }
       loadSyncTimer.stop();
-
-#ifndef NDEBUG
-      if (async) {
-        galois::gDebug("[", base_DistGraph::id, "] host count ",
-                       hostFinished.count());
-      }
-#endif
     }
 
     // if asynchronous, don't move on until everything is done
@@ -1409,14 +1654,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       waitTime.start();
       while (hostFinished.count() != base_DistGraph::numHosts ||
              loadsClear.count() != base_DistGraph::numHosts) {
-        //#ifndef NDEBUG
-        // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to
-        // finish, ",
-        //               hostFinished.count());
-        // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts
-        // loads "
-        //               "syncs to finish, ", loadsClear.count());
-        //#endif
         // make sure all assignments are done and all loads are done
         syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets,
                                     hostFinished);
@@ -1425,15 +1662,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       waitTime.stop();
     }
 
-#ifndef NDEBUG
-    printLoad(nodeLoads, nodeAccum);
-    printLoad(edgeLoads, edgeAccum);
-#endif
-
     // sanity check for correctness (all should be assigned)
     for (uint32_t i = 0; i < localNodeToMaster.size(); i++) {
       if (localNodeToMaster[i] == (uint32_t)-1) {
-        // galois::gDebug("[", base_DistGraph::id, "] bad index ", i);
         assert(localNodeToMaster[i] != (uint32_t)-1);
       }
     }
@@ -1444,9 +1675,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       base_DistGraph::increment_evilPhase();
     }
 
-    galois::gPrint("[", base_DistGraph::id,
+    galois::gDebug("[", base_DistGraph::id,
                    "] Local master assignment "
-                   "complete.\n");
+                   "complete.");
 
     // one more step: let masters know of nodes they own (if they don't
     // have the node locally then this is the only way they will learn about
@@ -1458,7 +1689,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     recvMastersToOwners();
     p0master2ownerTimer.stop();
 
-    galois::gPrint("[", base_DistGraph::id, "] Received my master mappings.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Received my master mappings.");
 
     base_DistGraph::increment_evilPhase();
 
@@ -1503,11 +1734,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     inspectionTimer.stop();
 
     uint64_t allBytesRead = bufGraph.getBytesRead();
-    galois::gPrint(
-        "[", base_DistGraph::id,
-        "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f,
-        " seconds to read ", allBytesRead, " bytes (",
-        allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n");
+    galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ",
+                   inspectionTimer.get_usec() / 1000000.0f, " seconds to read ",
+                   allBytesRead, " bytes (",
+                   allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)");
 
     // get incoming mirrors ready for creation
     uint32_t additionalMirrorCount = incomingMirrors.count();
@@ -1606,7 +1836,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
   void edgeCutLoad(GraphTy& graph,
                    galois::graphs::BufferedGraph<EdgeTy>& bGraph) {
     if (base_DistGraph::id == 0) {
-      galois::gPrint("Loading edge-data while creating edges\n");
+      galois::gDebug("Loading edge-data while creating edges");
     }
 
     uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;
@@ -1637,10 +1867,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::steal(), galois::no_stats());
 
     timer.stop();
-    galois::gPrint("[", base_DistGraph::id,
+    galois::gDebug("[", base_DistGraph::id,
                    "] Edge loading time: ", timer.get_usec() / 1000000.0f,
                    " seconds to read ", bGraph.getBytesRead(), " bytes (",
-                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n");
+                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)");
   }
 
   /**
@@ -1658,7 +1888,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
   void edgeCutLoad(GraphTy& graph,
                    galois::graphs::BufferedGraph<EdgeTy>& bGraph) {
     if (base_DistGraph::id == 0) {
-      galois::gPrint("Loading edge-data while creating edges\n");
+      galois::gDebug("Loading edge-data while creating edges");
     }
 
     uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;
@@ -1688,10 +1918,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::steal(), galois::no_stats());
 
     timer.stop();
-    galois::gPrint("[", base_DistGraph::id,
+    galois::gDebug("[", base_DistGraph::id,
                    "] Edge loading time: ", timer.get_usec() / 1000000.0f,
                    " seconds to read ", bGraph.getBytesRead(), " bytes (",
-                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n");
+                   bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)");
   }
 
   /**
@@ -1724,11 +1954,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     inspectionTimer.stop();
     // report edge inspection time
     uint64_t allBytesRead = bufGraph.getBytesRead();
-    galois::gPrint(
-        "[", base_DistGraph::id,
-        "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f,
-        " seconds to read ", allBytesRead, " bytes (",
-        allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n");
+    galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ",
+                   inspectionTimer.get_usec() / 1000000.0f, " seconds to read ",
+                   allBytesRead, " bytes (",
+                   allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)");
 
     // old inspection barrier
     // galois::runtime::getHostBarrier().wait();
@@ -1990,9 +2219,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     size_t curCount = 0;
     // size_t actuallySet = 0;
     for (uint32_t offset : offsetsToConsider.getOffsets()) {
-      // galois::gDebug("[", base_DistGraph::id, "] ", " setting ",
-      //               offset + hostOffset, " from host ", senderHost,
-      //               " to ", recvMasterLocations[curCount]);
       graphPartitioner->addMasterMapping(offset + hostOffset,
                                          recvMasterLocations[curCount]);
       // bool set = graphPartitioner->addMasterMapping(offset + hostOffset,
@@ -2000,9 +2226,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       // if (set) { actuallySet++; }
       curCount++;
     }
-
-    // galois::gDebug("[", base_DistGraph::id, "] host ", senderHost, ": set ",
-    //               actuallySet, " out of ", recvMasterLocations.size());
   }
 
   /**
@@ -2019,9 +2242,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     size_t curCount = 0;
     for (uint64_t gid : gids) {
       assert(gid < base_DistGraph::numGlobalNodes);
-      // galois::gDebug("[", base_DistGraph::id, "] ", " in-setting ", gid, " to
-      // ",
-      //               recvMasterLocations[curCount]);
       graphPartitioner->addMasterMapping(gid, recvMasterLocations[curCount]);
       curCount++;
     }
@@ -2082,7 +2302,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::runtime::gSerialize(b, offsets);
 
         if (graphPartitioner->masterAssignPhase()) {
-          // galois::gDebug("incoming master map serialization");
           // serializeIncomingMasterMap(b, curBitset, h);
           serializeIncomingMasterMap(b, curBitset);
         }
@@ -2091,7 +2310,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
         galois::runtime::gSerialize(b, 1);
         galois::runtime::gSerialize(b, curBitset);
         if (graphPartitioner->masterAssignPhase()) {
-          // galois::gDebug("incoming master map serialization");
           // serializeIncomingMasterMap(b, curBitset, h);
           serializeIncomingMasterMap(b, curBitset);
         }
@@ -2102,14 +2320,13 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       bytesSent.update(b.size());
 
       // send buffer and free memory
-      net.sendTagged(h, galois::runtime::evilPhase, b);
-      b.getVec().clear();
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
     }
 
     galois::runtime::reportStat_Tsum(
         GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce());
 
-    galois::gPrint("[", base_DistGraph::id, "] Inspection sends complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Inspection sends complete.");
   }
 
   /**
@@ -2127,9 +2344,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     for (unsigned h = 0; h < net.Num - 1; h++) {
       // expect data from comm partner back
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint32_t sendingHost = p->first;
@@ -2189,8 +2406,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       }
     }
 
-    galois::gPrint("[", base_DistGraph::id,
-                   "] Inspection receives complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Inspection receives complete.");
   }
 
   /**
@@ -2217,10 +2433,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges);
     finalizeInspection(prefixSumOfEdges);
 
-    galois::gDebug("[", base_DistGraph::id,
-                   "] To receive this many nodes: ", nodesToReceive);
-
-    galois::gPrint("[", base_DistGraph::id, "] Inspection mapping complete.\n");
+    galois::gDebug("[", base_DistGraph::id, "] Inspection mapping complete.");
     return prefixSumOfEdges;
   }
 
@@ -2256,9 +2469,6 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
             galois::block_range((size_t)0, hostSize, tid, nthreads);
         uint64_t count = 0;
         for (size_t i = beginNode; i < endNode; i++) {
-          // galois::gDebug("[", base_DistGraph::id, "] ", i + startNode,
-          //               " mapped to ",
-          //               graphPartitioner->retrieveMaster(i+startNode));
           if (graphPartitioner->retrieveMaster(i + startNode) == myHID) {
             count++;
           }
@@ -2275,9 +2485,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       assert(base_DistGraph::localToGlobalVector.size() ==
              base_DistGraph::numNodes);
 
-      uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1];
-      galois::gDebug("[", base_DistGraph::id, "] This many masters from host ",
-                     h, ": ", newMasterNodes);
+      uint32_t newMasterNodes    = threadPrefixSums[activeThreads - 1];
       uint32_t startingNodeIndex = base_DistGraph::numNodes;
       // increase size of prefix sum + mapping vector
       prefixSumOfEdges.resize(base_DistGraph::numNodes + newMasterNodes);
@@ -2565,6 +2773,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
     for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
          i++) {
       uint32_t globalID = base_DistGraph::localToGlobalVector[i];
+      assert(graphPartitioner->retrieveMaster(globalID) != base_DistGraph::id);
       base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
           .push_back(globalID);
     }
@@ -2577,9 +2786,9 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                  galois::graphs::BufferedGraph<EdgeTy>& bufGraph) {
     if (base_DistGraph::id == 0) {
       if (std::is_void<typename GraphTy::edge_data_type>::value) {
-        fprintf(stderr, "Loading void edge-data while creating edges.\n");
+        galois::gDebug("Loading void edge-data while creating edges.");
       } else {
-        fprintf(stderr, "Loading edge-data while creating edges.\n");
+        galois::gDebug(stderr, "Loading edge-data while creating edges.");
       }
     }
 
@@ -2604,10 +2813,10 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     loadEdgeTimer.stop();
 
-    galois::gPrint("[", base_DistGraph::id, "] Edge loading time: ",
+    galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ",
                    loadEdgeTimer.get_usec() / 1000000.0f, " seconds to read ",
                    bufBytesRead, " bytes (",
-                   bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)\n");
+                   bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)");
   }
 
   // Edge type is not void. (i.e. edge data exists)
@@ -2719,16 +2928,15 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                   bytesSent.update(b.size());
                   maxBytesSent.update(b.size());
 
-                  net.sendTagged(h, galois::runtime::evilPhase, b);
-                  b.getVec().clear();
-                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);
+                  net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+                  b = galois::runtime::SerializeBuffer();
+                  b.reserve(edgePartitionSendBufSize * 1.25);
                 }
               }
             }
 
             // overlap receives
-            auto buffer =
-                net.recieveTagged(galois::runtime::evilPhase, nullptr);
+            auto buffer = net.recieveTagged(galois::runtime::evilPhase);
             this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
           },
 #if MORE_DIST_STATS
@@ -2751,8 +2959,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           bytesSent.update(sendBuffer.size());
           maxBytesSent.update(sendBuffer.size());
 
-          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
-          sendBuffer.getVec().clear();
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+          sendBuffer = galois::runtime::SerializeBuffer();
         }
       }
     }
@@ -2865,16 +3073,15 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
                   bytesSent.update(b.size());
                   maxBytesSent.update(b.size());
 
-                  net.sendTagged(h, galois::runtime::evilPhase, b);
-                  b.getVec().clear();
-                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);
+                  net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+                  b = galois::runtime::SerializeBuffer();
+                  b.reserve(edgePartitionSendBufSize * 1.25);
                 }
               }
             }
 
             // overlap receives
-            auto buffer =
-                net.recieveTagged(galois::runtime::evilPhase, nullptr);
+            auto buffer = net.recieveTagged(galois::runtime::evilPhase);
             this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);
           },
 #if MORE_DIST_STATS
@@ -2897,8 +3104,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
           bytesSent.update(sendBuffer.size());
           maxBytesSent.update(sendBuffer.size());
 
-          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);
-          sendBuffer.getVec().clear();
+          net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
         }
       }
     }
@@ -2920,7 +3126,7 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
       GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {
     if (buffer) {
       auto& rb = buffer->second;
-      while (rb.r_size() > 0) {
+      while (rb.size() > 0) {
         uint64_t n;
         std::vector<uint64_t> gdst_vec;
         galois::runtime::gDeserialize(rb, n);
@@ -2946,8 +3152,8 @@ class NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {
 
     // receive edges for all mirror nodes
     while (receivedNodes < nodesToReceive) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      p = net.recieveTagged(galois::runtime::evilPhase);
       processReceivedEdgeBuffer(p, graph, receivedNodes);
     }
   }
diff --git a/libcusp/test/CMakeLists.txt b/libcusp/test/CMakeLists.txt
new file mode 100644
index 0000000000..710627302c
--- /dev/null
+++ b/libcusp/test/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(shad_dist_graph shad-dist-graph.cpp)
+target_link_libraries(shad_dist_graph galois_gnn)
diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp
new file mode 100644
index 0000000000..492bfeb2ad
--- /dev/null
+++ b/libcusp/test/shad-dist-graph.cpp
@@ -0,0 +1,132 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#include <fstream>
+
+#include "galois/Galois.h"
+#include "galois/graphs/CuSPPartitioner.h"
+#include "shad/ShadGraphConverter.h"
+
+int main() {
+  galois::DistMemSys G;
+  unsigned M = galois::substrate::getThreadPool().getMaxThreads();
+  // M = 1;
+  galois::setActiveThreads(M);
+
+  shad::ShadGraphConverter shadConverter;
+  size_t numNodes{0}, numEdges{0};
+
+  // TODO(hc): This path should be properly set based on user's environment.
+  // Later, this test dataset will be included in the Galois repository, and
+  // will use a relative path.
+  std::string filename = "/home/hochan/data.01.csv";
+  shadConverter.readSHADFile(filename, &numNodes, &numEdges);
+  std::unique_ptr<galois::graphs::DistGraph<shad::ShadNodeTy, shad::ShadEdgeTy>>
+      graph = galois::cuspPartitionGraph<GenericCVC, shad::ShadNodeTy,
+                                         shad::ShadEdgeTy>(
+          filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true);
+
+  std::cout << "Test starts...\n";
+
+  galois::DGAccumulator<uint64_t> sumGlobalNodes;
+  galois::DGAccumulator<uint64_t> sumGlobalEdges;
+
+  sumGlobalNodes.reset();
+  sumGlobalEdges.reset();
+
+  sumGlobalNodes += graph->numMasters();
+  sumGlobalEdges += graph->sizeEdges();
+
+  uint64_t reducedSumGlobalNodes = sumGlobalNodes.reduce();
+  uint64_t reducedSumGlobalEdges = sumGlobalEdges.reduce();
+
+  assert(reducedSumGlobalNodes == numNodes);
+  assert(reducedSumGlobalNodes == graph->globalSize());
+  assert(reducedSumGlobalEdges == numEdges);
+  assert(reducedSumGlobalEdges == graph->globalSizeEdges());
+
+  std::cout << "Num. nodes/edges tests has been passed\n";
+
+  uint32_t id       = galois::runtime::getSystemNetworkInterface().ID;
+  uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num;
+  {
+    std::ofstream fp(std::to_string(id) + ".master");
+    for (uint32_t src = 0; src < graph->numMasters(); ++src) {
+      uint64_t srcglobal = graph->getGID(src);
+      fp << "node " << srcglobal << ", type: " << graph->getData(src).type
+         << ", key: " << graph->getData(src).key << "\n";
+      for (auto e : graph->edges(src)) {
+        uint32_t dstlocal  = graph->getEdgeDst(e);
+        uint64_t dstglobal = graph->getGID(dstlocal);
+        fp << "\t edge dst " << dstglobal << ", type: " << graph->getEdgeData(e)
+           << "\n";
+      }
+    }
+    fp.close();
+  }
+
+  {
+    for (uint32_t host = 0; host < numHosts; ++host) {
+      if (host == id) {
+        continue;
+      }
+      std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) +
+                       ".graph");
+      for (uint32_t i = 0; i < graph->size(); ++i) {
+        fp << i << ", " << graph->getGID(i) << ", " << graph->getData(i).type
+           << ", " << graph->getData(i).key << "\n";
+      }
+      fp.close();
+    }
+  }
+#if 0
+  {
+  for (uint32_t host = 0; host < numHosts; ++host) {
+    if (host == id) {
+      continue;
+    }
+    std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".mirror");
+    for (uint32_t i = 0;
+         i < graph->getMirrorNodes()[host].size(); ++i) {
+      uint64_t srcglobal = graph->getMirrorNodes()[host][i];
+      uint32_t src = graph->getLID(srcglobal);
+      fp << "src:" << src << ", global:" << srcglobal << ", node data:" <<
+        graph->getData(src) << "\n" << std::flush;
+
+      assert(shadConverter.checkNode(srcglobal, graph->getData(src)));
+      fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n";
+      //if (std::distance(graph->edge_begin(src), graph->edge_end(src)) > 0) {
+        for (auto e : graph->edges(src)) {
+          uint32_t dst = graph->getEdgeDst(e);
+          uint64_t dstglobal = graph->getGID(dst);
+          assert(shadConverter.checkNode(dstglobal, graph->getData(dst)));
+          assert(shadConverter.checkEdge(srcglobal, dstglobal,
+              std::distance(graph->edge_begin(src), e),
+              graph->getEdgeData(e)));
+          fp << "\t edge dst " << dstglobal << ", type: " <<
+              graph->getEdgeData(e) << "\n" << std::flush;
+        }
+    }
+    fp.close();
+    }
+  }
+#endif
+
+  return 0;
+}
diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt
new file mode 100644
index 0000000000..44be89edad
--- /dev/null
+++ b/libdeepgalois/CMakeLists.txt
@@ -0,0 +1,131 @@
+cmake_minimum_required(VERSION 2.8)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
+SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas)
+SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
+set(BLAS_LIB "-lopenblas -lpthread")
+if(USE_MKL_BLAS)
+  link_directories(${INTEL_LIBS_DIR})
+  message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}")
+  SET(BLAS_INC_DIR ${MKL_ROOT}/include)
+  SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64)
+  set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core")
+  #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL")
+endif()
+
+# blas library
+include_directories(${BLAS_INC_DIR})
+link_directories(${BLAS_LIB_DIR})
+message(STATUS "BLAS_INC_DIR: ${BLAS_INC_DIR}")
+message(STATUS "BLAS_LIB_DIR: ${BLAS_LIB_DIR}")
+
+# galois base libs
+include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+link_directories(${CMAKE_SOURCE_DIR}/libgalois)
+
+if(GALOIS_ENABLE_GPU)
+  # hetero path
+  set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU --extended-lambda ${CUDA_NVCC_FLAGS}")
+  set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers
+  include_directories("${CUB_ROOT}")
+  set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers
+  include_directories("${MGPU_ROOT}/src")
+  include_directories(${CUDA_HOME}/include)
+  include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+
+  find_package(CUDA REQUIRED)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+  set(CUDA_HOST_COMPILER g++)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_60,code=sm_60)
+  #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_61,code=sm_61)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70)
+  #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic)
+  link_directories(${CUDA_HOME}/lib64)
+  link_directories(${CMAKE_SOURCE_DIR}/libgpu)
+  message(STATUS "CUDA_LIB_DIR: ${CUDA_HOME}/lib64")
+
+  set(CUDA_SOURCES
+    src/layers/graph_conv_layer.cu
+    src/layers/softmax_loss_layer.cu
+    src/layers/sigmoid_loss_layer.cu
+    src/layers/leaky_relu_layer.cu
+    src/layers/l2_norm_layer.cu
+    src/layers/relu_layer.cu
+    src/layers/aggregator.cu
+    src/math_functions.cu
+    src/optimizer.cu
+    src/DistContext.cu
+    src/Sampler.cu
+    src/lgraph.cu
+    src/node.cu
+    src/Net.cu
+  )
+  cuda_add_library(dg_gpu ${CUDA_SOURCES})
+  target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand)
+  set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA")
+  set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(GALOIS_ENABLE_GPU)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU")
+  set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp src/Train.cpp)
+else()
+  set(sources
+    src/layers/softmax_loss_layer.cpp
+    src/layers/sigmoid_loss_layer.cpp
+    src/layers/graph_conv_layer.cpp
+    src/layers/leaky_relu_layer.cpp
+    src/layers/l2_norm_layer.cpp
+    src/layers/relu_layer.cpp
+    src/layers/aggregator.cpp
+    src/math_functions.cpp
+    src/optimizer.cpp
+    src/DistContext.cpp
+    src/RandomWalk.cpp
+    src/Sampler.cpp
+    src/reader.cpp
+    src/lgraph.cpp
+    src/utils.cpp
+    src/Train.cpp
+    src/node.cpp
+    src/Net.cpp
+  )
+endif(GALOIS_ENABLE_GPU)
+
+add_library(dg_cpu STATIC ${sources})
+target_link_libraries(dg_cpu galois_shmem)
+target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES})
+target_link_libraries(dg_cpu ${BLAS_LIB} ${BOOST_LIBRARIES})
+target_include_directories(dg_cpu PUBLIC
+  ${CMAKE_SOURCE_DIR}/libgalois/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+# dist galois setup/linking to dg_cpu
+if(GALOIS_ENABLE_DIST)
+  target_link_libraries(dg_cpu galois_dist_async galois_cusp galois_gluon)
+  target_include_directories(dg_cpu PUBLIC
+    ${CMAKE_SOURCE_DIR}/libdist/include
+    ${CMAKE_SOURCE_DIR}/libcusp/include
+    ${CMAKE_SOURCE_DIR}/libgluon/include
+  )
+
+  if(GALOIS_ENABLE_GPU)
+    target_link_libraries(dg_gpu galois_dist_async galois_cusp galois_gluon)
+    target_include_directories(dg_gpu PUBLIC
+      ${CMAKE_SOURCE_DIR}/libdist/include
+      ${CMAKE_SOURCE_DIR}/libcusp/include
+      ${CMAKE_SOURCE_DIR}/libgluon/include
+    )
+  endif()
+endif()
+
+set_target_properties(dg_cpu PROPERTIES
+  INTERFACE_POSITION_INDEPENDENT_CODE On
+  POSITION_INDEPENDENT_CODE On
+)
diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h
new file mode 100644
index 0000000000..ba3d1510bf
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/Context.h
@@ -0,0 +1,48 @@
+#pragma once
+#include <string>
+#include <cassert>
+#include "deepgalois/types.h"
+#include "deepgalois/reader.h"
+#include "deepgalois/configs.h"
+#include "deepgalois/GraphTypes.h"
+
+namespace deepgalois {
+
+class Context {
+  bool is_device;         // is this on device or host
+  bool is_selfloop_added; // whether selfloop is added to the input graph
+  std::string dataset;
+  Reader reader;
+
+public:
+  GraphCPU* graph_cpu; // the input graph, |V| = N
+  GraphCPU* getGraphPointer() { return graph_cpu; }
+  Context() : Context(false) {}
+  //! initializer for gpu; goes ahead and sets a few things
+  Context(bool use_gpu) : is_device(use_gpu), is_selfloop_added(false) {}
+  ~Context() {}
+  void set_dataset(std::string dataset_str) {
+    dataset = dataset_str;
+    reader.init(dataset);
+  }
+  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
+                    mask_t* masks) {
+    return reader.read_masks(mask_type, n, begin, end, masks);
+  }
+  size_t read_graph(bool selfloop) {
+    graph_cpu = new GraphCPU();
+    graph_cpu->readGraph(dataset, selfloop);
+    is_selfloop_added = selfloop;
+    return graph_cpu->size();
+  }
+
+  //! Checks if subgraph being used, sets currenet graph, then calls degreex
+  //! counting
+  GraphCPU* getFullGraph() {
+    graph_cpu
+        ->degree_counting(); // TODO: why is it here? should be in read_graph
+    return graph_cpu;
+  }
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h
new file mode 100644
index 0000000000..3ecf9ed411
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/DistContext.h
@@ -0,0 +1,142 @@
+#ifndef __DG_DIST_CONTEXT__
+#define __DG_DIST_CONTEXT__
+#ifdef GALOIS_ENABLE_GPU
+#include "deepgalois/cutils.h"
+#else
+#include "galois/graphs/GluonSubstrate.h"
+#endif
+
+#include "deepgalois/types.h"
+#include "deepgalois/Context.h"
+#include "deepgalois/GraphTypes.h"
+#include "deepgalois/reader.h"
+
+namespace deepgalois {
+
+class DistContext {
+  bool is_device;         // is this on device or host
+  bool is_selfloop_added; // whether selfloop is added to the input graph
+  bool usingSingleClass;
+  std::string dataset;
+  size_t num_classes;       // number of classes: E
+  size_t feat_len;          // input feature length: D
+  Graph* lGraph;            // learning graph version
+  DGraph* partitionedGraph; // the input graph, |V| = N
+  std::vector<Graph*> partitionedSubgraphs;
+  label_t* h_labels; // labels for classification. Single-class: Nx1,
+                     // multi-class: NxE
+  float_t* h_feats;  // input features: N x D
+#ifdef GALOIS_ENABLE_GPU
+  label_t* d_labels;      // labels on device
+  label_t* d_labels_subg; // labels for subgraph on device
+  float_t* d_feats;       // input features on device
+  float_t* d_feats_subg;  // input features for subgraph on device
+  float_t* d_normFactors;
+  float_t* d_normFactorsSub;
+#else
+  galois::graphs::GluonSubstrate<DGraph>* syncSubstrate;
+#endif
+  std::vector<label_t> h_labels_subg; // labels for subgraph
+  std::vector<float_t> h_feats_subg;  // input features for subgraph
+  std::vector<float_t>
+      normFactors; // normalization constant based on graph structure
+  std::vector<float_t> normFactorsSub; // normalization constant for subgraph
+
+  Reader reader;
+
+public:
+  // TODO better constructor
+  DistContext();
+  DistContext(bool isDevice)
+      : is_device(isDevice), is_selfloop_added(false), usingSingleClass(true),
+        dataset(""), num_classes(0), feat_len(0), lGraph(NULL),
+        partitionedGraph(NULL), h_labels(0), h_feats(0) {}
+  ~DistContext();
+
+  size_t read_graph(std::string dataset_str, bool selfloop = false);
+
+  //! read labels of local nodes only
+  size_t read_labels(bool isSingleClassLabel, std::string dataset_str);
+
+  //! read features of local nodes only
+  size_t read_features(std::string dataset_str);
+
+  //! read masks of local nodes only
+  size_t read_masks(std::string dataset_str, std::string mask_type, size_t n,
+                    size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph);
+
+  DGraph* getGraphPointer() { return partitionedGraph; }
+  Graph* getLGraphPointer() { return lGraph; }
+  Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; };
+
+  void initializeSyncSubstrate();
+#ifdef GALOIS_ENABLE_GPU
+  float_t* get_feats_ptr() { return d_feats; }
+  float_t* get_feats_subg_ptr() { return d_feats_subg; }
+  label_t* get_labels_ptr() { return d_labels; }
+  label_t* get_labels_subg_ptr() { return d_labels_subg; }
+  float_t* get_norm_factors_ptr() { return d_normFactors; }
+  float_t* get_norm_factors_subg_ptr() { return d_normFactorsSub; }
+  void copy_data_to_device();               // copy labels and input features
+  static cublasHandle_t cublas_handle_;     // used to call cuBLAS
+  static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE
+  static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE
+  static curandGenerator_t
+      curand_generator_; // used to generate random numbers on GPU
+  inline static cublasHandle_t cublas_handle() { return cublas_handle_; }
+  inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; }
+  inline static cusparseMatDescr_t cusparse_matdescr() {
+    return cusparse_matdescr_;
+  }
+  inline static curandGenerator_t curand_generator() {
+    return curand_generator_;
+  }
+#else
+  void saveDistGraph(DGraph* a);
+  galois::graphs::GluonSubstrate<DGraph>* getSyncSubstrate();
+  float_t* get_feats_ptr() { return h_feats; }
+  float_t* get_feats_subg_ptr() { return h_feats_subg.data(); }
+  label_t* get_labels_ptr() { return h_labels; }
+  label_t* get_labels_subg_ptr() { return h_labels_subg.data(); }
+  float_t* get_norm_factors_ptr() { return normFactors.data(); }
+  float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; }
+#endif
+
+  void set_dataset(std::string dataset_str) {
+    dataset = dataset_str;
+    reader.init(dataset);
+  }
+
+  //! allocate the norm factor vector
+  void allocNormFactor();
+  void allocNormFactorSub(int subID);
+  //! construct norm factor vector by using data from global graph
+  void constructNormFactor(deepgalois::Context* globalContext);
+  void constructNormFactorSub(int subgraphID);
+
+  void constructSubgraphLabels(size_t m, const mask_t* masks);
+  void constructSubgraphFeatures(size_t m, const mask_t* masks);
+
+  //! return label for some node
+  //! NOTE: this is LID, not GID
+  label_t get_label(size_t lid) { return h_labels[lid]; }
+
+  //! returns pointer to the features of each local node
+  float_t* get_in_ptr();
+
+  //! allocate memory for subgraphs (don't actually build them)
+  void allocateSubgraphs(int num_subgraphs, unsigned max_size);
+
+  //! return if a vertex is owned by the partitioned graph this context contains
+  bool isOwned(unsigned gid);
+  //! return if part graph has provided vertex for given gid locally
+  bool isLocal(unsigned gid);
+  //! get GID of an lid for a vertex
+  unsigned getGID(unsigned lid);
+  //! get local id of a vertex given a global id for that vertex
+  unsigned getLID(unsigned gid);
+};
+
+} // namespace deepgalois
+
+#endif
diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h
new file mode 100644
index 0000000000..3f613a3039
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/GraphTypes.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "deepgalois/types.h"
+#include "deepgalois/lgraph.h"
+
+#ifdef GALOIS_ENABLE_GPU
+#define USE_CSRGRAPH
+#ifdef USE_CSRGRAPH
+#include "graph_gpu.h"
+#endif
+#else
+#include "galois/Galois.h"
+#include "galois/graphs/NewGeneric.h"
+#endif
+
+namespace deepgalois {
+using edge_iterator = index_t;
+using GraphCPU      = LearningGraph;
+#ifdef GALOIS_ENABLE_GPU
+using DGraph   = CSRGraph;
+using Graph    = CSRGraph;
+using GraphGPU = CSRGraph;
+#else
+using DGraph = galois::graphs::DistGraph<char, void>;
+using Graph  = LearningGraph;
+#endif
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h
new file mode 100644
index 0000000000..bd33924eee
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/Net.h
@@ -0,0 +1,151 @@
+/**
+ * Based on the net.hpp file from Caffe deep learning framework.
+ */
+#pragma once
+#include <random>
+#include "deepgalois/types.h"
+#include "deepgalois/layers/l2_norm_layer.h"
+#include "deepgalois/layers/graph_conv_layer.h"
+#include "deepgalois/layers/softmax_loss_layer.h"
+#include "deepgalois/layers/sigmoid_loss_layer.h"
+#include "deepgalois/optimizer.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/Context.h"
+#include "deepgalois/GraphTypes.h"
+#include "deepgalois/DistContext.h"
+#include "deepgalois/Sampler.h"
+
+namespace deepgalois {
+
+// N: number of vertices, D: feature vector dimentions,
+// E: number of distinct labels, i.e. number of vertex classes
+// layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16)
+// layer 2: features N x 16, weights 16 x E, out N x E
+class Net {
+  std::string header;
+  bool is_single_class;          // single-class (one-hot) or multi-class label
+  bool has_l2norm;               // whether the net contains an l2_norm layer
+  bool has_dense;                // whether the net contains an dense layer
+  unsigned neighbor_sample_size; // neighbor sampling
+  unsigned subgraph_sample_size; // subgraph sampling
+  int num_threads;               // number of threads
+  size_t globalSamples;          // number of samples: N
+  size_t distNumSamples;         // number of samples: N
+  size_t num_classes;            // number of vertex classes: E
+  size_t num_conv_layers;        // number of convolutional layers
+  size_t num_layers;             // total number of layers (conv + output)
+  int num_epochs;                // number of epochs
+  unsigned h1;                   // hidden layer size
+  float learning_rate;           // learning rate
+  float dropout_rate;            // dropout rate
+  float weight_decay;            // weighti decay for over-fitting
+  // begins/ends below are global ids
+  size_t globalTrainBegin;
+  size_t globalTrainEnd;
+  size_t globalTrainCount;
+  size_t globalValBegin;
+  size_t globalValEnd;
+  size_t globalValCount;
+  size_t globalTestBegin;
+  size_t globalTestEnd;
+  size_t globalTestCount;
+  int val_interval;
+  int num_subgraphs;
+  unsigned subgraphNumVertices;
+  bool is_selfloop;
+
+  mask_t* globalTrainMasks; // masks for training
+  mask_t* globalValMasks;   // masks for validation
+  mask_t* globalTestMasks;  // masks for test
+  // TODO it's looking like we may not even need these dist versions
+  mask_t* distTrainMasks;
+  mask_t* distValMasks;
+  mask_t* distTestMasks; // masks for test, dst
+
+  mask_t* d_train_masks; // masks for training on device
+  mask_t* d_val_masks;   // masks for validation on device
+  mask_t* d_test_masks;  // masks for test on device
+
+  mask_t* subgraphs_masks; // masks for subgraphs; size of local graph
+  // masks for subgraphs on device; size of local graph
+  mask_t* d_subgraphs_masks;
+  std::vector<size_t> feature_dims; // feature dimnesions for each layer
+  std::vector<layer*> layers;       // all the layers in the neural network
+
+  // one context is for entire graph; other is for partitioned graph
+  // TODO optimize single host case
+
+  //! context holds all of the graph data
+  deepgalois::Context* graphTopologyContext;
+
+  //! dist context holds graph data of the partitioned graph only
+  deepgalois::DistContext* distContext;
+  DGraph* dGraph;
+  Sampler* sampler;
+
+public:
+  //! Default net constructor
+  Net()
+      : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, false, true, false, false,
+            25, 9000, 1) {}
+
+  //! Net constructor
+  Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
+      unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
+      bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz,
+      int val_itv);
+
+  // allocate memory for subgraph masks
+  void allocateSubgraphsMasks(int num_subgraphs);
+
+  //! Initializes metadata for the partition: loads data, labels, etc
+  void partitionInit(DGraph* graph, std::string dataset_str,
+                     bool isSingleClassLabel);
+  size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; }
+  size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; }
+  void regularize(); // add weight decay
+  void train(optimizer* opt, bool need_validate);
+  double evaluate(std::string type, acc_t& loss, acc_t& acc);
+
+  //! read masks of test set for GLOBAL set
+  void read_test_masks(std::string dataset);
+  //! read test masks only for local nodes; assumes dist context is initialized
+  void readDistributedTestMasks(std::string dataset);
+
+  // void copy_test_masks_to_device();
+  void construct_layers();
+
+  //! Add an l2_norm layer to the network
+  void append_l2norm_layer(size_t layer_id);
+
+  //! Add an dense layer to the network
+  void append_dense_layer(size_t layer_id);
+
+  //! Add an output layer to the network
+  void append_out_layer(size_t layer_id);
+
+  //! Add a convolution layer to the network
+  void append_conv_layer(size_t layer_id, bool act = false, bool norm = true,
+                         bool bias = false, bool dropout = true);
+
+  // update trainable weights after back-prop
+  void update_weights(optimizer* opt);
+
+  // forward propagation
+  acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks);
+  void bprop();                        // back propagation
+  void set_contexts();                 // Save the context
+  void set_netphases(net_phase phase); // current phase: train or test
+  void print_layers_info();            // print layer information
+  void print_configs();                // print the configurations
+
+  // comparing outputs with the ground truth (labels)
+  acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
+                        mask_t* gMasks, float_t* preds,
+                        label_t* localGroundTruth);
+  acc_t masked_multi_class_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
+                                    mask_t* gMasks, float_t* preds,
+                                    label_t* localGroundTruth);
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h
new file mode 100644
index 0000000000..ff1b460b10
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/Sampler.h
@@ -0,0 +1,112 @@
+#pragma once
+#include "deepgalois/GraphTypes.h"
+
+namespace deepgalois {
+#define ETA 1.5          // length factor of DB in sampling
+#define SAMPLE_CLIP 3000 // clip degree in sampling
+#define DEFAULT_SIZE_FRONTIER 1000
+#define DEFAULT_SIZE_SUBG 9000
+
+class Sampler {
+public:
+  typedef int db_t;
+
+protected:
+  index_t m; // number of vertice in the frontier
+  size_t count_;
+
+  //! averaged degree of masked graph
+  int avg_deg;
+  //! average degree cut off to a clip
+  int subg_deg;
+
+  // VertexList vertices_;
+  // mask_t* masks_;
+
+  //! List of training nodes; sampling set
+  std::vector<index_t> trainingNodes;
+
+  //! masked original graph; typically to the training set
+  GraphCPU* globalMaskedGraph;
+  GraphCPU* globalGraph;
+  DGraph* partGraph;
+
+  //! Reindex a graph to only contain those in the vertex set
+  void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed);
+
+  //! Given a graph, return a graph with edges to unmasked vertices removed in
+  //! mg
+  template <typename GraphTy, typename SubgraphTy = Graph>
+  void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub);
+
+  //! determine degree of each vertex in a masked graph (given by masks and g)
+  template <typename GraphTy = GraphCPU>
+  void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
+                        std::vector<uint32_t>& degrees);
+
+  //! Set masks bitset with IDs in the vertices VertexSet
+  // void createMasks(size_t n, VertexSet vertices, mask_t* masks);
+  // inline VertexList reindexVertices(size_t n, VertexSet vertex_set);
+  // void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+  // std::vector<db_t>& DB2, index_t size);
+
+  //! convert set of gids to lids
+  VertexSet convertToLID(VertexSet& gidSet);
+
+  void createMasks(size_t n, VertexSet vertices, mask_t* masks) {
+    std::fill(masks, masks + n, 0);
+    for (auto v : vertices)
+      masks[v] = 1;
+  }
+
+  //! helper function to get degree of some vertex given some graph
+  inline unsigned getDegree(GraphCPU* g, index_t v) {
+    return g->edge_end_host(v) - g->edge_begin_host(v);
+  }
+
+  inline VertexList reindexVertices(size_t n, VertexSet vertex_set) {
+    VertexList new_ids(n, 0);
+    int vid = 0;
+    for (auto v : vertex_set) {
+      new_ids[v] = vid++; // reindex
+    }
+    return new_ids;
+  }
+
+  // helper function for graph saint implementation below
+  void checkGSDB(std::vector<db_t>& DB0, std::vector<db_t>& DB1,
+                 std::vector<db_t>& DB2, index_t size) {
+    if (DB0.capacity() < size) {
+      DB0.reserve(DB0.capacity() * 2);
+      DB1.reserve(DB1.capacity() * 2);
+      DB2.reserve(DB2.capacity() * 2);
+    }
+    DB0.resize(size);
+    DB1.resize(size);
+    DB2.resize(size);
+  }
+
+public:
+  Sampler() : m(DEFAULT_SIZE_FRONTIER) {}
+  ~Sampler() {}
+
+  //! sample a subgraph sg of size n from graph g
+  //! sg is overwritten/is output
+  void generateSubgraph(VertexSet& vertex_set, mask_t* masks, Graph* sg);
+
+  //! API function for user-defined selection strategy
+  // TODO how to expose this?
+  void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices,
+                      VertexSet& vertex_set);
+  virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed);
+
+  // galois::runtime::iterable<galois::NoDerefIterator<edge_iterator> >
+  // neighbor_sampler(Graph &g, VertexID v);
+
+  //! Given a mask, construct the graph with only those vertices ans ave as the
+  //! masked graph in this class for the sampler.
+  void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g,
+                             DGraph* dg);
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h
new file mode 100644
index 0000000000..5cbb1909fd
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/configs.h
@@ -0,0 +1,13 @@
+#pragma once
+
+namespace deepgalois {
+
+const std::string path =
+    "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset
+
+#define NUM_DATASETS 9
+const std::string dataset_names[NUM_DATASETS] = {
+    "cora", "citeseer", "ppi",    "pubmed", "flickr",
+    "yelp", "reddit",   "amazon", "tester"};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h
new file mode 100644
index 0000000000..4e4e9842b1
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/cutils.h
@@ -0,0 +1,192 @@
+#pragma once
+#include <cuda.h>
+#include <curand.h>
+#include <cusparse.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <driver_types.h>
+#include <iostream>
+
+// CUDA: use 256 threads per block
+const int CUDA_NUM_THREADS = 256;
+
+// CUDA: number of blocks for threads.
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+inline unsigned CudaTest(const char* msg) {
+  cudaError_t e;
+  // cudaThreadSynchronize();
+  cudaDeviceSynchronize();
+  if (cudaSuccess != (e = cudaGetLastError())) {
+    fprintf(stderr, "%s: %d\n", msg, e);
+    fprintf(stderr, "%s\n", cudaGetErrorString(e));
+    exit(-1);
+  }
+  return 0;
+}
+
+inline const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#endif
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif
+  default:
+    break;
+  }
+  return "Unknown cublas status";
+}
+
+inline const char* cusparseGetErrorString(cusparseStatus_t error) {
+  switch (error) {
+  case CUSPARSE_STATUS_SUCCESS:
+    return "CUSPARSE_STATUS_SUCCESS";
+  case CUSPARSE_STATUS_NOT_INITIALIZED:
+    return "CUSPARSE_STATUS_NOT_INITIALIZED";
+  case CUSPARSE_STATUS_ALLOC_FAILED:
+    return "CUSPARSE_STATUS_ALLOC_FAILED";
+  case CUSPARSE_STATUS_INVALID_VALUE:
+    return "CUSPARSE_STATUS_INVALID_VALUE";
+  case CUSPARSE_STATUS_ARCH_MISMATCH:
+    return "CUSPARSE_STATUS_ARCH_MISMATCH";
+  case CUSPARSE_STATUS_MAPPING_ERROR:
+    return "CUSPARSE_STATUS_MAPPING_ERROR";
+  case CUSPARSE_STATUS_EXECUTION_FAILED:
+    return "CUSPARSE_STATUS_EXECUTION_FAILED";
+  case CUSPARSE_STATUS_INTERNAL_ERROR:
+    return "CUSPARSE_STATUS_INTERNAL_ERROR";
+  case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+    return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  case CUSPARSE_STATUS_ZERO_PIVOT:
+    return "CUSPARSE_STATUS_ZERO_PIVOT";
+  default:
+    break;
+  }
+  return "Unknown cusparse status";
+}
+
+inline const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  default:
+    break;
+  }
+  return "Unknown curand status";
+}
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition)                                                  \
+  do {                                                                         \
+    cudaError_t error = condition;                                             \
+    if (error != cudaSuccess) {                                                \
+      fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n",  \
+              error, __FILE__, __LINE__, cudaGetErrorString(error));           \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUBLAS_CHECK(condition)                                                \
+  do {                                                                         \
+    cublasStatus_t status = condition;                                         \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      fprintf(stderr,                                                          \
+              "error %d: cuBLAS error in file '%s' in line %i : %s.\n",        \
+              status, __FILE__, __LINE__, cublasGetErrorString(status));       \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUSPARSE_CHECK(condition)                                              \
+  do {                                                                         \
+    cusparseStatus_t status = condition;                                       \
+    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
+      fprintf(stderr,                                                          \
+              "error %d: cuSPARSE error in file '%s' in line %i : %s.\n",      \
+              status, __FILE__, __LINE__, cusparseGetErrorString(status));     \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CURAND_CHECK(condition)                                                \
+  do {                                                                         \
+    curandStatus_t status = condition;                                         \
+    if (status != CURAND_STATUS_SUCCESS) {                                     \
+      fprintf(stderr,                                                          \
+              "error %d: cuBLAS error in file '%s' in line %i : %s.\n",        \
+              status, __FILE__, __LINE__, curandGetErrorString(status));       \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n)                                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
+       i += blockDim.x * gridDim.x)
+
+// CUDA: check for error after kernel execution and exit loudly if there is one.
+#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
+
+inline void print_device_vector(size_t n, const float_t* d_x,
+                                std::string name = "x") {
+  float_t* h_x = new float_t[n];
+  CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost));
+  for (size_t i = 0; i < n; i++)
+    std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
+  delete[] h_x;
+}
+
+inline void print_device_int_vector(size_t n, const int* d_x,
+                                    std::string name = "x") {
+  int* h_x = new int[n];
+  CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(int), cudaMemcpyDeviceToHost));
+  for (size_t i = 0; i < n; i++)
+    std::cout << name << "[" << i << "]=" << h_x[i] << "\n";
+  delete[] h_x;
+}
diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
new file mode 100644
index 0000000000..2918cdd8dd
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h
@@ -0,0 +1,160 @@
+#ifndef __GLUON_GRADIENTS__
+#define __GLUON_GRADIENTS__
+
+#include "galois/gstl.h"
+#include "galois/runtime/Network.h"
+#include "deepgalois/types.h"
+
+namespace deepgalois {
+
+/**
+ * Wraps the weight gradients and provides an interface for Gluon to
+ * synchronize them during distributed execution.
+ */
+class GluonGradients {
+private:
+  //! Data type used for gradients
+  using GradientType = float_t;
+  //! type that's being used by the gradient vector
+  using GradientVecType = vec_t;
+
+  GradientVecType& _gradients;
+  //! number of weight gradients
+  size_t _numWeights;
+  //! number of gradients this host is responsible for
+  size_t _numOwned;
+
+  //! My host ID
+  unsigned _myHost;
+  //! Total num hosts in system
+  unsigned _totalHosts;
+
+  //! first node I own
+  unsigned _beginMaster;
+  //! last node I own (contiguous chunk)
+  unsigned _endMaster;
+
+  //! my nodes whose's masters are on other hosts; global ids
+  std::vector<std::vector<size_t>> _mirrorNodes;
+  //! nodes that are mirrors on this host
+  std::vector<std::pair<uint32_t, uint32_t>> _mirrorRanges;
+
+public:
+  bool is_a_graph() { return true; }
+
+  /**
+   * Save weight gradients + number of them (i.e. size).
+   * Then setup mirror metadata for Gluon to use during setup.
+   */
+  GluonGradients(GradientVecType& gradients, size_t numWeights)
+      : _gradients(gradients), _numWeights(numWeights) {
+    _myHost     = galois::runtime::getSystemNetworkInterface().ID;
+    _totalHosts = galois::runtime::getSystemNetworkInterface().Num;
+
+    // allocate a vector for each host
+    _mirrorNodes.resize(_totalHosts);
+
+    // loop through distribution of weights to hosts
+    for (unsigned h = 0; h < _totalHosts; h++) {
+      std::pair<size_t, size_t> curRange =
+          galois::block_range((size_t)0, _numWeights, h, _totalHosts);
+
+      if (h != _myHost) {
+        // setup mirrors for the host h which is just the list of IDs
+        size_t curW  = curRange.first;
+        size_t lastW = curRange.second;
+        size_t numW  = lastW - curW;
+
+        // set mirrors for host h
+        _mirrorNodes[h].reserve(numW);
+        for (; curW < lastW; curW++) {
+          _mirrorNodes[h].push_back(curW);
+        }
+      } else {
+        // these belong to this host; save, then mirror ranges can be
+        // calculated from this
+        _beginMaster = curRange.first;
+        _endMaster   = curRange.second;
+        _numOwned    = _endMaster - _beginMaster;
+
+        // first range is 0 to begin master
+        if (_beginMaster > 0) {
+          galois::gInfo("[", _myHost, "] Mirror range ", 0, " to ",
+                        _beginMaster);
+          _mirrorRanges.emplace_back(0, _beginMaster);
+        }
+
+        // second range is endMaster to end
+        if (_endMaster < _numWeights) {
+          galois::gInfo("[", _myHost, "] Mirror range ", _endMaster, " to ",
+                        _numWeights);
+          _mirrorRanges.emplace_back(_endMaster, _numWeights);
+        }
+      }
+    }
+
+    galois::gInfo("[", _myHost, "] This host owns ", _beginMaster, " to ",
+                  _endMaster);
+  }
+
+  //! Size is number of weights
+  size_t size() const { return _numWeights; }
+
+  //! Global size is number of weights
+  size_t globalSize() const { return _numWeights; }
+
+  //! Return the weights owned by this host
+  size_t numMasters() const { return _numOwned; }
+
+  //! Return host ID
+  unsigned myHostID() const { return _myHost; }
+
+  //! Return num hosts in the system
+  unsigned numHosts() const { return _totalHosts; }
+
+  //! GID is same as LID since all hosts have all weights
+  uint32_t getGID(const uint32_t nodeID) const { return nodeID; }
+
+  //! LID is same as GID since all hosts have all weights
+  uint32_t getLID(const uint32_t nodeID) const { return nodeID; }
+
+  //! Return local weight w
+  GradientType& getData(uint32_t w) const { return _gradients[w]; }
+
+  //! Return ranges for mirrors (unowned nodes)
+  const std::vector<std::pair<uint32_t, uint32_t>>& getMirrorRanges() const {
+    return _mirrorRanges;
+  }
+
+  //! Return mirror nodes for each host from this host's point of view
+  std::vector<std::vector<size_t>>& getMirrorNodes() { return _mirrorNodes; }
+
+  //! clears the vector
+  // TODO return to this when we start distributing on GPUs; wrapper
+  // end probably shouldn't be managing this MAYBE
+  void deallocate() { _gradients.clear(); }
+
+  // Essentially no-op functions follow
+
+  //! no nodes with edges
+  size_t getNumNodesWithEdges() { return 0; }
+
+  //! No edges; not a vertex cut
+  bool is_vertex_cut() const { return false; }
+
+  //! no edges, return 0
+  unsigned edge_begin(uint32_t) { return 0; }
+
+  //! no edges, return 0
+  unsigned edge_end(uint32_t) { return 0; }
+
+  //! no edges, return 0
+  unsigned getEdgeDst(uint32_t) { return 0; }
+
+  //! no edges, return 0
+  unsigned getEdgeData(uint32_t) { return 0; }
+};
+
+} // namespace deepgalois
+
+#endif // end header guard
diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
new file mode 100644
index 0000000000..d4c23af1bb
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h
@@ -0,0 +1,51 @@
+#ifndef GALOIS_ENABLE_GPU
+#ifndef __GRAD_SYNC_STRUCT__
+#define __GRAD_SYNC_STRUCT__
+
+#include "deepgalois/types.h"
+
+struct GradientSync {
+  using ValTy = float_t;
+
+  static ValTy extract(uint32_t, float_t& weight) { return weight; }
+
+  static bool reduce(uint32_t, float_t& weight, ValTy y) {
+    // TODO merge function here
+    // for now make sure the weights are close enough
+    // if (std::abs(weight - y) > 0.00001) {
+    //  galois::gInfo("weight ", node_id, " not consistent with one received");
+    //}
+    if (y == 0) {
+      galois::gPrint("nothing important\n");
+    }
+    weight += y;
+    // need a post process divide all step
+    // weight /= 2;
+    return true;
+  }
+
+  //! reset weight to 0
+  static void reset(uint32_t, float_t& weight) { weight = 0; }
+
+  //! save weight
+  static void setVal(uint32_t, float_t& weight, ValTy y) { weight = y; }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
+// no bitset; everything is sent anyways
+#endif
+#endif
diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
new file mode 100644
index 0000000000..570aa56d2b
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h
@@ -0,0 +1,60 @@
+#ifndef GALOIS_ENABLE_GPU
+#ifndef __GRAPH_CONV_SYNC_STRUCT__
+#define __GRAPH_CONV_SYNC_STRUCT__
+#include "galois/BufferWrapper.h"
+
+struct GraphConvSync {
+  using ValTy = galois::BufferWrapper<float>;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, char&) {
+    ValTy vecToReturn(
+        &deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize],
+        deepgalois::_syncVectorSize);
+    // move constructor should kick in here to avoid return copy
+    return vecToReturn;
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == deepgalois::_syncVectorSize);
+    // loop and do addition
+    for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
+      deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] +=
+          y[i];
+    }
+    return true;
+  }
+
+  //! do nothing (waste of a write)
+  static void reset(uint32_t, char&) {}
+
+  //! element wise set
+  static void setVal(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == deepgalois::_syncVectorSize);
+    // loop and do addition
+    for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) {
+      deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] = y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
+GALOIS_SYNC_STRUCTURE_BITSET(conv);
+#endif
+#endif
diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h
new file mode 100644
index 0000000000..8ef845b1d9
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/aggregator.h
@@ -0,0 +1,23 @@
+#pragma once
+#include "deepgalois/types.h"
+//! For each node in the graph, add the embeddings of all of its neighbors
+//! together (using norm_factor if specified)
+#ifndef GALOIS_ENABLE_GPU
+#include "deepgalois/GraphTypes.h"
+namespace deepgalois {
+// TODO template arg
+void update_all(size_t len, Graph& g, const float_t* in, float_t* out,
+                bool norm, float_t* norm_factor);
+void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out,
+                      bool norm, float_t* norm_factor);
+} // namespace deepgalois
+#else
+#include "deepgalois/GraphTypes.h"
+// #include "graph_gpu.h"
+namespace deepgalois {
+void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor);
+void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
+                      bool norm, const float_t* norm_factor);
+} // namespace deepgalois
+#endif
diff --git a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
new file mode 100644
index 0000000000..e4b59e694f
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h
@@ -0,0 +1,28 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+// element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i```
+class elementwise_add_layer : public layer {
+public:
+  elementwise_add_layer(unsigned level, std::vector<size_t> in_dim,
+                        std::vector<size_t> out_dim)
+      : layer(level, in_dim, out_dim) {
+    trainable_ = false;
+  }
+  std::string layer_type() const override {
+    return std::string("elementwise_add");
+  }
+  void forward_propagation(const tensor_t& in_data,
+                           tensor_t& out_data) override {
+    for (size_t sample = 0; sample < in_data.size(); ++sample) {
+      for (size_t j = 0; j < in_data[0].size(); j++)
+        out_data[sample][j] = in_data[sample][j];
+    }
+  }
+  void back_propagation(const tensor_t& in_data, const tensor_t& out_data,
+                        tensor_t& out_grad, tensor_t& in_grad) override {
+    in_grad = out_grad;
+  }
+};
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
new file mode 100644
index 0000000000..14c47c9813
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h
@@ -0,0 +1,85 @@
+#pragma once
+#include "layer.h"
+#include "deepgalois/layers/aggregator.h"
+
+/**
+ * GraphConv Layer; based on DGL implementation + follows TinyDNN layer
+ * convention
+ * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html
+ *
+ *   Parameters
+ *   ----------
+ *   x: int, number of samples.
+ *   y: int, Input feature size.
+ *   z: int, Output feature size.
+ *   dropout: bool, optional, if True, a dropout operation is applied before
+ *   other operations.
+ *   norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied.
+ *          Default: ``True``.
+ *   bias : bool, optional, if True, adds a learnable bias to the output.
+ *          Default: ``False``.
+ *   activation: default false
+ */
+namespace deepgalois {
+class graph_conv_layer : public layer {
+public:
+  graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout,
+                   float_t dropout_rate, std::vector<size_t> in_dims,
+                   std::vector<size_t> out_dims)
+      : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias),
+        dropout_(dropout), dropout_rate_(dropout_rate) {
+    assert(input_dims[0] == output_dims[0]); // num_vertices
+    trainable_ = true;
+    name_      = layer_type() + "_" + std::to_string(level);
+    assert(dropout_rate_ >= 0. && dropout_rate_ < 1.);
+    scale_ = 1. / (1. - dropout_rate_);
+  }
+  graph_conv_layer(unsigned level, std::vector<size_t> in_dims,
+                   std::vector<size_t> out_dims)
+      : graph_conv_layer(level, false, true, false, true, 0.5, in_dims,
+                         out_dims) {}
+  ~graph_conv_layer() {}
+  void malloc_and_init();
+  std::string layer_type() const override { return std::string("graph_conv"); }
+  virtual acc_t get_weight_decay_loss();
+  //! Uses weights contained in this layer to update in_data (results from
+  //! previous) and save result to out_data
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  //! Uses gradients from layer after this one to update both own weight
+  //! gradients as well as gradients for the features (in_grad)
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+  // user-defined aggregate function
+#ifndef GALOIS_ENABLE_GPU
+  virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
+  void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out);
+#else
+  virtual void aggregate(size_t len, GraphGPU& g, const float_t* in,
+                         float_t* out);
+  void d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out);
+#endif
+  // user-defined combine function
+  virtual void combine(size_t dim_x, size_t dim_y, const float_t* self,
+                       const float_t* neighbors, float_t* out);
+
+private:
+  bool act_;     // whether to use activation function at the end
+  bool norm_;    // whether to normalize data
+  bool bias_;    // whether to add bias afterwards
+  bool dropout_; // whether to use dropout at first
+  const float_t dropout_rate_;
+  float_t scale_;
+  float_t* out_temp; //!< intermediate data temporary
+  float_t* in_temp;
+  float_t* in_temp1;
+  float_t* trans_data;  // y*x
+  mask_t* dropout_mask; // x*y
+  float_t epsilon;      // LeakyReLU angle of negative slope: set to 0.2
+
+  // Glorot & Bengio (AISTATS 2010)
+  inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix,
+                               unsigned seed = 1);
+  inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix);
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
new file mode 100644
index 0000000000..c7167700a2
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h
@@ -0,0 +1,28 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+// L2 Normalization Layer
+class l2_norm_layer : public layer {
+public:
+  l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims,
+                dims_t out_dims)
+      : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) {
+    assert(input_dims[0] == output_dims[0]); // num_vertices
+    trainable_ = false;
+    name_      = layer_type() + "_" + std::to_string(level);
+  }
+  l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims)
+      : l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {}
+  ~l2_norm_layer() {}
+  std::string layer_type() const override { return std::string("l2_norm"); }
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+
+protected:
+  float_t epsilon_;
+  float_t scale_;
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h
new file mode 100644
index 0000000000..6e1ac879cc
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/layer.h
@@ -0,0 +1,215 @@
+#pragma once
+/**
+ * Code from on below link. Modified under Galois's license.
+ *
+ * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h
+ *
+ * Copyright (c) 2013, Taiga Nomi and the respective contributors
+ * All rights reserved.
+ * Reused/revised under 3-BSD
+ */
+#include <iostream>
+#include "deepgalois/GraphTypes.h"
+#include "deepgalois/Context.h"
+#include "deepgalois/optimizer.h"
+#include "deepgalois/layers/node.h"
+#include "deepgalois/DistContext.h"
+
+#ifndef GALOIS_ENABLE_GPU
+#include "galois/graphs/GluonSubstrate.h"
+#include "deepgalois/layers/GluonGradients.h"
+#endif
+
+namespace deepgalois {
+
+/**
+ * base class of all kind of NN layers
+ *
+ * sub-class should override these methods:
+ * - forward_propagation ... body of forward-pass calculation
+ * - back_propagation    ... body of backward-pass calculation
+ * - in_shape            ... specify input data shapes
+ * - out_shape           ... specify output data shapes
+ * - layer_type          ... name of layer
+ *
+ * Node inheritance is just to get accessed to linked-list semantics it
+ * provides
+ **/
+class layer : public deepgalois::node {
+public:
+  using ContextType = deepgalois::DistContext;
+
+protected:
+#ifndef GALOIS_ENABLE_GPU
+  const std::string header =
+      "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+      "] ";
+#endif
+  unsigned level_;                 // layer id: [0, num_layers-1]
+  size_t begin_;                   // sample begin index
+  size_t end_;                     // sample end index
+  size_t count_;                   // number of samples
+  size_t num_dims;                 // number of dimensions
+  net_phase phase_;                // in which phase: train, val or test
+  std::vector<size_t> input_dims;  // input dimensions
+  std::vector<size_t> output_dims; // output dimentions
+  std::string name_;               // name of this layer
+  bool trainable_;                 // is this layer trainable
+  bool use_mask;
+  vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E
+  vec_t Q; // parameters to learn, for vertex v's neighbors, same size as W
+  vec_t weight_grad;      // weight gradient for updating parameters
+  float_t* d_W;           // parameters to learn on device (GPU)
+  float_t* d_weight_grad; // weight gradient on device (GPU)
+  vec_t alpha_l;          // parameters to learn (H x 1), only used for GAT
+  vec_t alpha_r;          // parameters to learn (H x 1), only used for GAT
+  vec_t alpha_lgrad;      // gradients for updating alpha (GAT only)
+  vec_t alpha_rgrad;      // gradients for updating alpha (GAT only)
+  mask_t* masks_;         // masks to show which samples are valid
+  mask_t* d_masks_;       // masks on device (GPU)
+  float_t* loss;          // error for each vertex: N x 1
+  ContextType* context;
+  label_t* labels;
+  float_t* norm_consts;   // normalization score
+  vec_t scores;           // un-normalized scores
+  vec_t temp_scores;      // un-normalized scores
+  vec_t scores_grad;      // gradients of un-normalized scores
+  vec_t norm_scores;      // normalized scores
+  vec_t norm_scores_grad; // gradients of normalized scores
+// TODO
+#ifdef GALOIS_ENABLE_GPU
+  GraphGPU* graph_gpu;
+#else
+  Graph* graph_cpu;
+  // Used for synchronization of weight gradients
+  deepgalois::GluonGradients* gradientGraph;
+  galois::graphs::GluonSubstrate<deepgalois::GluonGradients>* syncSub;
+#endif
+
+public:
+  layer(unsigned level, std::vector<size_t> in_dims,
+        std::vector<size_t> out_dims)
+      : level_(level), begin_(0), end_(0), num_dims(in_dims.size()),
+        input_dims(in_dims), output_dims(out_dims), labels(NULL) {}
+  virtual ~layer()                       = default;
+  virtual std::string layer_type() const = 0;
+  virtual void malloc_and_init() {}
+  void print_layer_info() { //! debug print function
+    unsigned myID = 0;
+#ifndef GALOIS_ENABLE_GPU
+    myID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
+    std::cout << "[" << myID << "] Layer " << level_
+              << " type: " << layer_type() << "input[" << input_dims[0] << ","
+              << input_dims[1] << "] output[" << output_dims[0] << ","
+              << output_dims[1] << "]\n";
+    // galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(),
+    //               "input[", input_dims[0], ",", input_dims[1], "] output[",
+    //               output_dims[0], ",", output_dims[1], "]\n");
+  }
+  // get methods
+  virtual acc_t get_prediction_loss() { return acc_t(0); }
+  virtual acc_t get_weight_decay_loss() { return acc_t(0); }
+  bool trainable() const { return trainable_; }
+  std::string get_name() { return name_; }
+  mask_t* get_device_masks() { return d_masks_; }
+  float_t* get_weights_ptr() { return &W[0]; }
+  float_t* get_weights_device_ptr() { return d_W; }
+  float_t* get_grads_ptr() { return &weight_grad[0]; }
+  float_t* get_grads_device_ptr() { return d_weight_grad; }
+
+  // set methods
+  void set_netphase(net_phase ctx) { phase_ = ctx; }
+  void set_context(ContextType* ctx) { context = ctx; }
+  void set_trainable(bool trainable) {
+    trainable_ = trainable;
+  } // is this layer trainable?
+  void set_labels_ptr(label_t* ptr) { labels = ptr; }
+  void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; }
+  void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); }
+  void set_name(std::string name) { name_ = name; } // name metadata
+#ifndef GALOIS_ENABLE_GPU
+  void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; }
+#else
+  void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; }
+#endif
+  void update_dim_size(size_t g_size) {
+    input_dims[0] = output_dims[0] = g_size;
+  }
+
+  //! set the data of the previous layer connected to this one
+  void set_in_data(float_t* data) {
+    prev_ =
+        std::make_shared<deepgalois::edge>(this, input_dims[0], input_dims[1]);
+    prev_->set_data(data);
+    // no need to allocate memory for gradients, since this is the input layer.
+  }
+
+  virtual void set_sample_mask(size_t sample_begin, size_t sample_end,
+                               size_t sample_count, mask_t* masks) {
+    begin_   = sample_begin;
+    end_     = sample_end;
+    count_   = sample_count;
+    use_mask = false;
+    if (masks != NULL) {
+      use_mask = true;
+#ifndef GALOIS_ENABLE_GPU
+      masks_ = masks;
+#else
+      d_masks_ = masks;
+#endif
+    }
+  }
+
+  void add_edge() {
+    // add an outgoing edge
+    next_ = std::make_shared<deepgalois::edge>(this, output_dims[0],
+                                               output_dims[1]);
+    // allocate memory for intermediate feature vectors and gradients
+    next_->alloc();
+  }
+
+  // main functions for layer work
+  virtual void forward_propagation(const float_t* in_data,
+                                   float_t* out_data)                = 0;
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad) = 0;
+
+  //! calls forward propagation using previous layer as input and writes
+  //! to next layer as output
+  void forward() {
+    // std::cout << name_ << ": forwarding ... ";
+    forward_propagation(prev()->get_data(), next()->get_data());
+  }
+
+  //! calls backward propagation
+  void backward() {
+    // std::cout << name_ << ": backwarding ... ";
+    back_propagation(prev()->get_data(), next()->get_data(),
+                     next()->get_gradient(), prev()->get_gradient());
+  }
+
+  //! use optimizer to update weights given gradient (weight_grad)
+  void update_weight(deepgalois::optimizer* opt) {
+#ifndef GALOIS_ENABLE_GPU
+    // parallelize only when target size is big enough to mitigate thread
+    // spawning overhead.
+    // bool parallel = (W.size() >= 512);
+    opt->update(layer::weight_grad, layer::W); // W += grad
+#else
+    opt->update_gpu(input_dims[1] * output_dims[1], d_weight_grad,
+                    d_W); // W += grad
+#endif
+    // prev()->clear_grads();
+    next()->clear_grads();
+  }
+};
+
+//! Connects tail to head's edge and sets that edge's target to tail
+// inline void connect(layer* head, layer* tail) {
+inline void connect(layer* head, layer* tail) {
+  tail->prev_ = head->next_;
+  tail->prev_->add_next_node(tail);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
new file mode 100644
index 0000000000..2f43e0a228
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h
@@ -0,0 +1,22 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+// Leaky ReLU Layer
+class leaky_relu_layer : public layer {
+public:
+  leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims,
+                   dims_t out_dims);
+  leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims)
+      : leaky_relu_layer(level, 0.0, in_dims, out_dims) {}
+  ~leaky_relu_layer() {}
+  std::string layer_type() const override { return std::string("leaky_relu"); }
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+
+protected:
+  float_t epsilon_;
+  size_t n;
+};
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h
new file mode 100644
index 0000000000..ebcc774cc1
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/linear_layer.h
@@ -0,0 +1,34 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+class linear_layer : public layer {
+public:
+  linear_layer(unsigned level, float_t scale, float_t bias,
+               std::vector<size_t> in_dims, std::vector<size_t> out_dims)
+      : layer(level, in_dims, out_dims), scale_(scale), bias_(bias) {
+    trainable_ = false;
+  }
+  linear_layer(unsigned level, std::vector<size_t> in_dim,
+               std::vector<size_t> out_dim)
+      : linear_layer(level, 1.0, 0.0, in_dim, out_dim) {}
+  std::string layer_type() const override { return "linear"; }
+
+  void forward_propagation(const tensor_t& in_data,
+                           tensor_t& out_data) override {
+    for (size_t sample = 0; sample < input_dims[0]; ++sample) {
+      for (size_t i = 0; i < input_dims[1]; i++)
+        out_data[sample][i] = scale_ * in_data[sample][i] + bias_;
+    }
+  }
+  void back_propagation(const tensor_t& in_data, const tensor_t& out_data,
+                        tensor_t& out_grad, tensor_t& in_grad) override {
+    for (size_t sample = 0; sample < input_dims[0]; ++sample)
+      for (size_t i = 0; i < input_dims[1]; i++)
+        in_grad[sample][i] = out_grad[sample][i] * scale_;
+  }
+
+protected:
+  float_t scale_, bias_;
+};
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h
new file mode 100644
index 0000000000..11499bbede
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/node.h
@@ -0,0 +1,75 @@
+#pragma once
+/**
+ * Code modified from below
+ *
+ * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/node.h
+ *
+ * Copyright (c) 2013, Taiga Nomi and the respective contributors
+ * All rights reserved.
+ * Reused/revised under 3-BSD
+ */
+
+#include <vector>
+#include <memory>
+#include <cassert>
+#include "deepgalois/types.h"
+
+namespace deepgalois {
+
+class node;
+class layer;
+class edge;
+
+typedef std::shared_ptr<edge> edgeptr_t;
+
+// node data structure: each layer is a node, two layers are connected by an
+// edge
+class node : public std::enable_shared_from_this<node> {
+public:
+  node() {
+    prev_ = NULL;
+    next_ = NULL;
+  }
+  // node(size_t in_size, size_t out_size) {
+  //} //: prev_(in_size), next_(out_size) {}
+  virtual ~node() {}
+  const edgeptr_t prev() const { return prev_; }
+  const edgeptr_t next() const { return next_; }
+
+protected:
+  // node() = delete;
+  friend void connect(layer* head, layer* tail);
+  mutable edgeptr_t prev_;
+  mutable edgeptr_t next_;
+};
+
+// edges manage the input/output data and gradients between nodes
+class edge {
+public:
+  edge(node* prev, size_t n, size_t len)
+      : num_samples_(n), ft_dim_(len), data_(NULL), grad_(NULL), prev_(prev) {}
+
+  void alloc();
+  void clear_grads();
+  void merge_grads(float_t* dst);
+  void set_data(float_t* ptr) { data_ = ptr; }
+  float_t* get_data() { return data_; }
+  const float_t* get_data() const { return data_; }
+  float_t* get_gradient() { return grad_; }
+  const float_t* get_gradient() const { return grad_; }
+
+  const node* next() const { return next_; }
+  node* prev() { return prev_; }
+  const node* prev() const { return prev_; }
+  void add_next_node(node* next) { next_ = next; }
+
+private:
+  size_t num_samples_; // number of samples
+  size_t ft_dim_;      // feature dimensions
+  float_t* data_;      // feature vectors
+  float_t* grad_;      // gradients
+  node* prev_;         // previous node, "producer" of data
+  node* next_;         // next node, "consumer" of data
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h
new file mode 100644
index 0000000000..4e1c47ed77
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+// ReLU Layer
+class relu_layer : public layer {
+public:
+  relu_layer(unsigned level, dims_t in_dims, dims_t out_dims)
+      : layer(level, in_dims, out_dims) {
+    trainable_ = false;
+  }
+  ~relu_layer() {}
+  std::string layer_type() const override { return std::string("relu"); }
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+};
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
new file mode 100644
index 0000000000..be133995c0
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h
@@ -0,0 +1,20 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+class sigmoid_loss_layer : public layer {
+public:
+  sigmoid_loss_layer(unsigned level, std::vector<size_t> in_dims,
+                     std::vector<size_t> out_dims);
+  ~sigmoid_loss_layer();
+  std::string layer_type() const override {
+    return std::string("sigmoid_loss");
+  }
+  void malloc_and_init();
+  inline label_t get_label(size_t i, size_t j);
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+  virtual acc_t get_prediction_loss();
+};
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
new file mode 100644
index 0000000000..7ba096a2aa
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h
@@ -0,0 +1,20 @@
+#pragma once
+#include "layer.h"
+
+namespace deepgalois {
+class softmax_loss_layer : public layer {
+public:
+  softmax_loss_layer(unsigned level, std::vector<size_t> in_dims,
+                     std::vector<size_t> out_dims);
+  ~softmax_loss_layer();
+  std::string layer_type() const override {
+    return std::string("softmax_loss");
+  }
+  void malloc_and_init();
+  inline label_t get_label(size_t i);
+  virtual void forward_propagation(const float_t* in_data, float_t* out_data);
+  virtual void back_propagation(const float_t* in_data, const float_t* out_data,
+                                float_t* out_grad, float_t* in_grad);
+  virtual acc_t get_prediction_loss();
+};
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h
new file mode 100644
index 0000000000..01b84a60b6
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/lgraph.h
@@ -0,0 +1,171 @@
+#pragma once
+#include "deepgalois/types.h"
+#include <string>
+#include <cassert>
+
+#ifdef __CUDACC__
+#define CUDA_HOSTDEV __host__ __device__
+#else
+#define CUDA_HOSTDEV
+#endif
+
+namespace deepgalois {
+
+class LearningGraph {
+  typedef std::vector<index_t> IndexList;
+  // typedef index_t* IndexList;
+protected:
+  bool is_device;
+  index_t max_size_;
+  index_t num_vertices_;
+  index_t num_edges_;
+  IndexList rowptr_;
+  IndexList colidx_;
+  IndexList degrees_;
+  vdata_t* vertex_data_;
+  edata_t* edge_data_;
+
+  index_t* d_rowptr_;
+  index_t* d_colidx_;
+  index_t* d_degrees_;
+  vdata_t* d_vertex_data_;
+  edata_t* d_edge_data_;
+  std::vector<std::vector<size_t>> mirrorNodes;
+
+public:
+  typedef size_t iterator;
+  LearningGraph(bool use_gpu)
+      : is_device(use_gpu), max_size_(0), num_vertices_(0), num_edges_(0),
+        vertex_data_(NULL), edge_data_(NULL) {}
+  LearningGraph() : LearningGraph(false) {}
+  ~LearningGraph() { dealloc(); }
+  void init(index_t nv, index_t ne) {
+    num_vertices_ = nv;
+    num_edges_    = ne;
+  }
+  size_t size() { return (size_t)num_vertices_; }
+  size_t sizeEdges() { return (size_t)num_edges_; }
+  index_t get_degree(index_t vid) { return degrees_[vid]; }
+
+  iterator begin() const { return iterator(0); }
+  iterator end() const { return iterator(num_vertices_); }
+  void progressPrint(unsigned maxii, unsigned ii);
+  void allocOnDevice(bool no_edge_data_);
+  void copy_to_cpu();
+  void copy_to_gpu();
+  void dealloc();
+  void degree_counting();
+  void constructNodes() {}
+  void set_max_size(index_t max) {
+    assert(max > 0);
+    max_size_ = max;
+  }
+
+  void readGraph(std::string dataset, bool selfloop = false);
+  void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; }
+  void allocateFrom(index_t nv, index_t ne) {
+    // printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_,
+    // num_edges_);
+    num_vertices_ = nv;
+    num_edges_    = ne;
+    rowptr_.resize(num_vertices_ + 1);
+    colidx_.resize(num_edges_);
+    degrees_.resize(num_vertices_);
+    rowptr_[0] = 0;
+  }
+
+  void constructEdge(index_t eid, index_t dst, edata_t edata = 0) {
+    assert(dst < num_vertices_);
+    assert(eid < num_edges_);
+    colidx_[eid] = dst;
+    if (edge_data_)
+      edge_data_[eid] = edata;
+  }
+
+  void add_selfloop() {
+    auto old_colidx_ = colidx_;
+    colidx_.resize(num_vertices_ + num_edges_);
+    for (index_t i = 0; i < num_vertices_; i++) {
+      auto start             = rowptr_[i];
+      auto end               = rowptr_[i + 1];
+      bool selfloop_inserted = false;
+      if (start == end) {
+        colidx_[start + i] = i;
+        continue;
+      }
+      for (auto e = start; e != end; e++) {
+        auto dst = old_colidx_[e];
+        if (!selfloop_inserted) {
+          if (i < dst) {
+            selfloop_inserted  = true;
+            colidx_[e + i]     = i;
+            colidx_[e + i + 1] = dst;
+          } else if (e + 1 == end) {
+            selfloop_inserted  = true;
+            colidx_[e + i + 1] = i;
+            colidx_[e + i]     = dst;
+          } else
+            colidx_[e + i] = dst;
+        } else
+          colidx_[e + i + 1] = dst;
+      }
+    }
+    for (index_t i = 0; i <= num_vertices_; i++)
+      rowptr_[i] += i;
+    num_edges_ += num_vertices_;
+    printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_,
+           num_edges_);
+  }
+
+  bool isLocal(index_t vid);
+  index_t getLID(index_t vid);
+  bool is_vertex_cut();
+  std::vector<std::vector<size_t>>& getMirrorNodes();
+  uint64_t numMasters();
+  uint64_t globalSize();
+
+  index_t* row_start_host_ptr() { return &rowptr_[0]; }
+  index_t* edge_dst_host_ptr() { return &colidx_[0]; }
+  index_t getEdgeDstHost(index_t eid) { return colidx_[eid]; }
+  index_t edge_begin_host(index_t vid) { return rowptr_[vid]; }
+  index_t edge_end_host(index_t vid) { return rowptr_[vid + 1]; }
+#ifndef GALOIS_ENABLE_GPU
+  index_t getEdgeDst(index_t eid) { return colidx_[eid]; }
+  index_t edge_begin(index_t vid) { return rowptr_[vid]; }
+  index_t edge_end(index_t vid) { return rowptr_[vid + 1]; }
+  vdata_t getData(index_t vid) { return vertex_data_[vid]; }
+  index_t getDegree(index_t vid) { return degrees_[vid]; }
+  index_t* row_start_ptr() { return &rowptr_[0]; }
+  const index_t* row_start_ptr() const { return &rowptr_[0]; }
+  index_t* edge_dst_ptr() { return &colidx_[0]; }
+  const index_t* edge_dst_ptr() const { return &colidx_[0]; }
+  index_t* degrees_ptr() { return &degrees_[0]; }
+  edata_t* edge_data_ptr() { return edge_data_; }
+  vdata_t* vertex_data_ptr() { return vertex_data_; }
+#else
+  CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; }
+  CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; }
+  CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src + 1]; }
+  CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; }
+  // CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; }
+  // CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; }
+  CUDA_HOSTDEV index_t getDegree(index_t vid) {
+    return d_rowptr_[vid + 1] - d_rowptr_[vid];
+  }
+  CUDA_HOSTDEV index_t getOutDegree(index_t vid) {
+    return d_rowptr_[vid + 1] - d_rowptr_[vid];
+  }
+  index_t* row_start_ptr() { return d_rowptr_; }
+  const index_t* row_start_ptr() const { return d_rowptr_; }
+  index_t* edge_dst_ptr() { return d_colidx_; }
+  const index_t* edge_dst_ptr() const { return d_colidx_; }
+  index_t* degrees_ptr() { return d_degrees_; }
+  edata_t* edge_data_ptr() { return d_edge_data_; }
+  vdata_t* vertex_data_ptr() { return d_vertex_data_; }
+  // const vdata_t *vertex_data_ptr() const { return vertex_data_; }
+  // const edata_t *edge_data_ptr() const { return edge_data; }
+  void print_test();
+#endif
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh
new file mode 100644
index 0000000000..e6b5836386
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/math_functions.hh
@@ -0,0 +1,180 @@
+/**
+ * File inspired by similar one from TinyDNN
+ * https://github.com/tiny-dnn/
+ */
+#ifndef _MATH_FUNCTIONS_
+#define _MATH_FUNCTIONS_
+#include <cmath>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include "deepgalois/types.h"
+
+#ifdef USE_MKL
+#include <mkl.h>
+#else
+extern "C" {
+#include <cblas.h>
+}
+#endif
+
+namespace deepgalois {
+
+namespace math {
+
+// single-precision dense matrix multiply
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C);
+
+// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse
+void csrmm_cpu(const int M, const int N, const int K, const int nnz,
+               const float alpha, float* A_nonzeros, int* A_idx_ptr,
+               int* A_nonzero_idx, const float* B, const float beta, float* C);
+
+// matrix-vector multiply
+void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+           const float alpha, const float* A, const float* x, const float beta,
+           float* y);
+
+//! add 2 arrays for n elements
+void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out);
+
+//! multiply n elements of vector by scalar
+void scal(size_t n, const float_t alpha, float_t* x);
+void scale(size_t n, const float_t alpha, const float_t* x, float_t* y);
+void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y);
+
+//! do dot product of 2 vectors
+float_t dot(size_t n, const float_t* x, const float_t* y);
+
+// concatenation of two vectors into one
+void concat(size_t n, const float_t* x, const float_t* y, float_t* z);
+
+// SAXPY stands for “Single-precision A*X Plus Y"
+void axpy(size_t n, const float_t a, float_t* x, float_t* y);
+
+// Returns the index of the maximum value
+int argmax(const size_t n, const float_t* x); // the arguments of the maxima
+
+//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2)
+//! / 2
+float_t l2_norm(size_t n, const float_t* a);
+
+//! clear n elements of a vector
+void clear_cpu(size_t n, float_t* in);
+
+//! copy vector from in -> out; first len elements
+void copy_cpu(size_t len, const float_t* in, float_t* out);
+
+// dropout functions randomly remove weights
+void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate,
+                 const float_t* in, mask_t* mask, float_t* out);
+
+// dropout derivative: use existing dropouts in masks instead of generating
+// them;
+void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in,
+                   mask_t* mask, float_t* out);
+
+//! ReLU = keep if positive; and ReLU derivative: 1 if data > 0, 0 otherwise
+void relu_cpu(size_t n, const float_t* in, float_t* out);
+void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out);
+
+// Leaky ReLU
+void leaky_relu(float_t epsilon, float_t in, float_t& out);
+void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out);
+void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out);
+void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
+                      const float_t* data, float_t* out);
+
+// Loss function for single-class label (one-hot) data: softmax
+void softmax(size_t n, const float_t* input, float_t* output);
+void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp);
+
+// Cross entropy
+float_t cross_entropy(size_t n, const float_t* y, const float_t* p);
+void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d);
+
+// Loss function for multi-class label (one-hot) data: sigmoid
+void sigmoid(size_t n, const float_t* input, float_t* output);
+void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy,
+               const float_t* dp);
+
+// dropout functions randomly remove weights
+void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask,
+             float_t* out);
+void d_dropout(const float scale, const float_t* in, mask_t* mask,
+               float_t* out);
+
+//! transposes a matrix (malloc'd array)
+void transpose(size_t x, size_t y, const float_t* in, float_t* out);
+
+} // namespace math
+} // namespace deepgalois
+
+// GPU operators
+bool isnan_gpu(int n,
+               const float_t* array); // does array contain any 'nan' element
+void init_const_gpu(int n, float_t value, float_t* array);
+void copy_gpu(int len, const float_t* in, float_t* out);
+void vadd_gpu(const int n, const float_t* a, const float_t* b,
+              float_t* out); // vector add
+void axpy_gpu(const int n, const float_t a, const float_t* x,
+              float_t* y);                                   // axpy
+void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU
+void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
+                float_t* out_diff); // ReLU derivative
+void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in,
+                    float_t* out); // Leaky ReLU
+void d_leaky_relu_gpu(const int n, const float_t epsilon,
+                      const float_t* in_diff, const float_t* data,
+                      float_t* out_diff); // Leaky ReLU derivative
+void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                 mask_t* masks, float_t* out); // dropout
+void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                   const mask_t* masks, float_t* out); // dropout derivative
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C);
+void matmul_gpu(const size_t x, const size_t y, const size_t z,
+                const float_t* A, const float_t* B, float_t* C);
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                    const float_t* A, const float_t* B,
+                    float_t* C); // matrix multiply
+void csrmm_gpu(const int M, const int N, const int K, const int nnz,
+               const float alpha, const float* A_nonzeros, const int* A_idx_ptr,
+               const int* A_nonzero_idx, const float* B, const float beta,
+               float* trans_C, float* C);
+void softmax_cross_entropy_gpu(int len, int begin, int end,
+                               const float_t* in_data, const mask_t* masks,
+                               const label_t* labels, float_t* loss,
+                               float_t* out_data);
+void d_softmax_cross_entropy_gpu(int len, int bengin, int end,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out_data, float_t* diff);
+void sigmoid_cross_entropy_gpu(int len, int begin, int end,
+                               const float_t* in_data, const mask_t* masks,
+                               const label_t* labels, float_t* loss,
+                               float_t* out_data);
+void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out_data, float_t* diff);
+void scal_gpu(const int n, const float alpha, float* X);
+void add_scalar_gpu(const int n, const float_t alpha, float_t* Y);
+void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r);
+bool is_allocated_device(float_t* data);
+void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks);
+void float_malloc_device(int n, float_t*& ptr);
+void float_free_device(float_t*& ptr);
+void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr);
+void uint8_malloc_device(int n, uint8_t*& ptr);
+void uint8_free_device(uint8_t*& ptr);
+void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks,
+                          float_t* loss);
+acc_t l2_norm_gpu(int n, const float_t* in);
+void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out);
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff,
+                   float_t* out_diff);
+#endif
diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h
new file mode 100644
index 0000000000..694819591c
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/optimizer.h
@@ -0,0 +1,197 @@
+/**
+ * Code taken/modified from below link.
+ *
+ * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
+ * Copyright (c) 2013, Taiga Nomi and the respective contributors
+ * All rights reserved.
+ * Reused under 3-BSD
+ */
+#pragma once
+
+// TODO:
+// - use classes, not structs (modern C++)
+// - templatize this instead of using inheritance
+// - put optimizers in their own namespace
+
+#include <algorithm>
+#include <unordered_map>
+#include "deepgalois/types.h"
+
+namespace deepgalois {
+
+// base class of optimizer
+// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss
+// function)
+struct optimizer {
+  optimizer()                                    = default;
+  optimizer(const optimizer&)                    = default;
+  optimizer(optimizer&&)                         = default;
+  optimizer& operator=(const optimizer&)         = default;
+  optimizer& operator=(optimizer&&)              = default;
+  virtual ~optimizer()                           = default;
+  virtual void update(const vec_t& dW, vec_t& W) = 0;
+#ifdef GALOIS_ENABLE_GPU
+  virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0;
+#endif
+  virtual void reset() {} // override to implement pre-learning action
+};
+
+// helper class to hold N values for each weight
+template <int N>
+struct stateful_optimizer : public optimizer {
+  void reset() override {
+    for (auto& e : E_)
+      e.clear();
+  }
+
+protected:
+  template <int Index>
+  vec_t& get(const vec_t& key) {
+    static_assert(Index < N, "index out of range");
+    if (E_[Index][&key].empty())
+      E_[Index][&key].resize(key.size(), float_t(0));
+    return E_[Index][&key];
+  }
+  std::unordered_map<const vec_t*, vec_t> E_[N];
+#ifdef GALOIS_ENABLE_GPU
+  template <int Index>
+  float_t* get_gpu(const size_t n, const float_t* key);
+  std::unordered_map<const float_t*, float_t*> dE_[N];
+#endif
+};
+
+/**
+ * adaptive gradient method
+ *
+ * J Duchi, E Hazan and Y Singer,
+ * Adaptive subgradient methods for online learning and stochastic optimization
+ * The Journal of Machine Learning Research, pages 2121-2159, 2011.
+ **/
+struct adagrad : public stateful_optimizer<1> {
+  adagrad() : alpha(0.01), eps(float_t(1e-8)) {}
+  void update(const vec_t& dW, vec_t& W);
+#ifdef GALOIS_ENABLE_GPU
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
+  float_t alpha; // learning rate
+private:
+  float_t eps;
+};
+
+/**
+ * RMSprop
+ *
+ * T Tieleman, and G E Hinton,
+ * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012)
+ **/
+struct RMSprop : public stateful_optimizer<1> {
+  RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
+  void update(const vec_t& dW, vec_t& W);
+#ifdef GALOIS_ENABLE_GPU
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
+  float_t alpha; // learning rate
+  float_t mu;    // decay term
+private:
+  float_t eps; // constant value to avoid zero-division
+};
+
+// Adam: A Method for Stochastic Optimization
+// http://arxiv.org/abs/1412.6980
+struct adam : public stateful_optimizer<2> {
+  adam()
+      : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)),
+        b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
+  void update(const vec_t& dW, vec_t& W);
+#ifdef GALOIS_ENABLE_GPU
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
+
+  float_t alpha; // learning rate
+  float_t b1;    // decay term
+  float_t b2;    // decay term
+  float_t b1_t;  // decay term power t
+  float_t b2_t;  // decay term power t
+
+private:
+  float_t eps; // constant value to avoid zero-division
+};
+
+/**
+ * @brief [a new optimizer (2015)]
+ * @details [see Adam: A Method for Stochastic Optimization (Algorithm 2)
+ *               http://arxiv.org/abs/1412.6980]
+ *
+ */
+struct adamax : public stateful_optimizer<2> {
+  adamax()
+      : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1),
+        eps(float_t(1e-8)) {}
+  void update(const vec_t& dW, vec_t& W);
+#ifdef GALOIS_ENABLE_GPU
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
+
+  float_t alpha; // learning rate
+  float_t b1;    // decay term
+  float_t b2;    // decay term
+  float_t b1_t;  // decay term power t
+
+private:
+  float_t eps; // constant value to avoid zero-division
+};
+
+// SGD without momentum
+// slightly faster than tiny_dnn::momentum
+struct gradient_descent : public optimizer {
+  gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
+  void update(const vec_t& dW, vec_t& W);
+#ifdef GALOIS_ENABLE_GPU
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
+  float_t alpha;  // learning rate
+  float_t lambda; // weight decay
+};
+
+/**
+ * SGD with momentum
+ *
+ * B T Polyak,
+ * Some methods of speeding up the convergence of iteration methods
+ * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964.
+ **/
+struct momentum : public stateful_optimizer<1> {
+public:
+  momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
+  void update(const vec_t& dW, vec_t& W);
+#ifdef GALOIS_ENABLE_GPU
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
+
+  float_t alpha;  // learning rate
+  float_t lambda; // weight decay
+  float_t mu;     // momentum
+};
+
+/**
+ * SGD with Nesterov momentum
+ *
+ * Y Nesterov,
+ * A method for unconstrained convex minimization problem with the rate of
+ * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983.
+ **/
+struct nesterov_momentum : public stateful_optimizer<1> {
+public:
+  nesterov_momentum()
+      : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
+  void update(const vec_t& dW, vec_t& W);
+#ifdef GALOIS_ENABLE_GPU
+  void update_gpu(const size_t n, const float_t* dW, float_t* W);
+#endif
+
+  float_t alpha;  // learning rate
+  float_t lambda; // weight decay
+  float_t mu;     // momentum
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/random.h b/libdeepgalois/include/deepgalois/random.h
new file mode 100644
index 0000000000..6e5cb0fe5b
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/random.h
@@ -0,0 +1,53 @@
+// random number generators for CPU
+#pragma once
+
+#include <random>
+#include "galois/Galois.h"
+#include "deepgalois/GraphTypes.h"
+
+namespace deepgalois {
+
+class PerThreadRNG {
+  galois::substrate::PerThreadStorage<std::default_random_engine> engine;
+  galois::substrate::PerThreadStorage<std::uniform_real_distribution<float_t>>
+      distribution;
+
+public:
+  //! init distribution
+  PerThreadRNG() : distribution{0.0, 1.0} {};
+
+  //! thread local RNG float from 0 to 1
+  float_t get_number() {
+    float_t num = (*distribution.getLocal())(*engine.getLocal());
+    return num;
+  }
+};
+
+class random_generator {
+public:
+  static random_generator& get_instance() {
+    static random_generator instance;
+    return instance;
+  }
+  std::mt19937& operator()() { return gen_; }
+  void set_seed(unsigned int seed) { gen_.seed(seed); }
+
+private:
+  random_generator() : gen_(1) {}
+  std::mt19937 gen_;
+};
+
+template <typename T>
+inline typename std::enable_if<std::is_integral<T>::value, T>::type
+uniform_rand(T min, T max) {
+  std::uniform_int_distribution<T> dst(min, max);
+  return dst(random_generator::get_instance()());
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+uniform_rand(T min, T max) {
+  std::uniform_real_distribution<T> dst(min, max);
+  return dst(random_generator::get_instance()());
+}
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h
new file mode 100644
index 0000000000..c25eeceac2
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/reader.h
@@ -0,0 +1,22 @@
+#pragma once
+#include "deepgalois/lgraph.h"
+// #include "galois/DistGalois.h"
+namespace deepgalois {
+
+class Reader {
+private:
+  std::string dataset_str;
+  void progressPrint(unsigned maxii, unsigned ii);
+
+public:
+  Reader() : dataset_str("") {}
+  Reader(std::string dataset) : dataset_str(dataset) {}
+  void init(std::string dataset) { dataset_str = dataset; }
+  size_t read_labels(bool is_single_class, label_t*& labels);
+  size_t read_features(float_t*& feats, std::string filetype = "bin");
+  size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end,
+                    mask_t* masks);
+  void readGraphFromGRFile(LearningGraph* g);
+};
+
+} // namespace deepgalois
diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h
new file mode 100644
index 0000000000..17dd05b15d
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/types.h
@@ -0,0 +1,58 @@
+#ifndef _GNN_TYPES_H_
+#define _GNN_TYPES_H_
+#include <set>
+#include <vector>
+#include <stdint.h>
+#include <cstddef>
+
+// TODO namespace
+
+#ifdef CNN_USE_DOUBLE
+typedef double float_t;
+typedef double feature_t;
+#else
+typedef float float_t;
+typedef float feature_t; // feature type
+#endif
+typedef std::vector<float_t> vec_t; // feature vector (1D)
+typedef std::vector<vec_t>
+    tensor_t; // feature vectors (2D): num_samples x feature_dim
+typedef std::vector<feature_t> FV; // feature vector
+typedef std::vector<FV> FV2D;      // feature vectors: num_samples x feature_dim
+typedef float acc_t;               // Accuracy type
+typedef uint8_t label_t; // label is for classification (supervised learning)
+typedef uint8_t mask_t;  // mask is used to indicate different uses of labels:
+                         // train, val, test
+typedef uint32_t VertexID;
+typedef uint64_t EdgeID;
+typedef std::vector<VertexID> VertexList;
+typedef std::set<VertexID> VertexSet;
+typedef std::vector<size_t> dims_t; // dimentions type
+
+typedef uint32_t index_t; // index type
+typedef float_t edata_t;  // edge data type
+typedef float_t vdata_t;  // vertex data type
+typedef float_t* emb_t;   // embedding (feature vector) type
+
+enum class net_phase { train, test };
+
+#define CHUNK_SIZE 256
+#define TB_SIZE 256
+#define BLOCK_SIZE 256
+#define WARP_SIZE 32
+#define MAX_NUM_CLASSES 128
+#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
+#define USE_CUSPARSE
+
+namespace deepgalois {
+// TODO only being used by graph conv layer at the moment so extern works,
+// but this design is bad and needs to be revisited
+
+//! Set this to let sync struct know where to get data from
+extern float_t* _dataToSync;
+//! Set this to let sync struct know the size of the vector to use during
+//! sync
+extern long unsigned _syncVectorSize;
+} // namespace deepgalois
+
+#endif
diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h
new file mode 100644
index 0000000000..bf74aad196
--- /dev/null
+++ b/libdeepgalois/include/deepgalois/utils.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <random>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "deepgalois/types.h"
+
+namespace deepgalois {
+
+//! tracks max mem usage with rusage
+// TODO use Galois's getrusage functionality
+class ResourceManager {
+public:
+  ResourceManager() {}
+  ~ResourceManager() {}
+  // peak memory usage
+  std::string get_peak_memory() {
+    double kbm;
+    struct rusage CurUsage;
+    getrusage(RUSAGE_SELF, &CurUsage);
+    kbm        = (double)CurUsage.ru_maxrss;
+    double mbm = kbm / 1024.0;
+    double gbm = mbm / 1024.0;
+    return "Peak memory: " + to_string_with_precision(mbm, 3) + " MB; " +
+           to_string_with_precision(gbm, 3) + " GB";
+  }
+
+private:
+  template <typename T = double>
+  std::string to_string_with_precision(const T a_value, const int& n) {
+    std::ostringstream out;
+    out << std::fixed;
+    out << std::setprecision(n) << a_value;
+    return out.str();
+  }
+};
+
+// TODO don't need a separate timer: use Galois's regular timer
+class Timer {
+public:
+  Timer() {}
+  void Start() { gettimeofday(&start_time_, NULL); }
+  void Stop() {
+    gettimeofday(&elapsed_time_, NULL);
+    elapsed_time_.tv_sec -= start_time_.tv_sec;
+    elapsed_time_.tv_usec -= start_time_.tv_usec;
+  }
+  double Seconds() const {
+    return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6;
+  }
+  double Millisecs() const {
+    return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000;
+  }
+  double Microsecs() const {
+    return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec;
+  }
+
+private:
+  struct timeval start_time_;
+  struct timeval elapsed_time_;
+};
+
+// sequential prefix sum
+template <typename InTy = unsigned, typename OutTy = unsigned>
+inline std::vector<OutTy> prefix_sum(const std::vector<InTy>& in) {
+  std::vector<OutTy> prefix(in.size() + 1);
+  OutTy total = 0;
+  for (size_t i = 0; i < in.size(); i++) {
+    prefix[i] = total;
+    total += (OutTy)in[i];
+  }
+  prefix[in.size()] = total;
+  return prefix;
+}
+
+template <typename InTy = unsigned, typename OutTy = unsigned>
+OutTy* parallel_prefix_sum(const std::vector<InTy>& in);
+
+// Utility function to randomly select k items from [begin, end)
+template <typename T = int>
+inline T* select_k_items(T k, T begin, T end) {
+  auto i = begin;
+
+  // reservoir[] is the output array. Initialize
+  // it with first k vertices
+  T* reservoir = new T[k];
+  for (; i < k; i++)
+    reservoir[i] = i;
+
+  // Use a different seed value so that we don't get
+  // same result each time we run this program
+  srand(time(NULL));
+
+  // Iterate from the (k+1)th element to nth element
+  for (; i < end; i++) {
+    // Pick a random index from 0 to i.
+    auto j = rand() % (i + 1);
+
+    // If the randomly picked index is smaller than k,
+    // then replace the element present at the index
+    // with new element from stream
+    if (j < k)
+      reservoir[j] = i;
+  }
+  return reservoir;
+}
+
+// Utility function to find ceiling of r in arr[l..h]
+template <typename T = int>
+inline T find_ceil(T* arr, T r, T l, T h) {
+  T mid;
+  while (l < h) {
+    mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2
+    (r > arr[mid]) ? (l = mid + 1) : (h = mid);
+  }
+  return (arr[l] >= r) ? l : -1;
+}
+
+// Utility function to select one element from n elements given a frequency
+// (probability) distribution
+// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/
+template <typename T = int>
+T select_one_item(T n, T* dist) {
+  T* offsets = new T[n];
+  offsets[0] = dist[0];
+  // compute the prefix sum of the distribution
+  for (T i = 1; i < n; ++i)
+    offsets[i] = offsets[i - 1] + dist[i];
+  // offsets[n-1] is sum of all frequencies
+  T sum = offsets[n - 1];
+  T r   = (rand() % sum) + 1;
+  // find which range r falls into, and return the index of the range
+  return find_ceil(offsets, r, 0, n - 1);
+}
+
+acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t* masks,
+                      size_t num_classes, label_t* ground_truth, float_t* pred);
+
+} // namespace deepgalois
diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt
new file mode 100644
index 0000000000..d9bf751eac
--- /dev/null
+++ b/libdeepgalois/licensenote.txt
@@ -0,0 +1,59 @@
+TODO
+
+figure out which files have coded based on other codebsaes, get license,
+note here
+
+e.g.
+https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn
+under BSD-3
+
+DGL structure as well from what I can tell
+
+================================================================================
+Caffe License
+================================================================================
+
+COPYRIGHT
+
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014-2017, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
+LICENSE
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTION AGREEMENT
+
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
diff --git a/libdeepgalois/scripts/run-multi.sh b/libdeepgalois/scripts/run-multi.sh
new file mode 100755
index 0000000000..da9861fb2e
--- /dev/null
+++ b/libdeepgalois/scripts/run-multi.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp
+LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn
+GNNS="gcn"
+#GRAPHS="ppi yelp amazon"
+GRAPHS="ppi"
+EPOCHS="200"
+NTHREADS="56"
+DROPOUT="0.1 0.2 0.3 0.5"
+LEARNINGRATES="0.01"
+HIDDENDIM="16 64 128"
+OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois
+
+for GNN in $GNNS; do
+  for NT in $NTHREADS; do
+    for GR in $GRAPHS; do
+      for K in $EPOCHS; do
+        for DR in $DROPOUT; do
+          for LR in $LEARNINGRATES; do
+            for HD in $HIDDENDIM; do
+              EXEC_DIR=$LONESTARGNN/$GNN
+              echo $EXEC_DIR
+              echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
+              $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
+              echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
+            done
+          done
+        done
+      done
+    done
+  done
+done
diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh
new file mode 100755
index 0000000000..a6bc223ebd
--- /dev/null
+++ b/libdeepgalois/scripts/run-single.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp
+LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn
+GNNS="gcn"
+GRAPHS="cora citeseer pubmed flickr reddit"
+#GRAPHS="cora"
+EPOCHS="200"
+NTHREADS="56"
+DROPOUT="0.1 0.2 0.3 0.5"
+LEARNINGRATES="0.01"
+HIDDENDIM="16 32 64 128 256 512"
+OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois
+
+for GNN in $GNNS; do
+  for NT in $NTHREADS; do
+    for GR in $GRAPHS; do
+      for K in $EPOCHS; do
+        for DR in $DROPOUT; do
+          for LR in $LEARNINGRATES; do
+            for HD in $HIDDENDIM; do
+              EXEC_DIR=$LONESTARGNN/$GNN
+              echo $EXEC_DIR
+              echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log"
+              $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log
+              echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log"
+            done
+          done
+        done
+      done
+    done
+  done
+done
diff --git a/libdeepgalois/scripts/test-multi.sh b/libdeepgalois/scripts/test-multi.sh
new file mode 100755
index 0000000000..a67bd047a8
--- /dev/null
+++ b/libdeepgalois/scripts/test-multi.sh
@@ -0,0 +1 @@
+./gcn ppi -k=20 -t=14 -sc=0 -h=128
diff --git a/libdeepgalois/scripts/test-single.sh b/libdeepgalois/scripts/test-single.sh
new file mode 100755
index 0000000000..78093d71ed
--- /dev/null
+++ b/libdeepgalois/scripts/test-single.sh
@@ -0,0 +1 @@
+./gcn cora -k=200 -t=14
diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp
new file mode 100644
index 0000000000..21bcad0fe3
--- /dev/null
+++ b/libdeepgalois/src/DistContext.cpp
@@ -0,0 +1,405 @@
+#include "deepgalois/DistContext.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/configs.h"
+
+namespace deepgalois {
+DistContext::DistContext() : DistContext(false) { syncSubstrate = NULL; }
+
+DistContext::~DistContext() {}
+
+void DistContext::saveDistGraph(DGraph* a) {
+  partitionedGraph = a;
+
+  // construct lgraph from underlying lc csr graph
+  // TODO fix this so i don't have more than 1 copy of graph in memory
+  this->lGraph = new Graph();
+  this->lGraph->allocateFrom(a->size(), a->sizeEdges());
+  this->lGraph->constructNodes();
+
+  galois::do_all(
+      galois::iterate((size_t)0, a->size()),
+      [&](const auto src) {
+        this->lGraph->fixEndEdge(src, *a->edge_end(src));
+        index_t idx = *(a->edge_begin(src));
+
+        for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) {
+          const auto dst = a->getEdgeDst(e);
+          this->lGraph->constructEdge(idx++, dst, 0);
+        }
+      },
+      galois::loopname("lgraphcopy"));
+}
+
+// TODO move to reader class
+size_t DistContext::read_labels(bool isSingleClassLabel,
+                                std::string dataset_str) {
+  DGraph* dGraph         = DistContext::partitionedGraph;
+  this->usingSingleClass = isSingleClassLabel;
+  unsigned myID          = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Reading labels from disk...\n");
+
+  std::string filename = path + dataset_str + "-labels.txt";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  size_t m;
+  // read file header
+  in >> m >> this->num_classes >> std::ws;
+  assert(m == dGraph->globalSize());
+
+  // size of labels should be # local nodes
+  if (isSingleClassLabel) {
+    galois::gPrint("[", myID, "] One hot labels...\n");
+    // single-class (one-hot) label for each vertex: N x 1
+    this->h_labels = new label_t[dGraph->size()];
+  } else {
+    galois::gPrint("[", myID, "] Multi-class labels...\n");
+    this->h_labels = new label_t[dGraph->size() * this->num_classes];
+    // multi-class label for each vertex: N x E
+  }
+
+  uint32_t foundVertices = 0;
+  unsigned v             = 0;
+  // each line contains a set of 0s and 1s
+  while (std::getline(in, line)) {
+    // only bother if local node
+    if (dGraph->isLocal(v)) {
+      std::istringstream label_stream(line);
+      unsigned x;
+      // for each class
+      for (size_t idx = 0; idx < this->num_classes; ++idx) {
+        // check if that class is labeled
+        label_stream >> x;
+
+        // diff between single and multi class
+        if (isSingleClassLabel) {
+          if (x != 0) {
+            // set local id
+            this->h_labels[dGraph->getLID(v)] = idx;
+            foundVertices++;
+            break;
+          }
+        } else {
+          this->h_labels[dGraph->getLID(v) * this->num_classes + idx] = x;
+          foundVertices++;
+        }
+      }
+    }
+    // always increment v
+    v++;
+  }
+
+  in.close();
+
+  // print the number of vertex classes
+  galois::gPrint("[", myID,
+                 "] Done with labels, unique label counts: ", num_classes,
+                 "; set ", foundVertices, " nodes\n");
+
+  return num_classes;
+}
+
+// TODO move to reader class
+size_t DistContext::read_features(std::string dataset_str) {
+  DGraph* dGraph = DistContext::partitionedGraph;
+  unsigned myID  = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Reading features from disk...\n");
+
+  std::string filename = path + dataset_str + ".ft";
+  std::ifstream in;
+  size_t m; // m = number of vertices
+  // dimension read
+  std::string file_dims = path + dataset_str + "-dims.txt";
+  std::ifstream ifs;
+  ifs.open(file_dims, std::ios::in);
+  ifs >> m >> this->feat_len >> std::ws;
+  ifs.close();
+
+  galois::gPrint("[", myID, "] N x D: ", m, " x ", feat_len, "\n");
+
+  // TODO read in without using 2 in-memory buffers
+  // full read feats to load into h_feats
+  float_t* fullFeats = new float_t[m * feat_len];
+  // actual stored feats
+  h_feats = new float_t[dGraph->size() * feat_len];
+
+  // read in full feats
+  filename = path + dataset_str + "-feats.bin";
+  in.open(filename, std::ios::binary | std::ios::in);
+  in.read((char*)fullFeats, sizeof(float_t) * m * feat_len);
+  in.close();
+
+  // get the local ids we want
+  size_t count = 0;
+  for (size_t i = 0; i < m; i++) {
+    if (dGraph->isLocal(i)) {
+      // h_feats[count * feat_len] = fullFeats[i];
+      std::copy(fullFeats + i * DistContext::feat_len,
+                fullFeats + (i + 1) * DistContext::feat_len,
+                &this->h_feats[dGraph->getLID(i) * DistContext::feat_len]);
+      count++;
+    }
+  }
+  GALOIS_ASSERT(count == dGraph->size());
+  free(fullFeats);
+
+  galois::gPrint("[", myID, "] Done with features, feature length: ", feat_len,
+                 "\n");
+
+  return feat_len;
+}
+
+// TODO move to reader class/reuse reader class somehow
+size_t DistContext::read_masks(std::string dataset_str, std::string mask_type,
+                               size_t n, size_t& begin, size_t& end,
+                               mask_t* masks, DGraph* dGraph) {
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+
+  bool dataset_found = false;
+  for (int i = 0; i < NUM_DATASETS; i++) {
+    if (dataset_str == dataset_names[i]) {
+      dataset_found = true;
+      break;
+    }
+  }
+  if (!dataset_found) {
+    GALOIS_DIE("Dataset currently not supported");
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        // only bother if it's local
+        if (dGraph->isLocal(i)) {
+          masks[dGraph->getLID(i)] = 1;
+          sample_count++;
+        }
+      }
+    }
+    i++;
+  }
+  galois::gPrint("[", myID, "] ", mask_type, "_mask range: [", begin, ", ", end,
+                 ") Number of valid samples: ", sample_count, "(",
+                 (float)sample_count / (float)n * (float)100, "\%)\n");
+  in.close();
+  return sample_count;
+}
+
+float_t* DistContext::get_in_ptr() { return &h_feats[0]; }
+
+void DistContext::initializeSyncSubstrate() {
+  DistContext::syncSubstrate = new galois::graphs::GluonSubstrate<DGraph>(
+      *DistContext::partitionedGraph,
+      galois::runtime::getSystemNetworkInterface().ID,
+      galois::runtime::getSystemNetworkInterface().Num, false);
+}
+
+void DistContext::allocNormFactor() {
+#ifdef USE_MKL
+  this->normFactors.resize(partitionedGraph->sizeEdges());
+#else
+  this->normFactors.resize(partitionedGraph->size());
+#endif
+}
+
+void DistContext::allocNormFactorSub(int subID) {
+#ifdef USE_MKL
+  this->normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges());
+#else
+  this->normFactorsSub.resize(partitionedSubgraphs[subID]->size());
+#endif
+}
+
+void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
+  unsigned myID = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Norm factor construction\n");
+  // using original graph to get ids
+  Graph* wholeGraph = globalContext->getFullGraph();
+
+  allocNormFactor();
+  // this is for testing purposes
+  // galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()),
+  //  [&] (unsigned i) {
+  //    this->normFactors[i] = 0;
+  //  }
+  //);
+
+#ifdef USE_MKL
+  galois::do_all(
+      galois::iterate((size_t)0, partitionedGraph->size()),
+      [&](unsigned i) {
+        float_t c_i = std::sqrt(
+            float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+
+        for (auto e = partitionedGraph->edge_begin(i);
+             e != partitionedGraph->edge_end(i); e++) {
+          const auto j = partitionedGraph->getEdgeDst(e);
+          float_t c_j  = std::sqrt(
+              float_t(wholeGraph->get_degree(partitionedGraph->getGID(j))));
+
+          if (c_i == 0.0 || c_j == 0.0) {
+            this->normFactors[*e] = 0.0;
+          } else {
+            this->normFactors[*e] = 1.0 / (c_i * c_j);
+          }
+        }
+      },
+      galois::loopname("NormCountingEdge"));
+#else
+  galois::do_all(
+      galois::iterate((size_t)0, partitionedGraph->size()),
+      [&](unsigned v) {
+        auto degree  = wholeGraph->get_degree(partitionedGraph->getGID(v));
+        float_t temp = std::sqrt(float_t(degree));
+        if (temp == 0.0) {
+          this->normFactors[v] = 0.0;
+        } else {
+          this->normFactors[v] = 1.0 / temp;
+        }
+      },
+      galois::loopname("NormCountingNode"));
+#endif
+  galois::gPrint("[", myID, "] Norm factor construction done \n");
+}
+
+void DistContext::constructNormFactorSub(int subgraphID) {
+  // galois::gPrint("Sub norm factor construction\n");
+  // right now norm factor based on subgraph
+  // TODO fix this for dist execution
+
+  allocNormFactorSub(subgraphID);
+
+  Graph& graphToUse = *partitionedSubgraphs[subgraphID];
+  graphToUse.degree_counting();
+
+  // TODO using partitioned subgraph rather than whoel graph; i.e. dist
+  // setting wrong
+#ifdef USE_MKL
+  galois::do_all(
+      galois::iterate((size_t)0, graphToUse.size()),
+      [&](unsigned i) {
+        // float_t c_i =
+        // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i))));
+        float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i)));
+
+        for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i);
+             e++) {
+          const auto j = graphToUse.getEdgeDst(e);
+          float_t c_j  = std::sqrt(float_t(graphToUse.get_degree(j)));
+
+          if (c_i == 0.0 || c_j == 0.0) {
+            this->normFactorsSub[e] = 0.0;
+          } else {
+            this->normFactorsSub[e] = 1.0 / (c_i * c_j);
+          }
+        }
+      },
+      galois::loopname("NormCountingEdge"));
+#else
+  galois::do_all(
+      galois::iterate((size_t)0, graphToUse.size()),
+      [&](unsigned v) {
+        // auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v));
+        auto degree  = graphToUse.get_degree(v);
+        float_t temp = std::sqrt(float_t(degree));
+        if (temp == 0.0) {
+          this->normFactorsSub[v] = 0.0;
+        } else {
+          this->normFactorsSub[v] = 1.0 / temp;
+        }
+        // galois::gPrint(this->normFactorsSub[v], "\n");
+      },
+      galois::loopname("NormCountingNode"));
+#endif
+  // galois::gPrint("Sub norm factor construction done\n");
+}
+//! generate labels for the subgraph, m is subgraph size, mask
+//! tells which vertices to use
+void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
+  if (DistContext::usingSingleClass) {
+    DistContext::h_labels_subg.resize(m);
+  } else {
+    DistContext::h_labels_subg.resize(m * DistContext::num_classes);
+  }
+  size_t count = 0;
+  // see which labels to copy over for this subgraph
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      if (DistContext::usingSingleClass) {
+        DistContext::h_labels_subg[count] = h_labels[i];
+      } else {
+        std::copy(
+            DistContext::h_labels + i * DistContext::num_classes,
+            DistContext::h_labels + (i + 1) * DistContext::num_classes,
+            &DistContext::h_labels_subg[count * DistContext::num_classes]);
+      }
+      // galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n");
+      count++;
+    }
+  }
+  GALOIS_ASSERT(count == m);
+}
+
+//! generate input features for the subgraph, m is subgraph size,
+//! masks tells which vertices to use
+void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
+  size_t count = 0;
+  DistContext::h_feats_subg.resize(m * feat_len);
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      std::copy(DistContext::h_feats + i * DistContext::feat_len,
+                DistContext::h_feats + (i + 1) * DistContext::feat_len,
+                &DistContext::h_feats_subg[count * DistContext::feat_len]);
+      // for (unsigned a = 0; a < DistContext::feat_len; a++) {
+      //  if (h_feats_subg[count * DistContext::feat_len + a] != 0) {
+      //    galois::gPrint(h_feats_subg[count * DistContext::feat_len + a],
+      //    " ");
+      //  }
+      //}
+      // galois::gPrint("\n");
+      count++;
+    }
+  }
+  GALOIS_ASSERT(count == m);
+}
+
+galois::graphs::GluonSubstrate<DGraph>* DistContext::getSyncSubstrate() {
+  return DistContext::syncSubstrate;
+}
+
+//! allocate memory for subgraphs (don't actually build them)
+void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) {
+  this->partitionedSubgraphs.resize(num_subgraphs);
+  for (int i = 0; i < num_subgraphs; i++) {
+    this->partitionedSubgraphs[i] = new Graph();
+    this->partitionedSubgraphs[i]->set_max_size(max_size);
+  }
+}
+
+bool DistContext::isOwned(unsigned gid) {
+  return this->partitionedGraph->isOwned(gid);
+}
+
+bool DistContext::isLocal(unsigned gid) {
+  return this->partitionedGraph->isLocal(gid);
+}
+
+unsigned DistContext::getGID(unsigned lid) {
+  return this->partitionedGraph->getGID(lid);
+}
+
+unsigned DistContext::getLID(unsigned gid) {
+  return this->partitionedGraph->getLID(gid);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu
new file mode 100644
index 0000000000..30704b0748
--- /dev/null
+++ b/libdeepgalois/src/DistContext.cu
@@ -0,0 +1,257 @@
+#include <ctime>
+#include <cstdio>
+#include <unistd.h>
+#include <sys/types.h>
+#include "deepgalois/DistContext.h"
+#include "deepgalois/math_functions.hh"
+#include "deepgalois/configs.h"
+
+// random seeding
+int64_t cluster_seedgen(void) {
+  int64_t s, seed, pid;
+  FILE* f = fopen("/dev/urandom", "rb");
+  if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) {
+    fclose(f);
+    return seed;
+  }
+  std::cout << "System entropy source not available, "
+               "using fallback algorithm to generate seed instead.";
+  if (f)
+    fclose(f);
+  pid  = getpid();
+  s    = time(NULL);
+  seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729);
+  return seed;
+}
+
+namespace deepgalois {
+
+// computing normalization factor for each vertex
+__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t temp = sqrt(float_t(graph.getOutDegree(i)));
+    if (temp == 0.0) norm_fac[i] = 0.0;
+    else norm_fac[i] = 1.0 / temp;
+  }
+}
+
+// TODO: make sure self-loop added for each vertex
+// computing normalization factor for each edge
+__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) {
+  CUDA_KERNEL_LOOP(src, n) {
+    assert(src < n);
+    float_t d_src = float_t(graph.getOutDegree(src));
+    assert(d_src != 0.0); // should never be zero since self-loop added for each vertex
+    d_src       = 1.0 / sqrt(d_src);
+    auto start  = graph.edge_begin(src);
+    index_t end = graph.edge_end(src);
+    for (index_t e = start; e != end; e++) {
+      index_t dst = graph.getEdgeDst(e);
+      // if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src,
+      // dst, e, start, end);
+      assert(dst < n);
+      float_t d_dst = float_t(graph.getOutDegree(dst));
+      assert(d_dst != 0.0);
+      d_dst       = 1.0 / sqrt(d_dst);
+      norm_fac[e] = d_src * d_dst;
+    }
+  }
+}
+
+cublasHandle_t DistContext::cublas_handle_         = 0;
+cusparseHandle_t DistContext::cusparse_handle_     = 0;
+cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0;
+curandGenerator_t DistContext::curand_generator_   = 0;
+
+DistContext::DistContext() : DistContext(true) {
+  d_labels = NULL;
+  d_feats = NULL;
+  d_labels_subg = NULL;
+  d_feats_subg = NULL;
+  d_normFactors = NULL;
+  d_normFactorsSub = NULL;
+  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
+  CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_));
+  CUSPARSE_CHECK(
+      cusparseSetMatType(cusparse_matdescr_, CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CHECK(
+      cusparseSetMatIndexBase(cusparse_matdescr_, CUSPARSE_INDEX_BASE_ZERO));
+  CURAND_CHECK(
+      curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(
+      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()));
+}
+
+DistContext::~DistContext() {
+  if (cublas_handle_)
+    CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+  if (cusparse_handle_)
+    CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
+  if (cusparse_matdescr_)
+    CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_));
+  if (curand_generator_)
+    CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  if (d_labels) CUDA_CHECK(cudaFree(d_labels));
+  if (d_feats) CUDA_CHECK(cudaFree(d_feats));
+  if (d_normFactors) CUDA_CHECK(cudaFree(d_normFactors));
+  if (d_labels_subg) CUDA_CHECK(cudaFree(d_labels_subg));
+  if (d_feats_subg) CUDA_CHECK(cudaFree(d_feats_subg));
+  if (d_normFactorsSub) CUDA_CHECK(cudaFree(d_normFactorsSub));
+}
+
+size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) {
+  num_classes = reader.read_labels(isSingleClass, h_labels);
+  return num_classes;
+}
+
+size_t DistContext::read_features(std::string dataset_str) {
+  feat_len = reader.read_features(h_feats);
+  return feat_len;
+}
+
+size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n,
+                               size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) {
+  return reader.read_masks(mask_type, n, begin, end, masks);
+}
+
+//! allocate memory for subgraphs (don't actually build them)
+void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) {
+  this->partitionedSubgraphs.resize(num_subgraphs);
+  for (int i = 0; i < num_subgraphs; i++) {
+    this->partitionedSubgraphs[i] = new Graph();
+    this->partitionedSubgraphs[i]->set_max_size(max_size);
+  }
+}
+
+void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {
+  size_t labels_size = m;
+  if (!usingSingleClass) labels_size = m * num_classes;
+  h_labels_subg.resize(labels_size);
+  size_t count = 0;
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      if (usingSingleClass) h_labels_subg[count] = h_labels[i];
+      else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes,
+                     &h_labels_subg[count * num_classes]);
+      count++;
+    }
+  }
+  if (d_labels_subg) uint8_free_device(d_labels_subg);
+  uint8_malloc_device(labels_size, d_labels_subg);
+  uint8_copy_device(labels_size, &h_labels_subg[0], d_labels_subg);
+}
+
+void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {
+  //std::cout << "construct subgraph features (d_feats_subg: " << d_feats_subg << ") ... ";
+  size_t count = 0;
+  DistContext::h_feats_subg.resize(m * feat_len);
+  for (size_t i = 0; i < this->partitionedGraph->size(); i++) {
+    if (masks[i] == 1) {
+      std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len, &h_feats_subg[count * feat_len]);
+      count++;
+    }
+  }
+  if (d_feats_subg) float_free_device(d_feats_subg);
+  float_malloc_device(m * feat_len, d_feats_subg);
+  float_copy_device(m * feat_len, &h_feats_subg[0], d_feats_subg);
+  //std::cout << "Done\n";
+}
+
+void DistContext::constructNormFactorSub(int subgraphID) {
+  Graph& graphToUse = *partitionedSubgraphs[subgraphID];
+  auto n = graphToUse.size();
+  //std::cout << "Pre-computing subgraph normalization factor (n=" << n << ") ... ";
+
+ #ifdef USE_CUSPARSE
+  auto nnz = graphToUse.sizeEdges();
+  float_malloc_device(nnz, d_normFactorsSub);
+  init_const_gpu(nnz, 0.0, d_normFactors);
+  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graphToUse, d_normFactorsSub);
+#else
+  float_malloc_device(n, d_normFactorsSub);
+  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, graphToUse, d_normFactorsSub);
+#endif
+  CudaTest("solving norm_factor_computing kernel failed");
+  //std::cout << "Done\n";
+}
+
+void DistContext::constructNormFactor(deepgalois::Context* globalContext) {
+  auto n = partitionedGraph->size();
+  std::cout << "Pre-computing normalization factor (n=" << n << ") ... ";
+  if (!is_selfloop_added) {
+    std::cout << "Set -sl=1 to add selfloop\n";
+    exit(0);
+  }
+#ifdef USE_CUSPARSE
+  auto nnz = partitionedGraph->sizeEdges();
+  CUDA_CHECK(cudaMalloc((void**)&d_normFactors, nnz * sizeof(float_t)));
+  init_const_gpu(nnz, 0.0, d_normFactors);
+  norm_factor_computing_edge<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, *partitionedGraph, d_normFactors);
+#else
+  CUDA_CHECK(cudaMalloc((void**)&d_normFactors, n * sizeof(float_t)));
+  norm_factor_computing_node<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, *partitionedGraph, d_normFactors);
+#endif
+  CudaTest("solving norm_factor_computing kernel failed");
+  std::cout << "Done\n";
+}
+
+/*
+void DistContext::SetDevice(const int device_id) {
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id) return;
+  CUDA_CHECK(cudaSetDevice(device_id));
+  if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+  if (curand_generator_)
+CURAND_CHECK(curandDestroyGenerator(curand_generator_));
+  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+  CURAND_CHECK(curandCreateGenerator(&curand_generator_,
+CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_,
+cluster_seedgen()));
+}
+*/
+size_t DistContext::read_graph(std::string dataset, bool selfloop) {
+  partitionedGraph = new DGraph();
+#ifdef USE_CSRGRAPH
+  std::string filename = path + dataset + ".csgr";
+  GraphGPU g;
+  g.read(filename.c_str(), false);
+  if (selfloop) {
+    g.add_selfloop();
+    is_selfloop_added = selfloop;
+  }
+  g.copy_to_gpu(*partitionedGraph);
+#else
+  partitionedGraph->readGraph(dataset);
+  if (selfloop) {
+    partitionedGraph->add_selfloop();
+    is_selfloop_added = selfloop;
+  }
+  partitionedGraph->copy_to_gpu();
+#endif
+  return partitionedGraph->size();
+}
+
+void DistContext::copy_data_to_device() {
+  auto n = partitionedGraph->size();
+  std::cout << "Copying labels and features to GPU memory. n = " << n << " ... ";
+  if (usingSingleClass) {
+    CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t)));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice));
+  } else {
+    CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t)));
+    CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice));
+  }
+  CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t)));
+  CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice));
+  // print_device_vector(10, d_feats, "d_feats");
+  std::cout << "Done\n";
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp
new file mode 100644
index 0000000000..d07b19f912
--- /dev/null
+++ b/libdeepgalois/src/Net.cpp
@@ -0,0 +1,177 @@
+/**
+ * Based on the net.hpp file from Caffe deep learning framework.
+ */
+
+#include "galois/Timer.h"
+#include "galois/Galois.h"
+#include "deepgalois/Net.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+void Net::partitionInit(DGraph* graph, std::string dataset_str,
+                        bool isSingleClassLabel) {
+  this->dGraph      = graph;
+  this->distContext = new deepgalois::DistContext();
+  this->distContext->saveDistGraph(dGraph);
+  this->distNumSamples = this->dGraph->size();
+
+  // TODO self loop setup would have to be done before this during partitioning
+  // or on master node only
+
+  this->distContext->initializeSyncSubstrate();
+  num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str);
+
+  // std::cout << "Reading label masks ... ";
+  this->distTrainMasks = new mask_t[this->distNumSamples];
+  this->distValMasks   = new mask_t[this->distNumSamples];
+  std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples,
+            0);
+  std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0);
+
+  // load the training/val masks
+  if (dataset_str == "reddit") {
+    // find local ID from global ID, set if it exists
+    for (size_t i = this->globalTrainBegin; i < this->globalTrainEnd; i++) {
+      if (this->dGraph->isLocal(i)) {
+        this->distTrainMasks[this->dGraph->getLID(i)] = 1;
+      }
+    }
+    for (size_t i = this->globalValBegin; i < this->globalValEnd; i++) {
+      if (this->dGraph->isLocal(i)) {
+        this->distValMasks[this->dGraph->getLID(i)] = 1;
+      }
+    }
+  } else {
+    globalTrainCount = this->distContext->read_masks(
+        dataset_str, "train", this->distNumSamples, this->globalTrainBegin,
+        this->globalTrainEnd, this->distTrainMasks, this->dGraph);
+    globalValCount = this->distContext->read_masks(
+        dataset_str, "val", this->distNumSamples, this->globalValBegin,
+        this->globalValEnd, this->distValMasks, this->dGraph);
+  }
+
+  // input feature dimension: D
+  feature_dims[0] = this->distContext->read_features(dataset_str);
+
+  feature_dims[num_conv_layers] = num_classes; // output embedding: E
+  if (this->has_l2norm) {
+    // l2 normalized embedding: E
+    feature_dims[num_conv_layers + 1] = num_classes;
+  }
+  if (this->has_dense) {
+    // MLP embedding: E
+    feature_dims[num_layers - 1] = num_classes;
+  }
+  feature_dims[num_layers] = num_classes; // normalized output embedding: E
+}
+
+void Net::allocateSubgraphsMasks(int num_subgraphs) {
+  subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
+}
+
+// add weight decay
+void Net::regularize() {
+  size_t layer_id = 0;
+  auto n          = feature_dims[layer_id] * feature_dims[layer_id + 1];
+  // TODO: parallel
+  math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(),
+             layers[layer_id]->get_grads_ptr());
+}
+
+void Net::read_test_masks(std::string dataset) {
+  if (dataset == "reddit") {
+    globalTestBegin = 177262;
+    globalTestCount = 55703;
+    globalTestEnd   = globalTestBegin + globalTestCount;
+    for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
+      globalTestMasks[i] = 1;
+    }
+  } else {
+    globalTestCount = graphTopologyContext->read_masks(
+        "test", globalSamples, globalTestBegin, globalTestEnd, globalTestMasks);
+  }
+}
+
+void Net::readDistributedTestMasks(std::string dataset) {
+  distTestMasks = new mask_t[distNumSamples];
+  if (dataset == "reddit") {
+    globalTestBegin = 177262;
+    globalTestCount = 55703;
+    globalTestEnd   = globalTestBegin + globalTestCount;
+    for (size_t i = globalTestBegin; i < globalTestEnd; i++) {
+      if (dGraph->isLocal(i))
+        distTestMasks[dGraph->getLID(i)] = 1;
+    }
+  } else {
+    globalTestCount = distContext->read_masks(
+        dataset, std::string("test"), globalSamples, globalTestBegin,
+        globalTestEnd, distTestMasks, dGraph);
+  }
+}
+
+/**
+ * @param gBegin GLOBAL begin
+ * @param gEnd GLOBAL end
+ * @param gMasks: GLOBAL masks
+ * @param gCount GLOBAL training count
+ */
+acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount,
+                           mask_t* gMasks, float_t* preds,
+                           label_t* localGroundTruth) {
+  galois::DGAccumulator<acc_t> accuracy_all;
+  galois::DGAccumulator<uint32_t> sampleCount;
+  accuracy_all.reset();
+  sampleCount.reset();
+
+  galois::do_all(
+      galois::iterate(gBegin, gEnd),
+      [&](const auto& gid) {
+        // only look at owned nodes (i.e. masters); the prediction for these
+        // should only by handled on the owner
+        if (this->dGraph->isOwned(gid)) {
+          sampleCount += 1;
+          uint32_t localID = this->dGraph->getLID(gid);
+          if (gMasks == NULL) {
+            auto pred =
+                math::argmax(num_classes, &preds[localID * num_classes]);
+            // check prediction
+            if ((label_t)pred == localGroundTruth[localID]) {
+              accuracy_all += 1.0;
+            }
+          } else {
+            if (gMasks[gid] == 1) {
+              // get prediction
+              auto pred =
+                  math::argmax(num_classes, &preds[localID * num_classes]);
+              // check prediction
+              if ((label_t)pred == localGroundTruth[localID]) {
+                accuracy_all += 1.0;
+              }
+            }
+          }
+        }
+      },
+      galois::loopname("getMaskedLoss"));
+
+  gCount = sampleCount.reduce();
+  galois::gDebug("Total sample count is ", gCount);
+  // all hosts should get same accuracy
+  return accuracy_all.reduce() / (acc_t)gCount;
+}
+
+acc_t Net::masked_multi_class_accuracy(size_t gBegin, size_t gEnd,
+                                       size_t gCount, mask_t* gMasks,
+                                       float_t* preds,
+                                       label_t* localGroundTruth) {
+  // TODO fix this
+  if (galois::runtime::getSystemNetworkInterface().Num > 1) {
+    GALOIS_DIE(
+        "Multi-class accuracy not yet implemented for distributed setting\n");
+  }
+
+  return deepgalois::masked_f1_score(gBegin, gEnd, gCount, gMasks, num_classes,
+                                     localGroundTruth, preds);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu
new file mode 100644
index 0000000000..ee70e1d578
--- /dev/null
+++ b/libdeepgalois/src/Net.cu
@@ -0,0 +1,227 @@
+#include "deepgalois/Net.h"
+#include "deepgalois/cutils.h"
+#include "deepgalois/math_functions.hh"
+#include "gg.h"
+#include "ggcuda.h"
+#include <iomanip>
+
+// the arguments of the maxima
+__device__ int argmax_device(const int n, const float_t* x) {
+  float_t max = x[0];
+  int max_ind = 0;
+  for (int i = 1; i < n; i++) {
+    if (x[i] > max) {
+      max_ind = i;
+      max     = x[i];
+    }
+  }
+  return max_ind;
+}
+
+__global__ void masked_accuracy_kernel(int num_classes, int begin, int end,
+                                       mask_t* masks, float_t* preds,
+                                       label_t* labels,
+                                       HGAccumulator<acc_t> total) {
+  total.thread_entry();
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage
+      local_accuracy;
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1) {
+      label_t pred = (label_t)argmax_device(num_classes,
+                                            preds + (begin + i) * num_classes);
+      if (pred == labels[begin + i])
+        total.reduce(1.0);
+    }
+  }
+  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_accuracy);
+}
+
+acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count,
+                          mask_t* masks, float_t* preds, label_t* labels) {
+  assert(count > 0);
+  HGAccumulator<acc_t> accuracy_accum;
+  Shared<acc_t> total_accuracy   = Shared<acc_t>(1);
+  *(total_accuracy.cpu_wr_ptr()) = 0;
+  accuracy_accum.rv              = total_accuracy.gpu_wr_ptr();
+  masked_accuracy_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      num_classes, begin, end, masks, preds, labels, accuracy_accum);
+  CudaTest("solving masked_accuracy kernel failed");
+  cudaDeviceSynchronize();
+  return *(total_accuracy.cpu_rd_ptr()) / count;
+}
+
+typedef float f1count_t;
+__global__ void
+masked_f1_score_kernel(int num_classes, int begin, int end, mask_t* masks,
+                       float_t* preds, label_t* labels,
+                       f1count_t* true_positive, f1count_t* false_positive,
+                       f1count_t* false_negtive, f1count_t* true_negtive) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    int id = begin + i;
+    if (masks[id] == 1) {
+      for (size_t j = 0; j < num_classes; j++) {
+        int idx = id * num_classes + j;
+        if (labels[idx] == 1 && preds[idx] > 0.5) {
+          atomicAdd(&true_positive[j], 1.0);
+        } else if (labels[idx] == 0 && preds[idx] > 0.5) {
+          atomicAdd(&false_positive[j], 1.0);
+        } else if (labels[idx] == 1 && preds[idx] <= 0.5) {
+          atomicAdd(&false_negtive[j], 1.0);
+        } else {
+          atomicAdd(&true_negtive[j], 1.0);
+        }
+      }
+    }
+  }
+}
+
+acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count,
+                          mask_t* masks, float_t* preds, label_t* labels) {
+  float beta = 1.0;
+  assert(count > 0);
+  f1count_t* h_tp = new f1count_t[num_classes];
+  f1count_t* h_fp = new f1count_t[num_classes];
+  f1count_t* h_fn = new f1count_t[num_classes];
+  f1count_t* h_tn = new f1count_t[num_classes];
+  f1count_t *d_tp, *d_fp, *d_fn, *d_tn;
+  float_malloc_device(num_classes, d_tp);
+  float_malloc_device(num_classes, d_fp);
+  float_malloc_device(num_classes, d_fn);
+  float_malloc_device(num_classes, d_tn);
+  init_const_gpu(num_classes, 0.0, d_tp);
+  init_const_gpu(num_classes, 0.0, d_fp);
+  init_const_gpu(num_classes, 0.0, d_fn);
+  init_const_gpu(num_classes, 0.0, d_tn);
+  masked_f1_score_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn, d_tn);
+  CudaTest("solving masked_f1_score_kernel kernel failed");
+  CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t),
+                        cudaMemcpyDeviceToHost));
+
+  acc_t pNumerator     = 0.0;
+  acc_t pDenominator   = 0.0;
+  acc_t rNumerator     = 0.0;
+  acc_t rDenominator   = 0.0;
+  acc_t precisionMacro = 0.0;
+  acc_t recallMacro    = 0.0;
+  for (size_t i = 0; i < num_classes; i++) {
+    acc_t fn = (acc_t)h_fn[i]; // false negtive
+    acc_t fp = (acc_t)h_fp[i]; // false positive
+    acc_t tp = (acc_t)h_tp[i]; // true positive
+                               // acc_t tn = (acc_t)h_tn[i]; // true positive
+
+    precisionMacro = precisionMacro + (tp / (tp + fp));
+    recallMacro    = recallMacro + (tp / (tp + fn));
+    pNumerator     = pNumerator + tp;
+    pDenominator   = pDenominator + (tp + fp);
+    rNumerator     = rNumerator + tp;
+    rDenominator   = rDenominator + (tp + fn);
+  }
+  precisionMacro = precisionMacro / num_classes;
+  recallMacro    = recallMacro / num_classes;
+  acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) /
+                   ((beta * beta) * precisionMacro + recallMacro);
+  acc_t recallMicro    = rNumerator / rDenominator;
+  acc_t precisionMicro = pNumerator / pDenominator;
+  acc_t f1_micro       = (((beta * beta) + 1) * precisionMicro * recallMicro) /
+                   ((beta * beta) * precisionMicro + recallMicro);
+  std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro
+            << ", f1_macro: " << f1_macro << ") ";
+
+  float_free_device(d_tp);
+  float_free_device(d_fp);
+  float_free_device(d_fn);
+  float_free_device(d_tn);
+  delete[] h_tp;
+  delete[] h_fp;
+  delete[] h_fn;
+  delete[] h_tn;
+  return f1_micro;
+}
+
+namespace deepgalois {
+
+void Net::allocateSubgraphsMasks(int num_subgraphs) {
+  subgraphs_masks = new mask_t[distNumSamples * num_subgraphs];
+  //CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t)));
+}
+
+void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) {
+  copy_masks_device(globalSamples, globalTrainMasks, d_train_masks);
+  copy_masks_device(globalSamples, globalValMasks, d_val_masks);
+
+  this->distContext = new deepgalois::DistContext();
+  this->distContext->set_dataset(dataset_str);
+
+  // read the graph into CPU memory and copy it to GPU memory
+  this->distNumSamples = this->distContext->read_graph(dataset_str, is_selfloop);
+
+  // read labels into CPU memory
+  num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str);
+
+  // read features into CPU memory
+  feature_dims[0] = this->distContext->read_features(dataset_str);
+
+  // copy labels and features from CPU memory to GPU memory
+  distContext->copy_data_to_device(); // copy labels and input features to the device
+
+  feature_dims[num_conv_layers] = num_classes; // output embedding: E
+  if (this->has_l2norm) {
+    // l2 normalized embedding: E
+    feature_dims[num_conv_layers + 1] = num_classes;
+  }
+  if (this->has_dense) {
+    // MLP embedding: E
+    feature_dims[num_layers - 1] = num_classes;
+  }
+  feature_dims[num_layers] = num_classes; // normalized output embedding: E
+}
+
+void Net::read_test_masks(std::string dataset) {
+  if (dataset == "reddit") {
+    globalTestBegin = 177262;
+    globalTestCount = 55703;
+    globalTestEnd   = globalTestBegin + globalTestCount;
+    for (size_t i = globalTestBegin; i < globalTestEnd; i++)
+        globalTestMasks[i] = 1;
+  } else {
+    globalTestCount = distContext->read_masks(dataset, std::string("test"),
+        globalSamples, globalTestBegin, globalTestEnd, globalTestMasks, NULL);
+  }
+  //copy_test_masks_to_device();
+  copy_masks_device(globalSamples, globalTestMasks, d_test_masks);
+}
+
+//void Net::copy_test_masks_to_device() {}
+
+// add weight decay
+void Net::regularize() {
+  size_t layer_id = 0;
+  auto n          = feature_dims[layer_id] * feature_dims[layer_id + 1];
+  axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(),
+           layers[layer_id]->get_grads_device_ptr());
+}
+
+//void Net::normalize() {}
+
+acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count,
+                           mask_t* masks, float_t* preds,
+                           label_t* ground_truth) {
+  return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds,
+                             ground_truth);
+}
+
+acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count,
+                                       mask_t* masks, float_t* preds,
+                                       label_t* ground_truth) {
+  return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds,
+                             ground_truth);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp
new file mode 100644
index 0000000000..23efe124d2
--- /dev/null
+++ b/libdeepgalois/src/RandomWalk.cpp
@@ -0,0 +1,222 @@
+#include <time.h>
+#include <vector>
+#include <iostream>
+#include "galois/Galois.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/Sampler.h"
+
+namespace deepgalois {
+
+void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g,
+                                    DGraph* dg) {
+  this->count_ = count;
+  // save original graph
+  Sampler::globalGraph = g;
+  // save partitioned graph
+  Sampler::partGraph = dg;
+
+  // allocate the object for the new masked graph
+  Sampler::globalMaskedGraph = new GraphCPU();
+
+  std::vector<uint32_t> degrees(g->size(), 0);
+  galois::gPrint("graph size: ", g->size(), "\n");
+  // get degrees of nodes that will be in new graph
+  // this->getMaskedDegrees(g->size(), masks, g, degrees);
+  galois::do_all(
+      galois::iterate(size_t(0), g->size()),
+      [&](const auto src) {
+        if (masks[src] == 1) {
+          for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src);
+               e++) {
+            const auto dst = g->getEdgeDstHost(e);
+            if (masks[dst] == 1)
+              degrees[src]++;
+          }
+        }
+      },
+      galois::loopname("update_degrees"));
+
+  auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto ne      = offsets[g->size()];
+
+  // save ids (of original graph) of training nodes to vector
+  for (size_t i = 0; i < g->size(); i++) {
+    if (masks[i] == 1)
+      Sampler::trainingNodes.push_back(i);
+  }
+
+  Sampler::globalMaskedGraph->allocateFrom(g->size(), ne);
+  Sampler::globalMaskedGraph->constructNodes();
+  // same as original graph, except keep only edges involved in masks
+  galois::do_all(
+      galois::iterate((size_t)0, g->size()),
+      [&](const auto src) {
+        Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]);
+        if (masks[src] == 1) {
+          auto idx = offsets[src];
+          for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src);
+               e++) {
+            const auto dst = g->getEdgeDstHost(e);
+            if (masks[dst] == 1) {
+              // galois::gPrint(src, " ", dst, "\n");
+              Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0);
+            }
+          }
+        }
+      },
+      galois::loopname("gen_subgraph"));
+
+  Sampler::globalMaskedGraph->degree_counting();
+  Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size();
+  Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg;
+
+  // TODO masked part graph as well to save time later; right now constructing
+  // from full part graph
+}
+
+// implementation from GraphSAINT
+// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
+void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
+  if (n < m)
+    m = n;
+  unsigned myseed = seed;
+
+  // unsigned myseed = tid;
+  // DBx: Dashboard line x, IAx: Index array line x
+  std::vector<db_t> DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2;
+  DB0.reserve(subg_deg * m * ETA);
+  DB1.reserve(subg_deg * m * ETA);
+  DB2.reserve(subg_deg * m * ETA);
+  IA0.reserve(n);
+  IA1.reserve(n);
+  IA2.reserve(n);
+  IA3.reserve(n);
+  IA4.reserve(n);
+  IA0.resize(m);
+  IA1.resize(m);
+  IA2.resize(m);
+  IA3.resize(m);
+
+  // galois::gPrint("seed ", myseed, " m ", m, "\n");
+  // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n");
+  // printf("( ");
+  // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " ";
+  // printf(")\n");
+
+  for (index_t i = 0; i < m; i++) {
+    auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size();
+    db_t v = IA3[i] = Sampler::trainingNodes[rand_idx];
+    st.insert(v);
+    IA0[i] = getDegree(Sampler::globalMaskedGraph, v);
+    IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i];
+    IA1[i] = 1;
+    IA2[i] = 0;
+  }
+  // calculate prefix sum for IA0 and store in IA2 to compute the address for
+  // each frontier in DB
+  IA2[0] = IA0[0];
+  for (index_t i = 1; i < m; i++)
+    IA2[i] = IA2[i - 1] + IA0[i];
+  // now fill DB accordingly
+  checkGSDB(DB0, DB1, DB2, IA2[m - 1]);
+  for (index_t i = 0; i < m; i++) {
+    db_t DB_start = (i == 0) ? 0 : IA2[i - 1];
+    db_t DB_end   = IA2[i];
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3[i];
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+      DB2[j] = i + 1;
+    }
+  }
+
+  db_t choose, neigh_v, newsize, tmp;
+  for (index_t itr = 0; itr < n - m; itr++) {
+    choose = db_t(-1);
+    while (choose == db_t(-1)) {
+      tmp = rand_r(&myseed) % DB0.size();
+      if (size_t(tmp) < DB0.size())
+        if (DB0[tmp] != db_t(-1))
+          choose = tmp;
+    }
+    choose      = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
+    db_t v      = DB0[choose];
+    auto degree = getDegree(Sampler::globalMaskedGraph, v);
+    neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
+    if (neigh_v != db_t(-1)) {
+      neigh_v = Sampler::globalMaskedGraph->getEdgeDstHost(
+          Sampler::globalMaskedGraph->edge_begin_host(v) + neigh_v);
+      st.insert(neigh_v);
+      IA1[DB2[choose] - 1] = 0;
+      IA0[DB2[choose] - 1] = 0;
+      for (auto i = choose; i < choose - DB1[choose]; i++)
+        DB0[i] = db_t(-1);
+      newsize = getDegree(Sampler::globalMaskedGraph, neigh_v);
+      newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize;
+    } else
+      newsize = 0;
+    // shrink DB to remove sampled nodes, also shrink IA accordingly
+    bool cond = DB0.size() + newsize > DB0.capacity();
+    if (cond) {
+      // compute prefix sum for the location in shrinked DB
+      IA4.resize(IA0.size());
+      IA4[0] = IA0[0];
+      for (size_t i = 1; i < IA0.size(); i++)
+        IA4[i] = IA4[i - 1] + IA0[i];
+      nDB0.resize(IA4.back());
+      nDB1.resize(IA4.back());
+      nDB2.resize(IA4.back());
+      IA2.assign(IA4.begin(), IA4.end());
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA1[i] == 0)
+          continue;
+        db_t DB_start = (i == 0) ? 0 : IA4[i - 1];
+        db_t DB_end   = IA4[i];
+        for (auto j = DB_start; j < DB_end; j++) {
+          nDB0[j] = IA3[i];
+          nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+          nDB2[j] = i + 1;
+        }
+      }
+      // remap the index in DB2 by compute prefix of IA1 (new idx in IA)
+      IA4.resize(IA1.size());
+      IA4[0] = IA1[0];
+      for (size_t i = 1; i < IA1.size(); i++)
+        IA4[i] = IA4[i - 1] + IA1[i];
+      DB0.assign(nDB0.begin(), nDB0.end());
+      DB1.assign(nDB1.begin(), nDB1.end());
+      DB2.assign(nDB2.begin(), nDB2.end());
+      for (auto i = DB2.begin(); i < DB2.end(); i++)
+        *i = IA4[*i - 1];
+      db_t curr = 0;
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA0[i] != 0) {
+          IA0[curr] = IA0[i];
+          IA1[curr] = IA1[i];
+          IA2[curr] = IA2[i];
+          IA3[curr] = IA3[i];
+          curr++;
+        }
+      }
+      IA0.resize(curr);
+      IA1.resize(curr);
+      IA2.resize(curr);
+      IA3.resize(curr);
+    }
+    checkGSDB(DB0, DB1, DB2, newsize + DB0.size());
+    IA0.push_back(newsize);
+    IA1.push_back(1);
+    IA2.push_back(IA2.back() + IA0.back());
+    IA3.push_back(neigh_v);
+    db_t DB_start = (*(IA2.end() - 2));
+    db_t DB_end   = IA2.back();
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3.back();
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+      DB2[j] = IA3.size();
+    }
+  }
+  // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: ");
+  // print_vertex_set(st);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp
new file mode 100644
index 0000000000..055b5c0a85
--- /dev/null
+++ b/libdeepgalois/src/Sampler.cpp
@@ -0,0 +1,360 @@
+#include <time.h>
+#include <vector>
+#include "galois/Galois.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/Sampler.h"
+#define PARALLEL_GEN
+
+namespace deepgalois {
+
+//! debug function: prints out sets of vertices
+void print_vertex_set(VertexSet vertex_set) {
+  unsigned counter = 0;
+  unsigned n       = vertex_set.size();
+  galois::gPrint("( ");
+  for (int i : vertex_set) {
+    counter++;
+    if (counter > 16 && counter < n - 16)
+      continue;
+    galois::gPrint(i, " ");
+  }
+  galois::gPrint(")\n");
+}
+
+/*
+// implementation from GraphSAINT
+// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp
+void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) {
+  if (n < m) m = n;
+  unsigned myseed = seed;
+
+  // unsigned myseed = tid;
+  // DBx: Dashboard line x, IAx: Index array line x
+  std::vector<db_t> DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2;
+  DB0.reserve(subg_deg * m * ETA);
+  DB1.reserve(subg_deg * m * ETA);
+  DB2.reserve(subg_deg * m * ETA);
+  IA0.reserve(n);
+  IA1.reserve(n);
+  IA2.reserve(n);
+  IA3.reserve(n);
+  IA4.reserve(n);
+  IA0.resize(m);
+  IA1.resize(m);
+  IA2.resize(m);
+  IA3.resize(m);
+
+  // galois::gPrint("seed ", myseed, " m ", m, "\n");
+  // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n");
+  // printf("( ");
+  // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " ";
+  // printf(")\n");
+
+  for (index_t i = 0; i < m; i++) {
+    auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size();
+    db_t v = IA3[i] = Sampler::trainingNodes[rand_idx];
+    st.insert(v);
+    IA0[i] = getDegree(Sampler::globalMaskedGraph, v);
+    IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i];
+    IA1[i] = 1;
+    IA2[i] = 0;
+  }
+  // calculate prefix sum for IA0 and store in IA2 to compute the address for
+  // each frontier in DB
+  IA2[0] = IA0[0];
+  for (index_t i = 1; i < m; i++)
+    IA2[i] = IA2[i - 1] + IA0[i];
+  // now fill DB accordingly
+  checkGSDB(DB0, DB1, DB2, IA2[m - 1]);
+  for (index_t i = 0; i < m; i++) {
+    db_t DB_start = (i == 0) ? 0 : IA2[i - 1];
+    db_t DB_end   = IA2[i];
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3[i];
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+      DB2[j] = i + 1;
+    }
+  }
+
+  db_t choose, neigh_v, newsize, tmp;
+  for (index_t itr = 0; itr < n - m; itr++) {
+    choose = db_t(-1);
+    while (choose == db_t(-1)) {
+      tmp = rand_r(&myseed) % DB0.size();
+      if (size_t(tmp) < DB0.size())
+        if (DB0[tmp] != db_t(-1))
+          choose = tmp;
+    }
+    choose      = (DB1[choose] < 0) ? choose : (choose - DB1[choose]);
+    db_t v      = DB0[choose];
+    auto degree = getDegree(Sampler::globalMaskedGraph, v);
+    neigh_v     = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1);
+    if (neigh_v != db_t(-1)) {
+      neigh_v = Sampler::globalMaskedGraph->getEdgeDst(
+          Sampler::globalMaskedGraph->edge_begin(v) + neigh_v);
+      st.insert(neigh_v);
+      IA1[DB2[choose] - 1] = 0;
+      IA0[DB2[choose] - 1] = 0;
+      for (auto i = choose; i < choose - DB1[choose]; i++)
+        DB0[i] = db_t(-1);
+      newsize = getDegree(Sampler::globalMaskedGraph, neigh_v);
+      newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize;
+    } else
+      newsize = 0;
+    // shrink DB to remove sampled nodes, also shrink IA accordingly
+    bool cond = DB0.size() + newsize > DB0.capacity();
+    if (cond) {
+      // compute prefix sum for the location in shrinked DB
+      IA4.resize(IA0.size());
+      IA4[0] = IA0[0];
+      for (size_t i = 1; i < IA0.size(); i++)
+        IA4[i] = IA4[i - 1] + IA0[i];
+      nDB0.resize(IA4.back());
+      nDB1.resize(IA4.back());
+      nDB2.resize(IA4.back());
+      IA2.assign(IA4.begin(), IA4.end());
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA1[i] == 0)
+          continue;
+        db_t DB_start = (i == 0) ? 0 : IA4[i - 1];
+        db_t DB_end   = IA4[i];
+        for (auto j = DB_start; j < DB_end; j++) {
+          nDB0[j] = IA3[i];
+          nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+          nDB2[j] = i + 1;
+        }
+      }
+      // remap the index in DB2 by compute prefix of IA1 (new idx in IA)
+      IA4.resize(IA1.size());
+      IA4[0] = IA1[0];
+      for (size_t i = 1; i < IA1.size(); i++)
+        IA4[i] = IA4[i - 1] + IA1[i];
+      DB0.assign(nDB0.begin(), nDB0.end());
+      DB1.assign(nDB1.begin(), nDB1.end());
+      DB2.assign(nDB2.begin(), nDB2.end());
+      for (auto i = DB2.begin(); i < DB2.end(); i++)
+        *i = IA4[*i - 1];
+      db_t curr = 0;
+      for (size_t i = 0; i < IA0.size(); i++) {
+        if (IA0[i] != 0) {
+          IA0[curr] = IA0[i];
+          IA1[curr] = IA1[i];
+          IA2[curr] = IA2[i];
+          IA3[curr] = IA3[i];
+          curr++;
+        }
+      }
+      IA0.resize(curr);
+      IA1.resize(curr);
+      IA2.resize(curr);
+      IA3.resize(curr);
+    }
+    checkGSDB(DB0, DB1, DB2, newsize + DB0.size());
+    IA0.push_back(newsize);
+    IA1.push_back(1);
+    IA2.push_back(IA2.back() + IA0.back());
+    IA3.push_back(neigh_v);
+    db_t DB_start = (*(IA2.end() - 2));
+    db_t DB_end   = IA2.back();
+    for (auto j = DB_start; j < DB_end; j++) {
+      DB0[j] = IA3.back();
+      DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start);
+      DB2[j] = IA3.size();
+    }
+  }
+  // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: ");
+  // print_vertex_set(st);
+}
+*/
+
+// API function for user-defined selection strategy
+// Select n vertices from vertices and put them in vertex_set.
+// nv: number of vertices in the original graph;
+// n: number of vertices in the subgraph;
+// m: number of vertices in the frontier.
+// our implementation of GraphSAINT sampling
+void Sampler::selectVertices(index_t nv, index_t n, Graph* g,
+                             VertexList vertices, VertexSet& vertex_set) {
+  // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices,
+  // graph size: ", g->size(), "\n");
+  assert(nv == vertices.size());
+  // randomly select m vertices from vertices as frontier
+  auto frontier_indices = deepgalois::select_k_items((int)m, 0, (int)nv);
+  VertexList frontier(m);
+  for (index_t i = 0; i < m; i++)
+    frontier[i] = vertices[frontier_indices[i]];
+  vertex_set.insert(frontier.begin(), frontier.end());
+  // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n");
+  int* degrees = new int[m];
+  // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) {
+  for (index_t i = 0; i < m; i++) {
+    degrees[i] = (int)getDegree(g, frontier[i]);
+  } //, galois::loopname("compute_degrees"));
+  for (index_t i = 0; i < n - m; i++) {
+    auto pos    = select_one_item((int)m, degrees);
+    auto u      = frontier[pos];
+    auto degree = degrees[pos];
+    int j       = 0;
+    for (; j < degree; j++) {
+      auto neighbor_id = rand() % degree; // randomly select a neighbor
+      auto dst         = g->getEdgeDst(g->edge_begin(u) + neighbor_id);
+      if (vertex_set.find(dst) == vertex_set.end()) {
+        frontier[pos] = dst;
+        degrees[pos]  = getDegree(g, frontier[pos]);
+        vertex_set.insert(dst);
+        break;
+      }
+    }
+    if (j == degree)
+      galois::gPrint("Not found from ", degree, " neighbors\n");
+  }
+  /*
+  assert(n == vertex_set.size()); // size of vertex_set could be slightly
+  smaller than n galois::gPrint("Done selection, vertex_set size: ",
+  vertex_set.size(), ", set: "); print_vertex_set(vertex_set);
+  */
+}
+
+// Given a subset of vertices and a graph g, generate a subgraph sg from the
+// graph g
+void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph,
+                              Graph& reindexGraph) {
+  // auto n = origGraph.size(); // old graph size
+  auto nv            = keptVertices.size(); // new graph (subgraph) size
+  VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices);
+  std::vector<uint32_t> degrees(nv, 0); // degrees of vertices in the subgraph
+  for (auto v : keptVertices) {
+    degrees[new_ids[v]] = getDegree(&origGraph, v);
+  }
+  // auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto offsets = deepgalois::prefix_sum(degrees);
+  auto ne      = offsets[nv];
+  // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne,
+  // "\n");
+  reindexGraph.allocateFrom(nv, ne);
+  reindexGraph.constructNodes();
+  VertexList old_ids(keptVertices.begin(),
+                     keptVertices.end()); // vertex ID mapping
+#ifdef PARALLEL_GEN
+  galois::do_all(
+      galois::iterate(size_t(0), size_t(nv)),
+      [&](const auto i) {
+#else
+  for (size_t i = 0; i < nv; i++) {
+#endif
+        reindexGraph.fixEndEdge(i, offsets[i + 1]);
+        unsigned j  = 0;
+        auto old_id = old_ids[i];
+        for (auto e = origGraph.edge_begin(old_id);
+             e != origGraph.edge_end(old_id); e++) {
+          auto dst = new_ids[origGraph.getEdgeDst(e)];
+          assert(dst < nv);
+          reindexGraph.constructEdge(offsets[i] + j, dst, 0);
+          j++;
+        }
+      }
+#ifdef PARALLEL_GEN
+      ,
+      galois::loopname("construct_graph"));
+#endif
+}
+
+VertexSet Sampler::convertToLID(VertexSet& gidSet) {
+  VertexSet existingLIDs;
+  // find local selected vertices, convert to lid
+  for (auto i : gidSet) {
+    if (partGraph->isLocal(i)) {
+      existingLIDs.insert(partGraph->getLID(i));
+    }
+  }
+  return existingLIDs;
+}
+
+template <typename GraphTy>
+void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g,
+                               std::vector<uint32_t>& degrees) {
+  // template <>
+  // void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g,
+  // std::vector<uint32_t>& degrees) {
+  assert(degrees.size() == n);
+  galois::do_all(
+      galois::iterate(size_t(0), n),
+      [&](const auto src) {
+        // for (size_t src = 0; src < n; src++) {
+        if (masks[src] == 1) {
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            const auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1) {
+              // galois::gInfo("Edge ", src, " ", dst);
+              degrees[src]++;
+            }
+          }
+        }
+      },
+      galois::loopname("update_degrees"));
+}
+
+template <typename GraphTy, typename SubgraphTy>
+void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g,
+                             SubgraphTy* sub) {
+  std::vector<uint32_t> degrees(n, 0);
+  this->getMaskedDegrees(n, masks, g, degrees);
+  // auto offsets = deepgalois::parallel_prefix_sum(degrees);
+  auto offsets = deepgalois::prefix_sum(degrees);
+  size_t ne    = offsets[n];
+  // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne,
+  // "\n");
+
+  // note this constructs the full graph's nodes; just trims edges
+  sub->allocateFrom(n, ne);
+  sub->constructNodes();
+
+  galois::do_all(
+      galois::iterate(size_t(0), size_t(n)),
+      [&](const auto src) {
+        sub->fixEndEdge(src, offsets[src + 1]);
+        if (masks[src] == 1) {
+          auto idx = offsets[src];
+          for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) {
+            auto dst = g->getEdgeDst(e);
+            if (masks[dst] == 1) {
+              // galois::gPrint(src, " ", dst, "\n");
+              sub->constructEdge(idx++, dst, 0);
+            }
+          }
+        }
+      },
+      galois::loopname("gen_subgraph"));
+}
+
+void Sampler::generateSubgraph(VertexSet& sampledSet, mask_t* masks,
+                               Graph* sg) {
+  // n = 9000 by default
+  // do the sampling of vertices from training set + using masked graph
+
+  // sampledSet is a list of *global* ids in the graph
+  // create new vertex set with LIDs for partitioned graph
+  VertexSet sampledLIDs = this->convertToLID(sampledSet);
+
+  // VertexSet sampledLIDs;
+  // galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n");
+  // galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(),
+  // "\n"); for (auto i : this->trainingNodes) {
+  //  sampledLIDs.insert(i);
+  //}
+
+  // create the masks
+  createMasks(Sampler::partGraph->size(), sampledLIDs, masks);
+
+  // this graph will contain sampled vertices and induced subgraph for it
+  Graph maskedSG;
+  // TODO use partMaskedGraph once constructed later
+  // remove edges whose destination is not masked
+  this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph,
+                       &maskedSG);
+  this->reindexSubgraph(sampledLIDs, maskedSG, *sg);
+
+  // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n");
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu
new file mode 100644
index 0000000000..1cdfc49e32
--- /dev/null
+++ b/libdeepgalois/src/Sampler.cu
@@ -0,0 +1,168 @@
+#include <thrust/scan.h>
+#include <thrust/execution_policy.h>
+#include "deepgalois/cutils.h"
+#include "deepgalois/Sampler.h"
+
+namespace deepgalois {
+
+__global__ void clear_masks(index_t n, mask_t* masks) {
+  CUDA_KERNEL_LOOP(i, n) { masks[i] = 0; }
+}
+
+// set the masks of vertices in a given vertex set
+// n is the size of the vertex set
+__global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) {
+  CUDA_KERNEL_LOOP(i, n) { masks[vertices[i]] = 1; }
+}
+
+// compute the degrees of a masked graph
+// n is the size of the original graph
+__global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g,
+                                   index_t* degrees) {
+  CUDA_KERNEL_LOOP(src, n) {
+    //if (src < 10) printf("masks[%d] = %d\n", src, masks[src]);
+    degrees[src] = 0;
+    if (masks[src] == 1) {
+      for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+        auto dst = g.getEdgeDst(e);
+        if (masks[dst] == 1)
+          degrees[src]++;
+      }
+    }
+    //if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]);
+  }
+}
+
+// Given a graph, remove any edge which has end-point masked, and generate the
+// subgraph n is the size of the original graph and the subgraph offset was
+// computed by using prefix-sum of the masked degrees
+__global__ void generate_masked_graph_kernel(index_t n, const mask_t* masks,
+                                             const index_t* offsets, GraphGPU g,
+                                             GraphGPU subg) {
+  CUDA_KERNEL_LOOP(src, n) {
+    subg.fixEndEdge(src, offsets[src + 1]);
+    if (masks[src] == 1) {
+      auto idx = offsets[src];
+      for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+        auto dst = g.getEdgeDst(e);
+        if (masks[dst] == 1)
+          subg.constructEdge(idx++, dst);
+      }
+    }
+  }
+}
+
+// compute the degrees of the subgraph induced by the vertex set
+// n is the size of the vertex set
+// new_ids array maps vertex ID in the original graph to the vertex ID in the
+// subgraph
+__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids,
+                                GraphGPU g, index_t* degrees) {
+  CUDA_KERNEL_LOOP(i, n) {
+    auto v              = vertices[i];
+    degrees[new_ids[v]] = g.getOutDegree(v);
+  }
+}
+
+// Given a masked graph, remove the masked vertices, reindex the rest vertices,
+// and generate the subgraph offset was computed by using prefix-sum of the new
+// degrees n is the size of the old_ids and the sbugraph
+__global__ void generate_graph_kernel(index_t n, const index_t* offsets,
+                                      const index_t* old_ids,
+                                      const index_t* new_ids, GraphGPU g,
+                                      GraphGPU subg) {
+  CUDA_KERNEL_LOOP(i, n) {
+    subg.fixEndEdge(i, offsets[i + 1]);
+    index_t j = 0;
+    auto src  = old_ids[i];
+    for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+      auto dst = new_ids[g.getEdgeDst(e)];
+      assert(dst < n);
+      subg.constructEdge(offsets[i] + j, dst);
+      j++;
+    }
+  }
+}
+
+/*
+void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) {
+  index_t vid = 0;
+  for (index_t i = 0; i < n; i++) {
+    auto v         = vertices[i];
+    new_indices[v] = vid++;
+  }
+}
+*/
+
+template <typename GraphTy, typename SubgraphTy>
+void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) {
+  //std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n";
+  index_t *degrees, *offsets;
+  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*n));
+  get_masked_degrees<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, *g, degrees);
+  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1)));
+  thrust::exclusive_scan(thrust::device, degrees, degrees+n+1, offsets);
+  CUDA_CHECK(cudaFree(degrees));
+  index_t ne;
+  CUDA_CHECK(cudaMemcpy(&ne, &offsets[n], sizeof(index_t), cudaMemcpyDeviceToHost));
+  //std::cout << "maskedSG num_edges " << ne << "\n";
+  subg->allocateFrom(n, ne); // TODO: avoid reallocation
+  generate_masked_graph_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, masks, offsets, *g, *subg);
+  CUDA_CHECK(cudaFree(offsets));
+}
+
+// n: size of the original graph
+// nv: size of the subgraph; i.e. size of vertex_set
+// masks, graph g and subgraph sub are on the device (GPU)
+void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) {
+  index_t n = partGraph->size();
+  auto nv = vertex_set.size();
+  //std::cout << "g size: " << n << " sg sizes: " << nv << "\n";
+  // convert the vertex_set to a vertex_list and copy it to the device
+  VertexList vertex_list(vertex_set.begin(), vertex_set.end());
+  index_t* d_vertex_list;
+  cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t));
+  CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice));
+
+  createMasks(n, vertex_set, masks);
+  mask_t* d_masks;
+  cudaMalloc((void**)&d_masks, n * sizeof(mask_t));
+  CUDA_CHECK(cudaMemcpy(d_masks, masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+  //clear_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_masks); // set all 0
+  //CudaTest("solving clear_masks kernel failed");
+  // createMasks: set masks for vertices in the vertex_set
+  //set_masks<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, d_vertex_list, d_masks);
+  //CudaTest("solving set_masks kernel failed");
+  GraphGPU masked_sg; // size is the same as original graph, but masked dst removed
+  getMaskedGraph(n, d_masks, partGraph, &masked_sg); // remove edges whose destination is not masked
+  //std::cout << "maskedGraph generated\n";
+
+  // re-index the subgraph
+  index_t* d_new_ids;
+  cudaMalloc((void**)&d_new_ids, n * sizeof(index_t));
+  // Given an old vertex ID ∈ [0, n), returns a new vertex ID ∈ [0, nv)
+  auto new_ids = reindexVertices(n, vertex_set);
+  CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), cudaMemcpyHostToDevice));
+
+  // generate the offsets for the re-indexed subgraph
+  index_t *degrees, *offsets;
+  CUDA_CHECK(cudaMalloc((void**)&degrees, sizeof(index_t)*nv));
+  get_new_degrees<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees);
+  CudaTest("solving get_new_degrees kernel failed");
+  CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1)));
+  thrust::exclusive_scan(thrust::device, degrees, degrees+nv+1, offsets);
+  CUDA_CHECK(cudaFree(degrees));
+  index_t ne;
+  CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost));
+  //std::cout << "subgraph num_edges " << ne << "\n";
+
+  // allocate memory for the subgraph
+  sub->allocateFrom(nv, ne); // avoid reallocation
+  // generate the subgraph
+  generate_graph_kernel<<<CUDA_GET_BLOCKS(nv), CUDA_NUM_THREADS>>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub);
+  CudaTest("solving generate_graph kernel failed");
+  CUDA_CHECK(cudaFree(offsets));
+  //std::cout << "Subgraph generated\n";
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp
new file mode 100644
index 0000000000..4275232baa
--- /dev/null
+++ b/libdeepgalois/src/Train.cpp
@@ -0,0 +1,554 @@
+#include "galois/Galois.h"
+#include "deepgalois/Net.h"
+
+namespace deepgalois {
+
+Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs,
+         unsigned hidden1, float lr, float dropout, float wd, bool selfloop,
+         bool single, bool l2norm, bool dense, unsigned neigh_sz,
+         unsigned subg_sz, int val_itv)
+    : //    globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0),
+      //    globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0),
+      //    globalValBegin(0), globalValEnd(0), globalValCount(0),
+      //    globalTestBegin(0), globalTestEnd(0), globalTestCount(0),
+      //    globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL)
+      //    {}
+      is_single_class(single), has_l2norm(l2norm), has_dense(dense),
+      neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz),
+      num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), h1(hidden1),
+      learning_rate(lr), dropout_rate(dropout), weight_decay(wd),
+      val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) {
+  // init some identifiers for this host
+  unsigned myID = 0;
+#ifndef GALOIS_ENABLE_GPU
+  myID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
+  this->header = "[" + std::to_string(myID) + "] ";
+  assert(n_conv > 0);
+  this->num_layers = num_conv_layers + 1;
+
+  // additional layers to add
+  if (has_l2norm)
+    this->num_layers++;
+  if (has_dense)
+    this->num_layers++;
+  // initialize feature metadata
+  feature_dims.resize(num_layers + 1);
+  print_configs();
+
+  // initialze global graph context
+  graphTopologyContext = new deepgalois::Context();
+  graphTopologyContext->set_dataset(dataset_str);
+  // read *entire* graph, get num nodes
+  globalSamples = graphTopologyContext->read_graph(selfloop);
+
+  // get training and validation sets: this is to create the training
+  // subgraph in the sampler
+  globalTrainMasks = new mask_t[globalSamples];
+  globalValMasks   = new mask_t[globalSamples];
+  globalTestMasks  = new mask_t[globalSamples];
+  std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0);
+  std::fill(globalValMasks, globalValMasks + globalSamples, 0);
+
+  // reddit is hard coded
+  if (dataset_str == "reddit") {
+    this->globalTrainBegin = 0;
+    this->globalTrainCount = 153431;
+    this->globalTrainEnd   = this->globalTrainBegin + this->globalTrainCount;
+    this->globalValBegin   = 153431;
+    this->globalValCount   = 23831;
+    this->globalValEnd     = this->globalValBegin + this->globalValCount;
+
+    // TODO do all can be used below
+    for (size_t i = globalTrainBegin; i < globalTrainEnd; i++)
+      globalTrainMasks[i] = 1;
+    for (size_t i = globalValBegin; i < globalValEnd; i++)
+      globalValMasks[i] = 1;
+  } else {
+    globalTrainCount = graphTopologyContext->read_masks(
+        "train", globalSamples, globalTrainBegin, globalTrainEnd,
+        globalTrainMasks);
+    globalValCount = graphTopologyContext->read_masks(
+        "val", globalSamples, globalValBegin, globalValEnd, globalValMasks);
+  }
+  // make sure sampel size isn't greater than what we have to train with
+  assert(subgraph_sample_size <= globalTrainCount);
+
+  layers.resize(num_layers);
+  // hidden1 level embedding: 16
+  for (size_t i = 1; i < num_conv_layers; i++)
+    feature_dims[i] = this->h1;
+
+  // features are read in distcontext, not this context (this context only
+  // used for sampling)
+  if (subgraph_sample_size)
+    sampler = new deepgalois::Sampler();
+}
+
+void Net::train(optimizer* opt, bool need_validate) {
+  galois::StatTimer train_timer("Timer_0");
+  train_timer.start();
+  std::string separator   = "\n";
+  double total_train_time = 0.0;
+  int num_subg_remain     = 0;
+#ifndef GALOIS_ENABLE_GPU
+  unsigned hostID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
+
+  if (subgraph_sample_size) {
+    galois::StatTimer construct_time("SubgraphAllocateTime");
+    construct_time.start();
+    distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size);
+    allocateSubgraphsMasks(num_subgraphs);
+    std::cout << header
+              << "Constructing training vertex set induced graph...\n";
+    // auto gg = distContext->getGraphPointer();
+    auto gg =
+        graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem
+    sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg,
+                                   distContext->getGraphPointer());
+    construct_time.stop();
+  }
+
+  galois::gPrint(header, "Start training...\n");
+
+  Timer t_epoch;
+
+  // run epochs
+  for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) {
+    t_epoch.Start();
+
+    ////////////////////////////////////////////////////////////////////////////////
+    // Sampling
+    ////////////////////////////////////////////////////////////////////////////////
+    if (subgraph_sample_size) {
+      galois::StatTimer sample_time("SubgraphSampleTime");
+      sample_time.start();
+      if (num_subg_remain == 0) {
+        std::cout << header << "Generating " << num_subgraphs
+                  << " subgraph(s)\n";
+        galois::StatTimer t_subgen("SubgraphGenerateTime");
+        t_subgen.start();
+
+        // generate subgraphs
+        for (int sid = 0; sid < num_subgraphs; sid++) {
+          VertexSet sampledSet;
+          sampler->selectVertices(subgraph_sample_size, sampledSet,
+                                  curEpoch); // m = 1000 by default
+          sampler->generateSubgraph(sampledSet,
+                                    subgraphs_masks + sid * globalSamples,
+                                    distContext->getSubgraphPointer(sid));
+        }
+        num_subg_remain = num_subgraphs;
+        t_subgen.stop();
+      }
+      // count their degrees
+      for (int i = 0; i < num_subgraphs; i++) {
+        auto sg_ptr = distContext->getSubgraphPointer(i);
+        sg_ptr->degree_counting();
+        // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), "
+        // num_e ", sg_ptr->sizeEdges(), "\n");
+      }
+
+      // choose a subgraph to use
+      num_subg_remain--;
+      int sg_id                 = num_subg_remain;
+      auto subgraphPointer      = distContext->getSubgraphPointer(sg_id);
+      this->subgraphNumVertices = subgraphPointer->size();
+
+      // std::cout << "Subgraph num_vertices: " << subgraphNumVertices
+      //          << ", num_edges: " << subgraphPointer->sizeEdges() << "\n";
+      for (size_t i = 0; i < num_layers; i++) {
+        layers[i]->update_dim_size(this->subgraphNumVertices);
+      }
+
+      // TODO dist version where i need global degrees
+      // change normalization constants
+      distContext->constructNormFactorSub(sg_id);
+      for (size_t i = 0; i < num_conv_layers; i++) {
+        layers[i]->set_graph_ptr(subgraphPointer);
+        layers[i]->set_norm_consts_ptr(
+            distContext->get_norm_factors_subg_ptr());
+      }
+
+      // update labels for subgraph
+      distContext->constructSubgraphLabels(
+          this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
+      layers[num_layers - 1]->set_labels_ptr(
+          distContext->get_labels_subg_ptr());
+
+      // update features for subgraph
+      distContext->constructSubgraphFeatures(
+          this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples);
+      layers[0]->set_feats_ptr(
+          distContext->get_feats_subg_ptr()); // feed input data
+
+      // Graph* testing = distContext->getSubgraphPointer(sg_id);
+      // for (size_t i = 0; i < testing->size(); i++) {
+      //  for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++)
+      //  {
+      //    galois::gPrint(i, " ", testing->getEdgeDst(j), "\n");
+      //  }
+      //}
+      sample_time.stop();
+    } // end subgraph sample loop
+    ////////////////////////////////////////////////////////////////////////////////
+
+    // training steps
+#ifdef GALOIS_ENABLE_GPU
+    std::cout << header << "Epoch " << std::setw(3) << curEpoch << " ";
+#else
+    if (hostID == 0) {
+      galois::gInfo("Epoch ", std::setw(3), curEpoch);
+    }
+#endif
+    set_netphases(net_phase::train);
+    acc_t train_loss = 0.0, train_acc = 0.0;
+
+    // galois::gPrint(header, "Calling into eval for forward propagation\n");
+    // forward: after this phase, layer edges will contain intermediate
+    // features for use during backprop
+    double fw_time = evaluate("train", train_loss, train_acc);
+    // evaluate("train", train_loss, train_acc);
+
+    // galois::gPrint(header, "Calling into backward propagation\n");
+    // backward: use intermediate features + ground truth to update layers
+    // with feature gradients whcih are then used to calculate weight
+    // gradients
+    Net::bprop();
+
+    // galois::gPrint(header, "Weight update call\n");
+    // gradient update: use gradients stored on each layer to update model
+    // for next epoch
+    Net::update_weights(opt); // update parameters
+
+    t_epoch.Stop();
+
+    // validation / testing
+    set_netphases(net_phase::test);
+
+#ifdef GALOIS_ENABLE_GPU
+    std::cout << header << "train_loss " << std::setprecision(3) << std::fixed
+              << train_loss << " train_acc " << train_acc << " ";
+#else
+    if (hostID == 0) {
+      galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed,
+                     train_loss, " train_acc ", train_acc, separator);
+    }
+#endif
+
+    double epoch_time = t_epoch.Millisecs();
+    total_train_time += epoch_time;
+
+    // report current total time + accuracy as a stat
+#ifndef GALOIS_ENABLE_GPU
+    if (hostID == 0) {
+      galois::runtime::reportParam(
+          std::string("GNN"),
+          "Epoch" + std::to_string(curEpoch) + "TestAccuracyAndTime",
+          std::to_string(train_acc) + ";" + std::to_string(total_train_time));
+    }
+#endif
+
+    if (need_validate && (curEpoch % val_interval == 0)) {
+      // Validation
+      acc_t val_loss = 0.0, val_acc = 0.0;
+      double val_time = evaluate("val", val_loss, val_acc);
+#ifdef GALOIS_ENABLE_GPU
+      std::cout << header << "val_loss " << std::setprecision(3) << std::fixed
+                << val_loss << " val_acc " << val_acc << " ";
+      std::cout << header << "time " << std::setprecision(3) << std::fixed
+                << epoch_time + val_time << " ms (train_time " << epoch_time
+                << " val_time " << val_time << ")\n";
+#else
+      if (hostID == 0) {
+        galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed,
+                       val_loss, " val_acc ", val_acc, separator);
+        galois::gPrint(header, "time ", std::setprecision(3), std::fixed,
+                       epoch_time + val_time, " ms (train_time ", epoch_time,
+                       " val_time ", val_time, ")\n");
+      }
+#endif
+    } else {
+#ifdef GALOIS_ENABLE_GPU
+      std::cout << header << "train_time " << std::fixed << epoch_time
+                << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time
+                << ")\n";
+#else
+      if (hostID == 0) {
+        galois::gPrint(header, "train_time ", std::fixed, epoch_time,
+                       " ms (fw ", fw_time, ", bw ", epoch_time - fw_time,
+                       ")\n");
+      }
+#endif
+    }
+  } // epoch loop
+
+  double avg_train_time = total_train_time / (double)num_epochs;
+  double throughput     = 1000.0 * (double)num_epochs / total_train_time;
+#ifdef GALOIS_ENABLE_GPU
+  std::cout << "Average training time per epoch: " << avg_train_time
+            << "ms. Throughput " << throughput << " epoch/s\n";
+#else
+  galois::gPrint(header, "Average training time per epoch: ", avg_train_time,
+                 " ms. Throughput: ", throughput, " epoch/s\n");
+#endif
+  train_timer.stop();
+}
+
+// evaluate, i.e. inference or predict
+double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) {
+  // TODO get rid of this timer
+  Timer t_eval;
+  t_eval.Start();
+
+  galois::StatTimer eval_timer("EvaluateTime");
+  eval_timer.start();
+
+  size_t gBegin = 0, gEnd = 0, gCount = 0;
+  mask_t* gMasks = NULL;
+
+  // TODO global here good for dist case?
+  if (type == "train") {
+    gBegin = globalTrainBegin;
+    gEnd   = globalTrainEnd;
+    gCount = globalTrainCount;
+    gMasks = globalTrainMasks;
+    if (subgraph_sample_size) {
+      // update gMasks for subgraph
+      gMasks = NULL;
+      gBegin = 0;
+      gEnd   = this->subgraphNumVertices;
+      gCount = this->subgraphNumVertices;
+    }
+  } else if (type == "val") {
+    gBegin = globalValBegin;
+    gEnd   = globalValEnd;
+    gCount = globalValCount;
+    gMasks = globalValMasks;
+  } else {
+    gBegin = globalTestBegin;
+    gEnd   = globalTestEnd;
+    gCount = globalTestCount;
+    gMasks = globalTestMasks;
+  }
+
+  // switch to the original graph if not training
+  if (subgraph_sample_size && type != "train") {
+    for (size_t i = 0; i < num_layers; i++)
+      layers[i]->update_dim_size(distNumSamples);
+    for (size_t i = 0; i < num_conv_layers; i++) {
+#ifdef GALOIS_ENABLE_GPU
+      layers[i]->set_graph_ptr(distContext->getGraphPointer());
+#else
+      layers[i]->set_graph_ptr(distContext->getLGraphPointer());
+#endif
+      layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
+    }
+    layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr());
+    layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data
+  }
+#ifdef GALOIS_ENABLE_GPU
+  if (type == "train") {
+    gMasks = d_train_masks;
+  } else if (type == "val") {
+    gMasks = d_val_masks;
+  } else {
+    gMasks = d_test_masks;
+  }
+#endif
+
+  // galois::gPrint(header, "Doing actual forward propagation\n");
+  loss = fprop(gBegin, gEnd, gCount, gMasks);
+  // galois::gPrint(header,
+  //               "Forward propagation donne, going to check accuracy\n");
+  float_t* predictions = layers[num_layers - 1]->next()->get_data();
+
+  // labels will be subgraph labels if applicable
+  label_t* localLabels;
+  if (type == "train" && subgraph_sample_size) {
+    localLabels = distContext->get_labels_subg_ptr();
+  } else {
+    // note this grabs local labels
+    localLabels = distContext->get_labels_ptr();
+  }
+
+  if (is_single_class) {
+    acc =
+        masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, localLabels);
+  } else {
+    acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, predictions,
+                                      localLabels);
+  }
+
+  eval_timer.stop();
+
+  // TODO replace with stat timer
+  t_eval.Stop();
+  return t_eval.Millisecs();
+}
+
+void Net::construct_layers() {
+  // append conv layers
+  // galois::gPrint(header, "Constructing layers...\n");
+  for (size_t i = 0; i < num_conv_layers - 1; i++) {
+    append_conv_layer(i, true); // conv layers, act=true
+  }
+  append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false
+
+  if (has_l2norm) {
+    append_l2norm_layer(num_conv_layers); // l2_norm layer
+  }
+  if (has_dense) {
+    append_dense_layer(num_layers - 2); // dense layer
+  }
+  append_out_layer(num_layers - 1); // output layer
+
+  // allocate memory for intermediate features and gradients
+  for (size_t i = 0; i < num_layers; i++) {
+    layers[i]->add_edge();
+  }
+  for (size_t i = 1; i < num_layers; i++) {
+    connect(layers[i - 1], layers[i]);
+  }
+  for (size_t i = 0; i < num_layers; i++) {
+    layers[i]->malloc_and_init();
+  }
+
+  layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data
+  // precompute the normalization constant based on graph structure
+  // context->norm_factor_computing(false);
+  distContext->constructNormFactor(graphTopologyContext);
+  for (size_t i = 0; i < num_conv_layers; i++)
+    layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr());
+  set_contexts();
+}
+
+//! Add an l2_norm layer to the network
+void Net::append_l2norm_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0]       = distNumSamples;
+  in_dims[0]       = distNumSamples;
+  in_dims[1]       = get_in_dim(layer_id);
+  out_dims[1]      = get_out_dim(layer_id);
+  layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims);
+}
+
+//! Add an dense layer to the network
+void Net::append_dense_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0]  = distNumSamples;
+  in_dims[0]  = distNumSamples;
+  in_dims[1]  = get_in_dim(layer_id);
+  out_dims[1] = get_out_dim(layer_id);
+  // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims);
+}
+
+//! Add an output layer to the network
+void Net::append_out_layer(size_t layer_id) {
+  assert(layer_id > 0); // can not be the first layer
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0] = out_dims[0] = distNumSamples;
+  in_dims[1]               = get_in_dim(layer_id);
+  out_dims[1]              = get_out_dim(layer_id);
+
+  if (is_single_class)
+    layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims);
+  else
+    layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims);
+
+  layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr());
+}
+//! Add a convolution layer to the network
+void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias,
+                            bool dropout) {
+  assert(dropout_rate < 1.0);
+  assert(layer_id < num_conv_layers);
+  std::vector<size_t> in_dims(2), out_dims(2);
+  in_dims[0] = out_dims[0] = distNumSamples;
+  in_dims[1]               = get_in_dim(layer_id);
+  out_dims[1]              = get_out_dim(layer_id);
+  layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout,
+                                          dropout_rate, in_dims, out_dims);
+#ifdef GALOIS_ENABLE_GPU
+  layers[layer_id]->set_graph_ptr(distContext->getGraphPointer());
+#else
+  layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer());
+#endif
+}
+
+//! forward propagation: [begin, end) is the range of samples used.
+//! calls "forward" on each layer and returns the loss of the final layer
+acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) {
+  galois::StatTimer fprop_timer("ForwardPropTime");
+  fprop_timer.start();
+  // set mask for the last layer; globals
+  // TODO this should be distirbuted sample gBegin->end not global; fix later
+  // seems to be unused in code right now anyways
+  // galois::gPrint(header, "fprop: set sample mask\n");
+  layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks);
+
+  for (size_t i = 0; i < num_layers; i++) {
+    galois::gPrint(header, "fprop: layer ", i, " forward call\n");
+    layers[i]->forward();
+  }
+
+  // galois::gPrint(header, "fprop: getting loss\n");
+  // prediction error
+  acc_t loss = layers[num_layers - 1]->get_prediction_loss();
+  // Squared Norm Regularization to mitigate overfitting
+  loss += weight_decay * layers[0]->get_weight_decay_loss();
+  fprop_timer.stop();
+  return loss;
+}
+
+// back propagation
+void Net::bprop() {
+  galois::StatTimer bprop_timer("BackPropTime");
+  bprop_timer.start();
+  for (size_t i = num_layers; i != 0; i--) {
+    layers[i - 1]->backward();
+  }
+  bprop_timer.stop();
+}
+
+// update trainable weights after back-propagation
+void Net::update_weights(optimizer* opt) {
+  regularize();
+  for (size_t i = 0; i < num_layers; i++) {
+    if (layers[i]->trainable()) {
+      layers[i]->update_weight(opt);
+    }
+  }
+}
+
+//! Save the context object to all layers of the network
+void Net::set_contexts() {
+  for (size_t i = 0; i < num_layers; i++)
+    layers[i]->set_context(distContext);
+}
+
+//! set netphases for all layers in this network
+void Net::set_netphases(net_phase phase) {
+  for (size_t i = 0; i < num_layers; i++)
+    layers[i]->set_netphase(phase);
+}
+
+//! print all layers
+void Net::print_layers_info() {
+  for (size_t i = 0; i < num_layers; i++)
+    layers[i]->print_layer_info();
+}
+
+// print the configurations
+void Net::print_configs() {
+  galois::gPrint(header, "Configuration: num_threads ", num_threads,
+                 ", num_conv_layers ", num_conv_layers, ", num_epochs ",
+                 num_epochs, ", hidden_feat_len ", h1, ", learning_rate ",
+                 learning_rate, ", dropout_rate ", dropout_rate,
+                 ", weight_decay ", weight_decay, "\n");
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp
new file mode 100644
index 0000000000..ce9d709dbf
--- /dev/null
+++ b/libdeepgalois/src/layers/aggregator.cpp
@@ -0,0 +1,54 @@
+#include "deepgalois/layers/aggregator.h"
+#include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
+
+// TODO template arg
+void deepgalois::update_all(size_t len, Graph& g, const float_t* in,
+                            float_t* out, bool norm, float_t* norm_factor) {
+  // std::cout << "[update_all] graph size: " << n << "\n";
+  size_t n = g.size();
+  galois::do_all(
+      galois::iterate(size_t(0), n),
+      [&](const auto src) {
+        auto src_idx = src * len;
+        // zero out the output data
+        math::clear_cpu(len, &out[src_idx]);
+        float_t a = 0.0;
+        float_t b = 0.0;
+        // get normalization factor if needed
+        if (norm)
+          a = norm_factor[src];
+        // gather neighbors' embeddings
+        for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) {
+          const auto dst = g.getEdgeDst(e);
+          assert(dst < n);
+          auto dst_idx = dst * len;
+          if (norm) {
+            // normalize b as well
+            b = a * norm_factor[dst];
+            // float_t* neighbor = new float_t[len]; // this is super slow
+            vec_t neighbor(len);
+            // scale the neighbor's data using the normalization factor
+            math::scale(len, b, &in[dst_idx], &neighbor[0]);
+            // use scaled data to update; out[src] += in[dst]
+            math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]);
+          } else {
+            // add embeddings from neighbors together; out[src] += in[dst]
+            math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]);
+          }
+        }
+      },
+      galois::steal(), galois::loopname("update_all"));
+}
+
+void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in,
+                                  float_t* out, bool, float_t* norm_factor) {
+  galois::StatTimer Tcsrmm("CSRMM-MKL");
+  Tcsrmm.start();
+  unsigned n = g.size();
+  math::clear_cpu(n * len, out);
+  math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor,
+                  (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0,
+                  out);
+  Tcsrmm.stop();
+}
diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu
new file mode 100644
index 0000000000..b29e980da3
--- /dev/null
+++ b/libdeepgalois/src/layers/aggregator.cu
@@ -0,0 +1,102 @@
+#include "gg.h"
+#include "ggcuda.h"
+#include "cub/cub.cuh"
+#include "deepgalois/cutils.h"
+#include "deepgalois/layers/aggregator.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+// TODO: use warp
+__device__ void scale_add(const int n, const float_t alpha, const float_t* a,
+                          const float_t* b, float_t* y) {
+  for (int i = 0; i < n; i++)
+    y[i] = alpha * a[i] + b[i];
+}
+
+__global__ void update_all_naive(size_t n, size_t len, GraphGPU g,
+                                 const float_t* in, float_t* out, bool norm,
+                                 const float_t* norm_factor) {
+  CUDA_KERNEL_LOOP(src, n) {
+    float_t a = 0.0, b = 1.0;
+    if (norm)
+      a = norm_factor[src];
+    index_type begin = g.edge_begin(src);
+    index_type end   = g.edge_end(src);
+    for (index_type e = begin; e != end; e++) {
+      index_type dst = g.getEdgeDst(e);
+      if (norm)
+        b = a * norm_factor[dst];
+      scale_add(len, b, in + dst * len, out + src * len,
+                out + src * len); // out[src] += in[dst]
+    }
+  }
+}
+
+__global__ void update_all_warp(size_t n, size_t len, GraphGPU g,
+                                const float_t* in, float_t* out, bool norm,
+                                const float_t* norm_factor) {
+  __shared__ index_type ptrs[BLOCK_SIZE / WARP_SIZE][2];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int src = warp_id; src < n; src += num_warps) {
+    float_t a = 0.0, b = 1.0;
+    if (norm)
+      a = norm_factor[src];
+    if (thread_lane < 2)
+      ptrs[warp_lane][thread_lane] = g.edge_begin(src + thread_lane);
+    __syncthreads();
+    const index_type row_begin = ptrs[warp_lane][0];
+    const index_type row_end   = ptrs[warp_lane][1];
+    index_type base_src        = src * len;
+    for (index_type offset = row_begin; offset < row_end; offset++) {
+      index_type dst = g.getEdgeDst(offset);
+      if (norm)
+        b = a * norm_factor[dst];
+      index_type base_dst = dst * len;
+      for (int i = 0; i < len; i += WARP_SIZE)
+        if (thread_lane + i < len)
+          out[base_src + thread_lane + i] += in[base_dst + thread_lane + i] * b;
+    }
+  }
+}
+
+void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out,
+                bool norm, const float_t* norm_factor) {
+  unsigned n = g.size();
+  CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
+  // update_all_naive<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, len, g, in,
+  // out, norm, norm_factor);
+  update_all_warp<<<(n - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+      n, len, g, in, out, norm, norm_factor);
+  CudaTest("solving update_all kernel failed");
+}
+
+void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out,
+                      bool norm, const float_t* norm_factor) {
+  // g.print_test();
+  unsigned n = g.size();
+  auto nnz   = g.sizeEdges();
+  CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t)));
+  // std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << "
+  // nnz " << nnz << "\n"; print_device_vector(10, norm_factor, "norm_factor");
+  float* temp;
+  const int* row_start = (const int*)g.row_start_ptr();
+  const int* edge_dst  = (const int*)g.edge_dst_ptr();
+  //printf("row_start_ptr: 0x%x\n", row_start);
+  //printf("edge_dst_ptr: 0x%x\n", edge_dst);
+  // print_device_int_vector(10, row_start, "row_start");
+  // print_device_int_vector(10, edge_dst, "edge_dst");
+  float_malloc_device(n * len, temp); // TODO: avoid repetitive allocation
+  csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out);
+  float_free_device(temp);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h
new file mode 100644
index 0000000000..d57f485a8c
--- /dev/null
+++ b/libdeepgalois/src/layers/gat_fw.h
@@ -0,0 +1,158 @@
+// #define USE_GAT
+#ifdef USE_GAT
+// `Graph Attention Network <https://arxiv.org/pdf/1710.10903.pdf>`
+// NOTE: GAT paper uses "first concatenation then linear projection"
+//  to compute attention scores, while ours is "first projection then
+//  addition", the two approaches are mathematically equivalent:
+//  We decompose the weight vector a mentioned in the paper into
+//  [a_l || a_r], then  a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j
+//  Our implementation is much efficient because we do not need to
+//  save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus,
+//  addition could be optimized with DGL's built-in function u_add_v,
+//  which further speeds up computation and saves memory footprint.
+
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
+                                 float_t* out) {
+  size_t n = g.size();
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+    auto src_idx = src * len;
+    auto deg_src = g.get_degree(src);
+
+    // concatenation, dot product, LeakyReLU
+    // int i = 0;
+    // vec_t scores(deg_src);
+    auto begin = g.edge_begin(src);
+    auto end   = g.edge_end(src);
+    // alpha: learnable weight vector (shared by all vertices)
+    float_t src_score = math::dot(len, &alpha_l[0], &in[src_idx]);
+    for (auto e = begin; e != end; e++) {
+      auto dst     = g.getEdgeDst(e);
+      auto dst_idx = dst * len;
+      // vec_t concat_vec(2*len);
+      // math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]);
+      // float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]);
+      float_t dst_score = math::dot(len, &alpha_r[0], &in[dst_idx]);
+      temp_scores[e]    = src_score + dst_score;
+      math::leaky_relu(epsilon, temp_scores[e], scores[e]);
+    }
+
+    // softmax to normalize the attention scores on each vertex’s incoming edges
+    // vec_t normalized_scores(deg_src, 0);
+    // math::softmax(deg_src, &scores[0], &normalized_scores[0]);
+    math::softmax(deg_src, &scores[begin], &norm_scores[begin]);
+
+    // aggregation: scaled by the attention scores
+    math::clear_cpu(len, &out[src_idx]);
+    for (auto e = begin; e != end; e++) {
+      auto dst     = g.getEdgeDst(e);
+      auto dst_idx = dst * len;
+      auto score   = norm_scores[e];
+      vec_t neighbor(len);
+      math::scale(len, score, &in[dst_idx], &neighbor[0]);
+      math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]);
+    }
+  });
+}
+
+void graph_conv_layer::d_compute_scores(size_t len, Graph& g,
+                                        const float_t* in_data,
+                                        const float_t* out_data,
+                                        const float_t* in_grad) {
+  size_t n = g.size();
+
+  // compute gradients for the learnable vector `alpha`
+  // vec_t temp_grad(n*n);
+  // math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data,
+  //                in_grad, 0.0, temp_grad);
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+    auto begin   = g.edge_begin(src);
+    auto end     = g.edge_end(src);
+    auto deg_src = g.get_degree(src);
+    math::d_softmax(deg_src, &scores[begin], &norm_scores[begin],
+                    &scores_grad[begin], &norm_scores_grad[begin]);
+    for (auto e = begin; e != end; e++) {
+      auto dst = g.getEdgeDst(e);
+      // use norm_scores_grad as temp_scores_grad since its data is useless
+      // already
+      math::d_leaky_relu(epsilon, &scores_grad[e], &temp_scores[e],
+                         &norm_scores_grad[e]);
+      math::scale(len, norm_scores_grad[e], &in_data[src_idx], &alpha_lgrad[0]);
+      math::scale(len, norm_scores_grad[e], &in_data[dst_idx], &alpha_rgrad[0]);
+    }
+  });
+}
+
+void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in_grad,
+                                   float_t* out_grad) {
+  size_t n = g.size();
+
+  // aggregation: the derivative is transposed;
+  // the graph is undirected (structurally symmetric),
+  // but values are not the same for the symmetric positions
+  galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) {
+    auto src_idx   = src * len;
+    auto src_begin = g.edge_begin(src);
+    for (auto e = src_begin; e != g.edge_end(src); e++) {
+      auto dst       = g.getEdgeDst(e);
+      auto dst_idx   = dst * len;
+      auto dst_begin = g.edge_begin(dst);
+      auto score     = norm_scores[dst_begin + e - src_begin]; // transposed
+      vec_t neighbor(len);
+      math::scale(len, score, &in_grad[dst_idx], &neighbor[0]);
+      math::vadd_cpu(len, &out_grad[src_idx], &neighbor[0], &out_grad[src_idx]);
+    }
+  });
+}
+
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  galois::StatTimer conv_timer("GraphConvForward");
+  conv_timer.start();
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+
+  // dropout
+  if (dropout_ && phase_ == net_phase::train) {
+    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask,
+                      in_temp);
+  } else {
+    math::copy_cpu(x * y, in_data, in_temp);
+  }
+
+  // linear transformation
+  math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp,
+                  &layer::W[0], 0.0, out_temp);
+
+  // aggregation
+  aggregate(z, *graph_cpu, out_temp, out_data);
+
+  // ReLU
+  if (act_)
+    math::relu_cpu(x * z, out_data, out_data);
+}
+
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+  if (act_)
+    math::d_relu_cpu(x * z, out_grad, out_data, out_grad);
+
+  // compute gradients for alpha (alpha is a learnable vector)
+  d_compute_scores(z, *graph_cpu, in_temp, out_temp, out_grad);
+  // compute gradients for feature vectors
+  d_aggregate(z, *graph_cpu, out_grad, out_temp);
+  if (level_ != 0) {
+    math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0],
+                    0.0, in_grad); // x*z; z*y -> x*y
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp,
+                    0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  }
+  if (level_ != 0 && dropout_)
+    math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
+}
+
+#endif
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp
new file mode 100644
index 0000000000..f13b26be25
--- /dev/null
+++ b/libdeepgalois/src/layers/graph_conv_layer.cpp
@@ -0,0 +1,295 @@
+#include "deepgalois/layers/graph_conv_layer.h"
+#include "deepgalois/math_functions.hh"
+#include "deepgalois/utils.h"
+
+static galois::DynamicBitSet bitset_conv;
+
+#include "deepgalois/layers/GraphConvSyncStructures.h"
+#include "deepgalois/layers/GradientSyncStructs.h"
+
+namespace deepgalois {
+#include "gat_fw.h"
+
+//! Set this to let sync struct know where to get data from
+float_t* _dataToSync = nullptr;
+//! Set this to let sync struct know the size of the vector to use during
+//! sync
+long unsigned _syncVectorSize = 0;
+
+inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y,
+                                               vec_t& matrix, unsigned seed) {
+  auto init_range = sqrt(6.0 / (dim_x + dim_y));
+  std::default_random_engine rng(seed);
+  std::uniform_real_distribution<float_t> dist(-init_range, init_range);
+  matrix.resize(dim_x * dim_y);
+  for (size_t i = 0; i < dim_x; ++i) {
+    for (size_t j = 0; j < dim_y; ++j)
+      matrix[i * dim_y + j] = dist(rng);
+  }
+}
+
+inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y,
+                                               vec_t& matrix) {
+  matrix.resize(dim_x * dim_y);
+  for (size_t i = 0; i < dim_x; ++i) {
+    for (size_t j = 0; j < dim_y; ++j)
+      matrix[i * dim_y + j] = 0;
+  }
+}
+
+void graph_conv_layer::malloc_and_init() {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+
+  galois::gInfo("conv bitset size is going to be ", x);
+  bitset_conv.resize(x);
+
+  // setup gluon
+  layer::gradientGraph =
+      new deepgalois::GluonGradients(layer::weight_grad, y * z);
+  layer::syncSub =
+      new galois::graphs::GluonSubstrate<deepgalois::GluonGradients>(
+          *layer::gradientGraph, layer::gradientGraph->myHostID(),
+          layer::gradientGraph->numHosts(), false);
+  galois::gInfo("gradient bitset size is going to be ", y * z, " ", y, " ", z);
+
+  // make sure seed consistent across all hosts for weight matrix
+  rand_init_matrix(y, z, W, 1);
+  // rand_init_matrix(y, z, Q, 1); // for GraphSAGE
+
+  zero_init_matrix(y, z, layer::weight_grad);
+
+#ifdef USE_GAT
+  // alpha is only used for GAT
+  rand_init_matrix(z, 1, alpha_l, 1);
+  rand_init_matrix(z, 1, alpha_r, 1);
+  alpha_lgrad.resize(2 * z);
+  alpha_rgrad.resize(2 * z);
+  std::fill(alpha_lgrad.begin(), alpha_lgrad.end(), 0);
+  std::fill(alpha_rgrad.begin(), alpha_rgrad.end(), 0);
+  auto ne = graph_cpu->sizeEdges(); // number of edges
+  scores.resize(ne);                // a score for each edge
+  temp_scores.resize(ne);
+  scores_grad.resize(ne);
+  norm_scores.resize(ne);
+  norm_scores_grad.resize(ne);
+  epsilon = 0.2; // LeakyReLU angle of negative slope
+#endif
+  dropout_ = true;
+  act_     = false;
+
+  if (dropout_)
+    dropout_mask = new mask_t[x * y];
+  in_temp    = new float_t[x * y];
+  out_temp   = new float_t[x * z];
+  trans_data = new float_t[y * x]; // y*x
+  if (y <= z)
+    in_temp1 = new float_t[x * y];
+}
+
+namespace {
+void set_conv_bitset() {
+  // bitset setting
+  galois::do_all(
+      galois::iterate((size_t)0, bitset_conv.size()),
+      [&](size_t node_id) {
+        bool set_true = false;
+        // check for non-zeros; the moment one is found, set true becomes true
+        // and we break out of the loop
+        for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) {
+          auto val =
+              deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize +
+                                      i];
+          if (val != 0) {
+            set_true = true;
+            break;
+          }
+        }
+
+        if (set_true) {
+          bitset_conv.set(node_id);
+        }
+      },
+      galois::loopname("BitsetGraphConv"), galois::no_stats());
+}
+
+} // end anonymous namespace
+
+void graph_conv_layer::combine(size_t n, size_t len, const float_t* self,
+                               const float_t* neighbors, float_t* out) {
+  float_t* a = new float_t[len];
+  float_t* b = new float_t[len];
+  math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a);
+  math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b);
+  math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors
+}
+
+#ifndef USE_GAT
+// aggregate based on graph topology
+void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in,
+                                 float_t* out) {
+  galois::StatTimer aggregate_timer("AggregateTime");
+  aggregate_timer.start();
+  // normalization constant based on graph structure
+#ifdef USE_MKL
+  update_all_csrmm(len, g, in, out, norm_, norm_consts);
+#else
+  update_all(len, g, in, out, norm_, norm_consts);
+#endif
+  aggregate_timer.stop();
+}
+
+// since graph is symmetric, the derivative is the same
+void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in,
+                                   float_t* out) {
+  galois::StatTimer aggregate_timer("AggregateDerivativeTime");
+  aggregate_timer.start();
+#ifdef USE_MKL
+  update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
+#else
+  update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z
+#endif
+  aggregate_timer.stop();
+}
+
+// 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1]))
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  galois::StatTimer conv_timer("GraphConvForward");
+  conv_timer.start();
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+  galois::gPrint("forward ", x, " ", y, " ", z, "\n");
+
+  galois::StatTimer drop_timer("GraphConvForwardDropout");
+  drop_timer.start();
+  // input: x*y; W: y*z; output: x*z
+  // if y > z: mult W first to reduce the feature size for aggregation
+  // else: aggregate first then mult W
+  if (dropout_ && phase_ == net_phase::train) {
+    math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask,
+                      in_temp);
+  } else {
+    math::copy_cpu(x * y, in_data, in_temp);
+  }
+  drop_timer.stop();
+
+  galois::StatTimer compute_timer("GraphConvForwardCompute");
+  compute_timer.start();
+  if (y > z) {
+    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp,
+                    &layer::W[0], 0.0, out_temp);
+    aggregate(z, *graph_cpu, out_temp, out_data);
+  } else {
+    aggregate(y, *graph_cpu, in_temp, in_temp1);
+    math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1,
+                    &layer::W[0], 0.0, out_data);
+  }
+  compute_timer.stop();
+
+  // TODO sync of out_data required here
+  // TODO how to do this for the sampled case?
+  deepgalois::_syncVectorSize = z;
+  deepgalois::_dataToSync     = out_data;
+  set_conv_bitset();
+
+  galois::gPrint("forward ", x, " ", y, " ", z, " sync calling\n");
+  layer::context->getSyncSubstrate()
+      ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>("GraphConvForward");
+
+  // run relu activation on output if specified
+  galois::StatTimer relu_timer("GraphConvForwardRelu");
+  relu_timer.start();
+  if (act_)
+    math::relu_cpu(x * z, out_data, out_data);
+  relu_timer.stop();
+
+  conv_timer.stop();
+}
+
+// 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  galois::StatTimer conv_timer("GraphConvBackward");
+  conv_timer.start();
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+  // note; assumption here is that out_grad contains 1s or 0s via relu?
+  galois::StatTimer relu_timer("GraphConvBackwardRelu");
+  relu_timer.start();
+  if (act_)
+    math::d_relu_cpu(x * z, out_grad, out_data, out_grad);
+  relu_timer.stop();
+  // else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying
+
+  galois::StatTimer compute_timer("GraphConvBackwardCompute");
+  compute_timer.start();
+  if (y > z) {
+    d_aggregate(z, *graph_cpu, out_grad, out_temp);
+    // at this point, out_temp has the derivative of data from last step to
+    // use for both updating gradients for features and gradients for weights
+    // this calculates gradients for the node predictions
+    if (level_ != 0) { // no need to calculate in_grad for the first layer
+      // derivative of matmul needs transposed matrix
+      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0],
+                      0.0, in_grad); // x*z; z*y -> x*y
+    }
+    // calculate weight gradients using input data; multiplied by gradients from
+    // last back prop step
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp,
+                    0.0, &layer::weight_grad[0]); // y*x; x*z; y*z
+  } else {
+    if (level_ != 0) {
+      math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0],
+                      0.0, in_temp);
+      d_aggregate(y, *graph_cpu, in_temp, in_grad);
+    }
+    math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad,
+                    0.0, &layer::weight_grad[0]);
+  }
+  compute_timer.stop();
+
+  // sync agg
+  // galois::gPrint(header, "x is ", x, " y is ", y,  " z is ", z, "\n");
+  if (level_ != 0) {
+    deepgalois::_syncVectorSize = y;
+    deepgalois::_dataToSync     = in_grad;
+    set_conv_bitset();
+    layer::context->getSyncSubstrate()
+        ->sync<writeAny, readAny, GraphConvSync, Bitset_conv>(
+            //->sync<writeAny, readAny, GraphConvSync>(
+            "GraphConvBackward");
+  }
+  galois::StatTimer drop_timer("GraphConvBackwardDropout");
+  drop_timer.start();
+  if (level_ != 0 && dropout_)
+    math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad);
+  drop_timer.stop();
+
+  deepgalois::_syncVectorSize = z;
+  deepgalois::_dataToSync     = &layer::weight_grad[0];
+  unsigned host_num = galois::runtime::getSystemNetworkInterface().Num;
+  layer::syncSub->sync<writeAny, readAny, GradientSync>("Gradients");
+  galois::do_all(
+      galois::iterate((size_t)0, (size_t)z),
+      [&](size_t i) {
+        // galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n");
+        layer::weight_grad[i] /= host_num;
+        // galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n");
+      },
+      galois::loopname("sync post process"));
+
+  galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done");
+  conv_timer.stop();
+}
+#endif
+
+acc_t graph_conv_layer::get_weight_decay_loss() {
+  return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu
new file mode 100644
index 0000000000..f8b59d3c0e
--- /dev/null
+++ b/libdeepgalois/src/layers/graph_conv_layer.cu
@@ -0,0 +1,117 @@
+#include "deepgalois/layers/graph_conv_layer.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+void graph_conv_layer::malloc_and_init() {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+
+  if (dropout_)
+    CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t)));
+  float_malloc_device(x * y, in_temp);
+  init_const_gpu(x * y, 0.0, in_temp);
+  if (y <= z) {
+    float_malloc_device(x * y, in_temp1);
+    init_const_gpu(x * y, 0.0, in_temp1);
+  }
+  float_malloc_device(x * z, out_temp);
+  init_const_gpu(x * z, 0.0, out_temp);
+  float_malloc_device(y * z, d_W);
+  auto init_range = sqrt(6.0 / (y + z));
+  // Glorot & Bengio (AISTATS 2010)
+  rng_uniform_gpu(y * z, -init_range, init_range, d_W);
+  float_malloc_device(y * z, layer::d_weight_grad);
+  init_const_gpu(y * z, 0.0, layer::d_weight_grad);
+}
+
+void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in,
+                                 float_t* out) {
+#ifdef USE_CUSPARSE
+  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
+#else
+  deepgalois::update_all(len, g, in, out, norm_, norm_consts);
+#endif
+}
+
+void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in,
+                                   float_t* out) {
+#ifdef USE_CUSPARSE
+  deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts);
+#else
+  deepgalois::update_all(len, g, in, out, norm_, norm_consts);
+#endif
+}
+
+void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self,
+                               const float_t* neighbors, float_t* out) {}
+
+// GPU forward: compute output features
+// NOTE: in_data will be used in back-prop, so it can not be modified
+void graph_conv_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+
+  // currently only support feature length <= 128
+  if (z > MAX_NUM_CLASSES) {
+    std::cout << "Currently support maximum hidden feature length of "
+              << MAX_NUM_CLASSES << "\n";
+    exit(0);
+  }
+  init_const_gpu(x * z, 0.0, out_temp);
+  if (dropout_ && phase_ == net_phase::train)
+    dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp);
+  else
+    copy_gpu(x * y, in_data, in_temp);
+  if (y > z) {
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0,
+              out_temp);
+    graph_conv_layer::aggregate(z, *graph_gpu, out_temp, out_data);
+  } else {
+    graph_conv_layer::aggregate(y, *graph_gpu, in_temp, in_temp1);
+    sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0,
+              out_data);
+  }
+  if (act_)
+    relu_gpu(x * z, out_data, out_data);
+}
+
+// GPU backward: compute input gradients (in_grad) and weight gradients
+// (d_weight_grad)
+void graph_conv_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  size_t z = output_dims[1];
+
+  if (act_)
+    d_relu_gpu(x * z, out_grad, out_data, out_grad);
+  if (y > z) {
+    graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp);
+    if (level_ != 0)
+      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0,
+                in_grad);
+    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0,
+              layer::d_weight_grad);
+  } else {
+    if (level_ != 0) {
+      sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0,
+                in_temp);
+      graph_conv_layer::d_aggregate(y, *graph_gpu, in_temp, in_grad);
+    }
+    sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0,
+              layer::d_weight_grad);
+  }
+  if (level_ != 0 && dropout_)
+    d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad);
+}
+
+acc_t graph_conv_layer::get_weight_decay_loss() {
+  return l2_norm_gpu(input_dims[1] * output_dims[1], d_W);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp
new file mode 100644
index 0000000000..8de2406ede
--- /dev/null
+++ b/libdeepgalois/src/layers/l2_norm_layer.cpp
@@ -0,0 +1,53 @@
+#include "deepgalois/layers/l2_norm_layer.h"
+#include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
+
+namespace deepgalois {
+
+void l2_norm_layer::forward_propagation(const float_t* in_data,
+                                        float_t* out_data) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  galois::do_all(
+      galois::iterate((size_t)0, x),
+      [&](const auto i) {
+        // for (size_t i = 0; i < x; i++) {
+        float_t sum = 0.0;
+        size_t idx  = i * y;
+        for (size_t j = 0; j < y; j++) {
+          sum += in_data[idx + j] * in_data[idx + j];
+        }
+        sum = std::max(sum, epsilon_);
+        sum = sqrt(sum);
+        for (size_t j = 0; j < y; j++) {
+          out_data[idx + j] = in_data[idx + j] / sum * scale_;
+        }
+      },
+      galois::loopname("l2_norm"));
+}
+
+void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*,
+                                     float_t* out_grad, float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  galois::do_all(
+      galois::iterate((size_t)0, x),
+      [&](const auto i) {
+        // for (size_t i = 0; i < x; i++) {
+        float_t sum_x2      = 0.0;
+        float_t coef0_axis0 = 0, coef1_axis0 = 0;
+        size_t idx = i * y;
+        for (size_t j = 0; j < y; j++) {
+          sum_x2 += powf(in_data[idx + j], 2);
+          coef0_axis0 -= in_data[idx + j] * out_grad[idx + j];
+        }
+        coef1_axis0 = powf(sum_x2, -1.5);
+        for (size_t j = 0; j < y; j++) {
+          in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0 +
+                             out_grad[idx + j] * sum_x2 * coef1_axis0;
+        }
+      },
+      galois::loopname("d_l2_norm"));
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu
new file mode 100644
index 0000000000..ed86cf147d
--- /dev/null
+++ b/libdeepgalois/src/layers/l2_norm_layer.cu
@@ -0,0 +1,21 @@
+#include "deepgalois/layers/l2_norm_layer.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+void l2_norm_layer::forward_propagation(const float_t* in_data,
+                                        float_t* out_data) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  l2_norm_gpu(x, y, in_data, out_data);
+}
+
+void l2_norm_layer::back_propagation(const float_t* in_data,
+                                     const float_t* out_data, float_t* out_grad,
+                                     float_t* in_grad) {
+  size_t x = input_dims[0];
+  size_t y = input_dims[1];
+  d_l2_norm_gpu(x, y, in_data, out_grad, in_grad);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp
new file mode 100644
index 0000000000..a230de1090
--- /dev/null
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp
@@ -0,0 +1,28 @@
+#include "deepgalois/layers/leaky_relu_layer.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims,
+                                   dims_t out_dims)
+    : layer(level, in_dims, out_dims), epsilon_(eps) {
+  assert(input_dims[0] == output_dims[0]); // num_vertices
+  trainable_ = false;
+  n          = input_dims[0] * input_dims[1];
+  name_      = layer_type() + "_" + std::to_string(level);
+}
+
+// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε
+void leaky_relu_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  math::leaky_relu_cpu(n, epsilon_, in_data, out_data);
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
+void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu
new file mode 100644
index 0000000000..a6271086e9
--- /dev/null
+++ b/libdeepgalois/src/layers/leaky_relu_layer.cu
@@ -0,0 +1,20 @@
+#include "deepgalois/layers/leaky_relu_layer.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε
+void leaky_relu_layer::forward_propagation(const float_t* in_data,
+                                           float_t* out_data) {
+  leaky_relu_gpu(n, epsilon_, in_data, out_data);
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε,   𝑖𝑓 (𝑦[𝑙] ≤ 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙,       𝑖𝑓 (𝑦[𝑙] > 0)
+void leaky_relu_layer::back_propagation(const float_t* in_data,
+                                        const float_t* out_data,
+                                        float_t* out_grad, float_t* in_grad) {
+  d_leaky_relu_gpu(n, epsilon_, out_grad, in_data, in_grad);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp
new file mode 100644
index 0000000000..0576bea642
--- /dev/null
+++ b/libdeepgalois/src/layers/relu_layer.cpp
@@ -0,0 +1,21 @@
+#include "deepgalois/layers/relu_layer.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
+void relu_layer::forward_propagation(const float_t* in_data,
+                                     float_t* out_data) {
+  size_t n = input_dims[0] * input_dims[1];
+  math::relu_cpu(n, in_data, out_data);
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
+void relu_layer::back_propagation(const float_t*, const float_t* out_data,
+                                  float_t* out_grad, float_t* in_grad) {
+  size_t n = input_dims[0] * input_dims[1];
+  math::d_relu_cpu(n, out_grad, out_data, in_grad);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu
new file mode 100644
index 0000000000..d457c994ce
--- /dev/null
+++ b/libdeepgalois/src/layers/relu_layer.cu
@@ -0,0 +1,22 @@
+#include "deepgalois/layers/relu_layer.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+// 𝑦[𝑙] = max(0, 𝑦[𝑙−1])
+void relu_layer::forward_propagation(const float_t* in_data,
+                                     float_t* out_data) {
+  const size_t count = input_dims[0] * input_dims[1];
+  relu_gpu(count, in_data, out_data);
+}
+
+// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0)
+//              = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒
+void relu_layer::back_propagation(const float_t* in_data,
+                                  const float_t* out_data, float_t* out_grad,
+                                  float_t* in_grad) {
+  const size_t count = input_dims[0] * input_dims[1];
+  d_relu_gpu(count, out_grad, in_data, in_grad);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
new file mode 100644
index 0000000000..8d72ed4b07
--- /dev/null
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp
@@ -0,0 +1,122 @@
+#include "deepgalois/layers/sigmoid_loss_layer.h"
+#include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
+
+namespace deepgalois {
+
+sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
+}
+
+sigmoid_loss_layer::~sigmoid_loss_layer() { delete[] loss; }
+
+void sigmoid_loss_layer::malloc_and_init() {
+  loss = new float_t[input_dims[0]]; // error for each sample
+}
+
+inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) {
+  // return context->get_label(i, j);
+  return labels[i * input_dims[1] + j];
+}
+
+void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  size_t featLen = input_dims[1];
+  galois::do_all(
+      galois::iterate(begin_, end_),
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid] == 1) { // masked
+          // check if local to this host
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            size_t idx   = featLen * lid;
+
+            // output is normalized input for this layer
+            math::sigmoid(featLen, &in_data[idx],
+                          &out_data[idx]); // normalize using sigmoid
+
+            // one hot encoded vector for the labels
+            // TODO this is a bottleneck; big lock on memory allocator
+            float_t* ground_truth = new float_t[featLen];
+            for (size_t j = 0; j < featLen; j++)
+              ground_truth[j] = (float_t)get_label(lid, j);
+            // loss calculation
+            this->loss[lid] =
+                math::cross_entropy(featLen, ground_truth, &out_data[idx]);
+
+            // TODO this is a bottleneck, lock on memory possibly
+            delete[] ground_truth;
+          }
+        }
+      },
+      galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+      galois::loopname("sigmoid-loss-fw"));
+}
+
+void sigmoid_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data, float_t*,
+                                          float_t* in_grad) {
+  size_t featLen = layer::input_dims[1];
+
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid] == 1) { // masked
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+
+            size_t idx = featLen * lid;
+            // TODO this is bad
+            float_t* norm_grad    = new float_t[featLen];
+            float_t* ground_truth = new float_t[featLen];
+            for (size_t j = 0; j < featLen; j++)
+              ground_truth[j] = (float_t)get_label(lid, j);
+            // use ground truth to determine derivative of cross entropy
+            math::d_cross_entropy(featLen, ground_truth, &out_data[idx],
+                                  norm_grad);
+            // derviative sigmoid to gradient used in the next layer
+            math::d_sigmoid(featLen, &in_data[idx], &out_data[idx],
+                            &in_grad[idx], norm_grad);
+            // TODO this is bad
+            delete[] norm_grad;
+            delete[] ground_truth;
+          }
+        }
+      },
+      galois::chunk_size<CHUNK_SIZE>(), galois::steal(),
+      galois::loopname("sigmoid-loss-bw"));
+}
+
+acc_t sigmoid_loss_layer::get_prediction_loss() {
+  galois::GAccumulator<acc_t> total_loss;
+  galois::GAccumulator<size_t> valid_sample_count;
+  total_loss.reset();
+  valid_sample_count.reset();
+
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid]) {
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            total_loss += this->loss[lid];
+            valid_sample_count += 1;
+          }
+        }
+      },
+      galois::chunk_size<256>(), galois::steal(),
+      galois::loopname("getMaskedLoss"));
+
+  size_t c = valid_sample_count.reduce();
+  if (c > 0) {
+    return total_loss.reduce() / (acc_t)valid_sample_count.reduce();
+  } else {
+    return 0;
+  }
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
new file mode 100644
index 0000000000..0f5ff9cb69
--- /dev/null
+++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu
@@ -0,0 +1,40 @@
+#include "deepgalois/layers/sigmoid_loss_layer.h"
+#include "deepgalois/math_functions.hh"
+#include "gg.h"
+#include "ggcuda.h"
+
+namespace deepgalois {
+
+sigmoid_loss_layer::sigmoid_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
+}
+
+sigmoid_loss_layer::~sigmoid_loss_layer() { float_free_device(loss); }
+
+void sigmoid_loss_layer::malloc_and_init() {
+  float_malloc_device(input_dims[0], loss);
+}
+
+void sigmoid_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  init_const_gpu(input_dims[0], 0.0, loss);
+  sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_,
+                            labels, loss, out_data);
+}
+
+void sigmoid_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels,
+                              out_data, in_grad);
+}
+
+acc_t sigmoid_loss_layer::get_prediction_loss() {
+  return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp
new file mode 100644
index 0000000000..17e7023176
--- /dev/null
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp
@@ -0,0 +1,131 @@
+#include "deepgalois/layers/softmax_loss_layer.h"
+#include "deepgalois/math_functions.hh"
+#include "galois/Galois.h"
+
+namespace deepgalois {
+
+softmax_loss_layer::softmax_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
+}
+
+softmax_loss_layer::~softmax_loss_layer() { delete[] loss; }
+
+void softmax_loss_layer::malloc_and_init() {
+  loss = new float_t[input_dims[0]]; // error for each sample
+}
+
+inline label_t softmax_loss_layer::get_label(size_t i) {
+  return labels[i];
+  // return context->get_label(i);
+}
+
+// TODO: need kernel fusion optimization
+// 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘]
+void softmax_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  // size_t numSamples = input_dims;
+  size_t featLen = input_dims[1];
+  // zero out the output vector
+  for (unsigned i = 0; i < input_dims[0]; i++) {
+    for (unsigned j = 0; j < featLen; j++) {
+      out_data[i * featLen + j] = 0.0;
+    }
+  }
+
+  galois::do_all(
+      galois::iterate(begin_, end_),
+      [&](const unsigned gid) {
+        // if no mask used it means all are fair game
+        if (!use_mask || masks_[gid] == 1) {
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            // output is normalized input for this layer
+            math::softmax(featLen, &in_data[featLen * lid],
+                          &out_data[featLen * lid]); // normalize using softmax
+            // one hot encoded vector for the labels
+            vec_t groundTruth(output_dims[1], 0.0); // ground truth
+            // labels are local
+            groundTruth[get_label(lid)] = 1.0; // one-hot
+            // loss calculation
+            loss[lid] = math::cross_entropy(featLen, &groundTruth[0],
+                                            &out_data[featLen * lid]);
+          }
+        }
+      },
+      galois::chunk_size<64>(), galois::steal(),
+      galois::loopname("softmax-loss-fw"));
+
+  // no sync required in distributed execution since no graph topology used
+  // in this forward pass; only a post-process pretty much
+}
+
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data, float_t*,
+                                          float_t* in_grad) {
+  // note: out_grad is ignored because it shouldn't exist (this is output layer)
+  size_t featLen = layer::input_dims[1];
+
+  for (unsigned i = 0; i < input_dims[0]; i++) {
+    for (unsigned j = 0; j < featLen; j++) {
+      in_grad[i * featLen + j] = 0.0;
+    }
+  }
+
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid] == 1) { // masked
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            vec_t norm_grad(featLen);
+            std::vector<acc_t> groundTruth(featLen, 0.0);
+            groundTruth[get_label(lid)] = 1.0;
+            // use ground truth to determine derivative of cross entropy
+            math::d_cross_entropy(featLen, &groundTruth[0],
+                                  &out_data[featLen * lid], &norm_grad[0]);
+            // derviative softmax to gradient used in the next layer
+            math::d_softmax(featLen, &in_data[featLen * lid],
+                            &out_data[featLen * lid], &in_grad[featLen * lid],
+                            &norm_grad[0]);
+          }
+        }
+      },
+      galois::chunk_size<64>(), galois::steal(),
+      galois::loopname("softmax-loss-bw"));
+
+  // no weight sync required: this is all local graph information
+}
+
+acc_t softmax_loss_layer::get_prediction_loss() {
+  galois::GAccumulator<acc_t> total_loss;
+  galois::GAccumulator<size_t> valid_sample_count;
+  total_loss.reset();
+  valid_sample_count.reset();
+
+  galois::do_all(
+      galois::iterate(layer::begin_, layer::end_),
+      [&](const auto& gid) {
+        if (!use_mask || masks_[gid]) {
+          if (this->context->isLocal(gid)) {
+            unsigned lid = this->context->getLID(gid);
+            total_loss += this->loss[lid];
+            valid_sample_count += 1;
+          }
+        }
+      },
+      galois::chunk_size<256>(), galois::steal(),
+      galois::loopname("getMaskedLoss"));
+
+  size_t c = valid_sample_count.reduce();
+  if (c > 0) {
+    return total_loss.reduce() / (acc_t)valid_sample_count.reduce();
+  } else {
+    return 0;
+  }
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu
new file mode 100644
index 0000000000..20b7e659d8
--- /dev/null
+++ b/libdeepgalois/src/layers/softmax_loss_layer.cu
@@ -0,0 +1,40 @@
+#include "deepgalois/layers/softmax_loss_layer.h"
+#include "deepgalois/math_functions.hh"
+#include "gg.h"
+#include "ggcuda.h"
+
+namespace deepgalois {
+
+softmax_loss_layer::softmax_loss_layer(unsigned level,
+                                       std::vector<size_t> in_dims,
+                                       std::vector<size_t> out_dims)
+    : layer(level, in_dims, out_dims) {
+  trainable_ = false;
+  name_      = layer_type() + "_" + std::to_string(level);
+}
+
+softmax_loss_layer::~softmax_loss_layer() { float_free_device(loss); }
+
+void softmax_loss_layer::malloc_and_init() {
+  float_malloc_device(input_dims[0], loss);
+}
+
+void softmax_loss_layer::forward_propagation(const float_t* in_data,
+                                             float_t* out_data) {
+  init_const_gpu(input_dims[0], 0.0, loss);
+  softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_,
+                            labels, loss, out_data);
+}
+
+void softmax_loss_layer::back_propagation(const float_t* in_data,
+                                          const float_t* out_data,
+                                          float_t* out_grad, float_t* in_grad) {
+  d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels,
+                              out_data, in_grad);
+}
+
+acc_t softmax_loss_layer::get_prediction_loss() {
+  return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp
new file mode 100644
index 0000000000..31cd353e51
--- /dev/null
+++ b/libdeepgalois/src/lgraph.cpp
@@ -0,0 +1,41 @@
+#include "deepgalois/lgraph.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/reader.h"
+#include "galois/Galois.h"
+#include <iostream>
+
+namespace deepgalois {
+
+bool LearningGraph::isLocal(index_t) { return true; }
+
+index_t LearningGraph::getLID(index_t) { return 0; }
+
+bool LearningGraph::is_vertex_cut() { return true; }
+
+std::vector<std::vector<size_t>>& LearningGraph::getMirrorNodes() {
+  return mirrorNodes;
+}
+
+uint64_t LearningGraph::numMasters() { return 0; }
+
+uint64_t LearningGraph::globalSize() { return 0; }
+
+void LearningGraph::readGraph(std::string dataset, bool selfloop) {
+  if (selfloop)
+    std::cout << "selfloop not yet implemented\n";
+  deepgalois::Reader reader(dataset);
+  reader.readGraphFromGRFile(this);
+}
+
+void LearningGraph::degree_counting() {
+  // if (degrees_ != NULL) return;
+  // degrees_ = new index_t[num_vertices_];
+  galois::do_all(
+      galois::iterate(size_t(0), size_t(num_vertices_)),
+      [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; },
+      galois::loopname("DegreeCounting"));
+}
+
+void LearningGraph::dealloc() {}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu
new file mode 100644
index 0000000000..9e1f2ab29e
--- /dev/null
+++ b/libdeepgalois/src/lgraph.cu
@@ -0,0 +1,77 @@
+#include "deepgalois/lgraph.h"
+#include "deepgalois/cutils.h"
+#include "deepgalois/reader.h"
+#include <cassert>
+
+namespace deepgalois {
+
+void LearningGraph::readGraph(std::string dataset, bool selfloop) {
+  deepgalois::Reader reader(dataset);
+  reader.readGraphFromGRFile(this);
+}
+
+void LearningGraph::dealloc() {
+  assert(is_device);
+  CUDA_CHECK(cudaFree(d_colidx_));
+  CUDA_CHECK(cudaFree(d_rowptr_));
+  CUDA_CHECK(cudaFree(d_degrees_));
+  if (edge_data_ != NULL)
+    CUDA_CHECK(cudaFree(d_edge_data_));
+  if (vertex_data_ != NULL)
+    CUDA_CHECK(cudaFree(d_vertex_data_));
+}
+
+void LearningGraph::allocOnDevice(bool no_edge_data__) {
+  if (d_colidx_ != NULL)
+    return;
+  CUDA_CHECK(cudaMalloc((void**)&d_colidx_, num_edges_ * sizeof(index_t)));
+  CUDA_CHECK(
+      cudaMalloc((void**)&d_rowptr_, (num_vertices_ + 1) * sizeof(index_t)));
+  // CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ *
+  // sizeof(index_t))); if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **)
+  // &edge_data__, num_edges_ * sizeof(edge_data___t)));
+  // CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ *
+  // sizeof(vdata_t)));
+  is_device = true;
+}
+
+void LearningGraph::print_test() {
+  printf("d_rowptr_: 0x%x\n", d_rowptr_);
+  printf("d_colidx_: 0x%x\n", d_colidx_);
+  print_device_int_vector(10, (const int*)d_rowptr_, "row_start");
+  print_device_int_vector(10, (const int*)d_colidx_, "edge_dst");
+}
+
+void LearningGraph::copy_to_gpu() {
+  allocOnDevice(edge_data_ == NULL);
+  CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(),
+                        num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(),
+                        (num_vertices_ + 1) * sizeof(index_t),
+                        cudaMemcpyHostToDevice));
+  print_test();
+  // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ *
+  // sizeof(index_t), cudaMemcpyHostToDevice)); if (edge_data__ != NULL)
+  // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ *
+  // sizeof(edata_t), cudaMemcpyHostToDevice));
+  // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_
+  // * sizeof(vdata_t), cudaMemcpyHostToDevice));
+}
+
+void LearningGraph::copy_to_cpu() {
+  CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_,
+                        num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_,
+                        (num_vertices_ + 1) * sizeof(index_t),
+                        cudaMemcpyDeviceToHost));
+  // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ *
+  // sizeof(index_t), cudaMemcpyDeviceToHost)); if (edge_data__ != NULL)
+  // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ *
+  // sizeof(edata_t), cudaMemcpyDeviceToHost));
+  // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__,
+  // num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost));
+}
+
+void LearningGraph::degree_counting() {}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp
new file mode 100644
index 0000000000..b8addfe887
--- /dev/null
+++ b/libdeepgalois/src/math_functions.cpp
@@ -0,0 +1,368 @@
+#include <random>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <immintrin.h>
+#include "galois/Timer.h"
+#include "galois/Galois.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/random.h"
+#include "deepgalois/math_functions.hh"
+
+#ifdef USE_MKL
+#include <mkl.h>
+#else // If use MKL, simply include the MKL header
+extern "C" {
+#include <cblas.h>
+}
+#endif
+
+#define NOT_IMPLEMENTED                                                        \
+  do {                                                                         \
+    std::cout << "Not Implemented Yet";                                        \
+    exit(1);                                                                   \
+  } while (0);
+
+/*
+#include <boost/random.hpp>
+typedef boost::mt19937 rng_t;
+inline rng_t* deepgalois_rng() {
+  return static_cast<rng_t*>(Context::rng_stream().generator());
+}
+
+void rng_bernoulli(size_t n, const float_t p, uint8_t* r) {
+  boost::bernoulli_distribution<float_t> random_distribution(p);
+  boost::variate_generator<rng_t*, boost::bernoulli_distribution<float_t> >
+      variate_generator(deepgalois_rng(), random_distribution);
+  for (size_t i = 0; i < n; ++i)
+    r[i] = variate_generator();
+}
+*/
+
+// anon namespace so these things don't leak elsewhere
+namespace {
+static deepgalois::PerThreadRNG* per_thread_rng = nullptr;
+}
+
+namespace deepgalois {
+
+namespace math {
+
+inline uint8_t bernoulli(float_t p) {
+  if (!per_thread_rng) {
+    per_thread_rng = new PerThreadRNG();
+  }
+  return per_thread_rng->get_number() > p ? 1 : 0;
+}
+
+//! wrapper function to call cblas_sgemm
+void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C) {
+  galois::StatTimer Tmatmul("MatMul");
+  Tmatmul.start();
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, N);
+  Tmatmul.stop();
+}
+
+#ifdef USE_MKL
+void csrmm_cpu(const int M, const int N, const int K, const int,
+               const float alpha, float* A_nonzeros, int* A_idx_ptr,
+               int* A_nnz_idx, const float* B, const float beta, float* C) {
+#else
+void csrmm_cpu(const int, const int, const int, const int, const float, float*,
+               int*, int*, const float*, const float, float*) {
+#endif
+#ifdef USE_MKL
+  // mkl_set_num_threads(56);
+  // const char *matdescra = "GXXCX";//6 bytes
+  // const char transa = 'N';
+  // mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx,
+  // A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N);
+  sparse_status_t status;
+  bool need_trans              = false;
+  bool is_row_major            = true;
+  sparse_matrix_t csrA         = NULL;
+  sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
+  sparse_layout_t layout =
+      (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR);
+  status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr,
+                                   A_idx_ptr + 1, A_nnz_idx, A_nonzeros);
+  if (status != SPARSE_STATUS_SUCCESS) {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    exit(1);
+  }
+  sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE
+                                          : SPARSE_OPERATION_NON_TRANSPOSE);
+  struct matrix_descr descrA;
+  descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+  // descrA.mode = SPARSE_FILL_MODE_UPPER;
+  // descrA.diag = SPARSE_DIAG_NON_UNIT;
+  // mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1);
+  // mkl_sparse_optimize(csrA);
+  status =
+      mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N);
+  if (status != SPARSE_STATUS_SUCCESS) {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    exit(1);
+  }
+  mkl_sparse_destroy(csrA);
+#else
+  NOT_IMPLEMENTED;
+#endif
+}
+
+// matrix-vector multiply
+void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+           const float alpha, const float* A, const float* x, const float beta,
+           float* y) {
+  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
+}
+
+const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512
+
+void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) {
+#ifdef USE_MKL
+  vsAdd(n, a, b, y);
+#else
+#ifdef __AVX2__
+  const size_t alignedN = n - n % vec_len;
+  for (size_t i = 0; i < alignedN; i += vec_len)
+    _mm256_storeu_ps(
+        &y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i])));
+  for (size_t i = alignedN; i < n; ++i)
+    y[i] = a[i] + b[i];
+#else
+  for (size_t i = 0; i < n; ++i)
+    y[i] = a[i] + b[i];
+#endif
+#endif
+}
+
+void scal(size_t n, const float_t alpha, float_t* x) {
+  cblas_sscal(n, alpha, x, 1);
+}
+
+void scale(size_t n, const float_t alpha, const float_t* x, float_t* y) {
+  cblas_scopy(n, x, 1, y, 1);
+  cblas_sscal(n, alpha, y, 1);
+}
+
+void axpy(size_t n, const float_t a, float_t* x, float_t* y) {
+  cblas_saxpy(n, a, x, 1, y, 1);
+}
+
+int argmax(const size_t n, const float_t* x) {
+  float_t max = x[0];
+  int max_ind = 0;
+  for (size_t i = 1; i < n; i++) {
+    if (x[i] > max) {
+      max_ind = i;
+      max     = x[i];
+    }
+  }
+  return max_ind;
+}
+
+// l2 normalization
+float_t l2_norm(size_t n, const float_t* x) { return cblas_snrm2(n, x, 1); }
+
+// dot product
+float_t dot(size_t n, const float_t* x, const float_t* y) {
+  return cblas_sdot(n, x, 1, y, 1);
+}
+
+// concatenation of two vectors into one
+void concat(size_t n, const float_t* x, const float_t* y, float_t* z) {
+  copy_cpu(n, x, z);
+  copy_cpu(n, y, z + n);
+}
+
+void clear_cpu(size_t n, float_t* in) {
+  // for (size_t i = 0; i < n; i++) in[i] = 0;
+  std::fill(in, in + n, 0);
+  // memset(in, 0, n*sizeof(float_t));
+}
+
+void dropout(size_t m, float scale, float dropout_rate, const float_t* in,
+             mask_t* masks, float_t* out) {
+  for (size_t i = 0; i < m; ++i)
+    masks[i] = bernoulli(dropout_rate);
+  for (size_t i = 0; i < m; ++i)
+    out[i] = in[i] * (float_t)masks[i] * scale;
+}
+
+void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate,
+                 const float_t* in, mask_t* masks, float_t* out) {
+  size_t len = n * m;
+
+  galois::do_all(
+      galois::iterate((size_t)0, len),
+      [&](size_t i) { masks[i] = bernoulli(dropout_rate); },
+      galois::loopname("dropout RNG"));
+
+  galois::do_all(
+      galois::iterate((size_t)0, len),
+      [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; },
+      galois::loopname("dropout"));
+}
+
+void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks,
+               float_t* out) {
+  for (size_t i = 0; i < m; ++i)
+    out[i] = in[i] * (float_t)masks[i] * scale;
+}
+
+void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in,
+                   mask_t* masks, float_t* out) {
+  galois::do_all(
+      galois::iterate((size_t)0, n * m),
+      [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; },
+      galois::loopname("d_dropout"));
+}
+
+void relu_cpu(size_t n, const float_t* in, float_t* out) {
+  // TODO: vectorize
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) { out[i] = std::max(in[i], float_t(0)); },
+      galois::chunk_size<64>(), galois::loopname("relu"));
+}
+
+void d_relu_cpu(size_t n, const float_t* in, const float_t* data,
+                float_t* out) {
+  // TODO: vectorize
+  // check if original data greater than 0; if so keep grad
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) {
+        out[i] = data[i] > float_t(0) ? in[i] : float_t(0);
+      },
+      galois::chunk_size<64>(), galois::loopname("d_relu"));
+}
+
+void leaky_relu(float_t epsilon, float_t in, float_t& out) {
+  out = in > 0.0 ? in : epsilon * in;
+}
+
+void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out) {
+  out = in * (data > 0.0 ? 1.0 : epsilon);
+}
+
+void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
+                    float_t* out) {
+  // TODO: vectorize
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; },
+      galois::chunk_size<64>(), galois::loopname("leaky_relu"));
+}
+
+void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in,
+                      const float_t* data, float_t* out) {
+  // TODO: vectorize
+  galois::do_all(
+      galois::iterate((size_t)0, n),
+      [&](const auto& i) {
+        out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon);
+      },
+      galois::chunk_size<64>(), galois::loopname("d_leaky_relu"));
+}
+
+void softmax(size_t n, const float_t* input, float_t* output) {
+  const float_t max = *std::max_element(input, input + n);
+  float_t denominator(0);
+  for (size_t i = 0; i < n; i++) {
+    output[i] = std::exp(input[i] - max);
+    denominator += output[i];
+  }
+  for (size_t i = 0; i < n; i++)
+    output[i] /= denominator;
+}
+
+void d_softmax(size_t n, const float_t*, const float_t* p, float_t* dy,
+               const float_t* dp) {
+  vec_t df(n, 0);
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < n; j++) {
+      df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i];
+    }
+    dy[i] = dot(n, dp, &df[0]);
+  }
+}
+
+// cross-entropy loss function for multi-class classification
+// y: ground truth
+// p: predicted probability
+float_t cross_entropy(size_t n, const float_t* y, const float_t* p) {
+  float_t loss = 0.0;
+  for (size_t i = 0; i < n; i++) {
+    if (y[i] == float_t(0))
+      continue;
+    if (p[i] == float_t(0))
+      loss -= y[i] * std::log(float_t(1e-10));
+    else
+      loss -= y[i] * std::log(p[i]);
+  }
+  return loss;
+}
+
+void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) {
+  for (size_t i = 0; i < n; i++) {
+    d[i] = -y[i] / (p[i] + float_t(1e-10));
+  }
+}
+
+// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and
+// amazon inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) +
+// 0.5; }
+inline float_t sigmoid_func(float_t x) { return 1. / (1. + expf(-x)); }
+
+// Sigmoid
+void sigmoid(size_t n, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < n; i++) {
+    out[i] = 1. / (1. + expf(-in[i]));
+  }
+}
+
+void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy,
+               const float_t* dp) {
+  for (size_t i = 0; i < n; i++) {
+    dy[i] = dp[i] * p[i] * (float_t(1) - p[i]);
+  }
+}
+
+void copy_cpu(size_t n, const float_t* in, float_t* out) {
+  // std::copy(in, in + n, out);
+  // memcpy(out, in, sizeof(float_t) * n);
+  cblas_scopy(n, in, 1, out, 1);
+}
+
+// num rows in A, C; num columns in B, C; num columns in A, rows in B
+void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                const float_t* A, const float_t* B, float_t* C) {
+  sgemm_cpu(CblasNoTrans, CblasNoTrans, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+}
+
+// TODO make parallel
+void transpose(size_t x, size_t y, const float_t* in, float_t* out) {
+  for (size_t i = 0; i < y; i++) {
+    for (size_t j = 0; j < x; j++) {
+      out[i * x + j] = in[j * y + i];
+    }
+  }
+}
+
+float reduce_mean(size_t n, const float_t* x) {
+  float_t sum = 0.;
+  for (size_t i = 0; i < n; i++) {
+    sum += (float_t)x[i];
+  }
+  return sum / (float_t)n;
+}
+
+} // end namespace math
+} // end namespace deepgalois
diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu
new file mode 100644
index 0000000000..b9f7686867
--- /dev/null
+++ b/libdeepgalois/src/math_functions.cu
@@ -0,0 +1,800 @@
+#include "deepgalois/math_functions.hh"
+#include "deepgalois/DistContext.h"
+#include "gg.h"
+#include "ggcuda.h"
+#include "cub/cub.cuh"
+#include <curand_kernel.h>
+
+__global__ void init_const_kernel(int n, float_t value, float_t* array) {
+  CUDA_KERNEL_LOOP(i, n) { array[i] = value; }
+}
+
+void init_const_gpu(int n, float_t value, float_t* array) {
+  init_const_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, value, array);
+  CudaTest("solving init_const kernel failed");
+}
+
+__global__ void isnan_test(const int n, const float* data, bool* result) {
+  CUDA_KERNEL_LOOP(i, n) {
+    if (isnan(data[i]))
+      *result = true;
+  }
+}
+
+bool isnan_gpu(int n, const float_t* array) {
+  bool *d_result, h_result = false;
+  cudaMalloc((void**)&d_result, sizeof(bool));
+  cudaMemcpy(d_result, &h_result, sizeof(bool), cudaMemcpyHostToDevice);
+  isnan_test<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, array, d_result);
+  CudaTest("solving init_const kernel failed");
+  cudaMemcpy(&h_result, d_result, sizeof(bool), cudaMemcpyDeviceToHost);
+  return h_result;
+}
+
+void gpu_rng_uniform(size_t n, float_t* r) {
+  CURAND_CHECK(
+      curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n));
+}
+
+void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) {
+  CURAND_CHECK(
+      curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n));
+  const float range = b - a;
+  if (range != float_t(1))
+    scal_gpu(n, range, r);
+  if (a != float_t(0))
+    add_scalar_gpu(n, a, r);
+}
+
+void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma,
+                      float_t* r) {
+  CURAND_CHECK(curandGenerateNormal(deepgalois::DistContext::curand_generator(), r,
+                                    n, mu, sigma));
+}
+
+bool is_allocated_device(float_t* data) {
+  if (data == NULL)
+    return false;
+  cudaPointerAttributes attributes;
+  CUDA_CHECK(cudaPointerGetAttributes(&attributes, data));
+  if (attributes.devicePointer != NULL)
+    return true;
+  return false;
+}
+
+void float_malloc_device(int n, float_t*& ptr) {
+  CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(float_t)));
+}
+
+void float_free_device(float_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); }
+
+void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr) {
+  CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice));
+}
+
+void uint8_malloc_device(int n, uint8_t*& ptr) {
+  CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(uint8_t)));
+}
+
+void uint8_free_device(uint8_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); }
+
+void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr) {
+  CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(uint8_t), cudaMemcpyHostToDevice));
+}
+
+void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) {
+  assert(h_masks != NULL);
+  CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t)));
+  CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice));
+}
+
+__global__ void setup_curand_kernel(const int n, curandState* state) {
+  CUDA_KERNEL_LOOP(i, n) {
+    // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234
+    curand_init(7 + i, i, 0, &state[i]); // Each thread gets different seed
+  }
+}
+
+__global__ void dropout_kernel(int n, float scale, float threshold,
+                               float_t* rands, const float_t* in, mask_t* masks,
+                               float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) {
+    masks[i] = rands[i] > threshold ? 1 : 0;
+    out[i]   = in[i] * masks[i] * scale;
+  }
+}
+
+void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                 mask_t* masks, float_t* out) {
+  float_t* rands;
+  float_malloc_device(n, rands);
+  gpu_rng_uniform(n, rands);
+  dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, scale, dropout_rate, rands, in, masks, out);
+  CudaTest("solving dropout kernel failed");
+  float_free_device(rands);
+}
+
+__global__ void d_dropout_kernel(int n, float scale, float threshold,
+                                 const float_t* in, const mask_t* masks,
+                                 float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; }
+}
+
+void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in,
+                   const mask_t* masks, float_t* out) {
+  d_dropout_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, scale, dropout_rate, in, masks, out);
+  CudaTest("solving d_dropout kernel failed");
+}
+
+// flattern data into 1D before feed into the ReLU operater
+__global__ void relu_kernel(const int n, const float_t* in, float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : 0; }
+}
+
+void relu_gpu(const int n, const float_t* in, float_t* out) {
+  relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in, out);
+  CudaTest("solving relu kernel failed");
+}
+
+__global__ void d_relu_kernel(const int n, const float_t* in_diff,
+                              const float_t* data, float_t* out_diff) {
+  CUDA_KERNEL_LOOP(i, n) { out_diff[i] = data[i] > 0 ? in_diff[i] : 0; }
+}
+
+void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data,
+                float_t* out_diff) {
+  d_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, in_diff, data,
+                                                          out_diff);
+  CudaTest("solving d_relu kernel failed");
+}
+
+// flattern data into 1D before feed into the ReLU operater
+__global__ void leaky_relu_kernel(const int n, const float_t epsilon,
+                                  const float_t* in, float_t* out) {
+  CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; }
+}
+
+void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in,
+                    float_t* out) {
+  leaky_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, epsilon, in,
+                                                              out);
+  CudaTest("solving leaky_relu kernel failed");
+}
+
+__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon,
+                                    const float_t* in_diff, const float_t* data,
+                                    float_t* out_diff) {
+  CUDA_KERNEL_LOOP(i, n) {
+    out_diff[i] = in_diff[i] * (data[i] > 0 ? 1.0 : epsilon);
+  }
+}
+
+void d_leaky_relu_gpu(const int n, const float_t epsilon,
+                      const float_t* in_diff, const float_t* data,
+                      float_t* out_diff) {
+  d_leaky_relu_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, epsilon, in_diff, data, out_diff);
+  CudaTest("solving d_leaky_relu kernel failed");
+}
+
+__global__ void matmul_kernel(int x, int y, int z, const float_t* A,
+                              const float_t* B, float_t* C) {
+  int row     = blockIdx.x * blockDim.x + threadIdx.x;
+  int col     = blockIdx.y * blockDim.y + threadIdx.y;
+  float_t sum = 0.0f;
+  if (row < x && col < y) {
+    for (int i = 0; i < z; i++) {
+      sum += A[row * z + i] * B[i * y + col];
+    }
+  }
+  C[row * y + col] = sum;
+}
+
+#define TILE_SZ 16
+void matmul_gpu(const size_t x, const size_t y, const size_t z,
+                const float_t* A, const float_t* B, float_t* C) {
+  dim3 threadsPerBlock(TILE_SZ, TILE_SZ);
+  dim3 blocksPerGrid((y - 1) / TILE_SZ + 1, (x - 1) / TILE_SZ + 1);
+  matmul_kernel<<<blocksPerGrid, threadsPerBlock>>>(x, y, z, A, B, C);
+  CudaTest("solving matmul kernel failed");
+}
+
+void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+               const int M, const int N, const int K, const float alpha,
+               const float* A, const float* B, const float beta, float* C) {
+  // Note that cublas follows fortran order.
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_CHECK(cublasSgemm(deepgalois::DistContext::cublas_handle(), cuTransB,
+                           cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C,
+                           N));
+}
+
+void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z,
+                    const float_t* A, const float_t* B, float_t* C) {
+  const CBLAS_TRANSPOSE TransA = CblasNoTrans;
+  const CBLAS_TRANSPOSE TransB = CblasNoTrans;
+  sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C);
+}
+
+// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix
+// for vertex feature tensor. However, since cusparse only supports
+// column-major, while feature tensor is stored in row-major, the actual
+// computation is: C = trans(A x trans(B)). Currently, we use cublasSgeam to
+// implement transposition and allocate intermediate workspace memory
+// (transpose_C) for this.
+void csrmm_gpu(const int M, const int N, const int K, const int nnz,
+               const float alpha, const float* A_nonzeros, const int* A_idx_ptr,
+               const int* A_nnz_idx, const float* B, const float beta,
+               float* transpose_C, float* C) {
+  //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n";
+  CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(),
+             CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
+             M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(),
+             A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M));
+  // transpose C
+  const float one  = 1.0;
+  const float zero = 0.0;
+  CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T,
+                           CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N));
+}
+/*
+void csrmm_gpu_new(const int M, const int N, const int K, const int nnz,
+               const float alpha, const float* A_nonzeros,
+               const int* A_idx_ptr, const int* A_nnz_idx,
+               const float* B, const float beta, float *transpose_C, float* C) {
+  std::cout << "[debug]: csrmm_gpu\n";
+  cusparseSpMatDescr_t A_descr;
+  CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx,
+A_nonzeros, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+  cusparseDnMatDescr_t B_descr;
+  CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F,
+CUSPARSE_ORDER_COL)); cusparseDnMatDescr_t C_descr;
+  CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F,
+CUSPARSE_ORDER_COL)); size_t bufferSize;
+  CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::DistContext::cusparse_handle(),
+                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+CUSPARSE_OPERATION_TRANSPOSE, (void*)&alpha, A_descr, B_descr, (void*)&beta,
+C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize));
+  cudaDeviceSynchronize();
+  void* buffer = NULL;
+  if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize));
+  CUSPARSE_CHECK(cusparseSpMM(deepgalois::DistContext::cusparse_handle(),
+                 CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
+                 (const void*)&alpha, A_descr, B_descr, (const void*)&beta,
+C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); cudaDeviceSynchronize();
+  //transpose C
+  const float one = 1.0;
+  const float zero = 0.0;
+  CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T,
+CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N));
+}
+//*/
+void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+              const float alpha, const float* A, const float* x,
+              const float beta, float* y) {
+  cublasOperation_t cuTransA =
+      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(cublasSgemv(deepgalois::DistContext::cublas_handle(), cuTransA, N, M,
+                           &alpha, A, N, x, 1, &beta, y, 1));
+}
+
+void scal_gpu(const int N, const float alpha, float* X) {
+  CUBLAS_CHECK(
+      cublasSscal(deepgalois::DistContext::cublas_handle(), N, &alpha, X, 1));
+}
+
+void dot_gpu(const int n, const float* x, const float* y, float* out) {
+  CUBLAS_CHECK(
+      cublasSdot(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1, out));
+}
+
+void asum_gpu(const int n, const float* x, float* y) {
+  CUBLAS_CHECK(cublasSasum(deepgalois::DistContext::cublas_handle(), n, x, 1, y));
+}
+
+void scale_gpu(const int n, const float alpha, const float* x, float* y) {
+  CUBLAS_CHECK(
+      cublasScopy(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1));
+  CUBLAS_CHECK(
+      cublasSscal(deepgalois::DistContext::cublas_handle(), n, &alpha, y, 1));
+}
+
+__global__ void set_kernel(const int n, const float_t alpha, float_t* y) {
+  CUDA_KERNEL_LOOP(i, n) { y[i] = alpha; }
+}
+
+void set_gpu(const int n, const float_t alpha, float_t* y) {
+  if (alpha == 0) {
+    CUDA_CHECK(cudaMemset(y, 0, sizeof(float_t) * n));
+    return;
+  }
+  set_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, alpha, y);
+  CudaTest("solving set kernel failed");
+}
+
+__global__ void add_scalar_kernel(const int n, const float_t a, float_t* y) {
+  CUDA_KERNEL_LOOP(i, n) { y[i] += a; }
+}
+
+void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) {
+  add_scalar_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, alpha, Y);
+  CudaTest("solving add_scalar kernel failed");
+}
+
+__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b,
+                            float_t* y) {
+  CUDA_KERNEL_LOOP(i, n) { y[i] = a[i] + b[i]; }
+}
+
+void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* y) {
+  vadd_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, b, y);
+  CudaTest("solving vadd kernel failed");
+}
+
+__global__ void axpy_kernel(const int n, const float_t a, const float_t* x,
+                            float_t* y) {
+  CUDA_KERNEL_LOOP(i, n) { y[i] = a * x[i] + y[i]; }
+}
+
+void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) {
+  // axpy_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, a, x, y);
+  CUBLAS_CHECK(
+      cublasSaxpy(deepgalois::DistContext::cublas_handle(), n, &a, x, 1, y, 1));
+  CudaTest("solving axpy kernel failed");
+}
+
+__global__ void l2_norm_kernel(const int n, const float_t* a, float_t* sum) {
+  CUDA_KERNEL_LOOP(i, n) {
+    float_t product = a[i] * a[i];
+    atomicAdd(sum, product);
+  }
+}
+
+acc_t l2_norm_gpu(int n, const float_t* x) {
+  float_t sum = 0.0;
+  CUBLAS_CHECK(cublasSnrm2(deepgalois::DistContext::cublas_handle(), n, x, 1, &sum));
+  // float_t *d_sum;
+  // CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t));
+  // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice));
+  // l2_norm_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(n, x, d_sum);
+  // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t),
+  // cudaMemcpyDeviceToHost));
+  return (acc_t)sum / 2.0;
+}
+
+void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out) {}
+
+void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff,
+                   float_t* out_diff) {}
+
+void copy_gpu(int len, const float_t* in, float_t* out) {
+  CUDA_CHECK(
+      cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice));
+}
+
+// TODO: use warp
+__device__ void softmax_device(int n, const float_t* input, float_t* output) {
+  float_t max = input[0];
+  for (int i = 1; i < n; i++)
+    if (input[i] > max)
+      max = input[i];
+  float_t denominator = 0.0;
+  for (int i = 0; i < n; i++) {
+    output[i] = expf(input[i] - max);
+    denominator += output[i];
+    if (output[i] < 0.0)
+      printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]);
+    // assert(output[i] >= 0.0);
+  }
+  assert(denominator != 0.0);
+  for (int i = 0; i < n; i++) {
+    output[i] /= denominator;
+    // assert(output[i] >= 0.0);
+    // assert(output[i] <= 1.0);
+  }
+}
+
+__device__ void sigmoid_device(int n, const float_t* in, float_t* out) {
+  for (int i = 0; i < n; i++)
+    out[i] = 1. / (1. + expf(-in[i]));
+}
+
+__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p,
+                                     float_t& loss) {
+  if (p[idx] == 0.0)
+    loss -= logf(float_t(1e-10));
+  else
+    loss -= logf(p[idx]);
+}
+
+// y: ground truth
+// p: predictions
+__device__ void cross_entropy_multi_device(int n, const label_t* y,
+                                           const float_t* p, float_t& loss) {
+  for (int i = 0; i < n; i++) {
+    if (y[i] == 0)
+      continue;
+    if (p[i] == float_t(0))
+      loss -= logf(float_t(1e-10)); // avoid NaN exception
+    else
+      loss -= logf(p[i]);
+  }
+}
+
+// n: number of vectors
+// len: length of vectors
+// for each vector, do softmax to normalize the vector, and then compute a loss
+__global__ void softmax_cross_entropy_kernel(int len, int begin, int end,
+                                             const float_t* in_data,
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             float_t* loss, float_t* out_data) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    int id = begin + i;
+    if (masks[id] == 1) { // masked
+                          // normalize using softmax
+      softmax_device(len, in_data + len * id, out_data + len * id);
+      // loss[id] = 0.0;
+      cross_entropy_device(len, labels[id], out_data + len * id, loss[id]);
+    }
+  }
+}
+
+void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out) {
+  softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end - begin),
+                                 CUDA_NUM_THREADS>>>(len, begin, end, in, masks,
+                                                     labels, loss, out);
+  CudaTest("solving softmax_cross_entropy kernel failed");
+}
+
+// n: number of vectors
+// len: length of vectors
+// for each vector, do softmax to normalize the vector, and then compute a loss
+__global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end,
+                                             const float_t* in_data,
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             float_t* loss, float_t* out_data) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    int id = begin + i;
+    if (masks[id] == 1) { // masked
+      sigmoid_device(len, in_data + len * id, out_data + len * id);
+      cross_entropy_multi_device(len, labels, out_data + len * id, loss[id]);
+    }
+  }
+}
+
+void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in,
+                               const mask_t* masks, const label_t* labels,
+                               float_t* loss, float_t* out) {
+  sigmoid_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end - begin),
+                                 CUDA_NUM_THREADS>>>(len, begin, end, in, masks,
+                                                     labels, loss, out);
+  CudaTest("solving sigmoid_cross_entropy kernel failed");
+}
+
+__device__ void d_cross_entropy_device(int n, const label_t idx,
+                                       const float_t* p, float_t* d) {
+  for (int i = 0; i < n; i++) {
+    if (i == (int)idx)
+      d[i] = -1.0 / (p[i] + 1e-10);
+    else
+      d[i] = 0.0;
+  }
+}
+
+__global__ void d_cross_entropy_kernel(int len, int begin, int end,
+                                       const mask_t* masks,
+                                       const label_t* labels,
+                                       const float_t* data, float_t* grad) {
+  int base = begin * len;
+  CUDA_KERNEL_LOOP(i, (end - begin) * len) {
+    int id = begin + i / len;
+    if (masks[id] == 1) { // masked
+      if (i % len == (int)labels[id])
+        grad[i] = -1.0 / (data[i + base] + 1e-10);
+      else
+        grad[i] = 0.0;
+      // d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i);
+    }
+  }
+}
+
+__global__ void d_cross_entropy_warp(int len, int begin, int end,
+                                     const mask_t* masks, const label_t* labels,
+                                     const float_t* data, float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len)
+          p[warp_lane][pid] = data[base + pid];
+      }
+      __syncthreads();
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          if (pid == (int)labels[id])
+            grad[wid * len + pid] = -1.0 / (p[warp_lane][pid] + 1e-10);
+          else
+            grad[wid * len + pid] = 0.0;
+        }
+      }
+    }
+  }
+}
+
+__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp,
+                                 float_t* dy) {
+  for (int i = 0; i < n; i++) {
+    dy[i] = 0;
+    for (int j = 0; j < n; j++) {
+      float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i];
+      dy[i] += df * dp[j];
+    }
+  }
+}
+
+__global__ void d_softmax_kernel(int len, int begin, int end,
+                                 const mask_t* masks, const float_t* data,
+                                 const float_t* in_grad, float_t* out_grad) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    int id = begin + i;
+    if (masks[id] == 1) { // masked
+      d_softmax_device(len, data + len * id, in_grad + len * i,
+                       out_grad + len * id);
+    }
+  }
+}
+
+__global__ void d_softmax_warp(int len, int begin, int end, const mask_t* masks,
+                               const float_t* data, const float_t* in_grad,
+                               float_t* out_grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          p[warp_lane][pid] = data[base + pid];
+          d[warp_lane][pid] = in_grad[wid * len + pid];
+        }
+      }
+      __syncthreads();
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          float_t sum  = 0.0;
+          float_t self = p[warp_lane][pid];
+          for (int j = 0; j < len; j++) {
+            float_t df =
+                (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
+            sum += df * d[warp_lane][j];
+          }
+          out_grad[base + pid] = sum;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+__global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end,
+                                               const mask_t* masks,
+                                               const label_t* labels,
+                                               const float_t* out,
+                                               float_t* diff) {
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    int id = begin + i;
+    if (masks[id] == 1) {   // masked
+      float_t out_grad[41]; // TODO
+      d_cross_entropy_device(len, labels[id], out + len * id, out_grad);
+      d_softmax_device(len, out + len * id, out_grad, diff + len * id);
+    }
+  }
+}
+
+__global__ void d_softmax_cross_entropy_warp(int len, int begin, int end,
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             const float_t* data,
+                                             float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len)
+          p[warp_lane][pid] = data[base + pid];
+      }
+      __syncthreads();
+
+      // cross entropy derivative
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          if (pid == (int)labels[id])
+            d[warp_lane][pid] = -1.0 / (p[warp_lane][pid] + 1e-10);
+          else
+            d[warp_lane][pid] = 0.0;
+        }
+      }
+      __syncthreads();
+
+      // softmax derivative
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          float_t sum  = 0.0;
+          float_t self = p[warp_lane][pid];
+          for (int j = 0; j < len; j++) {
+            float_t df =
+                (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self;
+            sum += df * d[warp_lane][j];
+          }
+          grad[base + pid] = sum;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+void d_softmax_cross_entropy_gpu(int len, int begin, int end,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out, float_t* diff) {
+  //  d_softmax_cross_entropy_kernel<<<CUDA_GET_BLOCKS(end-begin),
+  //  CUDA_NUM_THREADS>>>(
+  //      len, begin, end, masks, labels, out, diff);
+  //  CudaTest("solving d_softmax_cross_entropy kernel failed");
+  // float_t *grad;
+  // float_malloc_device((end-begin)*len, grad);
+  // d_cross_entropy_kernel<<<CUDA_GET_BLOCKS((end-begin)*len),
+  // CUDA_NUM_THREADS>>>(
+  // d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+  //    len, begin, end, masks, labels, out, grad);
+  // CudaTest("solving d_cross_entropy kernel failed");
+  // d_softmax_kernel<<<CUDA_GET_BLOCKS(end-begin), CUDA_NUM_THREADS>>>(
+  // d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(
+  //    len, begin, end, masks, out, grad, diff);
+  // CudaTest("solving d_softmax kernel failed");
+  d_softmax_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1,
+                                 BLOCK_SIZE>>>(len, begin, end, masks, labels,
+                                               out, diff);
+  CudaTest("solving d_softmax_cross_entropy_warp kernel failed");
+}
+
+__global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end,
+                                             const mask_t* masks,
+                                             const label_t* labels,
+                                             const float_t* data,
+                                             float_t* grad) {
+  __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  const int thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const int thread_lane =
+      threadIdx.x & (WARP_SIZE - 1);             // thread index within the warp
+  const int warp_id   = thread_id / WARP_SIZE;   // global warp index
+  const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const int num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  for (int wid = warp_id; wid < end - begin; wid += num_warps) {
+    int id   = begin + wid;
+    int base = id * len;
+    if (masks[id] == 1) {
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len)
+          p[warp_lane][pid] = data[base + pid];
+      }
+      __syncthreads();
+
+      // cross entropy derivative
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          // if (p[warp_lane][pid] == 0)
+          d[warp_lane][pid] =
+              -(float_t)labels[base + pid] / (p[warp_lane][pid] + 1e-10);
+          // else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10;
+        }
+      }
+      __syncthreads();
+
+      // sigmoid derivative
+      for (int i = 0; i < len; i += WARP_SIZE) {
+        int pid = thread_lane + i;
+        if (pid < len) {
+          float_t self     = p[warp_lane][pid];
+          float_t dp       = d[warp_lane][pid];
+          grad[base + pid] = dp * self * (float_t(1) - self);
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+void d_sigmoid_cross_entropy_gpu(int len, int begin, int end,
+                                 const mask_t* masks, const label_t* labels,
+                                 const float_t* out, float_t* diff) {
+  d_sigmoid_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1,
+                                 BLOCK_SIZE>>>(len, begin, end, masks, labels,
+                                               out, diff);
+  CudaTest("solving d_sigmoid_cross_entropy_warp kernel failed");
+}
+
+__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks,
+                                       float_t* loss,
+                                       HGAccumulator<acc_t> total) {
+  total.thread_entry();
+  __shared__ cub::BlockReduce<acc_t, CUDA_NUM_THREADS>::TempStorage local_loss;
+  CUDA_KERNEL_LOOP(i, end - begin) {
+    if (masks[begin + i] == 1)
+      total.reduce(loss[begin + i]);
+  }
+  total.thread_exit<cub::BlockReduce<acc_t, CUDA_NUM_THREADS>>(local_loss);
+}
+
+// acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t*
+// loss);
+acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks,
+                          float_t* loss) {
+  assert(count > 0);
+  HGAccumulator<acc_t> loss_accum;
+  Shared<acc_t> total_loss   = Shared<acc_t>(1);
+  *(total_loss.cpu_wr_ptr()) = 0;
+  loss_accum.rv              = total_loss.gpu_wr_ptr();
+  masked_avg_loss_kernel<<<CUDA_GET_BLOCKS(end - begin), CUDA_NUM_THREADS>>>(
+      begin, end, masks, loss, loss_accum);
+  CudaTest("solving masked_avg_loss kernel failed");
+  cudaDeviceSynchronize();
+  return *(total_loss.cpu_rd_ptr()) / count;
+}
diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp
new file mode 100644
index 0000000000..e5e9fa7c10
--- /dev/null
+++ b/libdeepgalois/src/node.cpp
@@ -0,0 +1,28 @@
+#include "deepgalois/layers/node.h"
+#include <iostream>
+
+namespace deepgalois {
+
+void edge::alloc() {
+  data_ = new float_t[num_samples_ * ft_dim_];
+  grad_ = new float_t[num_samples_ * ft_dim_];
+}
+
+void edge::merge_grads(float_t* dst) {
+  assert(grad_ != NULL);
+  if (dst)
+    delete[] dst;
+  dst = new float_t[ft_dim_];
+  std::copy(grad_, grad_ + ft_dim_, dst);
+  // @todo consider adding parallelism and vectorization
+  for (size_t sample = 1; sample < num_samples_; ++sample) {
+    for (size_t i = 0; i < ft_dim_; i++)
+      dst[i] += grad_[sample * ft_dim_ + i];
+  }
+}
+
+void edge::clear_grads() {
+  std::fill(grad_, grad_ + ft_dim_ * num_samples_, float_t(0));
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu
new file mode 100644
index 0000000000..2151162752
--- /dev/null
+++ b/libdeepgalois/src/node.cu
@@ -0,0 +1,24 @@
+#include "deepgalois/layers/node.h"
+#include "deepgalois/cutils.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+void edge::alloc() {
+  CUDA_CHECK(
+      cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t)));
+  CUDA_CHECK(
+      cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t)));
+}
+
+void edge::merge_grads(float_t* dst) {
+  CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t),
+                        cudaMemcpyDeviceToHost));
+}
+
+void edge::clear_grads() {
+  // CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t)));
+  init_const_gpu(num_samples_ * ft_dim_, 0.0, grad_);
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp
new file mode 100644
index 0000000000..4538d1c956
--- /dev/null
+++ b/libdeepgalois/src/optimizer.cpp
@@ -0,0 +1,101 @@
+#include "deepgalois/optimizer.h"
+#include "galois/Galois.h"
+#include "deepgalois/math_functions.hh"
+
+namespace deepgalois {
+
+void adagrad::update(const vec_t& dW, vec_t& W) {
+  vec_t& g = get<0>(W);
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        g[i] += dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+      },
+      galois::loopname("adagrad_update"));
+  /*
+      for (size_t i = 0; i < W.size(); i++) {
+        g[i] += dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
+      }
+  */
+}
+
+void RMSprop::update(const vec_t& dW, vec_t& W) {
+  vec_t& g = get<0>(W);
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
+        W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
+      },
+      galois::loopname("rms_update"));
+}
+
+void adam::update(const vec_t& dW, vec_t& W) {
+  vec_t& mt = get<0>(W);
+  vec_t& vt = get<1>(W);
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+        vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
+        // L2 norm based update rule
+        W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) /
+                std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps);
+      },
+      galois::chunk_size<256>(), galois::steal(),
+      galois::loopname("adam_update"));
+  // TODO/NOTE: this is incorrect: adam parameters should not be shared
+  // among layers, but this is making it shared
+  b1_t *= b1;
+  b2_t *= b2;
+}
+
+void adamax::update(const vec_t& dW, vec_t& W) {
+  vec_t& mt = get<0>(W);
+  vec_t& ut = get<1>(W);
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
+        ut[i] = std::max(b2 * ut[i], std::abs(dW[i]));
+        // Lp norm based update rule
+        W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps));
+      },
+      galois::loopname("adamax_update"));
+  b1_t *= b1;
+}
+
+void gradient_descent::update(const vec_t& dW, vec_t& W) {
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); },
+      galois::loopname("gradient_descent_update"));
+}
+
+void momentum::update(const vec_t& dW, vec_t& W) {
+  vec_t& dWprev = get<0>(W);
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+        W[i] += V;
+        dWprev[i] = V;
+      },
+      galois::loopname("momentum_update"));
+}
+
+void nesterov_momentum::update(const vec_t& dW, vec_t& W) {
+  vec_t& dWprev = get<0>(W);
+  galois::do_all(
+      galois::iterate((size_t)0, W.size()),
+      [&](const auto& i) {
+        float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
+        W[i] += (-mu) * dWprev[i] + (1 + mu) * V;
+        dWprev[i] = V;
+      },
+      galois::loopname("nesterov_momentum_update"));
+}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu
new file mode 100644
index 0000000000..15f2fe5515
--- /dev/null
+++ b/libdeepgalois/src/optimizer.cu
@@ -0,0 +1,55 @@
+#include "deepgalois/optimizer.h"
+#include "deepgalois/cutils.h"
+#include "deepgalois/math_functions.hh"
+
+__global__ void update_kernel(const int n, float_t alpha, float_t b1,
+                              float_t b2, float_t b1_t, float_t b2_t,
+                              float_t eps, float_t* mt, float_t* vt,
+                              const float_t* dW, float_t* W) {
+  CUDA_KERNEL_LOOP(i, n) {
+    mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i];
+    vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i];
+    W[i] -=
+        alpha * (mt[i] / (1.0 - b1_t)) / sqrtf((vt[i] / (1.0 - b2_t)) + eps);
+  }
+}
+
+namespace deepgalois {
+
+template <int N>
+template <int Index>
+float_t* stateful_optimizer<N>::get_gpu(const size_t n, const float_t* key) {
+  static_assert(Index < N, "index out of range");
+  if (!is_allocated_device(dE_[Index][key])) {
+    float_malloc_device(n, dE_[Index][key]);
+    init_const_gpu(n, 0.0, dE_[Index][key]);
+  }
+  return dE_[Index][key];
+}
+
+void adam::update(const vec_t& dW, vec_t& W) {}
+void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) {
+  // std::cout << "updating weights on GPU, n = " << n << "\n";
+  // print_device_vector(10, dW, "dW");
+  float_t* cache    = get_gpu<0>(n, W);
+  float_t* velocity = get_gpu<1>(n, W);
+
+  update_kernel<<<CUDA_GET_BLOCKS(n), CUDA_NUM_THREADS>>>(
+      n, alpha, b1, b2, b1_t, b2_t, eps, cache, velocity, dW, W);
+  b1_t *= b1;
+  b2_t *= b2;
+}
+
+void adagrad::update_gpu(const size_t, const float_t*, float_t*) {}
+
+void RMSprop::update_gpu(const size_t, const float_t*, float_t*) {}
+
+void adamax::update_gpu(const size_t, const float_t*, float_t*) {}
+
+void gradient_descent::update_gpu(const size_t, const float_t*, float_t*) {}
+
+void momentum::update_gpu(const size_t, const float_t*, float_t*) {}
+
+void nesterov_momentum::update_gpu(const size_t, const float_t*, float_t*) {}
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp
new file mode 100644
index 0000000000..d7e1bcf44b
--- /dev/null
+++ b/libdeepgalois/src/reader.cpp
@@ -0,0 +1,311 @@
+#include "deepgalois/reader.h"
+#include "deepgalois/utils.h"
+#include "deepgalois/configs.h"
+#include "galois/Galois.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>  /* For O_RDWR */
+#include <unistd.h> /* For open(), creat() */
+#include <fstream>
+#include <cassert>
+#ifndef GALOIS_ENABLE_GPU
+#include "galois/DistGalois.h"
+#endif
+
+namespace deepgalois {
+
+// labels contain the ground truth (e.g. vertex classes) for each example
+// (num_examples x 1). Note that labels is not one-hot encoded vector and it can
+// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if
+// required.
+size_t Reader::read_labels(bool is_single_class, label_t*& labels) {
+  unsigned myID = 0;
+#ifndef GALOIS_ENABLE_GPU
+  myID = galois::runtime::getSystemNetworkInterface().ID;
+  galois::gPrint("[", myID, "] Reader: Reading labels...\n");
+#endif
+
+  Timer t_read;
+  t_read.Start();
+  std::string filename = path + dataset_str + "-labels.txt";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  size_t m, num_classes; // m: number of samples
+  in >> m >> num_classes >> std::ws;
+  if (is_single_class) {
+    std::cout << "[" << myID
+              << "] Reader: Using single-class (one-hot) labels\n";
+    // galois::gPrint("[", myID,
+    //               "] Reader: Using single-class (one-hot) labels\n");
+    labels =
+        new label_t[m]; // single-class (one-hot) label for each vertex: N x 1
+  } else {
+    // galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot)
+    // labels\n");
+    std::cout << "[" << myID
+              << "] Reader: Using multi-class (one-hot) labels\n";
+    labels =
+        new label_t[m *
+                    num_classes]; // multi-class label for each vertex: N x E
+  }
+  unsigned v = 0;
+  while (std::getline(in, line)) {
+    std::istringstream label_stream(line);
+    unsigned x;
+    for (size_t idx = 0; idx < num_classes; ++idx) {
+      label_stream >> x;
+      if (is_single_class) {
+        if (x != 0) {
+          labels[v] = idx;
+          break;
+        }
+      } else {
+        labels[v * num_classes + idx] = x;
+      }
+    }
+    v++;
+  }
+  in.close();
+  t_read.Stop();
+  // print the number of vertex classes
+  std::cout << "[" << myID << "] Done, unique label counts: " << num_classes
+            << ", time: " << t_read.Millisecs() << " ms\n";
+  // galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes,
+  //", time: ", t_read.Millisecs(), " ms\n");
+  // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " <<
+  // unsigned(labels[i]) << "\n";
+  return num_classes;
+}
+
+//! Read features, return the length of a feature vector
+//! Features are stored in the Context class
+size_t Reader::read_features(float_t*& feats, std::string filetype) {
+  // filetype = "txt";
+  std::cout << "Reading features ... ";
+  Timer t_read;
+  t_read.Start();
+  size_t m, feat_len; // m = number of vertices
+  std::string filename = path + dataset_str + ".ft";
+  std::ifstream in;
+
+  if (filetype == "bin") {
+    std::string file_dims = path + dataset_str + "-dims.txt";
+    std::ifstream ifs;
+    ifs.open(file_dims, std::ios::in);
+    ifs >> m >> feat_len >> std::ws;
+    ifs.close();
+  } else {
+    in.open(filename, std::ios::in);
+    in >> m >> feat_len >> std::ws;
+  }
+  std::cout << "N x D: " << m << " x " << feat_len << "\n";
+  feats = new float_t[m * feat_len];
+  if (filetype == "bin") {
+    filename = path + dataset_str + "-feats.bin";
+    in.open(filename, std::ios::binary | std::ios::in);
+    in.read((char*)feats, sizeof(float_t) * m * feat_len);
+  } else {
+    std::string line;
+    while (std::getline(in, line)) {
+      std::istringstream edge_stream(line);
+      unsigned u, v;
+      float_t w;
+      edge_stream >> u;
+      edge_stream >> v;
+      edge_stream >> w;
+      feats[u * feat_len + v] = w;
+    }
+  }
+  in.close();
+  t_read.Stop();
+  std::cout << "Done, feature length: " << feat_len
+            << ", time: " << t_read.Millisecs() << " ms\n";
+  // for (auto i = 0; i < 6; i ++)
+  // for (auto j = 0; j < 6; j ++)
+  // std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] <<
+  // "\n";
+  return feat_len;
+}
+
+//! Get masks from datafile where first line tells range of
+//! set to create mask from
+size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin,
+                          size_t& end, mask_t* masks) {
+  std::cout << "n:" << n << "\n";
+  bool dataset_found = false;
+  for (int i = 0; i < NUM_DATASETS; i++) {
+    if (dataset_str == dataset_names[i]) {
+      dataset_found = true;
+      break;
+    }
+  }
+  if (!dataset_found) {
+    std::cout << "Dataset currently not supported\n";
+    exit(1);
+  }
+  size_t i             = 0;
+  size_t sample_count  = 0;
+  std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt";
+  // std::cout << "Reading " << filename << "\n";
+  std::ifstream in;
+  std::string line;
+  in.open(filename, std::ios::in);
+  in >> begin >> end >> std::ws;
+  while (std::getline(in, line)) {
+    std::istringstream mask_stream(line);
+    if (i >= begin && i < end) {
+      unsigned mask = 0;
+      mask_stream >> mask;
+      if (mask == 1) {
+        masks[i] = 1;
+        sample_count++;
+      }
+    }
+    i++;
+  }
+  std::cout << "Global read " << mask_type << "_mask range: [" << begin << ", "
+            << end << ") Number of valid samples: " << sample_count << " ("
+            << (float)sample_count / (float)n * (float)100 << "\%)\n";
+  // galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ",
+  // end,
+  //               ") Number of valid samples: ", sample_count, " (",
+  //               (float)sample_count / (float)n * (float)100, "\%)\n");
+  in.close();
+  return sample_count;
+}
+
+void Reader::progressPrint(unsigned max, unsigned i) {
+  const unsigned nsteps = 10;
+  unsigned ineachstep   = (max / nsteps);
+  if (ineachstep == 0)
+    ineachstep = 1;
+  if (i % ineachstep == 0) {
+    int progress = ((size_t)i * 100) / max + 1;
+    printf("\t%3d%%\r", progress);
+    fflush(stdout);
+  }
+}
+
+void Reader::readGraphFromGRFile(LearningGraph* g) {
+  std::string filename = path + dataset_str + ".csgr";
+  std::ifstream ifs;
+  ifs.open(filename);
+  int masterFD = open(filename.c_str(), O_RDONLY);
+  if (masterFD == -1) {
+    std::cout << "LearningGraph: unable to open" << filename << "\n";
+    exit(1);
+  }
+  struct stat buf;
+  int f = fstat(masterFD, &buf);
+  if (f == -1) {
+    std::cout << "LearningGraph: unable to stat" << filename << "\n";
+    exit(1);
+  }
+  size_t masterLength = buf.st_size;
+  int _MAP_BASE       = MAP_PRIVATE;
+  void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);
+  if (m == MAP_FAILED) {
+    m = 0;
+    std::cout << "LearningGraph: mmap failed.\n";
+    exit(1);
+  }
+  Timer t;
+  t.Start();
+
+  uint64_t* fptr                           = (uint64_t*)m;
+  __attribute__((unused)) uint64_t version = le64toh(*fptr++);
+  assert(version == 1);
+  uint64_t sizeEdgeTy = le64toh(*fptr++);
+  uint64_t nv         = le64toh(*fptr++);
+  uint64_t ne         = le64toh(*fptr++);
+  uint64_t* outIdx    = fptr;
+  fptr += nv;
+  uint32_t* fptr32 = (uint32_t*)fptr;
+  uint32_t* outs   = fptr32;
+  fptr32 += ne;
+  if (ne % 2)
+    fptr32 += 1;
+  if (sizeEdgeTy != 0) {
+    std::cout << "LearningGraph: currently edge data not supported.\n";
+    exit(1);
+  }
+  g->allocateFrom(nv, ne);
+  auto rowptr = g->row_start_host_ptr();
+  for (unsigned vid = 0; vid < nv; ++vid) {
+    g->fixEndEdge(vid, le64toh(outIdx[vid]));
+    auto degree = rowptr[vid + 1] - rowptr[vid];
+    for (unsigned jj = 0; jj < degree; ++jj) {
+      unsigned eid = rowptr[vid] + jj;
+      unsigned dst = le32toh(outs[eid]);
+      if (dst >= nv) {
+        printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj,
+               eid);
+        exit(0);
+      }
+      g->constructEdge(eid, dst);
+    }
+    progressPrint(nv, vid);
+  }
+  ifs.close();
+
+  /*
+    std::string file_dims = path + dataset + "-dims.bin";
+    std::string file_rowptr = path + dataset + "-rowptr.bin";
+    std::string file_colidx = path + dataset + "-colidx.bin";
+    index_t dims[2];
+    ifs.open(file_dims, std::ios::binary|std::ios::in);
+    ifs.read((char*)dims, sizeof(index_t) * 2);
+    ifs.close();
+    num_vertices_ = dims[0];
+    num_edges_ = dims[1];
+    degrees_ = new index_t[num_vertices_];
+    rowptr_ = new index_t[num_vertices_+1];
+    colidx_ = new index_t[num_edges_];
+    ifs.open(file_rowptr, std::ios::binary|std::ios::in);
+    ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1));
+    ifs.close();
+    ifs.open(file_colidx, std::ios::binary|std::ios::in);
+    ifs.read((char*)colidx_, sizeof(index_t) * num_edges_);
+    ifs.close();
+  */
+  t.Stop();
+  // double runtime = t.Millisecs();
+  // std::cout << "read " << masterLength << " bytes in " << runtime << " ms ("
+  //          << masterLength / 1000.0 / runtime << " MB/s)\n\n";
+}
+
+/*
+void add_selfloop(Graph& og, Graph& g) {
+  g.allocateFrom(og.size(), og.size() + og.sizeEdges());
+  g.constructNodes();
+   for (size_t src = 0; src < og.size(); src++) {
+    //g.getData(src) = 1;
+    auto begin = og.edge_begin(src);
+    auto end = og.edge_end(src);
+    g.fixEndEdge(src, end+src+1);
+    bool self_inserted = false;
+    if (begin == end) {
+      new_edge_dst[begin+i] = i;
+      continue;
+    }
+    for (auto e = begin; e != end; e++) {
+      auto dst = og.getEdgeDst(e);
+      if (!self_inserted) {
+        if (dst > src) {
+          g.constructEdge(e+src, src, 0);
+          g.constructEdge(e+src+1, dst, 0);
+          self_inserted = true;
+        } else if (e+1 == end) {
+          g.constructEdge(e+src+1, src, 0);
+          g.constructEdge(e+src, dst, 0);
+          self_inserted = true;
+        } else g.constructEdge(e+src, dst, 0);
+      } else g.constructEdge(e+src+1, dst, 0);
+    }
+  }
+}
+//*/
+
+} // namespace deepgalois
diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp
new file mode 100644
index 0000000000..1b237ff7c3
--- /dev/null
+++ b/libdeepgalois/src/utils.cpp
@@ -0,0 +1,132 @@
+#include "galois/Galois.h"
+#include "deepgalois/utils.h"
+#ifndef GALOIS_ENABLE_GPU
+#include "galois/DistGalois.h"
+#endif
+
+namespace deepgalois {
+
+// parallel prefix sum
+template <typename InTy, typename OutTy>
+OutTy* parallel_prefix_sum(const std::vector<InTy>& in) {
+  const size_t block_size = 1 << 20;
+  const size_t num_blocks = (in.size() + block_size - 1) / block_size;
+  std::vector<OutTy> local_sums(num_blocks);
+  // count how many bits are set on each thread
+  galois::do_all(
+      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
+        OutTy lsum       = 0;
+        size_t block_end = std::min((block + 1) * block_size, in.size());
+        for (size_t i = block * block_size; i < block_end; i++)
+          lsum += in[i];
+        local_sums[block] = lsum;
+      });
+  std::vector<OutTy> bulk_prefix(num_blocks + 1);
+  OutTy total = 0;
+  for (size_t block = 0; block < num_blocks; block++) {
+    bulk_prefix[block] = total;
+    total += local_sums[block];
+  }
+  bulk_prefix[num_blocks] = total;
+  // TODO do not use new here: difficult to track and free later
+  OutTy* prefix = new OutTy[in.size() + 1];
+  galois::do_all(
+      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {
+        OutTy local_total = bulk_prefix[block];
+        size_t block_end  = std::min((block + 1) * block_size, in.size());
+        for (size_t i = block * block_size; i < block_end; i++) {
+          prefix[i] = local_total;
+          local_total += in[i];
+        }
+      });
+  prefix[in.size()] = bulk_prefix[num_blocks];
+  return prefix;
+}
+
+template uint32_t*
+parallel_prefix_sum<uint32_t, uint32_t>(const std::vector<uint32_t>& in);
+
+// Compute the F1 score, also known as balanced F-score or F-measure
+// The F1 score can be interpreted as a weighted average of the precision and
+// recall, where an F1 score reaches its best value at 1 and worst score at 0.
+// The relative contribution of precision and recall to the F1 score are equal.
+// The formula for the F1 score is:
+// F1 = 2 * (precision * recall) / (precision + recall)
+// where precision = TP / (TP + FP), recall = TP / (TP + FN)
+// TP: true positive; FP: false positive; FN: false negative.
+// In the multi-class and multi-label case, this is the weighted average of the
+// F1 score of each class. Please refer to
+// https://sebastianraschka.com/faq/docs/multiclass-metric.html,
+// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf
+// (p.1672) and
+// https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp
+acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks,
+                      size_t num_classes, label_t* ground_truth,
+                      float_t* pred) {
+  // TODO dist version; make aware of distributed execution
+  double precision_cls(0.), recall_cls(0.), f1_accum(0.);
+  int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0);
+
+  for (size_t col = 0; col < num_classes; col++) {
+    int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0);
+
+    for (size_t row = begin; row < end; row++) {
+      if (masks == NULL || masks[row] == 1) {
+        auto idx = row * num_classes + col;
+        if (ground_truth[idx] == 1 && pred[idx] > 0.5) {
+          //__sync_fetch_and_add(&tp_cls, 1);
+          tp_cls += 1;
+        } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) {
+          //__sync_fetch_and_add(&fp_cls, 1);
+          fp_cls += 1;
+        } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) {
+          //__sync_fetch_and_add(&fn_cls, 1);
+          fn_cls += 1;
+        } else if (ground_truth[idx] == 0 && pred[idx] <= 0.5) {
+          //__sync_fetch_and_add(&tn_cls, 1);
+          tn_cls += 1;
+        }
+      }
+    }
+
+    tp_accum += tp_cls;
+    fn_accum += fn_cls;
+    fp_accum += fp_cls;
+    tn_accum += tn_cls;
+    precision_cls =
+        tp_cls + fp_cls > 0 ? (double)tp_cls / (double)(tp_cls + fp_cls) : 0.;
+    recall_cls =
+        tp_cls + fn_cls > 0 ? (double)tp_cls / (double)(tp_cls + fn_cls) : 0.;
+    f1_accum +=
+        recall_cls + precision_cls > 0.
+            ? 2. * (recall_cls * precision_cls) / (recall_cls + precision_cls)
+            : 0.;
+  }
+
+  double f1_macro = f1_accum / (double)num_classes;
+  // double accuracy_mic =
+  // (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum);
+  double precision_mic = tp_accum + fp_accum > 0
+                             ? (double)tp_accum / (double)(tp_accum + fp_accum)
+                             : 0.;
+  double recall_mic    = tp_accum + fn_accum > 0
+                             ? (double)tp_accum / (double)(tp_accum + fn_accum)
+                             : 0.;
+  double f1_micro =
+      recall_mic + precision_mic > 0.
+          ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic)
+          : 0.;
+
+  unsigned myID = 0;
+#ifndef GALOIS_ENABLE_GPU
+  myID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
+  std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed
+            << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n";
+  // galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed,
+  //               " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n");
+
+  return f1_micro;
+}
+
+} // namespace deepgalois
diff --git a/libdist/CMakeLists.txt b/libdist/CMakeLists.txt
index 138a4edabd..2930d37cbf 100644
--- a/libdist/CMakeLists.txt
+++ b/libdist/CMakeLists.txt
@@ -21,7 +21,7 @@ target_include_directories(galois_dist_async PUBLIC
 target_link_libraries(galois_dist_async PUBLIC MPI::MPI_CXX)
 target_link_libraries(galois_dist_async PUBLIC galois_shmem)
 
-target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1)
+#target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1)
 
 if (GALOIS_USE_BARE_MPI)
   target_compile_definitions(galois_dist_async PRIVATE GALOIS_USE_BARE_MPI=1)
@@ -49,7 +49,7 @@ if (GALOIS_USE_LCI)
 
   add_dependencies(galois_dist_async lci)
   target_link_libraries(galois_dist_async PRIVATE ${LCI_LIBRARY} -lpsm2)
-  target_include_directories(galois_dist_async PUBLIC 
+  target_include_directories(galois_dist_async PUBLIC
     $<BUILD_INTERFACE:${LCI_INCLUDE}>
     $<INSTALL_INTERFACE:include>
   )
diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h
new file mode 100644
index 0000000000..8066f3a25e
--- /dev/null
+++ b/libdist/include/galois/BufferWrapper.h
@@ -0,0 +1,115 @@
+#ifndef GALOIS_BUFFER_WRAPPER
+#define GALOIS_BUFFER_WRAPPER
+#include "galois/gstl.h"
+#include <cassert>
+
+namespace galois {
+
+//! Wraps a pointer representing an array with the number of elements the
+//! array contains (or that we want to handle with this class)
+//!
+//! Used to avoid copying of memory into a vector for
+//! serialization/deserialization purpose
+//! @todo give this a better name
+template <typename ElementType>
+class BufferWrapper {
+public:
+  using size_type  = size_t;
+  using value_type = ElementType;
+
+private:
+  //! This vector is allocated when creating a buffer wrapper from scratch
+  //! (i.e. during deserialization into one)
+  galois::gstl::Vector<ElementType> dummy;
+  //! Raw memory kept by this class; either points to existing memory or is
+  //! empty (vector.data changes when this object is copied, causes issues
+  //! with correcntess)
+  ElementType* raw_memory;
+  //! Number of elements that can be accessed from the raw_memory pointer
+  size_type num_elements;
+
+public:
+  //! Default constructor 0s everything
+  BufferWrapper() {
+    dummy.clear();
+    this->raw_memory   = 0;
+    this->num_elements = 0;
+  }
+
+  //! frees dummy vector
+  ~BufferWrapper() {
+    // explicit vector clear; regular destructor probably frees it, but
+    // doing it for safetey
+    if (dummy.size()) {
+      dummy.clear();
+    }
+  }
+
+  //! Save a pointer and the number of elements in that array that this can
+  //! access
+  BufferWrapper(ElementType* pointer, size_t num_elements_)
+      : raw_memory(pointer), num_elements(num_elements_){};
+
+  //! Returns element at some specified index of the array
+  ElementType& operator[](size_t index) {
+    assert(index < this->num_elements);
+    if (dummy.size()) {
+      return dummy[index];
+    } else {
+      return raw_memory[index];
+    }
+  }
+
+  //! Returns element at some specified index of the array; const i.e. not
+  //! modifiable
+  const ElementType& operator[](size_t index) const {
+    assert(index < this->num_elements);
+    if (dummy.size()) {
+      return dummy[index];
+    } else {
+      return raw_memory[index];
+    }
+  }
+
+  //! Return number of elements in the array
+  size_t size() const { return this->num_elements; }
+
+  //! return unmodifiable pointer to raw_memory
+  const ElementType* data() const {
+    if (dummy.size()) {
+      return dummy.data();
+    } else {
+      return raw_memory;
+    }
+  }
+
+  //! return pointer to raw_memory
+  ElementType* data() {
+    if (dummy.size()) {
+      return dummy.data();
+    } else {
+      return raw_memory;
+    }
+  }
+
+  //! Allocates memory in the underlying vector; should only be used for
+  //! deserialization into this class during communication
+  //! This also means you shouldn't use raw_data
+  void resize(size_t new_size) {
+    if (!this->dummy.size()) {
+      this->dummy.resize(new_size);
+      this->num_elements = this->dummy.size();
+    } else {
+      GALOIS_DIE("calling resize when there is already memory "
+                 "allocated");
+    }
+  }
+
+  ElementType* get_vec_data() {
+    assert(this->dummy.size());
+    return dummy.data();
+  }
+};
+
+} // namespace galois
+#endif
diff --git a/libdist/include/galois/DTerminationDetector.h b/libdist/include/galois/DTerminationDetector.h
index 0f6d696838..28c58b3666 100644
--- a/libdist/include/galois/DTerminationDetector.h
+++ b/libdist/include/galois/DTerminationDetector.h
@@ -150,10 +150,8 @@ class DGTerminator {
 
   bool terminate() {
     bool active = (local_mdata != 0);
-    // if (active) galois::gDebug("[", net.ID, "] local work done \n");
     if (!active) {
       active = net.anyPendingSends();
-      // if (active) galois::gDebug("[", net.ID, "] pending send \n");
     }
     int snapshot_ended = 0;
     if (!active) {
@@ -166,8 +164,6 @@ class DGTerminator {
     }
     if (!active) { // check pending receives after checking snapshot
       active = net.anyPendingReceives();
-      if (active)
-        galois::gDebug("[", net.ID, "] pending receive");
     }
     if (active) {
       work_done = true;
@@ -178,16 +174,11 @@ class DGTerminator {
           work_done     = false;
           prev_snapshot = snapshot;
           ++snapshot;
-          galois::gDebug("[", net.ID, "] work done, taking snapshot ",
-                         snapshot);
           initiate_snapshot();
         } else if (prev_snapshot != snapshot) {
           prev_snapshot = snapshot;
-          galois::gDebug("[", net.ID, "] no work done, taking snapshot ",
-                         snapshot);
           initiate_snapshot();
         } else {
-          galois::gDebug("[", net.ID, "] terminating ", snapshot);
           // an explicit barrier may be required here
           // so that the next async phase begins on all hosts at the same time
           // however, this may add overheads when it is not required
diff --git a/libdist/include/galois/DistGalois.h b/libdist/include/galois/DistGalois.h
index b87c539f3e..e39f311470 100644
--- a/libdist/include/galois/DistGalois.h
+++ b/libdist/include/galois/DistGalois.h
@@ -44,10 +44,10 @@ class DistMemSys : public runtime::SharedMem<runtime::DistStatManager> {
 
   ~DistMemSys();
 
-  DistMemSys(const DistMemSys&) = delete;
+  DistMemSys(const DistMemSys&)            = delete;
   DistMemSys& operator=(const DistMemSys&) = delete;
 
-  DistMemSys(DistMemSys&&) = delete;
+  DistMemSys(DistMemSys&&)            = delete;
   DistMemSys& operator=(DistMemSys&&) = delete;
 };
 
diff --git a/libdist/include/galois/runtime/Network.h b/libdist/include/galois/runtime/Network.h
index e4695c0c2b..1560b20914 100644
--- a/libdist/include/galois/runtime/Network.h
+++ b/libdist/include/galois/runtime/Network.h
@@ -109,7 +109,7 @@ class NetworkInterface {
   //! tag (tag) and some data (buf)
   //! on the receiver, buf will be returned on a receiveTagged(tag)
   //! buf is invalidated by this operation
-  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,
+  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf,
                           int type = 0) = 0;
 
   //! Send a message to all hosts.  A message is simply a
@@ -123,9 +123,6 @@ class NetworkInterface {
   template <typename... Args>
   void broadcastSimple(void (*recv)(uint32_t, Args...), Args... param);
 
-  //! Receive and dispatch messages
-  void handleReceives();
-
   //! Wrapper to reset the mem usage tracker's stats
   inline void resetMemUsage() { memUsageTracker.resetMemUsage(); }
 
@@ -134,8 +131,7 @@ class NetworkInterface {
 
   //! Receive a tagged message
   virtual std::optional<std::pair<uint32_t, RecvBuffer>>
-  recieveTagged(uint32_t tag, std::unique_lock<substrate::SimpleLock>* rlg,
-                int type = 0) = 0;
+  recieveTagged(uint32_t tag, int type = 0) = 0;
 
   //! move send buffers out to network
   virtual void flush() = 0;
@@ -195,9 +191,6 @@ NetworkInterface& makeNetworkLCI();
 //! @warning Should not be called within a parallel region; assumes only one
 //! thread is calling it
 substrate::Barrier& getHostBarrier();
-//! Returns a fence that ensures all pending messages are delivered, acting
-//! like a memory-barrier
-substrate::Barrier& getHostFence();
 
 ////////////////////////////////////////////////////////////////////////////////
 // Implementations
@@ -220,7 +213,7 @@ void NetworkInterface::sendSimple(uint32_t dest,
   SendBuffer buf;
   gSerialize(buf, (uintptr_t)recv, param...,
              (uintptr_t)genericLandingPad<Args...>);
-  sendTagged(dest, 0, buf);
+  sendTagged(dest, 0, std::move(buf));
 }
 
 template <typename... Args>
diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h
index b7f7cab60e..bc3cad4b01 100644
--- a/libdist/include/galois/runtime/Serialize.h
+++ b/libdist/include/galois/runtime/Serialize.h
@@ -28,7 +28,9 @@
 #define GALOIS_RUNTIME_SERIALIZE_H
 
 #include <type_traits>
+#include <fstream>
 #include <ostream>
+#include <cstdlib>
 #include <vector>
 #include <deque>
 #include <string>
@@ -43,64 +45,81 @@
 #include <galois/AtomicWrapper.h>
 #include <galois/PODResizeableArray.h>
 #include "galois/CopyableTuple.h"
+#include "galois/BufferWrapper.h"
 #include "galois/Bag.h"
 
 namespace galois {
 namespace runtime {
 
+struct BufferHeader {
+  enum class BufferType { kSingleMessage, kMultipleMessages, kPartialMessage };
+  BufferType type{BufferType::kSingleMessage};
+  uint8_t num_segments{1};
+  uint8_t segment_id{0};
+  uint8_t segment_tag{0};
+};
+
 class DeSerializeBuffer; // forward declaration for friend declaration
 
 /**
  * Buffer for serialization of data. Mainly used during network communication.
  */
 class SerializeBuffer {
+  static constexpr size_t kHeaderSize = sizeof(BufferHeader);
+
   //! Access to a deserialize buffer
   friend DeSerializeBuffer;
 
   //! type of data buffer
   // using vTy = std::vector<uint8_t>;
-  using vTy = galois::PODResizeableArray<uint8_t>;
+  using vTy       = galois::PODResizeableArray<uint8_t>;
+  using size_type = vTy::size_type;
+
   //! the actual data stored in this buffer
   vTy bufdata;
 
 public:
   //! default constructor
-  SerializeBuffer() = default;
+  SerializeBuffer() {
+    BufferHeader header;
+    insert(reinterpret_cast<uint8_t*>(&header), kHeaderSize);
+  }
+
   //! disabled copy constructor
   SerializeBuffer(SerializeBuffer&& rhs) = default;
-  //! Creates a buffer from another buffer
-  //! @param d buffer to create from
-  //! @param len amount of copy from buffer d
-  SerializeBuffer(const char* d, unsigned len) : bufdata(d, d + len) {}
+
+  SerializeBuffer& operator=(SerializeBuffer&& rhs) {
+    auto buf = std::move(rhs);
+    bufdata  = std::move(buf.get());
+    return *this;
+  }
 
   //! Push a character onto the serialize buffer
   inline void push(const char c) { bufdata.push_back(c); }
 
   //! Insert characters from a buffer into the serialize buffer
   void insert(const uint8_t* c, size_t bytes) {
-    bufdata.insert(bufdata.end(), c, c + bytes);
+    if (bytes > 0) {
+      bufdata.insert(bufdata.end(), c, c + bytes);
+    }
   }
 
   //! Insert characters from a buffer into the serialize buffer at a particular
   //! offset
   void insertAt(const uint8_t* c, size_t bytes, size_t offset) {
-    std::copy_n(c, bytes, bufdata.begin() + offset);
+    offset += kHeaderSize;
+    assert((offset + bytes) <= bufdata.size());
+    if (bytes > 0) {
+      std::copy_n(c, bytes, bufdata.begin() + offset);
+    }
   }
 
-  /**
-   * Reserve space at the end for inserting new data into the serialize
-   * buffer
-   *
-   * @param bytes number of bytes to reserve at the end
-   * @returns offset to the end of the buffer before new space was reserved
-   */
-  size_t encomber(size_t bytes) {
-    size_t retval = bufdata.size();
-    bufdata.resize(retval + bytes);
-    return retval;
-  }
+  //! Returns an iterator to the beginning of the data in this serialize buffer
+  vTy::const_iterator begin() const { return bufdata.cbegin(); }
+  //! Returns an iterator to the end of the data in this serialize buffer
+  vTy::const_iterator end() const { return bufdata.cend(); }
 
-  void resize(size_t bytes) { bufdata.resize(bytes); }
+  void resize(size_t bytes) { bufdata.resize(kHeaderSize + bytes); }
 
   /**
    * Reserve more space in the serialize buffer.
@@ -110,34 +129,20 @@ class SerializeBuffer {
   void reserve(size_t s) { bufdata.reserve(bufdata.size() + s); }
 
   //! Returns a pointer to the data stored in this serialize buffer
-  const uint8_t* linearData() const { return bufdata.data(); }
+  const uint8_t* linearData() const { return bufdata.data() + kHeaderSize; }
   //! Returns vector of data stored in this serialize buffer
-  vTy& getVec() { return bufdata; }
+  vTy& get() { return bufdata; }
 
-  //! Returns an iterator to the beginning of the data in this serialize buffer
-  vTy::const_iterator begin() const { return bufdata.cbegin(); }
-  //! Returns an iterator to the end of the data in this serialize buffer
-  vTy::const_iterator end() const { return bufdata.cend(); }
-
-  using size_type = vTy::size_type;
-
-  //! Returns the size of the serialize buffer
-  size_type size() const { return bufdata.size(); }
-
-  //! Utility print function for the serialize buffer
-  //! @param o stream to print to
-  void print(std::ostream& o) const {
-    o << "<{" << std::hex;
-    for (auto& i : bufdata)
-      o << (unsigned int)i << " ";
-    o << std::dec << "}>";
+  //! Get a pointer to the remaining data of the deserialize buffer
+  //! (as determined by offset)
+  const uint8_t* data() const { return bufdata.data() + kHeaderSize; }
+  uint8_t* data() { return bufdata.data() + kHeaderSize; }
+  uint8_t* DataAtOffset(size_t offset) {
+    return bufdata.data() + kHeaderSize + offset;
   }
 
-  //! Operator that calls the print function of the serialize buffer
-  friend std::ostream& operator<<(std::ostream& os, const SerializeBuffer& b) {
-    b.print(os);
-    return os;
-  }
+  //! Returns the size of the serialize buffer
+  size_type size() const { return bufdata.size() - kHeaderSize; }
 };
 
 /**
@@ -145,50 +150,54 @@ class SerializeBuffer {
  * communication.
  */
 class DeSerializeBuffer {
+  static constexpr size_t kHeaderSize = sizeof(BufferHeader);
   //! Access to serialize buffer
   friend SerializeBuffer;
   //! type of data buffer
   // using vTy = std::vector<uint8_t>;
   using vTy = galois::PODResizeableArray<uint8_t>;
   //! the actual data stored in this buffer
-  vTy bufdata;
-  int offset;
+  vTy bufdata{kHeaderSize};
+  size_t offset{kHeaderSize};
 
 public:
   //! Constructor initializes offset into buffer to 0
-  DeSerializeBuffer() : offset(0) {}
+  DeSerializeBuffer() : offset(kHeaderSize) {}
   //! Disable copy constructor
   DeSerializeBuffer(DeSerializeBuffer&&) = default;
   //! Move constructor
   //! @param v vector to act as deserialize buffer
   //! @param start offset to start saving data into
   DeSerializeBuffer(vTy&& v, uint32_t start = 0)
-      : bufdata(std::move(v)), offset(start) {}
+      : bufdata(std::move(v)), offset(start + kHeaderSize) {
+    assert(bufdata.size() >= offset);
+  }
 
   //! Constructor that takes an existing vector to use as the deserialize
   //! buffer
   explicit DeSerializeBuffer(vTy& data) {
     bufdata.swap(data);
-    offset = 0;
+    offset = kHeaderSize;
   }
 
   /**
    * Initializes the deserialize buffer with a certain size
    * @param [in] count size to initialize buffer to
    */
-  explicit DeSerializeBuffer(int count) : bufdata(count), offset(0) {}
+  explicit DeSerializeBuffer(int count)
+      : bufdata(count + kHeaderSize), offset(kHeaderSize) {}
 
   /**
    * Initializes the deserialize buffer using vector initialization from
    * 2 iterators.
    */
   template <typename Iter>
-  DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{0} {}
+  DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{kHeaderSize} {}
 
   /**
    * Initialize a deserialize buffer from a serialize buffer
    */
-  explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(0) {
+  explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(kHeaderSize) {
     bufdata.swap(buf.bufdata);
   }
 
@@ -202,31 +211,15 @@ class DeSerializeBuffer {
    * @param count new size of buffer
    */
   void reset(int count) {
-    offset = 0;
-    bufdata.resize(count);
+    offset = kHeaderSize;
+    bufdata.resize(count + kHeaderSize);
   }
 
-  //! Gets the current offset into the deserialize buffer
-  unsigned getOffset() const { return offset; }
-  //! Sets the offset into the deserialize buffer
-  void setOffset(unsigned off) {
-    assert(off <= size());
-    offset = off;
-  }
-
-  //! Gets the size of the deserialize buffer
-  unsigned size() const { return bufdata.size(); }
-
-  //! Returns true if the deserialize buffer is empty
-  //! @returns true if the deserialize buffer is empty
-  bool empty() const { return bufdata.empty(); }
-
   //! Get the next character in the deserialize buffer
   unsigned char pop() { return bufdata.at(offset++); }
 
-  //! Clears the last x bytes of the deserialize buffer, resizing it as well
-  //! @param x How many bytes from the end to clear
-  void pop_back(unsigned x) { bufdata.resize(bufdata.size() - x); }
+  //! Gets the size of the deserialize buffer
+  unsigned size() const { return bufdata.size() - offset; }
 
   /**
    * Extracts a certain amount of data from the deserialize buffer
@@ -235,45 +228,23 @@ class DeSerializeBuffer {
    * @param num Amount of data to get from deserialize buffer
    */
   void extract(uint8_t* dst, size_t num) {
+    assert(offset >= kHeaderSize);
+    assert((offset + num) <= bufdata.size());
     if (num > 0) {
-      memcpy(dst, &bufdata[offset], num);
+      std::copy_n(&bufdata[offset], num, dst);
       offset += num;
     }
   }
 
   //! Get the underlying vector storing the data of the deserialize
   //! buffer
-  vTy& getVec() { return bufdata; }
+  vTy& get() { return bufdata; }
 
   //! Get a pointer to the underlying data of the deserialize buffer
-  void* linearData() { return &bufdata[0]; }
-
-  //! Get a pointer to the remaining data of the deserialize buffer
-  //! (as determined by offset)
-  const uint8_t* r_linearData() const { return &bufdata[offset]; }
-  //! Get the remaining size of the deserialize buffer (as determined
-  //! by offset)
-  size_t r_size() const { return bufdata.size() - offset; }
-
-  //! Checks if the current location in the deserialize buffer is aligned
-  //! to some size a
-  bool atAlignment(size_t a) { return (uintptr_t)r_linearData() % a == 0; }
-
-  //! Utility print of deserialize buffer
-  //! @param o stream to print to
-  void print(std::ostream& o) const {
-    o << "<{(" << offset << ") " << std::hex;
-    for (auto ii = bufdata.begin(), ee = bufdata.end(); ii != ee; ++ii)
-      o << (unsigned int)*ii << " ";
-    o << std::dec << "}>";
-  }
+  void* linearData() { return &bufdata[offset]; }
 
-  //! Operator for printing deserialize buffer
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const DeSerializeBuffer& buf) {
-    buf.print(os);
-    return os;
-  }
+  const uint8_t* data() const { return &bufdata[offset]; }
+  uint8_t* data() { return &bufdata[offset]; }
 };
 
 namespace internal {
@@ -307,6 +278,19 @@ gSizedObj(const T&,
   return sizeof(uintptr_t);
 }
 
+//! Size of BufferWrapper is size + number of things in it
+template <typename T>
+inline size_t gSizedObj(const galois::BufferWrapper<T>& data) {
+  return sizeof(size_t) + data.size() * sizeof(T);
+}
+
+template <typename T1, typename T2>
+inline size_t gSizedObj(const std::unordered_map<T1, T2>& data) {
+  size_t sz = 0;
+  for (auto i : data)
+    sz += gSizedObj(i.first) + gSizedObj(i.second);
+  return sz;
+}
 /**
  * Returns the size necessary for storing 2 elements of a pair into a
  * serialize buffer.
@@ -400,7 +384,7 @@ inline size_t gSizedObj(const SerializeBuffer& data) { return data.size(); }
  *
  * @returns size of the deserialize buffer passed into it
  */
-inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.r_size(); }
+inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.size(); }
 
 /**
  * Returns the size of the passed in insert bag.
@@ -472,6 +456,15 @@ inline void gSerializeObj(
  * @param [in,out] buf Serialize buffer to serialize into
  * @param [in] data Data to serialize
  */
+template <typename T1, typename T2>
+inline void gSerializeObj(SerializeBuffer& buf,
+                          const std::unordered_map<T1, T2>& data) {
+  gSerialize(buf, data.size());
+  for (auto i : data) {
+    gSerialize(buf, i.first, i.second);
+  }
+}
+
 template <typename T>
 inline void
 gSerializeObj(SerializeBuffer& buf, const T& data,
@@ -563,6 +556,11 @@ template <typename T, typename Alloc>
 inline void gSerializeObj(SerializeBuffer& buf,
                           const std::vector<T, Alloc>& data);
 
+// Forward declaration of buff serialize
+template <typename T>
+inline void gSerializeObj(SerializeBuffer& buf,
+                          const galois::BufferWrapper<T>& data);
+
 /**
  * Serialize a sequence type into a buffer.
  *
@@ -610,6 +608,18 @@ inline void gSerializeObj(SerializeBuffer& buf,
     gSerializeSeq(buf, data);
 }
 
+//! Serialize BufferWrapper similarly to vector
+template <typename T>
+inline void gSerializeObj(SerializeBuffer& buf,
+                          const galois::BufferWrapper<T>& data) {
+  if (is_memory_copyable<T>::value) {
+    gSerializeLinearSeq(buf, data);
+  } else {
+    GALOIS_DIE("have not implemented support for serializing nonPOD buffer "
+               "wrapper");
+  }
+}
+
 /**
  * Serialize a PODResizeableArray into a buffer, choosing to do a memcopy or
  * to serialize each element individually depending on data.
@@ -654,7 +664,7 @@ inline void gSerializeObj(SerializeBuffer& buf,
  * @param [in] data serialize buffer to get data from
  */
 inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) {
-  buf.insert(data.linearData(), data.size());
+  buf.insert(data.data(), data.size());
 }
 
 /**
@@ -665,7 +675,7 @@ inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) {
  */
 inline void gSerializeObj(SerializeBuffer& buf, const DeSerializeBuffer& rbuf) {
   //  buf.reserve(rbuf.r_size());
-  buf.insert(rbuf.r_linearData(), rbuf.r_size());
+  buf.insert(rbuf.data(), rbuf.size());
 }
 
 /**
@@ -729,8 +739,10 @@ gSerializeLazySeq(SerializeBuffer& buf, unsigned num, Seq*) {
                 "Not POD Sequence");
   typename Seq::size_type size = num;
   internal::gSerializeObj(buf, size);
-  size_t tsize = sizeof(typename Seq::value_type);
-  return LazyRef<typename Seq::value_type>{buf.encomber(tsize * num)};
+  size_t tsize    = sizeof(typename Seq::value_type);
+  size_t cur_size = buf.size();
+  buf.resize(cur_size + (tsize * num));
+  return LazyRef<typename Seq::value_type>{cur_size};
 }
 
 /**
@@ -800,6 +812,19 @@ void gDeserializeObj(
   data.deserialize(buf);
 }
 
+template <typename T1, typename T2>
+void gDeserializeObj(DeSerializeBuffer& buf, std::unordered_map<T1, T2>& data) {
+  uint64_t elts;
+  gDeserializeObj(buf, elts);
+  for (uint64_t i = 0; i < elts; i++) {
+    std::pair<T1, T2> elt;
+    gDeserialize(buf, elt.first, elt.second);
+    if (buf.size() <= 0) {
+      break;
+    }
+    data[elt.first] = elt.second;
+  }
+}
 /**
  * Deserialize a pair from a buffer.
  *
@@ -921,6 +946,10 @@ gDeserializeObj(DeSerializeBuffer& buf,
 template <typename T, typename Alloc>
 void gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data);
 
+// Forward declaration of buff wrapper deserialize
+template <typename T>
+void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper<T>& data);
+
 /**
  * Deserialize into a sequence object
  *
@@ -948,18 +977,10 @@ void gDeserializeSeq(DeSerializeBuffer& buf, Seq& seq) {
 template <typename Seq>
 void gDeserializeLinearSeq(DeSerializeBuffer& buf, Seq& seq) {
   typedef typename Seq::value_type T;
-  //  seq.clear();
   typename Seq::size_type size;
   gDeserializeObj(buf, size);
-  // If the alignment is right, cast to a T array and insert
-  if (buf.atAlignment(alignof(T))) {
-    T* src = (T*)buf.r_linearData();
-    seq.assign(src, &src[size]);
-    buf.setOffset(buf.getOffset() + size * sizeof(T));
-  } else {
-    seq.resize(size);
-    buf.extract((uint8_t*)seq.data(), size * sizeof(T));
-  }
+  seq.resize(size);
+  buf.extract((uint8_t*)seq.data(), size * sizeof(T));
 }
 
 /**
@@ -988,6 +1009,20 @@ void gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data) {
     gDeserializeSeq(buf, data);
 }
 
+//! deserialize into buf wrapper
+template <typename T>
+void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper<T>& bf) {
+  if (is_memory_copyable<T>::value) {
+    // manual deserialization here
+    size_t buffer_size{0};
+    gDeserializeObj(buf, buffer_size);
+    bf.resize(buffer_size);
+    buf.extract((uint8_t*)bf.get_vec_data(), buffer_size * sizeof(T));
+  } else {
+    GALOIS_DIE("deserialize for buf wrapper not implemented for nonpod");
+  }
+}
+
 /**
  * Deserialize into a PODResizeableArray
  *
@@ -1051,9 +1086,10 @@ inline void gDeserialize(DeSerializeBuffer&) {}
  * @param data Object to save data in the iterator type into
  */
 template <typename Iter, typename T>
-auto gDeserializeRaw(Iter iter, T& data) -> decltype(
-    std::declval<typename std::enable_if<is_memory_copyable<T>::value>::type>(),
-    Iter()) {
+auto gDeserializeRaw(Iter iter, T& data)
+    -> decltype(std::declval<typename std::enable_if<
+                    is_memory_copyable<T>::value>::type>(),
+                Iter()) {
   unsigned char* pdata = (unsigned char*)&data;
   for (size_t i = 0; i < sizeof(T); ++i)
     pdata[i] = *iter++;
diff --git a/libdist/src/Barrier.cpp b/libdist/src/Barrier.cpp
index 455e22aaed..0558d8ebb4 100644
--- a/libdist/src/Barrier.cpp
+++ b/libdist/src/Barrier.cpp
@@ -41,52 +41,6 @@
 #include "galois/runtime/BareMPI.h"
 
 namespace {
-class HostFence : public galois::substrate::Barrier {
-public:
-  virtual const char* name() const { return "HostFence"; }
-
-  virtual void reinit(unsigned) {}
-
-  //! control-flow barrier across distributed hosts
-  //! acts as a distributed-memory fence as well (flushes send and receives)
-  virtual void wait() {
-    auto& net = galois::runtime::getSystemNetworkInterface();
-
-    if (galois::runtime::evilPhase == 0) {
-      galois::gWarn("evilPhase is 0, implying loop-around or no use: fence "
-                    "may not work correctly!");
-    }
-
-    for (unsigned h = 0; h < net.Num; ++h) {
-      if (h == net.ID)
-        continue;
-      galois::runtime::SendBuffer b;
-      galois::runtime::gSerialize(b, net.ID + 1); // non-zero message
-      net.sendTagged(h, galois::runtime::evilPhase, b);
-    }
-    net.flush(); // flush all sends
-
-    unsigned received = 1; // self
-    while (received < net.Num) {
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
-      do {
-        net.handleReceives(); // flush all receives from net.sendMsg() or
-                              // net.sendSimple()
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
-      } while (!p);
-      assert(p->first != net.ID);
-      // ignore received data
-      ++received;
-    }
-    ++galois::runtime::evilPhase;
-    if (galois::runtime::evilPhase >=
-        static_cast<uint32_t>(
-            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
-                                                    // LCI
-      galois::runtime::evilPhase = 1;
-    }
-  }
-};
 
 class HostBarrier : public galois::substrate::Barrier {
 public:
@@ -110,8 +64,3 @@ galois::substrate::Barrier& galois::runtime::getHostBarrier() {
   static HostBarrier b;
   return b;
 }
-
-galois::substrate::Barrier& galois::runtime::getHostFence() {
-  static HostFence b;
-  return b;
-}
diff --git a/libdist/src/DistStats.cpp b/libdist/src/DistStats.cpp
index 8faf4cee5a..1fe46bc514 100644
--- a/libdist/src/DistStats.cpp
+++ b/libdist/src/DistStats.cpp
@@ -105,8 +105,8 @@ void DistStatManager::combineAtHost_0_helper(void) {
       SendBuffer b;
       gSerialize(b, hTotalMap.region(i), hTotalMap.category(i),
                  hTotalMap.stat(i).totalTy());
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 
@@ -126,8 +126,8 @@ void DistStatManager::combineAtHost_0_helper(void) {
     } else {
       SendBuffer b;
       gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 }
@@ -151,8 +151,8 @@ void DistStatManager::combineAtHost_0_helper2(void) {
     } else {
       SendBuffer b;
       gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 
@@ -172,8 +172,8 @@ void DistStatManager::combineAtHost_0_helper2(void) {
     } else {
       SendBuffer b;
       gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);
-      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,
-                                             syncTypePhase);
+      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase,
+                                             std::move(b), syncTypePhase);
     }
   }
 }
@@ -182,10 +182,10 @@ void DistStatManager::receiveAtHost_0_helper(void) {
   size_t syncTypePhase = 0;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         RecvBuffer& b = p->second;
@@ -203,10 +203,10 @@ void DistStatManager::receiveAtHost_0_helper(void) {
   ++syncTypePhase;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         uint32_t hostID = p->first;
@@ -230,10 +230,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) {
   size_t syncTypePhase = 0;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         uint32_t hostID = p->first;
@@ -255,10 +255,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) {
   ++syncTypePhase;
   {
     decltype(getSystemNetworkInterface().recieveTagged(
-        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;
+        galois::runtime::evilPhase, syncTypePhase)) p;
     do {
       p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,
-                                                    nullptr, syncTypePhase);
+                                                    syncTypePhase);
 
       if (p) {
         uint32_t hostID = p->first;
@@ -286,13 +286,18 @@ void DistStatManager::combineAtHost_0(void) {
   combineAtHost_0_helper();
   getSystemNetworkInterface().flush();
 
+  // work done before check
+  td += 1;
+
   // barrier
   while (td.reduce()) {
+    td.reset();
     if (getHostID() == 0) {
       // receive from other hosts
       receiveAtHost_0_helper();
     }
-  };
+  }
+
   // explicit barrier after logical barrier is required
   // as next async phase begins immediately
   getHostBarrier().wait();
@@ -302,13 +307,18 @@ void DistStatManager::combineAtHost_0(void) {
   combineAtHost_0_helper2();
   getSystemNetworkInterface().flush();
 
+  td += 1;
+
   // barrier
   while (td.reduce()) {
+    td.reset();
+
     if (getHostID() == 0) {
       // receive from other hosts
       receiveAtHost_0_helper2();
     }
-  };
+  }
+
   // explicit barrier after logical barrier is required
   // as next async phase begins immediately
   getHostBarrier().wait();
diff --git a/libdist/src/Network.cpp b/libdist/src/Network.cpp
index 44a189f7ad..7bf499a00b 100644
--- a/libdist/src/Network.cpp
+++ b/libdist/src/Network.cpp
@@ -93,7 +93,7 @@ void NetworkInterface::sendMsg(uint32_t dest,
                                void (*recv)(uint32_t, RecvBuffer&),
                                SendBuffer& buf) {
   gSerialize(buf, recv);
-  sendTagged(dest, 0, buf);
+  sendTagged(dest, 0, std::move(buf));
 }
 
 void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&),
@@ -104,30 +104,14 @@ void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&),
     if (x != ID) {
       SendBuffer b;
       gSerialize(b, fp, buf, (uintptr_t)&bcastLandingPad);
-      sendTagged(x, 0, b);
+      sendTagged(x, 0, std::move(b));
     } else if (self) {
-      RecvBuffer rb(buf.begin(), buf.end());
+      RecvBuffer rb = RecvBuffer(std::move(buf.get()));
       recv(ID, rb);
     }
   }
 }
 
-void NetworkInterface::handleReceives() {
-  std::unique_lock<substrate::SimpleLock> lg;
-  auto opt = recieveTagged(0, &lg);
-  while (opt) {
-    uint32_t src    = std::get<0>(*opt);
-    RecvBuffer& buf = std::get<1>(*opt);
-    uintptr_t fp    = 0;
-    gDeserializeRaw(buf.r_linearData() + buf.r_size() - sizeof(uintptr_t), fp);
-    buf.pop_back(sizeof(uintptr_t));
-    assert(fp);
-    auto f = (void (*)(uint32_t, RecvBuffer&))fp;
-    f(src, buf);
-    opt = recieveTagged(0, &lg);
-  }
-}
-
 NetworkInterface& galois::runtime::getSystemNetworkInterface() {
 #ifndef GALOIS_USE_LCI
   return makeNetworkBuffered();
diff --git a/libdist/src/NetworkBuffered.cpp b/libdist/src/NetworkBuffered.cpp
index 7b6d6c6ce1..a58f16c3ab 100644
--- a/libdist/src/NetworkBuffered.cpp
+++ b/libdist/src/NetworkBuffered.cpp
@@ -67,6 +67,12 @@ class NetworkInterfaceBuffered : public NetworkInterface {
   // using vTy = std::vector<uint8_t>;
   using vTy = galois::PODResizeableArray<uint8_t>;
 
+  static constexpr size_t kHeaderSize     = sizeof(BufferHeader);
+  static constexpr uint8_t kMaxSegmentTag = std::numeric_limits<uint8_t>::max();
+  static constexpr size_t kMaxBufferSize =
+      static_cast<size_t>(std::numeric_limits<int>::max());
+  static constexpr size_t kMaxDataSize = kMaxBufferSize - kHeaderSize;
+
   /**
    * Receive buffers for the buffered network interface
    */
@@ -77,6 +83,38 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     // tag of head of queue
     std::atomic<uint32_t> dataPresent;
 
+    struct PartialMessages {
+      uint8_t num_segments{0};
+      std::vector<vTy> segments;
+    };
+    std::unordered_map<uint8_t, PartialMessages> partial_messages_map_;
+
+    std::optional<vTy> CombinePartialMessages(const BufferHeader& header,
+                                              vTy&& vec) {
+      auto& partial_messages = partial_messages_map_[header.segment_tag];
+      if (partial_messages.num_segments == 0) {
+        partial_messages.segments.resize(header.num_segments);
+      }
+
+      partial_messages.segments[header.segment_id] = std::move(vec);
+      ++partial_messages.num_segments;
+
+      if (partial_messages.num_segments != header.num_segments) {
+        assert(partial_messages.num_segments < header.num_segments);
+        assert(partial_messages.segments.size() == header.num_segments);
+        return std::nullopt;
+      }
+
+      std::vector<vTy>& segments = partial_messages.segments;
+      vTy message                = std::move(segments[0]);
+      for (size_t i = 1, end = segments.size(); i < end; ++i) {
+        message.insert(message.end(), segments[i].begin() + kHeaderSize,
+                       segments[i].end());
+      }
+      partial_messages_map_.erase(header.segment_tag);
+      return std::make_optional(std::move(message));
+    }
+
     bool sizeAtLeast(size_t n, uint32_t tag) {
       size_t tot = -frontOffset;
       for (auto& v : data) {
@@ -163,30 +201,6 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     std::optional<RecvBuffer> popMsg(uint32_t tag,
                                      std::atomic<size_t>& inflightRecvs) {
       std::lock_guard<SimpleLock> lg(qlock);
-#ifndef NO_AGG
-      uint32_t len = getLenFromFront(tag);
-      //      assert(len);
-      if (len == ~0U || len == 0)
-        return std::optional<RecvBuffer>();
-      if (!sizeAtLeast(sizeof(uint32_t) + len, tag))
-        return std::optional<RecvBuffer>();
-      erase(4, inflightRecvs);
-
-      // Try just using the buffer
-      if (auto r = popVec(len, inflightRecvs)) {
-        auto start = r->size() - len;
-        //        std::cerr << "FP " << r->size() << " " << len << " " << start
-        //        << "\n";
-        return std::optional<RecvBuffer>(RecvBuffer(std::move(*r), start));
-      }
-
-      RecvBuffer buf(len);
-      // FIXME: This is slows things down 25%
-      copyOut((char*)buf.linearData(), len);
-      erase(len, inflightRecvs);
-      // std::cerr << "p " << tag << " " << len << "\n";
-      return std::optional<RecvBuffer>(std::move(buf));
-#else
       if (data.empty() || data.front().tag != tag)
         return std::optional<RecvBuffer>();
 
@@ -201,31 +215,28 @@ class NetworkInterfaceBuffered : public NetworkInterface {
       }
 
       return std::optional<RecvBuffer>(RecvBuffer(std::move(vec), 0));
-#endif
     }
 
     // Worker thread interface
-    void add(NetworkIO::message m) {
+    bool add(NetworkIO::message m) {
+      BufferHeader* header = reinterpret_cast<BufferHeader*>(m.data.data());
+      if (header->type == BufferHeader::BufferType::kPartialMessage) {
+        std::optional<vTy> segment =
+            CombinePartialMessages(*header, std::move(m.data));
+        if (!segment) {
+          return false;
+        }
+
+        m.data = std::move(*segment);
+      }
       std::lock_guard<SimpleLock> lg(qlock);
       if (data.empty()) {
         galois::runtime::trace("ADD LATEST ", m.tag);
         dataPresent = m.tag;
       }
 
-      // std::cerr << m.data.size() << " " <<
-      //              std::count(m.data.begin(), m.data.end(), 0) << "\n";
-      // for (auto x : m.data) {
-      //   std::cerr << (int) x << " ";
-      // }
-      // std::cerr << "\n";
-      // std::cerr << "A " << m.host << " " << m.tag << " " << m.data.size() <<
-      // "\n";
-
       data.push_back(std::move(m));
-
-      assert(data.back().data.size() !=
-             (unsigned int)std::count(data.back().data.begin(),
-                                      data.back().data.end(), 0));
+      return true;
     }
 
     bool hasData(uint32_t tag) { return dataPresent == tag; }
@@ -245,7 +256,7 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     struct msg {
       uint32_t tag;
       vTy data;
-      msg(uint32_t t, vTy& _data) : tag(t), data(std::move(_data)) {}
+      msg(uint32_t t, vTy&& _data) : tag(t), data(std::move(_data)) {}
     };
 
     std::deque<msg> messages;
@@ -254,6 +265,43 @@ class NetworkInterfaceBuffered : public NetworkInterface {
     //! @todo FIXME track time since some epoch in an atomic.
     std::chrono::high_resolution_clock::time_point time;
     SimpleLock lock, timelock;
+    uint8_t segment_tag_{0};
+
+    void IncrementSegmentTag() {
+      if (segment_tag_ == kMaxSegmentTag) {
+        segment_tag_ = 0;
+      } else {
+        ++segment_tag_;
+      }
+    }
+
+    std::vector<NetworkIO::message> Split(uint32_t host, uint32_t tag,
+                                          vTy&& vec) {
+      std::vector<vTy> segments;
+      segments.emplace_back(std::move(vec));
+      auto begin = segments[0].begin();
+      for (size_t i = kMaxBufferSize, end = segments[0].size(); i < end;
+           i += kMaxDataSize) {
+        vTy segment(kHeaderSize);
+        size_t segment_end = std::min(end, i + kMaxDataSize);
+        segment.insert(segment.end(), begin + i, begin + segment_end);
+        segments.emplace_back(std::move(segment));
+      }
+      segments[0].resize(kMaxBufferSize);
+
+      std::vector<NetworkIO::message> msg;
+      for (size_t i = 0; i < segments.size(); ++i) {
+        auto& segment        = segments[i];
+        BufferHeader* header = reinterpret_cast<BufferHeader*>(segment.data());
+        header->type         = BufferHeader::BufferType::kPartialMessage;
+        header->num_segments = segments.size();
+        header->segment_id   = i;
+        header->segment_tag  = segment_tag_;
+        msg.emplace_back(host, tag, std::move(segment));
+      }
+      IncrementSegmentTag();
+      return msg;
+    }
 
   public:
     unsigned long statSendTimeout;
@@ -269,103 +317,35 @@ class NetworkInterfaceBuffered : public NetworkInterface {
       }
     }
 
-    bool ready() {
-#ifndef NO_AGG
-      if (numBytes == 0)
-        return false;
-      if (urgent) {
-        ++statSendUrgent;
-        return true;
-      }
-      if (numBytes > COMM_MIN) {
-        ++statSendOverflow;
-        return true;
-      }
-      auto n = std::chrono::high_resolution_clock::now();
-      decltype(n) mytime;
-      {
-        std::lock_guard<SimpleLock> lg(timelock);
-        mytime = time;
-      }
-      auto elapsed =
-          std::chrono::duration_cast<std::chrono::microseconds>(n - mytime);
-      if (elapsed.count() > COMM_DELAY) {
-        ++statSendTimeout;
-        return true;
-      }
-      return false;
-#else
-      return messages.size() > 0;
-#endif
-    }
+    bool ready() { return messages.size() > 0; }
 
-    std::pair<uint32_t, vTy>
-    assemble(std::atomic<size_t>& GALOIS_UNUSED(inflightSends)) {
+    std::vector<NetworkIO::message> assemble(uint32_t host) {
       std::unique_lock<SimpleLock> lg(lock);
-      if (messages.empty())
-        return std::make_pair(~0, vTy());
-#ifndef NO_AGG
-      // compute message size
-      uint32_t len = 0;
-      int num      = 0;
-      uint32_t tag = messages.front().tag;
-      for (auto& m : messages) {
-        if (m.tag != tag) {
-          break;
-        } else {
-          // do not let it go over the integer limit because MPI_Isend cannot
-          // deal with it
-          if ((m.data.size() + sizeof(uint32_t) + len + num) >
-              static_cast<size_t>(std::numeric_limits<int>::max())) {
-            break;
-          }
-          len += m.data.size();
-          num += sizeof(uint32_t);
-        }
-      }
-      lg.unlock();
-      // construct message
-      vTy vec;
-      vec.reserve(len + num);
-      // go out of our way to avoid locking out senders when making messages
-      lg.lock();
-      do {
-        auto& m = messages.front();
-        lg.unlock();
-        union {
-          uint32_t a;
-          uint8_t b[sizeof(uint32_t)];
-        } foo;
-        foo.a = m.data.size();
-        vec.insert(vec.end(), &foo.b[0], &foo.b[sizeof(uint32_t)]);
-        vec.insert(vec.end(), m.data.begin(), m.data.end());
-        if (urgent)
-          --urgent;
-        lg.lock();
-        messages.pop_front();
-        --inflightSends;
-      } while (vec.size() < len + num);
-      ++inflightSends;
-      numBytes -= len;
-#else
+      assert(!messages.empty());
       uint32_t tag = messages.front().tag;
       vTy vec(std::move(messages.front().data));
       messages.pop_front();
-#endif
-      return std::make_pair(tag, std::move(vec));
+
+      if (vec.size() > kMaxBufferSize) {
+        return Split(host, tag, std::move(vec));
+      }
+
+      BufferHeader* header = reinterpret_cast<BufferHeader*>(vec.data());
+      header->type         = BufferHeader::BufferType::kSingleMessage;
+      std::vector<NetworkIO::message> msgs;
+      msgs.emplace_back(host, tag, std::move(vec));
+      return msgs;
     }
 
-    void add(uint32_t tag, vTy& b) {
+    void add(uint32_t tag, vTy&& b) {
       std::lock_guard<SimpleLock> lg(lock);
       if (messages.empty()) {
         std::lock_guard<SimpleLock> lg(timelock);
         time = std::chrono::high_resolution_clock::now();
       }
-      unsigned oldNumBytes = numBytes;
+      assert(b.size() >= kHeaderSize);
       numBytes += b.size();
-      galois::runtime::trace("BufferedAdd", oldNumBytes, numBytes, tag,
-                             galois::runtime::printVec(b));
-      messages.emplace_back(tag, b);
+      messages.emplace_back(tag, std::move(b));
     }
   }; // end send buffer class
 
@@ -402,24 +382,26 @@ class NetworkInterfaceBuffered : public NetworkInterface {
         // handle send queue i
         auto& sd = sendData[i];
         if (sd.ready()) {
-          NetworkIO::message msg;
-          msg.host                    = i;
-          std::tie(msg.tag, msg.data) = sd.assemble(inflightSends);
-          galois::runtime::trace("BufferedSending", msg.host, msg.tag,
-                                 galois::runtime::printVec(msg.data));
-          ++statSendEnqueued;
-          netio->enqueue(std::move(msg));
+          std::vector<NetworkIO::message> msgs = sd.assemble(i);
+          if (msgs.size() > 1) {
+            inflightSends += msgs.size() - 1;
+          }
+
+          for (auto& msg : msgs) {
+            ++statSendEnqueued;
+            netio->enqueue(std::move(msg));
+          }
         }
+
         // handle receive
         NetworkIO::message rdata = netio->dequeue();
         if (rdata.data.size()) {
           ++statRecvDequeued;
-          assert(rdata.data.size() !=
-                 (unsigned int)std::count(rdata.data.begin(), rdata.data.end(),
-                                          0));
-          galois::runtime::trace("BufferedRecieving", rdata.host, rdata.tag,
-                                 galois::runtime::printVec(rdata.data));
-          recvData[rdata.host].add(std::move(rdata));
+          uint32_t h               = rdata.host;
+          bool not_partial_segment = recvData[h].add(std::move(rdata));
+          if (!not_partial_segment) {
+            --inflightRecvs;
+          }
         }
       }
     }
@@ -454,22 +436,19 @@ class NetworkInterfaceBuffered : public NetworkInterface {
 
   std::unique_ptr<galois::runtime::NetworkIO> netio;
 
-  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,
+  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf,
                           int phase) {
-    ++inflightSends;
     tag += phase;
     statSendNum += 1;
-    statSendBytes += buf.size();
-    galois::runtime::trace("sendTagged", dest, tag,
-                           galois::runtime::printVec(buf.getVec()));
+    statSendBytes += buf.size() + kHeaderSize;
+    memUsageTracker.incrementMemUsage(buf.size() + kHeaderSize);
+    ++inflightSends;
     auto& sd = sendData[dest];
-    sd.add(tag, buf.getVec());
+    sd.add(tag, std::move(buf.get()));
   }
 
   virtual std::optional<std::pair<uint32_t, RecvBuffer>>
-  recieveTagged(uint32_t tag,
-                std::unique_lock<galois::substrate::SimpleLock>* rlg,
-                int phase) {
+  recieveTagged(uint32_t tag, int phase) {
     tag += phase;
     for (unsigned h = 0; h < recvData.size(); ++h) {
       auto& rq = recvData[h];
@@ -480,12 +459,8 @@ class NetworkInterfaceBuffered : public NetworkInterface {
           auto buf = rq.popMsg(tag, inflightRecvs);
           if (buf) {
             ++statRecvNum;
-            statRecvBytes += buf->size();
-            memUsageTracker.decrementMemUsage(buf->size());
-            if (rlg)
-              *rlg = std::move(lg);
-            galois::runtime::trace("recvTagged", h, tag,
-                                   galois::runtime::printVec(buf->getVec()));
+            statRecvBytes += buf->size() + kHeaderSize;
+            memUsageTracker.decrementMemUsage(buf->size() + kHeaderSize);
             anyReceivedMessages = true;
             return std::optional<std::pair<uint32_t, RecvBuffer>>(
                 std::make_pair(h, std::move(*buf)));
diff --git a/libdist/src/NetworkLCI.cpp b/libdist/src/NetworkLCI.cpp
index 59b17a1d35..3770356c8c 100644
--- a/libdist/src/NetworkLCI.cpp
+++ b/libdist/src/NetworkLCI.cpp
@@ -182,8 +182,8 @@ class NetworkInterfaceLCI : public NetworkInterface {
     statSendBytes += buf.size();
     // int count = 0;
 #ifndef GALOIS_SUPPORT_ASYNC
-    if (buf.getVec().size() < 8192) {
-      while (lc_sendm(buf.getVec().data(), buf.getVec().size(), dest, tag,
+    if (buf.get().size() < 8192) {
+      while (lc_sendm(buf.get().data(), buf.get().size(), dest, tag,
                       lc_p2p_ep[phase]) != LC_OK) {
         sched_yield();
       }
@@ -191,7 +191,7 @@ class NetworkInterfaceLCI : public NetworkInterface {
 #endif
     {
       pendingReq* msg =
-          new pendingReq(dest, tag, phase, buf.getVec(), inflightSends);
+          new pendingReq(dest, tag, phase, buf.get(), inflightSends);
       while (lc_sendl(msg->buf.data(), msg->buf.size(), dest, tag,
                       lc_p2p_ep[phase], free_req, msg) != LC_OK) {
         sched_yield();
diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt
index 8e9d56d48e..4721bc0261 100644
--- a/libgalois/CMakeLists.txt
+++ b/libgalois/CMakeLists.txt
@@ -10,7 +10,7 @@ set(sources
         "${CMAKE_CURRENT_BINARY_DIR}/Version.cpp"
         src/Barrier_Counting.cpp
         src/Barrier.cpp
-        src/Barrier_Dissemination.cpp 
+        src/Barrier_Dissemination.cpp
         src/Barrier_MCS.cpp
         src/Barrier_Pthread.cpp
         src/Barrier_Simple.cpp
@@ -86,6 +86,7 @@ endif()
 
 target_link_libraries(galois_shmem INTERFACE pygalois)
 target_link_libraries(galois_shmem PRIVATE Threads::Threads)
+target_link_libraries(galois_shmem PUBLIC galois_support)
 
 if (CMAKE_HAVE_PTHREAD_H)
   target_compile_definitions(galois_shmem PRIVATE GALOIS_HAVE_PTHREAD)
diff --git a/libgalois/include/galois/Atomic.h b/libgalois/include/galois/Atomic.h
deleted file mode 100644
index e073bf5aa7..0000000000
--- a/libgalois/include/galois/Atomic.h
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * This file belongs to the Galois project, a C++ library for exploiting
- * parallelism. The code is being released under the terms of the 3-Clause BSD
- * License (a copy is located in LICENSE.txt at the top-level directory).
- *
- * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
- * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
- * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
- * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
- * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
- * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
- * shall University be liable for incidental, special, indirect, direct or
- * consequential damages or loss of profits, interruption of business, or
- * related expenses which may arise from use of Software or Documentation,
- * including but not limited to those resulting from defects in Software and/or
- * Documentation, or loss or inaccuracy of data of any kind.
- */
-
-#ifndef GALOIS_ATOMIC_H
-#define GALOIS_ATOMIC_H
-
-#include <iterator>
-
-#include "galois/config.h"
-#include "galois/substrate/CacheLineStorage.h"
-
-namespace galois {
-
-namespace internal {
-/**
- * Common implementation.
- */
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicImpl {
-  // galois::runtime::LL::CacheLineStorage<T> val;
-  W<T> val;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicImpl(const T& i) : val(i) {}
-  //! default constructor
-  GAtomicImpl() {}
-
-  //! atomic add and fetch
-  T operator+=(const T& rhs) { return __sync_add_and_fetch(&val.data, rhs); }
-  //! atomic sub and fetch
-  T operator-=(const T& rhs) { return __sync_sub_and_fetch(&(val.data), rhs); }
-  //! atomic increment and fetch
-  T operator++() { return __sync_add_and_fetch(&(val.data), 1); }
-  //! atomic fetch and increment
-  T operator++(int) { return __sync_fetch_and_add(&(val.data), 1); }
-  //! atomic decrement and fetch
-  T operator--() { return __sync_sub_and_fetch(&(val.data), 1); }
-  //! atomic fetch and decrement
-  T operator--(int) { return __sync_fetch_and_sub(&(val.data), 1); }
-  //! conversion operator to base data type
-  operator T() const { return val.data; }
-  //! assign from underlying type
-  T& operator=(const T& i) { return val.data = i; }
-  //! assignment operator
-  T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; }
-  //! direct compare and swap
-  bool cas(const T& expected, const T& updated) {
-    if (val.data != expected) {
-      return false;
-    }
-#if defined(__INTEL_COMPILER)
-    return __sync_bool_compare_and_swap(
-        &val.data, *reinterpret_cast<const ptrdiff_t*>(&expected),
-        *reinterpret_cast<const ptrdiff_t*>(&updated));
-#else
-    return __sync_bool_compare_and_swap(&val.data, expected, updated);
-#endif
-  }
-};
-
-// non-current version
-template <typename T, template <typename _> class W>
-class GAtomicImpl<T, W, false> {
-  // galois::runtime::LL::CacheLineStorage<T> val;
-  W<T> val;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicImpl(const T& i) : val(i) {}
-  //! default constructor
-  GAtomicImpl() {}
-
-  //! atomic add and fetch
-  T operator+=(const T& rhs) { return (val.data += rhs); }
-  //! atomic sub and fetch
-  T operator-=(const T& rhs) { return (val.data -= rhs); }
-  //! atomic increment and fetch
-  T operator++() { return ++(val.data); }
-  //! atomic fetch and increment
-  T operator++(int) { return (val.data)++; }
-  //! atomic decrement and fetch
-  T operator--() { return --(val.data); }
-  //! atomic fetch and decrement
-  T operator--(int) { return (val.data)--; }
-  //! conversion operator to base data type
-  operator T() const { return val.data; }
-  //! assign from underlying type
-  T& operator=(const T& i) { return val.data = i; }
-  //! assignment operator
-  T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; }
-  //! direct compare and swap
-  bool cas(const T& expected, const T& updated) {
-    if (val.data != expected) {
-      return false;
-    } else {
-      val.data = updated;
-      return true;
-    }
-  }
-};
-
-//! Basic atomic
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicBase : public GAtomicImpl<T, W, CONCURRENT> {
-  typedef GAtomicImpl<T, W, CONCURRENT> Super_ty;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicBase(const T& i) : Super_ty(i) {}
-
-  //! default constructor
-  GAtomicBase() : Super_ty() {}
-
-  T& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); }
-
-  T& operator=(const T& that) { return Super_ty::operator=(that); }
-};
-
-//! Specialization for pointers
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicBase<T*, W, CONCURRENT> : public GAtomicImpl<T*, W, CONCURRENT> {
-  typedef GAtomicImpl<T*, W, CONCURRENT> Super_ty;
-
-public:
-  typedef typename std::iterator_traits<T*>::difference_type difference_type;
-
-  GAtomicBase() : Super_ty() {}
-
-  GAtomicBase(T* i) : Super_ty(i) {}
-
-  T*& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); }
-
-  T*& operator=(T* that) { return Super_ty::operator=(that); }
-
-  T* operator+=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_add_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data += rhs);
-    }
-  }
-
-  T* operator-=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_sub_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data -= rhs);
-    }
-  }
-};
-
-//! Specialization for const pointers
-template <typename T, template <typename _> class W, bool CONCURRENT>
-class GAtomicBase<const T*, W, CONCURRENT>
-    : public GAtomicImpl<const T*, W, CONCURRENT> {
-  typedef GAtomicImpl<const T*, W, CONCURRENT> Super_ty;
-
-public:
-  typedef
-      typename std::iterator_traits<const T*>::difference_type difference_type;
-
-  GAtomicBase() : Super_ty() {}
-
-  GAtomicBase(const T* i) : Super_ty(i) {}
-
-  const T*& operator=(const GAtomicBase& that) {
-    return Super_ty::operator=(that);
-  }
-
-  const T*& operator=(const T* that) { return Super_ty::operator=(that); }
-
-  const T* operator+=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_add_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data += rhs);
-    }
-  }
-
-  const T* operator-=(const difference_type& rhs) {
-    if (CONCURRENT) {
-      return __sync_sub_and_fetch(&Super_ty::val.data, rhs);
-    } else {
-      return (Super_ty::val.data -= rhs);
-    }
-  }
-};
-
-//! Specialization for bools
-template <template <typename _> class W, bool CONCURRENT>
-class GAtomicBase<bool, W, CONCURRENT>
-    : private GAtomicImpl<bool, W, CONCURRENT> {
-  typedef GAtomicImpl<bool, W, CONCURRENT> Super_ty;
-
-public:
-  //! Initialize with a value
-  explicit GAtomicBase(bool i) : Super_ty(i) {}
-
-  GAtomicBase() : Super_ty() {}
-
-  //! conversion operator to base data type
-  operator bool() const { return Super_ty::operator bool(); }
-
-  //! assignment operator
-  bool& operator=(const GAtomicBase& i) { return Super_ty::operator=(i); }
-
-  //! assign from underlying type
-  bool& operator=(bool i) { return Super_ty::operator=(i); }
-  //! direct compare and swap
-  bool cas(bool expected, bool updated) {
-    return Super_ty::cas(expected, updated);
-  }
-};
-
-template <typename T>
-struct DummyWrapper {
-  T data;
-
-  explicit DummyWrapper(const T& d) : data(d) {}
-  DummyWrapper() {}
-};
-
-} // namespace internal
-
-/**
- * An atomic wrapper that provides sensible atomic behavior for most
- * primative data types.  Operators return the value of type T so as to
- * retain atomic RMW semantics.
- */
-template <typename T, bool CONCURRENT = true>
-class GAtomic
-    : public internal::GAtomicBase<T, internal::DummyWrapper, CONCURRENT> {
-  typedef internal::GAtomicBase<T, internal::DummyWrapper, CONCURRENT> Super_ty;
-
-public:
-  GAtomic() : Super_ty() {}
-  explicit GAtomic(const T& v) : Super_ty(v) {}
-
-  T& operator=(const GAtomic& that) { return Super_ty::operator=(that); }
-
-  T& operator=(const T& that) { return Super_ty::operator=(that); }
-};
-
-/**
- * Cache-line padded version of {@link GAtomic}.
- */
-template <typename T, bool CONCURRENT = true>
-class GAtomicPadded
-    : public internal::GAtomicBase<T, galois::substrate::CacheLineStorage,
-                                   CONCURRENT> {
-
-  typedef internal::GAtomicBase<T, galois::substrate::CacheLineStorage,
-                                CONCURRENT>
-      Super_ty;
-
-public:
-  GAtomicPadded() : Super_ty() {}
-  explicit GAtomicPadded(const T& v) : Super_ty(v) {}
-
-  T& operator=(const GAtomicPadded& that) { return Super_ty::operator=(that); }
-
-  T& operator=(const T& that) { return Super_ty::operator=(that); }
-};
-
-} // namespace galois
-
-#endif
diff --git a/libgalois/include/galois/AtomicHelpers.h b/libgalois/include/galois/AtomicHelpers.h
index bd8504ec97..59ad57c93c 100644
--- a/libgalois/include/galois/AtomicHelpers.h
+++ b/libgalois/include/galois/AtomicHelpers.h
@@ -28,7 +28,7 @@ namespace galois {
 /** galois::atomicMax + non-atomic max calls **/
 template <typename Ty>
 const Ty atomicMax(std::atomic<Ty>& a, const Ty b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
   // if old value is less than new value, atomically exchange
   while (old_a < b &&
          !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))
@@ -38,10 +38,10 @@ const Ty atomicMax(std::atomic<Ty>& a, const Ty b) {
 
 template <typename Ty>
 const Ty max(std::atomic<Ty>& a, const Ty& b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
 
   if (a < b) {
-    a = b;
+    a.store(b, std::memory_order_relaxed);
   }
   return old_a;
 }
@@ -59,7 +59,7 @@ const Ty max(Ty& a, const Ty& b) {
 /** galois::atomicMin **/
 template <typename Ty>
 const Ty atomicMin(std::atomic<Ty>& a, const Ty b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
   while (old_a > b &&
          !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))
     ;
@@ -68,9 +68,9 @@ const Ty atomicMin(std::atomic<Ty>& a, const Ty b) {
 
 template <typename Ty>
 const Ty min(std::atomic<Ty>& a, const Ty& b) {
-  Ty old_a = a;
+  Ty old_a = a.load(std::memory_order_relaxed);
   if (a > b) {
-    a = b;
+    a.store(b, std::memory_order_relaxed);
   }
   return old_a;
 }
@@ -87,7 +87,7 @@ const Ty min(Ty& a, const Ty& b) {
 /** galois::atomicAdd **/
 template <typename Ty>
 const Ty atomicAdd(std::atomic<Ty>& val, Ty delta) {
-  Ty old_val = val;
+  Ty old_val = val.load(std::memory_order_relaxed);
   while (!val.compare_exchange_weak(old_val, old_val + delta,
                                     std::memory_order_relaxed))
     ;
@@ -96,15 +96,15 @@ const Ty atomicAdd(std::atomic<Ty>& val, Ty delta) {
 
 template <typename Ty>
 const Ty add(std::atomic<Ty>& a, const Ty& b) {
-  Ty old_a = a;
-  a        = a + b;
+  Ty old_a = a.load(std::memory_order_relaxed);
+  a.store(a + b, std::memory_order_relaxed);
   return old_a;
 }
 
 template <typename Ty>
 const Ty add(Ty& a, std::atomic<Ty>& b) {
   Ty old_a = a;
-  a        = a + b.load();
+  a        = a + b.load(std::memory_order_relaxed);
   return old_a;
 }
 
@@ -121,7 +121,7 @@ const Ty add(Ty& a, const Ty& b) {
  */
 template <typename Ty>
 const Ty atomicSubtract(std::atomic<Ty>& val, Ty delta) {
-  Ty old_val = val;
+  Ty old_val = val.load(std::memory_order_relaxed);
   while (!val.compare_exchange_weak(old_val, old_val - delta,
                                     std::memory_order_relaxed))
     ;
@@ -136,7 +136,7 @@ const Ty set(Ty& a, const Ty& b) {
 
 template <typename Ty>
 const Ty set(std::atomic<Ty>& a, const Ty& b) {
-  a = b;
+  a.store(b, std::memory_order_relaxed);
   return a;
 }
 
@@ -205,6 +205,6 @@ void reset(Ty& var, Ty val) {
 
 template <typename Ty>
 void reset(std::atomic<Ty>& var, Ty val) {
-  var = val;
+  var.store(val, std::memory_order_relaxed);
 }
 } // end namespace galois
diff --git a/libgalois/include/galois/Bag.h b/libgalois/include/galois/Bag.h
index 6592bec529..985fdffcb7 100644
--- a/libgalois/include/galois/Bag.h
+++ b/libgalois/include/galois/Bag.h
@@ -212,7 +212,7 @@ class InsertBag {
     return *this;
   }
 
-  InsertBag(const InsertBag&) = delete;
+  InsertBag(const InsertBag&)            = delete;
   InsertBag& operator=(const InsertBag&) = delete;
 
   ~InsertBag() { destruct_parallel(); }
diff --git a/libgalois/include/galois/DynamicBitset.h b/libgalois/include/galois/DynamicBitset.h
index 6bb9c34864..e2035d018a 100644
--- a/libgalois/include/galois/DynamicBitset.h
+++ b/libgalois/include/galois/DynamicBitset.h
@@ -109,6 +109,12 @@ class DynamicBitSet {
    */
   void reset() { std::fill(bitvec.begin(), bitvec.end(), 0); }
 
+  void ParallelReset() {
+    galois::do_all(
+        galois::iterate(bitvec),
+        [&](galois::CopyableAtomic<uint64_t>& to_reset) { to_reset = 0; });
+  }
+
   /**
    * Unset a range of bits given an inclusive range
    *
diff --git a/libgalois/include/galois/FixedSizeRing.h b/libgalois/include/galois/FixedSizeRing.h
index 51e1466011..e1d7896781 100644
--- a/libgalois/include/galois/FixedSizeRing.h
+++ b/libgalois/include/galois/FixedSizeRing.h
@@ -67,7 +67,7 @@ class FixedSizeBagBase {
     }
   }
 
-  FixedSizeBagBase(const FixedSizeBagBase& o) = delete;
+  FixedSizeBagBase(const FixedSizeBagBase& o)            = delete;
   FixedSizeBagBase& operator=(const FixedSizeBagBase& o) = delete;
 
   ~FixedSizeBagBase() { clear(); }
@@ -284,7 +284,7 @@ class FixedSizeRing {
     }
   }
 
-  FixedSizeRing(const FixedSizeRing& o) = delete;
+  FixedSizeRing(const FixedSizeRing& o)            = delete;
   FixedSizeRing& operator=(const FixedSizeRing& o) = delete;
 
   ~FixedSizeRing() { clear(); }
diff --git a/libgalois/include/galois/LargeArray.h b/libgalois/include/galois/LargeArray.h
index da2b89b916..fe2e99c364 100644
--- a/libgalois/include/galois/LargeArray.h
+++ b/libgalois/include/galois/LargeArray.h
@@ -80,21 +80,17 @@ class LargeArray {
     m_size = n;
     switch (t) {
     case Blocked:
-      galois::gDebug("Block-alloc'd");
       m_realdata =
           substrate::largeMallocBlocked(n * sizeof(T), runtime::activeThreads);
       break;
     case Interleaved:
-      galois::gDebug("Interleave-alloc'd");
       m_realdata = substrate::largeMallocInterleaved(n * sizeof(T),
                                                      runtime::activeThreads);
       break;
     case Local:
-      galois::gDebug("Local-allocd");
       m_realdata = substrate::largeMallocLocal(n * sizeof(T));
       break;
     case Floating:
-      galois::gDebug("Floating-alloc'd");
       m_realdata = substrate::largeMallocFloating(n * sizeof(T));
       break;
     };
@@ -176,7 +172,7 @@ class LargeArray {
     return *this;
   }
 
-  LargeArray(const LargeArray&) = delete;
+  LargeArray(const LargeArray&)            = delete;
   LargeArray& operator=(const LargeArray&) = delete;
 
   ~LargeArray() {
@@ -309,8 +305,8 @@ class LargeArray<void> {
 
 public:
   LargeArray(void*, size_t) {}
-  LargeArray()                  = default;
-  LargeArray(const LargeArray&) = delete;
+  LargeArray()                             = default;
+  LargeArray(const LargeArray&)            = delete;
   LargeArray& operator=(const LargeArray&) = delete;
 
   friend void swap(LargeArray&, LargeArray&) {}
diff --git a/libgalois/include/galois/LargeVector.h b/libgalois/include/galois/LargeVector.h
new file mode 100644
index 0000000000..a10834746f
--- /dev/null
+++ b/libgalois/include/galois/LargeVector.h
@@ -0,0 +1,187 @@
+#ifndef GALOIS_LARGEVECTOR_H
+#define GALOIS_LARGEVECTOR_H
+
+#include <linux/memfd.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <atomic>
+#include <cstddef>
+#include <stdexcept>
+#include <iterator>
+#include <iostream>
+#include <list>
+#include <string.h>
+#include <utility>
+
+namespace galois {
+/*
+ * A vector backed by huge pages. Guarantees addresss stability, so values do
+ * not have to be moveable.
+ *
+ * A note on iterator safety:
+ *  1. Resizing the container results in a new iterator generation.
+ *  2. All iterator methods (e.g. increment) preserve generation.
+ *  3. It is undefined behavior to compare iterators across generations.
+ *  4. Decreasing the container size invalidates some iterators.
+ */
+template <typename T>
+class LargeVector : public boost::noncopyable {
+private:
+  size_t m_capacity;
+  size_t volatile m_size;
+  T* m_data;
+
+  int m_fd;
+  std::list<std::pair<void*, size_t>> m_mappings; // sorted by size decreasing
+
+  void ensure_capacity(size_t new_cap) {
+    if (new_cap > m_capacity)
+      new_cap = std::max(new_cap, m_capacity * 2);
+    else if (new_cap > 0)
+      return;
+
+    // Round up to the nearest huge page size.
+    constexpr size_t page_size = 1ull << 21;
+    const size_t file_size =
+        (new_cap * sizeof(T) + (page_size - 1)) & (~(page_size - 1));
+
+    if (ftruncate(m_fd, file_size) == -1)
+      throw std::runtime_error(std::string("ftruncate: ") +
+                               std::strerror(errno));
+
+    // Floor divide to find the real capacity.
+    m_capacity = file_size / sizeof(T);
+
+    // Check whether the existing mapping covers the new capacity.
+    if (m_mappings.front().second >= m_capacity * sizeof(T))
+      return;
+
+    // Create a new virtual address mapping if a previous mapping is not large
+    // enough to access the new capacity.
+    //
+    // To avoid exhausting the virtual address space with lots of
+    // similarly-sized allocations, we always at least double the size.
+    size_t const mmap_size =
+        std::max(m_mappings.front().second * 2, m_capacity * sizeof(T));
+
+    m_data = static_cast<T*>(mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE,
+                                  MAP_SHARED | MAP_POPULATE, m_fd, 0));
+    if (m_data == MAP_FAILED)
+      throw std::runtime_error(std::string("mmap failed: ") +
+                               std::strerror(errno));
+
+    m_mappings.push_front(std::make_pair(m_data, mmap_size));
+  }
+
+public:
+  LargeVector(size_t initial_capacity)
+      : m_capacity(0), m_size(0), m_data(nullptr),
+        m_fd(memfd_create("LargeVector", 0)),
+        m_mappings({std::make_pair(nullptr, 0)}) {
+    if (m_fd == -1)
+      throw std::runtime_error(std::string("creating memfd: ") +
+                               std::strerror(errno));
+    ensure_capacity(initial_capacity);
+  }
+
+  LargeVector() : LargeVector(1) {}
+
+  LargeVector(LargeVector&& other)
+      : m_capacity(other.m_capacity), m_size(other.m_size),
+        m_data(other.m_data), m_fd(other.m_fd),
+        m_mappings(std::move(other.m_mappings)) {
+    other.m_capacity = 0;
+    other.m_size     = 0;
+    other.m_data     = nullptr;
+    other.m_fd       = -1;
+    assert(other.m_mappings.empty());
+  }
+
+  LargeVector& operator=(LargeVector<T>&& other) {
+    m_capacity = std::move(other.m_capacity);
+    m_size     = std::move(other.m_size);
+    m_data     = std::move(other.m_data);
+    m_fd       = std::move(other.m_fd);
+    m_mappings = std::move(other.m_mappings);
+
+    other.m_capacity = 0;
+    other.m_size     = 0;
+    other.m_data     = nullptr;
+    other.m_fd       = -1;
+    assert(other.m_mappings.empty());
+
+    return *this;
+  }
+
+  ~LargeVector() {
+    for (; !m_mappings.empty(); m_mappings.pop_front())
+      if (m_mappings.front().first != nullptr)
+        munmap(m_mappings.front().first, m_mappings.front().second);
+
+    if (m_fd != -1)
+      close(m_fd);
+  }
+
+  uint64_t size() const noexcept { return m_size; }
+
+  template <typename... Args>
+  T& emplace_back(Args&&... args) {
+    if (m_size == m_capacity) {
+      ensure_capacity(m_size + 1);
+    }
+    return *new (m_data + m_size++) T(std::forward<Args>(args)...);
+  }
+
+  T& push_back(const T& t) { return emplace_back(t); }
+
+  T& push_back(T&& t) { return emplace_back(std::move(t)); }
+
+  T& operator[](size_t index) const { return m_data[index]; }
+
+  void pop_back() {
+    assert(m_size > 0);
+    m_data[--m_size].~T();
+  }
+
+  void resize(size_t count) {
+    for (T* ii = begin() + count; ii < end(); ++ii)
+      ii->~T();
+
+    ensure_capacity(count);
+
+    for (T* ii = end(); ii < begin() + count; ++ii)
+      new (ii) T();
+
+    m_size = count;
+  }
+
+  inline T* begin() { return m_data; }
+  inline T* end() { return m_data + m_size; }
+};
+
+}; // namespace galois
+
+namespace std {
+template <typename T>
+ostream& operator<<(std::ostream& os, const galois::LargeVector<T>& vec) {
+  for (uint64_t i = 0; i < vec.getSize(); i++) {
+    os << vec[i];
+    if (i < vec.getSize() - 1) {
+      os << " ";
+    }
+  }
+  return os;
+}
+
+template <typename T>
+istream& operator>>(istream& is, galois::LargeVector<T>& vec) {
+  T value;
+  while (is >> value) {
+    vec.push_back(value);
+  }
+  return is;
+}
+} // namespace std
+
+#endif
diff --git a/libgalois/include/galois/PODResizeableArray.h b/libgalois/include/galois/PODResizeableArray.h
index a37a0b598c..acff59c8e9 100644
--- a/libgalois/include/galois/PODResizeableArray.h
+++ b/libgalois/include/galois/PODResizeableArray.h
@@ -28,6 +28,7 @@
 #include <type_traits>
 
 #include "galois/config.h"
+#include "galois/Logging.h"
 
 namespace galois {
 
@@ -136,6 +137,9 @@ class PODResizeableArray {
       }
       data_ = static_cast<_Tp*>(
           realloc(reinterpret_cast<void*>(data_), capacity_ * sizeof(_Tp)));
+      if (!data_) {
+        GALOIS_LOG_FATAL("Out of memory for a PODResizableArray");
+      }
     }
   }
 
@@ -183,10 +187,12 @@ class PODResizeableArray {
   void insert(iterator GALOIS_USED_ONLY_IN_DEBUG(position), InputIterator first,
               InputIterator last) {
     assert(position == end());
-    size_t old_size = size_;
-    size_t to_add   = last - first;
-    resize(old_size + to_add);
-    std::copy_n(first, to_add, begin() + old_size);
+    size_t to_add = last - first;
+    if (to_add > 0) {
+      size_t old_size = size_;
+      resize(old_size + to_add);
+      std::copy_n(first, to_add, begin() + old_size);
+    }
   }
 
   void swap(PODResizeableArray& v) {
diff --git a/libgalois/include/galois/ParallelSTL.h b/libgalois/include/galois/ParallelSTL.h
index c22858c84f..f5878686a9 100644
--- a/libgalois/include/galois/ParallelSTL.h
+++ b/libgalois/include/galois/ParallelSTL.h
@@ -119,7 +119,7 @@ struct sort_helper {
       RandomAccessIterator pivot = choose_rand(bounds.first, bounds.second);
       VT pv                      = *pivot;
       pivot                      = std::partition(bounds.first, bounds.second,
-                             std::bind(comp, std::placeholders::_1, pv));
+                                                  std::bind(comp, std::placeholders::_1, pv));
       // push the lower bit
       if (bounds.first != pivot)
         ctx.push(std::make_pair(bounds.first, pivot));
@@ -209,7 +209,7 @@ struct partition_helper {
     RP high, low;
     do {
       RP parts  = dual_partition(low.first, low.second, high.first, high.second,
-                                state->pred);
+                                 state->pred);
       low.first = parts.first;
       high.second = parts.second;
       if (low.first == low.second)
@@ -377,6 +377,12 @@ OutputIt partial_sum(InputIt first, InputIt last, OutputIt d_first) {
   }
 }
 
+template <class ForwardIt, class T>
+void fill(ForwardIt first, ForwardIt last, const T& value) {
+  galois::do_all(galois::iterate(first, last),
+                 [&](auto& iter) { iter = value; });
+}
+
 } // end namespace ParallelSTL
 } // end namespace galois
 #endif
diff --git a/libgalois/include/galois/PrefixSum.h b/libgalois/include/galois/PrefixSum.h
new file mode 100644
index 0000000000..d287eca8e3
--- /dev/null
+++ b/libgalois/include/galois/PrefixSum.h
@@ -0,0 +1,205 @@
+#include <cstdint>
+#include "galois/WaterFallLock.h"
+#include "galois/Galois.h"
+#include <variant>
+
+namespace galois {
+
+inline void empty(std::monostate& a, uint64_t i) {
+  (void)a;
+  (void)i;
+}
+
+template <typename T>
+inline T equalizer(const T& t) {
+  return t;
+}
+
+template <typename T>
+inline void before(T l, uint64_t tid) {
+  l.template wait<1>(tid);
+}
+template <typename T>
+inline void after(T l, uint64_t tid) {
+  l.template done<2>(tid);
+}
+
+/** This is a struct used for repeated PrefixSums
+ * It works using a 2 level algorithm
+ * @param A The type of the source array
+ * @param B The type of the dst array
+ * @param transmute is function A -> B
+ * @param scan_op is a function A x B -> B
+ * @param combiner is a function B x B -> B
+ * @param Conduit is the type used inside the WaterFallLock as well as the Paste
+ * array (used for measurement)
+ * @param src the source array user is required to ensure the size is correct
+ * @param dst the destination array user is required to ensure the size is
+ * correct
+ * @param lock a reference to a WaterFallLock, which should have length of the
+ * number of threads
+ * @param paste a conduit assigned per thread in order to ensure cache_line
+ * padding for speed
+ */
+template <typename A, typename B, B (*transmute)(const A&),
+          B (*scan_op)(const A& x, const B& y),
+          B (*combiner)(const B& x, const B& y),
+          template <typename C> typename Conduit>
+class PrefixSum {
+public:
+  /**
+   * These are exposed in order to be changed between subsequest calls in the
+   * case of dynamic structures
+   */
+  A* src;
+  B* dst;
+
+private:
+  using PArr = Conduit<B>;
+  Conduit<B> paste;
+  using WFLType = galois::WaterFallLock<Conduit<unsigned>>;
+  WFLType lock;
+
+  /** Type to make pointers into an array for serial_pfxsum
+   *
+   */
+  template <typename T>
+  struct Arr {
+    T* arr;
+
+    Arr(T* arr) : arr(arr) {}
+
+    template <typename i_type>
+    T& operator[](i_type i) {
+      return arr[i];
+    }
+  };
+
+  /** The templates are used to make this function usable in many different
+   * places Enables before and after to take in some context that can be
+   * triggered after an object is in the paste array
+   * @param Holder is used to specify any object holding A1 (A) or A2 (B)
+   */
+  template <typename A1, typename A2, A2 (*trans)(const A1&),
+            A2 (*scan)(const A1& x, const A2& y), typename CTX,
+            void (*before)(CTX&, uint64_t), void (*after)(CTX&, uint64_t),
+            template <typename C> typename Holder, bool combine = false>
+  inline void serial_pfxsum(Holder<A1> src, Holder<A2> dst, uint64_t ns,
+                            CTX ctx) {
+    if (!combine)
+      dst[0] = trans(src[0]);
+    for (uint64_t i = 1; i < ns; i++) {
+      before(ctx, i);
+      dst[i] = scan(src[i], dst[i - 1]);
+      after(ctx, i);
+    }
+  }
+
+  /** Does the serial pfxsum and puts the final value in the paste_loc for
+   * future processing
+   *
+   */
+  inline void parallel_pfxsum_phase_0(A* src, B* dst, uint64_t ns, B& paste_loc,
+                                      uint64_t wfl_id) {
+    serial_pfxsum<A, B, transmute, scan_op, std::monostate, empty, empty, Arr>(
+        src, dst, ns, std::monostate());
+    paste_loc = dst[ns - 1];
+    lock.template done<1>(wfl_id);
+  }
+
+  /** Sums up the paste locations in a single thread to prepare the finished
+   * product
+   *
+   */
+  inline void parallel_pfxsum_phase_1(uint64_t ns, uint64_t wfl_id) {
+    if (!wfl_id) {
+      lock.template done<2>(wfl_id);
+      serial_pfxsum<B, B, equalizer, combiner, WFLType&, before<WFLType&>,
+                    after<WFLType&>, Conduit>(paste, paste, ns, lock);
+    } else {
+      lock.template wait<2>(wfl_id - 1);
+    }
+  }
+
+  /** Does the final prefix sums with the last part of the array being handeled
+   * by tid = 0 */
+  inline void parallel_pfxsum_phase_2(A* src, B* dst, uint64_t ns,
+                                      const B& phase1_val, bool pfxsum) {
+    if (pfxsum) {
+      dst[0] = scan_op(src[0], phase1_val);
+      serial_pfxsum<A, B, transmute, scan_op, std::monostate, empty, empty, Arr,
+                    true>(src, dst, ns, std::monostate());
+    } else {
+      for (uint64_t i = 0; i < ns; i++)
+        dst[i] = combiner(phase1_val, dst[i]);
+    }
+  }
+
+  inline void parallel_pfxsum_work(uint64_t phase0_ind, uint64_t phase0_sz,
+                                   uint64_t phase2_ind, uint64_t phase2_sz,
+                                   uint64_t wfl_id, uint64_t nt) {
+
+    parallel_pfxsum_phase_0(&src[phase0_ind], &dst[phase0_ind], phase0_sz,
+                            paste[wfl_id], wfl_id);
+
+    parallel_pfxsum_phase_1(nt, wfl_id);
+
+    const B& paste_val = paste[wfl_id ? wfl_id - 1 : nt - 1];
+    parallel_pfxsum_phase_2(&src[phase2_ind], &dst[phase2_ind], phase2_sz,
+                            paste_val, !wfl_id);
+  }
+
+  /** This function computes the indices for the different phases and forwards
+   * them.
+   * @param ns the number of items to sum
+   * @param wf_id this corresponds to the thread id
+   * @param nt this is the number of threads
+   */
+  void parallel_pfxsum_op(uint64_t ns, uint64_t wf_id, uint64_t nt) {
+    uint64_t div_sz = ns / (nt + 1);
+    uint64_t bigs   = ns % (nt + 1);
+    uint64_t mid    = nt >> 1;
+    bool is_mid     = mid == wf_id;
+    // Concentrate the big in the middle thread
+    uint64_t phase0_sz = is_mid ? div_sz + bigs : div_sz;
+    uint64_t phase0_ind;
+    if (wf_id <= mid)
+      phase0_ind = div_sz * wf_id;
+    else
+      phase0_ind = bigs + (div_sz * wf_id);
+
+    uint64_t phase2_sz  = phase0_sz;
+    uint64_t phase2_ind = wf_id ? phase0_ind : ns - div_sz;
+    parallel_pfxsum_work(phase0_ind, phase0_sz, phase2_ind, phase2_sz, wf_id,
+                         nt);
+  }
+
+public:
+  PrefixSum(A* src, B* dst) : src(src), dst(dst), paste(B()), lock() {}
+
+  /** computePrefixSum is the interface exposed to actually have a prefixSum
+   * computed NOTE: this uses on_each be careful!!!
+   * @param ns the number of objects in src to sum
+   */
+  void computePrefixSum(uint64_t ns) {
+    galois::on_each([&](unsigned tid, unsigned numThreads) {
+      if (numThreads == 1) {
+        computePrefixSumSerially(ns);
+      } else {
+        this->parallel_pfxsum_op(ns, tid, numThreads);
+      }
+    });
+    this->lock.reset();
+  }
+
+  void computePrefixSumSerially(uint64_t ns) {
+    serial_pfxsum<A, B, transmute, scan_op, std::monostate, empty, empty, Arr>(
+        src, dst, ns, std::monostate());
+  }
+
+  const char* name() {
+    return typeid(PrefixSum<A, B, transmute, scan_op, combiner, Conduit>)
+        .name();
+  }
+};
+} // namespace galois
diff --git a/libgalois/include/galois/SharedMemSys.h b/libgalois/include/galois/SharedMemSys.h
index 8177a2283a..52459032d1 100644
--- a/libgalois/include/galois/SharedMemSys.h
+++ b/libgalois/include/galois/SharedMemSys.h
@@ -16,10 +16,10 @@ class SharedMemSys : public runtime::SharedMem<runtime::StatManager> {
   explicit SharedMemSys();
   ~SharedMemSys();
 
-  SharedMemSys(const SharedMemSys&) = delete;
+  SharedMemSys(const SharedMemSys&)            = delete;
   SharedMemSys& operator=(const SharedMemSys&) = delete;
 
-  SharedMemSys(SharedMemSys&&) = delete;
+  SharedMemSys(SharedMemSys&&)            = delete;
   SharedMemSys& operator=(SharedMemSys&&) = delete;
 };
 
diff --git a/libgalois/include/galois/Timer.h b/libgalois/include/galois/Timer.h
index f12c41c6b0..51ab492ff4 100644
--- a/libgalois/include/galois/Timer.h
+++ b/libgalois/include/galois/Timer.h
@@ -72,10 +72,10 @@ class StatTimer : public TimeAccumulator {
 
   StatTimer() : StatTimer(nullptr, nullptr) {}
 
-  StatTimer(const StatTimer&) = delete;
-  StatTimer(StatTimer&&)      = delete;
+  StatTimer(const StatTimer&)            = delete;
+  StatTimer(StatTimer&&)                 = delete;
   StatTimer& operator=(const StatTimer&) = delete;
-  StatTimer& operator=(StatTimer&&) = delete;
+  StatTimer& operator=(StatTimer&&)      = delete;
 
   ~StatTimer();
 
diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h
new file mode 100644
index 0000000000..396bb208af
--- /dev/null
+++ b/libgalois/include/galois/TwoDVector.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "gstl.h"
+#include "PODResizeableArray.h"
+
+namespace galois {
+
+template <typename T>
+class TwoDVector {
+public:
+  using value_type = T;
+
+  void SetVecSize(size_t fixed_vector_size) {
+    fixed_vector_size_ = fixed_vector_size;
+  }
+
+  //! Call this before using this else bad things will happen: initializes
+  //! the memory + fixed size metadata
+  void Create(size_t num_elements) {
+    num_elements_ = num_elements;
+    underlying_memory_.resize(num_elements_ * fixed_vector_size_);
+  }
+  void SetVector(size_t index, const galois::gstl::Vector<T>& to_copy) {
+    // TODO(loc) for generality should work with any vector type, but for
+    // now just use gstl
+    assert(index < num_elements_);
+    assert(to_copy == fixed_vector_size_);
+    size_t array_index = index * fixed_vector_size_;
+    std::memcpy((void*)(&(underlying_memory_[array_index])),
+                (void*)to_copy.data(), sizeof(T) * fixed_vector_size_);
+  }
+
+  PODResizeableArray<T>& edit_data() { return underlying_memory_; }
+  const PODResizeableArray<T>& data() { return underlying_memory_; }
+  void resize(size_t s) { underlying_memory_.resize(s); }
+  size_t size() const { return underlying_memory_.size(); }
+
+private:
+  size_t num_elements_{0};
+  size_t fixed_vector_size_{0};
+  PODResizeableArray<T> underlying_memory_;
+};
+
+} // namespace galois
diff --git a/libgalois/include/galois/WaterFallLock.h b/libgalois/include/galois/WaterFallLock.h
new file mode 100644
index 0000000000..f668ac8044
--- /dev/null
+++ b/libgalois/include/galois/WaterFallLock.h
@@ -0,0 +1,77 @@
+#ifndef _wf_h_
+#define _wf_h_
+
+#include <cstdint>
+#include <stdlib.h>
+#include <galois/substrate/PerThreadStorage.h>
+#include <galois/substrate/CacheLineStorage.h>
+
+using namespace galois::substrate;
+namespace galois {
+/** This is an array that is cacheline padded
+ * TODO(AdityaAtulTewari) Should be moved to substrate
+ * @author AdityaAtulTewari
+ */
+template <typename T>
+class CacheLinePaddedArr {
+  galois::substrate::CacheLineStorage<T>* arr;
+  uint64_t sz;
+
+  void initialize_values(T def) {
+    for (uint64_t i = 0; i < sz; i++) {
+      arr[i] = def;
+    }
+  }
+
+public:
+  CacheLinePaddedArr(T def)
+      : arr(new CacheLineStorage<T>[getThreadPool().getMaxThreads()]),
+        sz(getThreadPool().getMaxThreads()) {
+    initialize_values(def);
+  }
+
+  CacheLinePaddedArr(uint64_t sz, T def)
+      : arr(new CacheLineStorage<T>[sz]), sz(sz) {
+    initialize_values(def);
+  }
+
+  T* get(uint64_t i) { return &arr[i].data; }
+
+  uint64_t size() { return sz; }
+
+  template <typename n_type>
+  T& operator[](n_type i) {
+    return arr[i].data;
+  }
+};
+
+/** This is a Barrier style lock used for fine grained release control
+ * TODO(AdityaAtulTewari) Should be moved to a substrate
+ * @author AdityaAtulTewari
+ */
+template <typename T>
+class WaterFallLock {
+  T wfc{0};
+
+public:
+  WaterFallLock() : wfc(0) {}
+
+  void reset() {
+    for (unsigned i = 0; i < wfc.size(); i++)
+      *(wfc.get(i)) = 0;
+  }
+
+  const char* name() { return typeid(WaterFallLock<T>).name(); }
+
+  template <char val>
+  void wait(uint64_t num) {
+    while (__atomic_load_1((char*)wfc.get(num), __ATOMIC_ACQUIRE) != val)
+      ;
+  }
+  template <char val>
+  void done(uint64_t num) {
+    __atomic_store_1((char*)wfc.get(num), val, __ATOMIC_RELEASE);
+  }
+};
+} // namespace galois
+#endif
diff --git a/libgalois/include/galois/gdeque.h b/libgalois/include/galois/gdeque.h
index 737f989107..19830c0309 100644
--- a/libgalois/include/galois/gdeque.h
+++ b/libgalois/include/galois/gdeque.h
@@ -35,7 +35,7 @@ namespace galois {
 
 // Experimental random access iterator. Slower than old iterator for simple
 // traversals, so disable for now
-//#define _NEW_ITERATOR
+// #define _NEW_ITERATOR
 
 //! Like std::deque but use Galois memory management functionality
 template <typename T, unsigned ChunkSize = 64,
@@ -277,7 +277,7 @@ class gdeque {
     return *this;
   }
 
-  gdeque(const gdeque&) = delete;
+  gdeque(const gdeque&)            = delete;
   gdeque& operator=(const gdeque&) = delete;
 
   ~gdeque() { clear(); }
diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h
index 7140506311..df7d69f330 100644
--- a/libgalois/include/galois/graphs/BufferedGraph.h
+++ b/libgalois/include/galois/graphs/BufferedGraph.h
@@ -180,9 +180,7 @@ class BufferedGraph {
       typename std::enable_if<!std::is_void<EdgeType>::value>::type* = nullptr>
   void loadEdgeData(std::ifstream& graphFile, uint64_t edgeStart,
                     uint64_t numEdgesToLoad, uint64_t numGlobalNodes,
-                    uint64_t numGlobalEdges) {
-    galois::gDebug("Loading edge data");
-
+                    uint64_t numGlobalEdges, uint64_t file_data_size) {
     if (numEdgesToLoad == 0) {
       return;
     }
@@ -195,30 +193,39 @@ class BufferedGraph {
       GALOIS_DIE("Failed to allocate memory for edge data buffer.");
     }
 
-    // position after nodes + edges
-    uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +
-                                (sizeof(uint32_t) * numGlobalEdges);
-
-    // version 1 padding TODO make version agnostic
-    if (numGlobalEdges % 2) {
-      baseReadPosition += sizeof(uint32_t);
-    }
-
-    // jump to first byte of edge data
-    uint64_t readPosition =
-        baseReadPosition + (sizeof(EdgeDataType) * edgeStart);
-    graphFile.seekg(readPosition);
-    uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType);
-    uint64_t bytesRead      = 0;
-
-    while (numBytesToLoad > 0) {
-      graphFile.read(((char*)this->edgeDataBuffer) + bytesRead, numBytesToLoad);
-      size_t numRead = graphFile.gcount();
-      numBytesToLoad -= numRead;
-      bytesRead += numRead;
+    if (file_data_size == sizeof(EdgeDataType)) {
+      // position after nodes + edges
+      uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +
+                                  (sizeof(uint32_t) * numGlobalEdges);
+
+      // version 1 padding TODO make version agnostic
+      if (numGlobalEdges % 2) {
+        baseReadPosition += sizeof(uint32_t);
+      }
+
+      // jump to first byte of edge data
+      uint64_t readPosition =
+          baseReadPosition + (sizeof(EdgeDataType) * edgeStart);
+      graphFile.seekg(readPosition);
+      uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType);
+      uint64_t bytesRead      = 0;
+
+      while (numBytesToLoad > 0) {
+        graphFile.read(((char*)this->edgeDataBuffer) + bytesRead,
+                       numBytesToLoad);
+        size_t numRead = graphFile.gcount();
+        numBytesToLoad -= numRead;
+        bytesRead += numRead;
+      }
+
+      assert(numBytesToLoad == 0);
+    } else {
+      // file on disk does not match edge data type: fill in the buffer
+      // with 0s instead
+      galois::gInfo("File on disk does not have appropriate edge data to read; "
+                    "filling with 0s");
+      memset(edgeDataBuffer, 0, sizeof(EdgeDataType) * numEdgesToLoad);
     }
-
-    assert(numBytesToLoad == 0);
   }
 
   /**
@@ -232,8 +239,8 @@ class BufferedGraph {
   template <
       typename EdgeType,
       typename std::enable_if<std::is_void<EdgeType>::value>::type* = nullptr>
-  void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t) {
-    galois::gDebug("Not loading edge data");
+  void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t,
+                    uint64_t) {
     // do nothing (edge data is void, i.e. no edge data)
   }
 
@@ -270,6 +277,46 @@ class BufferedGraph {
    */
   BufferedGraph() { resetReadCounters(); }
 
+  /**
+   * @brief Construct a buffered graph from parameters paseed.
+   * The array parameters should be constructed outside.
+   *
+   * @param _outIndexBuffer Outgoing neighbors range for each node
+   * @param _edgeDestBuffer Outgoing edge destination nodes
+   * @param _edgeDataBuffer Outgoing edge data
+   * @param _globalsize The number of global nodes
+   * @param _globalEdgeSize The number of global edges
+   * @param _numLocalNodes The number of local nodes
+   * @param _numLocalEdges The number of local edges
+   * @param _nodeOffset Node offsets on the global node space of
+   * the current host
+   * @param _edgeOffset Edge offsets on the global edge space of
+   * the current host
+   */
+  void constructFrom(uint64_t* _outIndexBuffer, uint32_t* _edgeDestBuffer,
+                     EdgeDataType* _edgeDataBuffer, uint32_t _globalSize,
+                     uint64_t _globalEdgeSize, uint32_t _numLocalNodes,
+                     uint64_t _numLocalEdges, uint64_t _nodeOffset,
+                     uint64_t _edgeOffset) {
+    assert(_outIndexBuffer != nullptr);
+    assert(_edgeDestBuffer != nullptr);
+    assert(_edgeDataBuffer != nullptr);
+    outIndexBuffer = _outIndexBuffer;
+    edgeDestBuffer = _edgeDestBuffer;
+    edgeDataBuffer = _edgeDataBuffer;
+    globalSize     = _globalSize;
+    globalEdgeSize = _globalEdgeSize;
+    numLocalNodes  = _numLocalNodes;
+    numLocalEdges  = _numLocalEdges;
+    nodeOffset     = _nodeOffset;
+    edgeOffset     = _edgeOffset;
+    resetReadCounters();
+    graphLoaded = true;
+    numBytesReadOutIndex += sizeof(uint64_t);
+    numBytesReadEdgeDest += sizeof(uint64_t);
+    numBytesReadEdgeData += sizeof(uint64_t);
+  }
+
   /**
    * On destruction, free allocated buffers (if necessary).
    */
@@ -325,7 +372,7 @@ class BufferedGraph {
     loadEdgeDest(graphFile, 0, globalEdgeSize, globalSize);
     // may or may not do something depending on EdgeDataType
     loadEdgeData<EdgeDataType>(graphFile, 0, globalEdgeSize, globalSize,
-                               globalEdgeSize);
+                               globalEdgeSize, header[1]);
     graphLoaded = true;
 
     graphFile.close();
@@ -353,6 +400,8 @@ class BufferedGraph {
     }
 
     std::ifstream graphFile(filename.c_str());
+    uint64_t header[4];
+    graphFile.read(((char*)header), sizeof(uint64_t) * 4);
 
     globalSize     = numGlobalNodes;
     globalEdgeSize = numGlobalEdges;
@@ -367,7 +416,7 @@ class BufferedGraph {
 
     // may or may not do something depending on EdgeDataType
     loadEdgeData<EdgeDataType>(graphFile, edgeStart, numLocalEdges,
-                               numGlobalNodes, numGlobalEdges);
+                               numGlobalNodes, numGlobalEdges, header[1]);
     graphLoaded = true;
 
     graphFile.close();
@@ -421,10 +470,9 @@ class BufferedGraph {
     }
     assert(nodeOffset <= globalNodeID);
     assert(globalNodeID < (nodeOffset + numLocalNodes));
-
     numBytesReadOutIndex += sizeof(uint64_t);
-
     uint64_t localNodeID = globalNodeID - nodeOffset;
+
     return EdgeIterator(outIndexBuffer[localNodeID]);
   }
 
diff --git a/libgalois/include/galois/graphs/GraphHelpers.h b/libgalois/include/galois/graphs/GraphHelpers.h
index e7da20ebc1..f6bf923b63 100644
--- a/libgalois/include/galois/graphs/GraphHelpers.h
+++ b/libgalois/include/galois/graphs/GraphHelpers.h
@@ -167,8 +167,6 @@ auto divideNodesBinarySearch(
   // weight of a block (one block for each division by default; if scale
   // factor specifies something different, then use that instead)
   uint64_t blockWeight = (weight + numBlocks - 1) / numBlocks;
-  // galois::gDebug("weight ", weight, " numblock ", numBlocks, " blockwegith ",
-  //               blockWeight);
 
   // lower and upper blocks that this division should use determined
   // using scaleFactor
@@ -182,9 +180,6 @@ auto divideNodesBinarySearch(
   uint32_t blockUpper = scaleFactor[id];
 
   assert(blockLower <= blockUpper);
-  // galois::gDebug("Unit ", id, " block ", blockLower, " to ",
-  //               blockUpper, "; ", blockLower * blockWeight, " ",
-  //               blockUpper * blockWeight);
 
   uint64_t nodesLower;
   // use prefix sum to find node bounds
@@ -215,10 +210,6 @@ auto divideNodesBinarySearch(
     edgesUpper = edgePrefixSum[nodesUpper - 1 + nodeOffset] - edgeOffset;
   }
 
-  // galois::gDebug("Unit ", id, " nodes ", nodesLower, " to ",
-  //               nodesUpper, " edges ", edgesLower, " ",
-  //               edgesUpper);
-
   return GraphRange(
       NodeRange(iterator(nodesLower), iterator(nodesUpper)),
       EdgeRange(edge_iterator(edgesLower), edge_iterator(edgesUpper)));
@@ -257,19 +248,25 @@ bool unitRangeCornerCaseHandle(uint32_t unitsToSplit, uint32_t beginNode,
  * @param endNode End of range, non-inclusive
  * @param returnRanges Vector to store unit offsets for ranges in
  * @param nodeAlpha The higher the number, the more weight nodes have in
+ * @param is_LS_LC_CSR True if `graph` is a log structured csr graph
  * determining division of nodes (edges have weight 1).
  */
 template <typename GraphTy>
 void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
                                   uint32_t beginNode, uint32_t endNode,
                                   std::vector<uint32_t>& returnRanges,
-                                  uint32_t nodeAlpha) {
+                                  uint32_t nodeAlpha, bool is_LS_LC_CSR) {
   assert(beginNode != endNode);
 
   uint32_t numNodesInRange = endNode - beginNode;
-  uint64_t numEdgesInRange =
-      graph.edge_end(endNode - 1) - graph.edge_begin(beginNode);
-  uint64_t edgeOffset = *graph.edge_begin(beginNode);
+
+  // cannot use edge_end/begin on log strcutred CSR since its edges are not
+  // consecutive.
+  uint64_t numEdgesInRange = (is_LS_LC_CSR)
+                                 ? graph.sizeEdges()
+                                 : std::distance(graph.edge_end(beginNode),
+                                                 graph.edge_begin(endNode - 1));
+  uint64_t edgeOffset      = (is_LS_LC_CSR) ? 0 : graph[beginNode];
 
   returnRanges[0] = beginNode;
   std::vector<unsigned int> dummyScaleFactor;
@@ -294,11 +291,6 @@ void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,
       // unit assinged no nodes, copy last one
       returnRanges[i + 1] = returnRanges[i];
     }
-
-    galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i],
-                   " to ", returnRanges[i + 1], ", num edges is ",
-                   graph.edge_end(returnRanges[i + 1] - 1) -
-                       graph.edge_begin(returnRanges[i]));
   }
 }
 
@@ -362,9 +354,6 @@ void determineUnitRangesLoopPrefixSum(VectorTy& prefixSum,
       // unit assinged no nodes
       returnRanges[i + 1] = returnRanges[i];
     }
-
-    galois::gDebug("Unit ", i, " gets nodes ", returnRanges[i], " to ",
-                   returnRanges[i + 1]);
   }
 }
 
@@ -442,13 +431,13 @@ std::vector<uint32_t> determineUnitRangesFromGraph(GraphTy& graph,
  * @param endNode End of range, non-inclusive
  * @param nodeAlpha The higher the number, the more weight nodes have in
  * determining division of nodes (edges have weight 1).
+ * @param is_LS_LC_CSR True if `graph` is a log structured csr graph
  * @returns vector that indirectly specifies which units get which nodes
  */
 template <typename GraphTy>
-std::vector<uint32_t>
-determineUnitRangesFromGraph(GraphTy& graph, uint32_t unitsToSplit,
-                             uint32_t beginNode, uint32_t endNode,
-                             uint32_t nodeAlpha = 0) {
+std::vector<uint32_t> determineUnitRangesFromGraph(
+    GraphTy& graph, uint32_t unitsToSplit, uint32_t beginNode, uint32_t endNode,
+    uint32_t nodeAlpha = 0, bool is_LS_LC_CSR = false) {
   std::vector<uint32_t> returnRanges;
   returnRanges.resize(unitsToSplit + 1);
 
@@ -460,7 +449,8 @@ determineUnitRangesFromGraph(GraphTy& graph, uint32_t unitsToSplit,
   // no corner cases: onto main loop over nodes that determines
   // node ranges
   internal::determineUnitRangesLoopGraph(graph, unitsToSplit, beginNode,
-                                         endNode, returnRanges, nodeAlpha);
+                                         endNode, returnRanges, nodeAlpha,
+                                         is_LS_LC_CSR);
 
   internal::unitRangeSanity(unitsToSplit, beginNode, endNode, returnRanges);
 
@@ -522,9 +512,6 @@ std::vector<uint32_t> determineUnitRangesFromPrefixSum(uint32_t unitsToSplit,
       // unit assinged no nodes
       nodeRanges[i + 1] = nodeRanges[i];
     }
-
-    galois::gDebug("Unit ", i, " gets nodes ", nodeRanges[i], " to ",
-                   nodeRanges[i + 1]);
   }
 
   return nodeRanges;
diff --git a/libgalois/include/galois/graphs/LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LC_CSR_64_Graph.h
new file mode 100644
index 0000000000..5ce74d331e
--- /dev/null
+++ b/libgalois/include/galois/graphs/LC_CSR_64_Graph.h
@@ -0,0 +1,1027 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#ifndef GALOIS_GRAPHS_LC_CSR_64_GRAPH_H
+#define GALOIS_GRAPHS_LC_CSR_64_GRAPH_H
+
+#include <fstream>
+#include <type_traits>
+
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/serialization/split_member.hpp>
+#include <boost/serialization/binary_object.hpp>
+#include <boost/serialization/serialization.hpp>
+
+#include "galois/config.h"
+#include "galois/Galois.h"
+#include "galois/graphs/Details.h"
+#include "galois/graphs/FileGraph.h"
+#include "galois/graphs/GraphHelpers.h"
+#include "galois/PODResizeableArray.h"
+
+namespace galois::graphs {
+/**
+ * Local computation graph (i.e., graph structure does not change). The data
+ * representation is the traditional compressed-sparse-row (CSR) format.
+ *
+ * The position of template parameters may change between Galois releases; the
+ * most robust way to specify them is through the with_XXX nested templates.
+ *
+ * An example of use:
+ *
+ * \snippet test/graph.cpp Using a graph
+ *
+ * And in C++11:
+ *
+ * \snippet test/graph.cpp Using a graph cxx11
+ *
+ * @tparam NodeTy data on nodes
+ * @tparam EdgeTy data on out edges
+ */
+//! [doxygennuma]
+template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
+          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
+          typename FileEdgeTy = EdgeTy>
+class LC_CSR_64_Graph :
+    //! [doxygennuma]
+    private boost::noncopyable,
+    private internal::LocalIteratorFeature<UseNumaAlloc>,
+    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
+                                               !HasNoLockable> {
+  template <typename Graph>
+  friend class LC_InOut_Graph;
+
+public:
+  template <bool _has_id>
+  struct with_id {
+    typedef LC_CSR_64_Graph type;
+  };
+
+  template <typename _node_data>
+  struct with_node_data {
+    typedef LC_CSR_64_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _edge_data>
+  struct with_edge_data {
+    typedef LC_CSR_64_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _file_edge_data>
+  struct with_file_edge_data {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                            HasOutOfLineLockable, _file_edge_data>
+        type;
+  };
+
+  //! If true, do not use abstract locks in graph
+  template <bool _has_no_lockable>
+  struct with_no_lockable {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _has_no_lockable>
+  using _with_no_lockable =
+      LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                      HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved
+  //! allocation.
+  template <bool _use_numa_alloc>
+  struct with_numa_alloc {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                            HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _use_numa_alloc>
+  using _with_numa_alloc =
+      LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                      HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, store abstract locks separate from nodes
+  template <bool _has_out_of_line_lockable>
+  struct with_out_of_line_lockable {
+    typedef LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                            _has_out_of_line_lockable, FileEdgeTy>
+        type;
+  };
+
+  typedef read_default_graph_tag read_tag;
+
+protected:
+  typedef LargeArray<EdgeTy> EdgeData;
+  typedef LargeArray<uint64_t> EdgeDst;
+  typedef internal::NodeInfoBaseTypes<NodeTy,
+                                      !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfoTypes;
+  typedef internal::NodeInfoBase<NodeTy,
+                                 !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfo;
+  typedef LargeArray<uint64_t> EdgeIndData;
+  typedef LargeArray<NodeInfo> NodeData;
+
+public:
+  typedef uint64_t GraphNode;
+  typedef EdgeTy edge_data_type;
+  typedef FileEdgeTy file_edge_data_type;
+  typedef NodeTy node_data_type;
+  typedef typename EdgeData::reference edge_data_reference;
+  typedef typename NodeInfoTypes::reference node_data_reference;
+  using edge_iterator =
+      boost::counting_iterator<typename EdgeIndData::value_type>;
+  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;
+  typedef iterator const_iterator;
+  typedef iterator local_iterator;
+  typedef iterator const_local_iterator;
+
+protected:
+  NodeData nodeData;
+  EdgeIndData edgeIndData;
+  EdgeDst edgeDst;
+  EdgeData edgeData;
+
+  uint64_t numNodes;
+  uint64_t numEdges;
+
+  typedef internal::EdgeSortIterator<
+      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>
+      edge_sort_iterator;
+
+  edge_iterator raw_begin(GraphNode N) const {
+    return edge_iterator((N == 0) ? 0 : edgeIndData[N - 1]);
+  }
+
+  edge_iterator raw_end(GraphNode N) const {
+    return edge_iterator(edgeIndData[N]);
+  }
+
+  edge_sort_iterator edge_sort_begin(GraphNode N) {
+    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
+  }
+
+  edge_sort_iterator edge_sort_end(GraphNode N) {
+    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
+  }
+
+  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
+    galois::runtime::acquire(&nodeData[N], mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    this->outOfLineAcquire(getId(N), mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode, MethodFlag,
+                   typename std::enable_if<_A2>::type* = 0) {}
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph& graph,
+                          typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
+    typedef LargeArray<FileEdgeTy> FED;
+    if (EdgeData::has_value)
+      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
+  }
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    edgeData.set(*nn, {});
+  }
+
+  uint64_t getId(GraphNode N) { return N; }
+
+  GraphNode getNode(uint64_t n) { return n; }
+
+private:
+  friend class boost::serialization::access;
+
+  template <typename Archive>
+  void save(Archive& ar, const unsigned int) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  template <typename Archive>
+  void load(Archive& ar, const unsigned int) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+
+    if (!nodeData.data()) {
+      if (UseNumaAlloc) {
+        nodeData.allocateBlocked(numNodes);
+        this->outOfLineAllocateBlocked(numNodes);
+      } else {
+        nodeData.allocateInterleaved(numNodes);
+        this->outOfLineAllocateInterleaved(numNodes);
+      }
+
+      // Construct nodeData largeArray
+      for (size_t n = 0; n < numNodes; ++n) {
+        nodeData.constructAt(n);
+      }
+    }
+  }
+
+  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
+  // the save or load depending on whether the archive is used for saving or
+  // loading
+  BOOST_SERIALIZATION_SPLIT_MEMBER()
+
+public:
+  LC_CSR_64_Graph(LC_CSR_64_Graph&& rhs) = default;
+
+  LC_CSR_64_Graph() = default;
+
+  LC_CSR_64_Graph& operator=(LC_CSR_64_Graph&&) = default;
+
+  /**
+   * Serializes node data using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
+    ar << nodeData;
+  }
+
+  /**
+   * Deserializes a Boost archive containing node data to the local node data
+   * variable.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
+    ar >> nodeData;
+  }
+
+  /**
+   * Serializes graph using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeGraph(boost::archive::binary_oarchive& ar) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << nodeData;
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  /**
+   * Deserializes a Boost archive to the local graph.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> nodeData;
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+  }
+
+  /**
+   * Accesses the "prefix sum" of this graph; takes advantage of the fact
+   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
+   * if prefix_sum[0] = number of edges in node 0).
+   *
+   * ONLY USE IF GRAPH HAS BEEN LOADED
+   *
+   * @param n Index into edge prefix sum
+   * @returns The value that would be located at index n in an edge prefix sum
+   * array
+   */
+  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }
+
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  LC_CSR_64_Graph(uint64_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,
+                  EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)
+      : numNodes(_numNodes), numEdges(_numEdges) {
+    if (UseNumaAlloc) {
+      //! [numaallocex]
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      //! [numaallocex]
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+    for (size_t n = 0; n < numNodes; ++n) {
+      nodeData.constructAt(n);
+    }
+    uint64_t cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      cur += edgeNum(n);
+      edgeIndData[n] = cur;
+    }
+    cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
+        if (EdgeData::has_value)
+          edgeData.set(cur, _edgeData(n, e));
+        edgeDst[cur] = _edgeDst(n, e);
+        ++cur;
+      }
+    }
+  }
+
+  friend void swap(LC_CSR_64_Graph& lhs, LC_CSR_64_Graph& rhs) {
+    swap(lhs.nodeData, rhs.nodeData);
+    swap(lhs.edgeIndData, rhs.edgeIndData);
+    swap(lhs.edgeDst, rhs.edgeDst);
+    swap(lhs.edgeData, rhs.edgeData);
+    std::swap(lhs.numNodes, rhs.numNodes);
+    std::swap(lhs.numEdges, rhs.numEdges);
+  }
+
+  node_data_reference getData(GraphNode N,
+                              MethodFlag mflag = MethodFlag::WRITE) {
+    // galois::runtime::checkWrite(mflag, false);
+    NodeInfo& NI = nodeData[N];
+    acquireNode(N, mflag);
+    return NI.getData();
+  }
+
+  edge_data_reference
+  getEdgeData(edge_iterator ni,
+              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
+    // galois::runtime::checkWrite(mflag, false);
+    return edgeData[*ni];
+  }
+
+  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }
+
+  size_t size() const { return numNodes; }
+  size_t sizeEdges() const { return numEdges; }
+
+  iterator begin() const { return iterator(0); }
+  iterator end() const { return iterator(numNodes); }
+
+  const_local_iterator local_begin() const {
+    return const_local_iterator(this->localBegin(numNodes));
+  }
+
+  const_local_iterator local_end() const {
+    return const_local_iterator(this->localEnd(numNodes));
+  }
+
+  local_iterator local_begin() {
+    return local_iterator(this->localBegin(numNodes));
+  }
+
+  local_iterator local_end() {
+    return local_iterator(this->localEnd(numNodes));
+  }
+
+  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
+      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
+        acquireNode(edgeDst[*ii], mflag);
+      }
+    }
+    return raw_begin(N);
+  }
+
+  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    return raw_end(N);
+  }
+
+  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }
+
+  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
+    return std::find_if(edge_begin(N1), edge_end(N1),
+                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
+  }
+
+  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
+    auto e = std::lower_bound(
+        edge_begin(N1), edge_end(N1), N2,
+        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
+    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return internal::make_no_deref_range(edge_begin(N, mflag),
+                                         edge_end(N, mflag));
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return edges(N, mflag);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
+   */
+  template <typename CompTy>
+  void sortEdgesByEdgeData(GraphNode N,
+                           const CompTy& comp = std::less<EdgeTy>(),
+                           MethodFlag mflag   = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(
+        edge_sort_begin(N), edge_sort_end(N),
+        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
+            comp));
+  }
+
+  /**
+   * Sorts outgoing edges of a node.
+   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
+   */
+  template <typename CompTy>
+  void sortEdges(GraphNode N, const CompTy& comp,
+                 MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
+   */
+  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
+    std::sort(edge_sort_begin(N), edge_sort_end(N),
+              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
+                return e1.dst < e2.dst;
+              });
+  }
+
+  /**
+   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
+   * getEdgeDst(e).
+   */
+  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
+    galois::do_all(
+        galois::iterate(size_t{0}, this->size()),
+        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
+        galois::no_stats(), galois::steal());
+  }
+
+  void allocateFrom(const FileGraph& graph) {
+    numNodes = graph.size();
+    numEdges = graph.sizeEdges();
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void allocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = nEdges;
+
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = nEdges;
+
+    deallocate();
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+  }
+
+  void constructNodes() {
+#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
+    for (uint64_t x = 0; x < numNodes; ++x) {
+      nodeData.constructAt(x);
+      this->outOfLineConstructAt(x);
+    }
+#else
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t x) {
+          nodeData.constructAt(x);
+          this->outOfLineConstructAt(x);
+        },
+        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
+#endif
+  }
+
+  void deallocate() {
+    nodeData.destroy();
+    nodeData.deallocate();
+
+    edgeIndData.deallocate();
+    edgeIndData.destroy();
+
+    edgeDst.deallocate();
+    edgeDst.destroy();
+
+    edgeData.deallocate();
+    edgeData.destroy();
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst,
+                     const typename EdgeData::value_type& val) {
+    edgeData.set(e, val);
+    edgeDst[e] = dst;
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst) { edgeDst[e] = dst; }
+
+  void fixEndEdge(uint64_t n, uint64_t e) { edgeIndData[n] = e; }
+
+  /**
+   * Perform an in-memory transpose of the graph, replacing the original
+   * CSR to CSC
+   */
+  void transpose(const char* regionName = NULL) {
+    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
+    timer.start();
+
+    EdgeDst edgeDst_old;
+    EdgeData edgeData_new;
+    EdgeIndData edgeIndData_old;
+    EdgeIndData edgeIndData_temp;
+
+    if (UseNumaAlloc) {
+      edgeIndData_old.allocateBlocked(numNodes);
+      edgeIndData_temp.allocateBlocked(numNodes);
+      edgeDst_old.allocateBlocked(numEdges);
+      edgeData_new.allocateBlocked(numEdges);
+    } else {
+      edgeIndData_old.allocateInterleaved(numNodes);
+      edgeIndData_temp.allocateInterleaved(numNodes);
+      edgeDst_old.allocateInterleaved(numEdges);
+      edgeData_new.allocateInterleaved(numEdges);
+    }
+
+    // Copy old node->index location + initialize the temp array
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t n) {
+          edgeIndData_old[n]  = edgeIndData[n];
+          edgeIndData_temp[n] = 0;
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));
+
+    // get destination of edge, copy to array, and
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numEdges),
+        [&](uint64_t e) {
+          auto dst       = edgeDst[e];
+          edgeDst_old[e] = dst;
+          // counting outgoing edges in the tranpose graph by
+          // counting incoming edges in the original graph
+          __sync_add_and_fetch(&edgeIndData_temp[dst], 1);
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));
+
+    // TODO is it worth doing parallel prefix sum?
+    // prefix sum calculation of the edge index array
+    for (uint64_t n = 1; n < numNodes; ++n) {
+      edgeIndData_temp[n] += edgeIndData_temp[n - 1];
+    }
+
+    // copy over the new tranposed edge index data
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));
+
+    // edgeIndData_temp[i] will now hold number of edges that all nodes
+    // before the ith node have
+    if (numNodes >= 1) {
+      edgeIndData_temp[0] = 0;
+      galois::do_all(
+          galois::iterate(UINT64_C(1), numNodes),
+          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
+    }
+
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t src) {
+          // e = start index into edge array for a particular node
+          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];
+
+          // get all outgoing edges of a particular node in the
+          // non-transpose and convert to incoming
+          while (e < edgeIndData_old[src]) {
+            // destination nodde
+            auto dst = edgeDst_old[e];
+            // location to save edge
+            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);
+            // save src as destination
+            edgeDst[e_new] = src;
+            // copy edge data to "new" array
+            edgeDataCopy(edgeData_new, edgeData, e_new, e);
+            e++;
+          }
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));
+
+    // if edge weights, then overwrite edgeData with new edge data
+    if (EdgeData::has_value) {
+      galois::do_all(
+          galois::iterate(UINT64_C(0), numEdges),
+          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
+    }
+
+    timer.stop();
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
+                    uint64_t e,
+                    typename std::enable_if<is_non_void>::type* = 0) {
+    edgeData_new[e_new] = edgeData[e];
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
+                    typename std::enable_if<!is_non_void>::type* = 0) {
+    // does nothing
+  }
+
+  template <typename E                                            = EdgeTy,
+            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool readUnweighted = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        if (readUnweighted) {
+          edgeData.set(*nn, {});
+        } else {
+          constructEdgeValue(graph, nn);
+        }
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+  }
+
+  template <typename E                                           = EdgeTy,
+            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool GALOIS_UNUSED(readUnweighted) = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        constructEdgeValue(graph, nn);
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+  }
+
+  /**
+   * Returns the reference to the edgeIndData LargeArray
+   * (a prefix sum of edges)
+   *
+   * @returns reference to LargeArray edgeIndData
+   */
+  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }
+
+  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
+    return galois::graphs::divideNodesBinarySearch(
+        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
+  }
+  /**
+   *
+   * custom allocator for vector<vector<>>
+   * Adding for Louvain clustering
+   * TODO: Find better way to do this
+   */
+  void constructFrom(uint64_t numNodes, uint64_t numEdges,
+                     std::vector<uint64_t>& prefix_sum,
+                     std::vector<std::vector<uint64_t>>& edges_id,
+                     std::vector<std::vector<EdgeTy>>& edges_data) {
+    // allocateFrom(numNodes, numEdges);
+    /*
+     * Deallocate if reusing the graph
+     */
+    destroyAndAllocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        if (edgeIndData[n] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n - 1]);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n - 1]);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+  void constructFrom(
+      uint64_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
+      galois::gstl::Vector<galois::PODResizeableArray<uint64_t>>& edges_id,
+      std::vector<std::vector<EdgeTy>>& edges_data) {
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n] = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        if (edgeIndData[n] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n - 1]);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n - 1]);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is not void.
+   *
+   */
+  template <
+      typename U                                                      = void,
+      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
+      // version 1 padding TODO make version agnostic
+      if (numEdges % 2) {
+        readPosition += sizeof(uint32_t);
+      }
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));
+      if (numEdges % 2) {
+        readPosition += sizeof(uint64_t);
+      }
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+    /**
+     * Load edge data array
+     **/
+    assert(edgeData.data());
+    if (!edgeData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeData.data()),
+                   sizeof(EdgeTy) * numEdges);
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is void.
+   *
+   */
+  template <
+      typename U                                                     = void,
+      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Given a manually created graph, initialize the local ranges on this graph
+   * so that threads can iterate over a balanced number of vertices.
+   */
+  void initializeLocalRanges() {
+    galois::on_each([&](unsigned tid, unsigned total) {
+      auto r = divideByNode(0, 1, tid, total).first;
+      this->setLocalRange(*r.first, *r.second);
+    });
+  }
+};
+
+} // namespace galois::graphs
+
+#endif
diff --git a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
index 2da77fb6cb..09224296a3 100644
--- a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h
@@ -50,31 +50,35 @@ namespace graphs {
  */
 template <typename NodeTy, typename EdgeTy, bool EdgeDataByValue = false,
           bool HasNoLockable = false, bool UseNumaAlloc = false,
-          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy>
+          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy,
+          typename NodeIndexTy = uint32_t, typename EdgeIndexTy = uint64_t>
 class LC_CSR_CSC_Graph
     : public LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
-                          HasOutOfLineLockable, FileEdgeTy> {
+                          HasOutOfLineLockable, FileEdgeTy, NodeIndexTy,
+                          EdgeIndexTy> {
   // typedef to make it easier to read
   //! Typedef referring to base LC_CSR_Graph
-  using BaseGraph = LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
-                                 HasOutOfLineLockable, FileEdgeTy>;
+  using BaseGraph =
+      LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                   HasOutOfLineLockable, FileEdgeTy, NodeIndexTy, EdgeIndexTy>;
   //! Typedef referring to this class itself
   using ThisGraph =
       LC_CSR_CSC_Graph<NodeTy, EdgeTy, EdgeDataByValue, HasNoLockable,
-                       UseNumaAlloc, HasOutOfLineLockable, FileEdgeTy>;
+                       UseNumaAlloc, HasOutOfLineLockable, FileEdgeTy,
+                       NodeIndexTy, EdgeIndexTy>;
 
 public:
   //! Graph node typedef
-  using GraphNode = uint32_t;
+  using GraphNode = NodeIndexTy;
 
 protected:
-  // retypedefs of base class
+  // redefinitions of base class typedefs
   //! large array for edge data
   using EdgeData = LargeArray<EdgeTy>;
   //! large array for edge destinations
-  using EdgeDst = LargeArray<uint32_t>;
+  using EdgeDst = LargeArray<NodeIndexTy>;
   //! large array for edge index data
-  using EdgeIndData = LargeArray<uint64_t>;
+  using EdgeIndData = LargeArray<EdgeIndexTy>;
 
 public:
   //! iterator for edges
@@ -85,9 +89,9 @@ class LC_CSR_CSC_Graph
 
 protected:
   //! edge index data for the reverse edges
-  EdgeIndData inEdgeIndData;
+  EdgeIndData in_edge_ind_data_;
   //! edge destination data for the reverse edges
-  EdgeDst inEdgeDst;
+  EdgeDst in_edge_dst_;
   //! Edge data of inedges can be a value copy of the outedges (i.e. in and
   //! out edges have separate edge values) or inedges can refer to the same
   //! data as its corresponding outedge; this is what this typedef is for
@@ -95,6 +99,7 @@ class LC_CSR_CSC_Graph
       typename std::conditional<EdgeDataByValue, EdgeData, EdgeIndData>::type;
   //! The data for the reverse edges
   EdgeDataRep inEdgeData;
+  EdgeIndData in_edge_to_out_edge_;
 
   //! redefinition of the edge sort iterator in LC_CSR_Graph
   using edge_sort_iterator =
@@ -103,12 +108,12 @@ class LC_CSR_CSC_Graph
 
   //! beginning iterator to an edge sorter for in-edges
   edge_sort_iterator in_edge_sort_begin(GraphNode N) {
-    return edge_sort_iterator(*in_raw_begin(N), &inEdgeDst, &inEdgeData);
+    return edge_sort_iterator(*in_raw_begin(N), &in_edge_dst_, &inEdgeData);
   }
 
   //! ending iterator to an edge sorter for in-edges
   edge_sort_iterator in_edge_sort_end(GraphNode N) {
-    return edge_sort_iterator(*in_raw_end(N), &inEdgeDst, &inEdgeData);
+    return edge_sort_iterator(*in_raw_end(N), &in_edge_dst_, &inEdgeData);
   }
 
   /**
@@ -123,19 +128,11 @@ class LC_CSR_CSC_Graph
     BaseGraph::edgeDataCopy(inEdgeData, BaseGraph::edgeData, e_new, e);
   }
 
-  /**
-   * Save a pointer to an outedge (i.e. map an in-edge to an out-edge). Done
-   * to share edge data.
-   *
-   * @param e_new position of out-edge to save
-   * @param e position of in-edge
-   */
+  //! Do nothing; getting edge data will be done via pointer
   template <bool A                             = EdgeDataByValue,
             typename std::enable_if<!A>::type* = nullptr>
-  void createEdgeData(const uint64_t e_new, const uint64_t e) {
-    if (!std::is_void<EdgeTy>::value) {
-      inEdgeData[e_new] = e;
-    }
+  void createEdgeData(const uint64_t, const uint64_t) {
+    // do nothing
   }
 
   /**
@@ -162,9 +159,9 @@ class LC_CSR_CSC_Graph
     }
 
     // copy over the new tranposed edge index data
-    inEdgeIndData.allocateInterleaved(BaseGraph::numNodes);
+    in_edge_ind_data_.allocateInterleaved(BaseGraph::numNodes);
     galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numNodes),
-                   [&](uint64_t n) { inEdgeIndData[n] = dataBuffer[n]; });
+                   [&](uint64_t n) { in_edge_ind_data_[n] = dataBuffer[n]; });
   }
 
   /**
@@ -179,16 +176,18 @@ class LC_CSR_CSC_Graph
     // saving an edge for a node
     if (BaseGraph::numNodes >= 1) {
       dataBuffer[0] = 0;
-      galois::do_all(galois::iterate(UINT64_C(1), BaseGraph::numNodes),
-                     [&](uint64_t n) { dataBuffer[n] = inEdgeIndData[n - 1]; });
+      galois::do_all(
+          galois::iterate(UINT64_C(1), BaseGraph::numNodes),
+          [&](uint64_t n) { dataBuffer[n] = in_edge_ind_data_[n - 1]; });
     }
 
     // allocate edge dests and data
-    inEdgeDst.allocateInterleaved(BaseGraph::numEdges);
+    in_edge_dst_.allocateInterleaved(BaseGraph::numEdges);
 
     if (!std::is_void<EdgeTy>::value) {
       inEdgeData.allocateInterleaved(BaseGraph::numEdges);
     }
+    in_edge_to_out_edge_.allocateInterleaved(BaseGraph::numEdges);
 
     galois::do_all(
         galois::iterate(UINT64_C(0), BaseGraph::numNodes), [&](uint64_t src) {
@@ -203,21 +202,43 @@ class LC_CSR_CSC_Graph
             // location to save edge
             auto e_new = __sync_fetch_and_add(&(dataBuffer[dst]), 1);
             // save src as destination
-            inEdgeDst[e_new] = src;
+            in_edge_dst_[e_new] = src;
             // edge data to "new" array
             createEdgeData(e_new, e);
+            in_edge_to_out_edge_[e_new] = e;
             e++;
           }
         });
   }
 
 public:
-  //! default constructor
-  LC_CSR_CSC_Graph() = default;
-  //! default move constructor
-  LC_CSR_CSC_Graph(LC_CSR_CSC_Graph&& rhs) = default;
-  //! default = operator
-  LC_CSR_CSC_Graph& operator=(LC_CSR_CSC_Graph&&) = default;
+  /////////////////////////////////////////////////////////////////////////////
+  // Manual construction functions
+  /////////////////////////////////////////////////////////////////////////////
+
+  // no edge data support at the moment for these functions because not required
+  // for the current use case
+
+  //! Reallocate memory for the CSC part of the graph
+  void CSCAllocate() {
+    // assumes nodes and edges set from CSR version of this call
+    in_edge_dst_.deallocate();
+    in_edge_ind_data_.deallocate();
+
+    if (UseNumaAlloc) {
+      in_edge_ind_data_.allocateBlocked(BaseGraph::numNodes);
+      in_edge_dst_.allocateBlocked(BaseGraph::numEdges);
+    } else {
+      in_edge_ind_data_.allocateInterleaved(BaseGraph::numNodes);
+      in_edge_dst_.allocateInterleaved(BaseGraph::numEdges);
+    }
+  }
+  //! Construct the in edge for some edge index by setting the destination
+  void ConstructInEdge(EdgeIndexTy e, NodeIndexTy dst) {
+    in_edge_dst_[e] = dst;
+  }
+  //! In-edge index setting
+  void FixEndInEdge(NodeIndexTy n, EdgeIndexTy e) { in_edge_ind_data_[n] = e; }
 
   /////////////////////////////////////////////////////////////////////////////
   // Construction functions
@@ -254,7 +275,7 @@ class LC_CSR_CSC_Graph
    * @returns Iterator to first in edge of node N
    */
   edge_iterator in_raw_begin(GraphNode N) const {
-    return edge_iterator((N == 0) ? 0 : inEdgeIndData[N - 1]);
+    return edge_iterator((N == 0) ? 0 : in_edge_ind_data_[N - 1]);
   }
 
   /**
@@ -265,7 +286,7 @@ class LC_CSR_CSC_Graph
    * node N+1)
    */
   edge_iterator in_raw_end(GraphNode N) const {
-    return edge_iterator(inEdgeIndData[N]);
+    return edge_iterator(in_edge_ind_data_[N]);
   }
 
   /**
@@ -281,7 +302,7 @@ class LC_CSR_CSC_Graph
     if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
       for (edge_iterator ii = in_raw_begin(N), ee = in_raw_end(N); ii != ee;
            ++ii) {
-        BaseGraph::acquireNode(inEdgeDst[*ii], mflag);
+        BaseGraph::acquireNode(in_edge_dst_[*ii], mflag);
       }
     }
     return in_raw_begin(N);
@@ -320,7 +341,7 @@ class LC_CSR_CSC_Graph
    * @param ni edge id
    * @returns destination for that in edge
    */
-  GraphNode getInEdgeDst(edge_iterator ni) const { return inEdgeDst[*ni]; }
+  GraphNode getInEdgeDst(edge_iterator ni) const { return in_edge_dst_[*ni]; }
 
   /**
    * Given an edge id for in edge, get the data associated with that edge.
@@ -367,7 +388,7 @@ class LC_CSR_CSC_Graph
             typename std::enable_if<!A>::type* = nullptr>
   edge_data_reference
   getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) const {
-    return BaseGraph::edgeData[inEdgeData[*ni]];
+    return BaseGraph::edgeData[in_edge_to_out_edge_[*ni]];
   }
 
   /**
@@ -383,13 +404,19 @@ class LC_CSR_CSC_Graph
             typename std::enable_if<!A>::type* = nullptr>
   edge_data_reference getInEdgeData(edge_iterator ni,
                                     MethodFlag = MethodFlag::UNPROTECTED) {
-    return BaseGraph::edgeData[inEdgeData[*ni]];
+    return BaseGraph::edgeData[in_edge_to_out_edge_[*ni]];
+  }
+
+  //! Returns corresponding index for the out-edge corresponding to
+  //! an in-edge.
+  size_t InEdgeToOutEdge(edge_iterator ni) const {
+    return in_edge_to_out_edge_[*ni];
   }
 
   /**
    * @returns the prefix sum of in-edges
    */
-  const EdgeIndData& getInEdgePrefixSum() const { return inEdgeIndData; }
+  const EdgeIndData& getInEdgePrefixSum() const { return in_edge_ind_data_; }
 
   /////////////////////////////////////////////////////////////////////////////
   // Utility
diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h
index 4b96a2ae11..48f2740fdb 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Graph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h
@@ -57,8 +57,10 @@ namespace galois::graphs {
  */
 //! [doxygennuma]
 template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
-          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
-          typename FileEdgeTy = EdgeTy>
+          bool UseNumaAlloc =
+              false, // true => numa-blocked, false => numa-interleaved
+          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy,
+          typename NodeIndexTy = uint32_t, typename EdgeIndexTy = uint64_t>
 class LC_CSR_Graph :
     //! [doxygennuma]
     private boost::noncopyable,
@@ -132,18 +134,18 @@ class LC_CSR_Graph :
 
 protected:
   typedef LargeArray<EdgeTy> EdgeData;
-  typedef LargeArray<uint32_t> EdgeDst;
+  typedef LargeArray<NodeIndexTy> EdgeDst;
   typedef internal::NodeInfoBaseTypes<NodeTy,
                                       !HasNoLockable && !HasOutOfLineLockable>
       NodeInfoTypes;
   typedef internal::NodeInfoBase<NodeTy,
                                  !HasNoLockable && !HasOutOfLineLockable>
       NodeInfo;
-  typedef LargeArray<uint64_t> EdgeIndData;
+  typedef LargeArray<EdgeIndexTy> EdgeIndData;
   typedef LargeArray<NodeInfo> NodeData;
 
 public:
-  typedef uint32_t GraphNode;
+  typedef NodeIndexTy GraphNode;
   typedef EdgeTy edge_data_type;
   typedef FileEdgeTy file_edge_data_type;
   typedef NodeTy node_data_type;
@@ -187,19 +189,19 @@ class LC_CSR_Graph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode, MethodFlag,
-                   typename std::enable_if<_A2>::type* = 0) {}
+                   typename std::enable_if<_A2>::type* = 0) const {}
 
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
@@ -214,7 +216,7 @@ class LC_CSR_Graph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
@@ -325,6 +327,9 @@ class LC_CSR_Graph :
     ar >> edgeData;
   }
 
+  // cxh
+  EdgeIndexTy* row_start_ptr() { return &edgeIndData[0]; }
+  NodeIndexTy* edge_dst_ptr() { return &edgeDst[0]; }
   /**
    * Accesses the "prefix sum" of this graph; takes advantage of the fact
    * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +
@@ -339,8 +344,9 @@ class LC_CSR_Graph :
   uint64_t operator[](uint64_t n) { return *(edge_end(n)); }
 
   template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
-  LC_CSR_Graph(uint32_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,
-               EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)
+  LC_CSR_Graph(NodeIndexTy _numNodes, EdgeIndexTy _numEdges,
+               EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
+               EdgeDataFnTy _edgeData)
       : numNodes(_numNodes), numEdges(_numEdges) {
     if (UseNumaAlloc) {
       //! [numaallocex]
@@ -349,7 +355,7 @@ class LC_CSR_Graph :
       edgeDst.allocateBlocked(numEdges);
       edgeData.allocateBlocked(numEdges);
       //! [numaallocex]
-      this->outOfLineAllocateBlocked(numNodes, false);
+      this->outOfLineAllocateBlocked(numNodes);
     } else {
       nodeData.allocateInterleaved(numNodes);
       edgeIndData.allocateInterleaved(numNodes);
@@ -530,7 +536,7 @@ class LC_CSR_Graph :
     }
   }
 
-  void allocateFrom(uint32_t nNodes, uint64_t nEdges) {
+  void allocateFrom(NodeIndexTy nNodes, EdgeIndexTy nEdges) {
     numNodes = nNodes;
     numEdges = nEdges;
 
@@ -549,7 +555,7 @@ class LC_CSR_Graph :
     }
   }
 
-  void destroyAndAllocateFrom(uint32_t nNodes, uint64_t nEdges) {
+  void destroyAndAllocateFrom(NodeIndexTy nNodes, EdgeIndexTy nEdges) {
     numNodes = nNodes;
     numEdges = nEdges;
 
@@ -571,7 +577,7 @@ class LC_CSR_Graph :
 
   void constructNodes() {
 #ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
-    for (uint32_t x = 0; x < numNodes; ++x) {
+    for (NodeIndexTy x = 0; x < numNodes; ++x) {
       nodeData.constructAt(x);
       this->outOfLineConstructAt(x);
     }
@@ -600,15 +606,23 @@ class LC_CSR_Graph :
     edgeData.destroy();
   }
 
-  void constructEdge(uint64_t e, uint32_t dst,
+  //! No destroy, only deallocate
+  void DeallocateOnly() {
+    nodeData.deallocate();
+    edgeIndData.deallocate();
+    edgeDst.deallocate();
+    edgeData.deallocate();
+  }
+
+  void constructEdge(EdgeIndexTy e, NodeIndexTy dst,
                      const typename EdgeData::value_type& val) {
     edgeData.set(e, val);
     edgeDst[e] = dst;
   }
 
-  void constructEdge(uint64_t e, uint32_t dst) { edgeDst[e] = dst; }
+  void constructEdge(EdgeIndexTy e, NodeIndexTy dst) { edgeDst[e] = dst; }
 
-  void fixEndEdge(uint32_t n, uint64_t e) { edgeIndData[n] = e; }
+  void fixEndEdge(NodeIndexTy n, EdgeIndexTy e) { edgeIndData[n] = e; }
 
   /**
    * Perform an in-memory transpose of the graph, replacing the original
@@ -658,7 +672,7 @@ class LC_CSR_Graph :
 
     // TODO is it worth doing parallel prefix sum?
     // prefix sum calculation of the edge index array
-    for (uint32_t n = 1; n < numNodes; ++n) {
+    for (NodeIndexTy n = 1; n < numNodes; ++n) {
       edgeIndData_temp[n] += edgeIndData_temp[n - 1];
     }
 
@@ -712,14 +726,14 @@ class LC_CSR_Graph :
   }
 
   template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
-                    uint64_t e,
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData,
+                    EdgeIndexTy e_new, EdgeIndexTy e,
                     typename std::enable_if<is_non_void>::type* = 0) {
     edgeData_new[e_new] = edgeData[e];
   }
 
   template <bool is_non_void = EdgeData::has_value>
-  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
+  void edgeDataCopy(EdgeData&, EdgeData&, EdgeIndexTy, EdgeIndexTy,
                     typename std::enable_if<!is_non_void>::type* = 0) {
     // does nothing
   }
@@ -806,7 +820,7 @@ class LC_CSR_Graph :
    * Adding for Louvain clustering
    * TODO: Find better way to do this
    */
-  void constructFrom(uint32_t numNodes, uint64_t numEdges,
+  void constructFrom(NodeIndexTy numNodes, EdgeIndexTy numEdges,
                      std::vector<uint64_t>& prefix_sum,
                      std::vector<std::vector<uint32_t>>& edges_id,
                      std::vector<std::vector<EdgeTy>>& edges_data) {
@@ -817,25 +831,26 @@ class LC_CSR_Graph :
     destroyAndAllocateFrom(numNodes, numEdges);
     constructNodes();
 
-    galois::do_all(galois::iterate((uint32_t)0, numNodes),
-                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });
-
-    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {
-      if (n == 0) {
-        if (edgeIndData[n] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin());
-        }
-      } else {
-        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
-          std::copy(edges_id[n].begin(), edges_id[n].end(),
-                    edgeDst.begin() + edgeIndData[n - 1]);
-          std::copy(edges_data[n].begin(), edges_data[n].end(),
-                    edgeData.begin() + edgeIndData[n - 1]);
-        }
-      }
-    });
+    galois::do_all(galois::iterate((NodeIndexTy)0, numNodes),
+                   [&](NodeIndexTy n) { edgeIndData[n] = prefix_sum[n]; });
+    galois::do_all(galois::iterate((NodeIndexTy)0, numNodes),
+                   [&](NodeIndexTy n) {
+                     if (n == 0) {
+                       if (edgeIndData[n] > 0) {
+                         std::copy(edges_id[n].begin(), edges_id[n].end(),
+                                   edgeDst.begin());
+                         std::copy(edges_data[n].begin(), edges_data[n].end(),
+                                   edgeData.begin());
+                       }
+                     } else {
+                       if (edgeIndData[n] - edgeIndData[n - 1] > 0) {
+                         std::copy(edges_id[n].begin(), edges_id[n].end(),
+                                   edgeDst.begin() + edgeIndData[n - 1]);
+                         std::copy(edges_data[n].begin(), edges_data[n].end(),
+                                   edgeData.begin() + edgeIndData[n - 1]);
+                       }
+                     }
+                   });
 
     initializeLocalRanges();
   }
@@ -869,6 +884,11 @@ class LC_CSR_Graph :
     initializeLocalRanges();
   }
 
+  ////////////////////////////////////////////////////////////////////////////////
+  // Warning: the below code is NOT compatible with NodeIndexTy/EdgeIndexTy;
+  // do NOT use with them
+  ////////////////////////////////////////////////////////////////////////////////
+
   /**
    * Reads the GR files directly into in-memory
    * data-structures of LC_CSR graphs using freads.
@@ -1020,6 +1040,9 @@ class LC_CSR_Graph :
       this->setLocalRange(*r.first, *r.second);
     });
   }
+  ////////////////////////////////////////////////////////////////////////////////
+  // End warning section
+  ////////////////////////////////////////////////////////////////////////////////
 };
 
 } // namespace galois::graphs
diff --git a/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h b/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h
index 7c76391a46..d2ba3aad6f 100644
--- a/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h
+++ b/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h
@@ -190,13 +190,13 @@ class LC_CSR_Hypergraph :
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -217,7 +217,7 @@ class LC_CSR_Hypergraph :
   template <bool _A1 = EdgeData::has_value,
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edgeData.set(*nn, {});
   }
 
@@ -269,8 +269,8 @@ class LC_CSR_Hypergraph :
   BOOST_SERIALIZATION_SPLIT_MEMBER()
 
 public:
-  LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs) = default;
-  LC_CSR_Hypergraph()                        = default;
+  LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs)        = default;
+  LC_CSR_Hypergraph()                               = default;
   LC_CSR_Hypergraph& operator=(LC_CSR_Hypergraph&&) = default;
 
   /**
diff --git a/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h b/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h
index c0d8021167..f3db63a7fe 100644
--- a/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h
+++ b/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h
@@ -186,13 +186,13 @@ class LC_InlineEdge_Graph
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(N, mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -220,7 +220,7 @@ class LC_InlineEdge_Graph
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                           EdgeInfo* edge,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edge->construct();
   }
 
diff --git a/libgalois/include/galois/graphs/LC_Linear_Graph.h b/libgalois/include/galois/graphs/LC_Linear_Graph.h
index a884bfc91b..f92a0a77de 100644
--- a/libgalois/include/galois/graphs/LC_Linear_Graph.h
+++ b/libgalois/include/galois/graphs/LC_Linear_Graph.h
@@ -163,13 +163,13 @@ class LC_Linear_Graph
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(N, mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -195,7 +195,7 @@ class LC_Linear_Graph
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                           EdgeInfo* edge,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     edge->construct();
   }
 
diff --git a/libgalois/include/galois/graphs/LC_Morph_Graph.h b/libgalois/include/galois/graphs/LC_Morph_Graph.h
index 78cf28b9ae..fdc02c468e 100644
--- a/libgalois/include/galois/graphs/LC_Morph_Graph.h
+++ b/libgalois/include/galois/graphs/LC_Morph_Graph.h
@@ -240,7 +240,7 @@ class LC_Morph_Graph
    */
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(N, mflag);
   }
 
@@ -254,7 +254,7 @@ class LC_Morph_Graph
    */
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(getId(N), mflag);
   }
 
@@ -288,7 +288,7 @@ class LC_Morph_Graph
             bool _A2 = LargeArray<FileEdgeTy>::has_value>
   void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                           GraphNode src, GraphNode dst,
-                          typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
     addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
new file mode 100644
index 0000000000..ef42ae1a73
--- /dev/null
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h
@@ -0,0 +1,1508 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#ifndef GALOIS_GRAPHS_LS_LC_CSR_64_GRAPH_H
+#define GALOIS_GRAPHS_LS_LC_CSR_64_GRAPH_H
+
+#include <fstream>
+#include <type_traits>
+
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/serialization/split_member.hpp>
+#include <boost/serialization/binary_object.hpp>
+#include <boost/serialization/serialization.hpp>
+
+#include "galois/config.h"
+#include "galois/Galois.h"
+#include "galois/graphs/Details.h"
+#include "galois/graphs/FileGraph.h"
+#include "galois/graphs/GraphHelpers.h"
+#include "galois/PODResizeableArray.h"
+#include "galois/PrefixSum.h"
+
+namespace galois::graphs {
+/**
+ * Local computation graph (i.e., graph structure does not change). The data
+ * representation is the traditional compressed-sparse-row (CSR) format.
+ *
+ * The position of template parameters may change between Galois releases; the
+ * most robust way to specify them is through the with_XXX nested templates.
+ *
+ * An example of use:
+ *
+ * \snippet test/graph.cpp Using a graph
+ *
+ * And in C++11:
+ *
+ * \snippet test/graph.cpp Using a graph cxx11
+ *
+ * @tparam NodeTy data on nodes
+ * @tparam EdgeTy data on out edges
+ */
+//! [doxygennuma]
+template <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,
+          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,
+          typename FileEdgeTy = EdgeTy>
+class LS_LC_CSR_64_Graph :
+    //! [doxygennuma]
+    private boost::noncopyable,
+    private internal::LocalIteratorFeature<UseNumaAlloc>,
+    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&
+                                               !HasNoLockable> {
+  template <typename Graph>
+  friend class LC_InOut_Graph;
+
+public:
+  template <bool _has_id>
+  struct with_id {
+    typedef LS_LC_CSR_64_Graph type;
+  };
+
+  template <typename _node_data>
+  struct with_node_data {
+    typedef LS_LC_CSR_64_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _edge_data>
+  struct with_edge_data {
+    typedef LS_LC_CSR_64_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+
+  template <typename _file_edge_data>
+  struct with_file_edge_data {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               HasOutOfLineLockable, _file_edge_data>
+        type;
+  };
+
+  //! If true, do not use abstract locks in graph
+  template <bool _has_no_lockable>
+  struct with_no_lockable {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _has_no_lockable>
+  using _with_no_lockable =
+      LS_LC_CSR_64_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,
+                         HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved
+  //! allocation.
+  template <bool _use_numa_alloc>
+  struct with_numa_alloc {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                               HasOutOfLineLockable, FileEdgeTy>
+        type;
+  };
+  template <bool _use_numa_alloc>
+  using _with_numa_alloc =
+      LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,
+                         HasOutOfLineLockable, FileEdgeTy>;
+
+  //! If true, store abstract locks separate from nodes
+  template <bool _has_out_of_line_lockable>
+  struct with_out_of_line_lockable {
+    typedef LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                               _has_out_of_line_lockable, FileEdgeTy>
+        type;
+  };
+
+  typedef read_default_graph_tag read_tag;
+
+protected:
+  enum VertexState : uint16_t {
+    UNLK = 0x0 << 0,
+    LOCK = 0x1 << 0,
+    TOMB = 0x1 << 1,
+    UMAX = 0x1 << 2
+  };
+
+  constexpr uint64_t mask(uint8_t mask, uint8_t shift) { return mask << shift; }
+  constexpr uint64_t lower(uint8_t num) { return (1 << num) - 1; }
+
+  // Pack things in the same order of VertexState
+  template <typename T>
+  struct __attribute__((packed)) PackedVal {
+    VertexState get_vertex_state(uint64_t v) const {
+      return (VertexState)(v >> 48);
+    }
+    uint64_t get_raw_value(uint64_t v) const { return v & lower(48); }
+    uint16_t get_flags_unlock(uint16_t f) const { return f & (lower(15) << 1); }
+    uint16_t get_flags_untomb(uint16_t f) const {
+      return f & (lower(14) << 2 | 0x1);
+    }
+
+    volatile uint16_t flags : 16;
+    uint64_t value : 48;
+
+    PackedVal(T t)
+        : flags(get_vertex_state((uint64_t)t)),
+          value(get_raw_value((uint64_t)t)) {}
+
+    inline VertexState try_lock() {
+      uint16_t f = __atomic_load_2(this, __ATOMIC_RELAXED);
+      bool b     = false;
+      if (!(f & LOCK))
+        b = __atomic_compare_exchange_2(this, &f, f | LOCK, true,
+                                        __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+      return (VertexState)((b ? UNLK : LOCK) | get_flags_unlock(f));
+    }
+
+    // Make an explicit function that returns tombstone and locks
+    inline bool lock() {
+      uint64_t ret;
+      VertexState s;
+      do {
+        s = this->try_lock();
+      } while (s & LOCK);
+      return !(s & TOMB);
+    }
+
+    inline void unlock() {
+      uint64_t f = flags;
+      __atomic_store_2(this, f & (~LOCK), __ATOMIC_RELEASE);
+    }
+
+    inline void set_value(T p) {
+      if ((uint64_t)p == UINT64_MAX) {
+        flags |= UMAX;
+      } else {
+        value = get_raw_value(p);
+      }
+    }
+
+    inline T get_value() { return (flags & UMAX) ? (T)UINT64_MAX : (T)value; }
+
+    inline void unset_tomb() { flags = flags & (~TOMB); }
+
+    inline void set_tomb() { flags = flags | TOMB; }
+
+    inline bool is_tomb() { return flags & TOMB; }
+
+    inline bool atomic_is_tomb() {
+      return __atomic_load_2(this, __ATOMIC_RELAXED) & TOMB;
+    }
+
+    inline PackedVal<T>& operator=(const T& val) {
+      this.set_value(val);
+      return *this;
+    }
+  };
+
+  struct EdgeInd {
+    uint64_t first;
+    uint64_t second;
+    operator uint64_t() const { return second; }
+    uint64_t operator++() { return ++second; }
+    uint64_t operator--() { return --second; }
+    uint64_t operator+=(uint64_t t) { return (second += t); }
+  };
+
+  typedef LargeArray<EdgeTy> EdgeData;
+  typedef LargeArray<uint64_t> EdgeDst;
+  typedef internal::NodeInfoBaseTypes<NodeTy,
+                                      !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfoTypes;
+  typedef internal::NodeInfoBase<NodeTy,
+                                 !HasNoLockable && !HasOutOfLineLockable>
+      NodeInfo;
+  typedef LargeArray<EdgeInd> EdgeIndData;
+  typedef LargeArray<NodeInfo> NodeData;
+
+public:
+  typedef uint64_t GraphNode;
+  typedef EdgeTy edge_data_type;
+  typedef FileEdgeTy file_edge_data_type;
+  typedef NodeTy node_data_type;
+  typedef typename EdgeData::reference edge_data_reference;
+  typedef typename NodeInfoTypes::reference node_data_reference;
+  using edge_iterator = boost::counting_iterator<uint64_t>;
+  using iterator      = boost::counting_iterator<typename EdgeDst::value_type>;
+  typedef iterator const_iterator;
+  typedef iterator local_iterator;
+  typedef iterator const_local_iterator;
+
+protected:
+  NodeData nodeData;
+  EdgeIndData edgeIndData;
+  EdgeDst edgeDst;
+  EdgeData edgeData;
+  EdgeDst prefixSumCache;
+
+  static uint64_t transmute(const EdgeInd& p) { return p.second - p.first; };
+  static uint64_t scan_op(const EdgeInd& p, const EdgeDst::value_type& l) {
+    return p.second - p.first + l;
+  };
+  static uint64_t combiner(const EdgeDst::value_type& f,
+                           const EdgeDst::value_type& s) {
+    return f + s;
+  };
+
+  PrefixSum<EdgeInd, EdgeDst::value_type, transmute, scan_op, combiner,
+            CacheLinePaddedArr>
+      pfxsum{&edgeIndData[0], &prefixSumCache[0]};
+
+  std::atomic<bool> prefixValid = false;
+  std::atomic<uint64_t> numNodes;
+  std::atomic<uint64_t> numEdges = 0;
+  std::atomic<uint64_t> edgeEnd  = 0;
+
+  uint64_t maxNodes = ((uint64_t)1) << 30;
+  uint64_t maxEdges = ((uint64_t)1) << 32;
+
+  typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData>
+      edge_sort_iterator;
+
+  edge_iterator raw_begin(GraphNode N) const {
+    return edge_iterator(edgeIndData[N].first);
+  }
+
+  edge_iterator raw_end(GraphNode N) const {
+    return edge_iterator(edgeIndData[N].second);
+  }
+
+  edge_sort_iterator edge_sort_begin(GraphNode N) {
+    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);
+  }
+
+  edge_sort_iterator edge_sort_end(GraphNode N) {
+    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);
+  }
+
+  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
+    galois::runtime::acquire(&nodeData[N], mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode N, MethodFlag mflag,
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    this->outOfLineAcquire(getId(N), mflag);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void acquireNode(GraphNode, MethodFlag,
+                   typename std::enable_if<_A2>::type* = 0) {}
+
+  /**
+  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
+  void releaseNode(GraphNode N,
+                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+    galois::runtime::release(&nodeData[N]);
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void releaseNode(GraphNode N,
+                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+    this->outOfLineRelease(getId(N));
+  }
+
+  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
+  void releaseNode(GraphNode,
+                   typename std::enable_if<_A2>::type* = 0) {}
+  */
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph& graph,
+                          typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<!_A1 || _A2>::type* = 0) {
+    typedef LargeArray<FileEdgeTy> FED;
+    if (EdgeData::has_value)
+      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));
+  }
+
+  template <bool _A1 = EdgeData::has_value,
+            bool _A2 = LargeArray<FileEdgeTy>::has_value>
+  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,
+                          typename std::enable_if<_A1&& !_A2>::type* = 0) {
+    edgeData.set(*nn, {});
+  }
+
+  size_t getId(GraphNode N) { return N; }
+
+  GraphNode getNode(uint64_t n) { return n; }
+
+private:
+  friend class boost::serialization::access;
+
+  template <typename Archive>
+  void save(Archive& ar, const unsigned int) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  template <typename Archive>
+  void load(Archive& ar, const unsigned int) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+
+    if (!nodeData.data()) {
+      if (UseNumaAlloc) {
+        nodeData.allocateBlocked(numNodes);
+        this->outOfLineAllocateBlocked(numNodes);
+      } else {
+        nodeData.allocateInterleaved(numNodes);
+        this->outOfLineAllocateInterleaved(numNodes);
+      }
+
+      // Construct nodeData largeArray
+      for (size_t n = 0; n < numNodes; ++n) {
+        nodeData.constructAt(n);
+      }
+    }
+  }
+
+  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes
+  // the save or load depending on whether the archive is used for saving or
+  // loading
+  BOOST_SERIALIZATION_SPLIT_MEMBER()
+
+public:
+  LS_LC_CSR_64_Graph(LS_LC_CSR_64_Graph&& rhs) = default;
+
+  LS_LC_CSR_64_Graph() = default;
+
+  LS_LC_CSR_64_Graph& operator=(LS_LC_CSR_64_Graph&&) = default;
+
+  /**
+   * Serializes node data using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeNodeData(boost::archive::binary_oarchive& ar) const {
+    ar << nodeData;
+  }
+
+  /**
+   * Deserializes a Boost archive containing node data to the local node data
+   * variable.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {
+    ar >> nodeData;
+  }
+
+  /**
+   * Serializes graph using Boost.
+   *
+   * @param ar Boost archive to serialize to.
+   */
+  void serializeGraph(boost::archive::binary_oarchive& ar) const {
+    ar << numNodes;
+    ar << numEdges;
+
+    // Large Arrays
+    ar << nodeData;
+    ar << edgeIndData;
+    ar << edgeDst;
+    ar << edgeData;
+  }
+
+  /**
+   * Deserializes a Boost archive to the local graph.
+   *
+   * @param ar Boost archive to deserialize from.
+   */
+  void deSerializeGraph(boost::archive::binary_iarchive& ar) {
+    ar >> numNodes;
+    ar >> numEdges;
+
+    // Large Arrays
+    ar >> nodeData;
+    ar >> edgeIndData;
+    ar >> edgeDst;
+    ar >> edgeData;
+  }
+
+  void resetPrefixSum() {
+    pfxsum.src = &edgeIndData[0];
+    pfxsum.dst = &prefixSumCache[0];
+  }
+  // Compute the prefix sum using the two level method
+  void computePrefixSum() {
+    pfxsum.computePrefixSum(numNodes);
+    prefixValid = true;
+  }
+
+  /**
+   * DO NOT USE WHILE MODIFYING THE GRAPH!
+   * ONLY USE IF GRAPH HAS BEEN LOADED
+   *
+   * @param n Index into edge prefix sum
+   * @returns The value that would be located at index n in an edge prefix sum
+   * array
+   */
+  uint64_t operator[](uint64_t n) {
+    if (!prefixValid)
+      computePrefixSum();
+    return prefixSumCache[n];
+  }
+
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  LS_LC_CSR_64_Graph(uint64_t _numNodes, uint64_t _numEdges,
+                     EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
+                     EdgeDataFnTy _edgeData)
+      : numNodes(_numNodes), numEdges(_numEdges), edgeEnd(_numEdges) {
+    assert(numNodes <= maxNodes);
+    assert(numEdges <= maxEdges);
+    if (UseNumaAlloc) {
+      //! [numaallocex]
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      prefixSumCache.allocateBlocked(maxNodes);
+      //! [numaallocex]
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      prefixSumCache.allocateInterleaved(maxNodes);
+      this->outOfLineAllocateInterleaved(maxNodes);
+    }
+    resetPrefixSum();
+    for (size_t n = 0; n < numNodes; ++n) {
+      nodeData.constructAt(n);
+    }
+    uint64_t cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      edgeIndData[n].first = cur;
+      cur += edgeNum(n);
+      edgeIndData[n].second = cur;
+    }
+    cur = 0;
+    for (size_t n = 0; n < numNodes; ++n) {
+      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {
+        if (EdgeData::has_value)
+          edgeData.set(cur, _edgeData(n, e));
+        edgeDst[cur] = _edgeDst(n, e);
+        ++cur;
+      }
+    }
+  }
+
+  /* dangerous parallel constructor, call this outside a galois kernel */
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  LS_LC_CSR_64_Graph(bool setEdgeVals, uint64_t _numNodes, uint64_t _numEdges,
+                     EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst,
+                     EdgeDataFnTy _edgeData)
+      : numNodes(_numNodes), numEdges(_numEdges), edgeEnd(_numEdges) {
+    assert(numNodes <= maxNodes);
+    assert(numEdges <= maxEdges);
+    if (UseNumaAlloc) {
+      //! [numaallocex]
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      prefixSumCache.allocateBlocked(maxNodes);
+      //! [numaallocex]
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      prefixSumCache.allocateInterleaved(maxNodes);
+      this->outOfLineAllocateInterleaved(maxNodes);
+    }
+    resetPrefixSum();
+    galois::do_all(
+        galois::iterate((uint64_t)0, _numNodes),
+        [&](uint64_t n) { nodeData.constructAt(n); }, galois::steal());
+
+    galois::do_all(
+        galois::iterate((uint64_t)0, _numNodes),
+        [&](uint64_t n) {
+          addEdgesUnSort(setEdgeVals, n, _edgeDst(n), _edgeData(n), edgeNum(n));
+        },
+        galois::steal());
+  }
+
+  /**
+   * Add edges into the graph
+   *
+   * @param setEdgeVals if true, will set edges data
+   * @param src source node of edges to add
+   * @param dst array of edges dst
+   * @param dst_data array of dst nodes data
+   * @param num_dst number of dst these edges has
+   * @param keep_size if true, the number of edges in the graph are not
+   * increment, by default is false
+   */
+  template <typename T>
+  void addEdgesUnSort(bool setEdgeVals, GraphNode src, EdgeDst::value_type* dst,
+                      T* dst_data, uint64_t num_dst, bool keep_size = false) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg = getDegree(src);
+    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
+
+    auto edgeStart = ee;
+    auto orig_itr  = edge_begin(src);
+
+    std::memcpy(&edgeDst[edgeStart], &edgeDst[*orig_itr],
+                sizeof(EdgeDst::value_type) * orig_deg);
+    std::memcpy(&edgeDst[edgeStart + orig_deg], dst,
+                sizeof(EdgeDst::value_type) * num_dst);
+
+    if (EdgeData::has_value && setEdgeVals) {
+      for (uint64_t i = 0; i < orig_deg; i++) {
+        edgeData.set(edgeStart + i, edgeData[*orig_itr]);
+      }
+      for (uint64_t i = 0; i < num_dst; i++) {
+        edgeData.set(edgeStart + orig_deg + i, dst_data[i]);
+      }
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgeStart + num_dst + orig_deg;
+
+    if (!keep_size) {
+      numEdges.fetch_add(num_dst, std::memory_order_relaxed);
+    }
+    prefixValid = false;
+  }
+
+  void addEdgeSort(const uint64_t src, const uint64_t dst) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg  = getDegree(src);
+    auto ee        = edgeEnd.fetch_add(1 + orig_deg, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edge_begin(src);
+    auto orig_end  = edge_end(src);
+    bool dst_insert = false;
+
+    uint64_t orig_dst;
+    while (orig_itr != orig_end || !dst_insert) {
+      if (dst_insert || (orig_dst = getEdgeDst(orig_itr)) < dst) {
+        edgeDst[edgePlace] = orig_dst;
+        orig_itr++;
+      } else if (orig_itr == orig_end ||
+                 dst < (orig_dst = getEdgeDst(orig_itr))) {
+        edgeDst[edgePlace] = dst;
+        dst_insert         = true;
+      } else {
+        edgeDst[edgePlace] = dst;
+        dst_insert         = true;
+        orig_itr++;
+      }
+      edgePlace++;
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace;
+    numEdges.fetch_add(edgePlace - edgeStart - orig_deg,
+                       std::memory_order_relaxed);
+    prefixValid = false;
+  }
+
+  template <typename PQ>
+  void addEdges(uint64_t src, PQ& dst) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg = getDegree(src);
+    auto num_dst  = dst.size();
+    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edge_begin(src);
+    auto orig_end  = edge_end(src);
+
+    bool empty = dst.empty();
+    while (orig_itr != orig_end || !empty) {
+
+      auto orig_dst = getEdgeDst(orig_itr);
+      if (orig_itr != orig_end && (empty || orig_dst < dst.top())) {
+        edgeDst[edgePlace] = orig_dst;
+        /*
+        if (EdgeData::has_value)
+        {
+          edgeData.set(edgePlace, getEdgeData(orig_itr));
+        }
+        */
+        orig_itr++;
+      } else if (orig_itr == orig_end || dst.top() < orig_dst) {
+        edgeDst[edgePlace] = dst.top();
+        /*
+        if(EdgeData::has_value)
+          edgeData.set(edgePlace, *dst_data);
+
+        dst_data++;
+        */
+        dst.pop();
+      } else {
+        edgeDst[edgePlace] = dst.top();
+        /*
+        if(EdgeData::has_value)
+          edgeData.set(edgePlace, *dst_data);
+
+        dst_data++;
+        */
+        dst.pop();
+        orig_itr++;
+      }
+      edgePlace++;
+      empty = dst.empty();
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace;
+    numEdges.fetch_add(edgePlace - edgeStart - orig_deg,
+                       std::memory_order_relaxed);
+    prefixValid = false;
+    // releaseNode(src);
+  }
+
+  template <typename PTM>
+  void insertEdgesSerially(uint64_t src, const PTM& dst) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg    = getDegree(src);
+    uint64_t num_dst = 0;
+    for (uint64_t t = 0; t < dst.numRows(); t++) {
+      const auto& map = dst.get(t);
+      if (auto search = map.find(src); search != map.end()) {
+        num_dst += search->second.size();
+      }
+    }
+
+    auto ee = edgeEnd.fetch_add(num_dst + orig_deg, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edgeIndData[src].first;
+    auto orig_end  = edgeIndData[src].second;
+
+    std::memcpy(&edgeDst[edgePlace], &edgeDst[orig_itr],
+                sizeof(EdgeDst::value_type) * orig_deg);
+    edgePlace += orig_deg;
+
+    uint64_t i = 0;
+
+    for (uint64_t t = 0; t < dst.numRows(); t++) {
+      auto& map = dst.get(t);
+      if (auto search = map.find(src); search != map.end()) {
+        const auto& stack = search->second;
+        for (auto it = stack.begin(); it != stack.end(); it++, i++) {
+          edgeDst[edgePlace + i] = *it;
+        }
+      }
+    }
+
+    assert(i == num_dst);
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace + num_dst;
+    numEdges.fetch_add(num_dst, std::memory_order_relaxed);
+    prefixValid = false;
+  }
+
+  template <typename Cont>
+  void insertEdgesSerially(uint64_t src, uint64_t dst_sz, uint64_t start_index,
+                           const Cont& cont) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_deg = getDegree(src);
+
+    auto ee = edgeEnd.fetch_add(orig_deg + dst_sz, std::memory_order_relaxed);
+    auto edgeStart = ee;
+    auto edgePlace = ee;
+    auto orig_itr  = edgeIndData[src].first;
+    auto orig_end  = edgeIndData[src].second;
+
+    auto dst_end   = cont.end();
+    auto dst_start = cont.begin() + start_index;
+    auto dst_left  = dst_sz;
+
+    std::memcpy(&edgeDst[edgePlace], &edgeDst[orig_itr],
+                sizeof(EdgeDst::value_type) * orig_deg);
+    edgePlace += orig_deg;
+
+    while (dst_left != 0) {
+      while (dst_start->first != src)
+        dst_start++;
+      edgeDst[edgePlace] = dst_start->second;
+      edgePlace++;
+      dst_start++;
+      dst_left--;
+    }
+
+    edgeIndData[src].first  = edgeStart;
+    edgeIndData[src].second = edgePlace;
+    numEdges.fetch_add(dst_sz, std::memory_order_relaxed);
+    prefixValid = false;
+  }
+
+  void sortVertexSerially(uint64_t src) {
+    acquireNode(src, galois::MethodFlag::WRITE);
+    auto orig_itr = edgeIndData[src].first;
+    auto orig_end = edgeIndData[src].second;
+    std::sort(&edgeDst[orig_itr], &edgeDst[orig_end],
+              [](const EdgeDst::value_type& e0, const EdgeDst::value_type& e1) {
+                return e0 < e1;
+              });
+  }
+
+  template <typename PQ>
+  void addEdges(PQ* edges) {
+    for (uint64_t i = 0; i < numNodes; i++) {
+      if (!edges[i].empty())
+        addEdges<PQ>(i, edges[i]);
+    }
+  }
+
+  friend void swap(LS_LC_CSR_64_Graph& lhs, LS_LC_CSR_64_Graph& rhs) {
+    swap(lhs.nodeData, rhs.nodeData);
+    swap(lhs.edgeIndData, rhs.edgeIndData);
+    swap(lhs.edgeDst, rhs.edgeDst);
+    swap(lhs.edgeData, rhs.edgeData);
+    swap(lhs.pfxsum, rhs.pfxsum);
+    swap(lhs.prefixSumCache, rhs.prefixSumCache);
+
+    bool pv         = lhs.prefixValid;
+    lhs.prefixValid = rhs.prefixValid;
+    rhs.prefixValid = pv;
+
+    uint64_t blah = lhs.numNodes;
+    lhs.numNodes  = rhs.numNodes;
+    rhs.numNodes  = blah;
+
+    blah         = lhs.numEdges;
+    lhs.numEdges = rhs.numEdges;
+    rhs.numEdges = blah;
+  }
+
+  node_data_reference getData(GraphNode N,
+                              MethodFlag mflag = MethodFlag::WRITE) {
+    // galois::runtime::checkWrite(mflag, false);
+    NodeInfo& NI = nodeData[N];
+    acquireNode(N, mflag);
+    return NI.getData();
+  }
+
+  edge_data_reference
+  getEdgeData(edge_iterator ni,
+              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {
+    // galois::runtime::checkWrite(mflag, false);
+    return edgeData[*ni];
+  }
+
+  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }
+
+  size_t size() const { return numNodes; }
+  size_t sizeEdges() const { return numEdges; }
+
+  iterator begin() const { return iterator(0); }
+  iterator end() const { return iterator(numNodes); }
+
+  const_local_iterator local_begin() const {
+    return const_local_iterator(this->localBegin(numNodes));
+  }
+
+  const_local_iterator local_end() const {
+    return const_local_iterator(this->localEnd(numNodes));
+  }
+
+  local_iterator local_begin() {
+    return local_iterator(this->localBegin(numNodes));
+  }
+
+  local_iterator local_end() {
+    return local_iterator(this->localEnd(numNodes));
+  }
+
+  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {
+      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {
+        acquireNode(edgeDst[*ii], mflag);
+      }
+    }
+    return raw_begin(N);
+  }
+
+  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    return raw_end(N);
+  }
+
+  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }
+
+  edge_iterator findEdge(GraphNode N1, GraphNode N2) {
+    return std::find_if(edge_begin(N1), edge_end(N1),
+                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });
+  }
+
+  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {
+    auto e = std::lower_bound(
+        edge_begin(N1), edge_end(N1), N2,
+        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });
+    return (getEdgeDst(e) == N2) ? e : edge_end(N1);
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return internal::make_no_deref_range(edge_begin(N, mflag),
+                                         edge_end(N, mflag));
+  }
+
+  runtime::iterable<NoDerefIterator<edge_iterator>>
+  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    return edges(N, mflag);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.
+   */
+  template <typename CompTy>
+  void sortEdgesByEdgeData(GraphNode N,
+                           const CompTy& comp = std::less<EdgeTy>(),
+                           MethodFlag mflag   = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(
+        edge_sort_begin(N), edge_sort_end(N),
+        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(
+            comp));
+  }
+
+  /**
+   * Sorts outgoing edges of a node.
+   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.
+   */
+  template <typename CompTy>
+  void sortEdges(GraphNode N, const CompTy& comp,
+                 MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);
+  }
+
+  /**
+   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).
+   */
+  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {
+    acquireNode(N, mflag);
+    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;
+    std::sort(edge_sort_begin(N), edge_sort_end(N),
+              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {
+                return e1.dst < e2.dst;
+              });
+  }
+
+  /**
+   * Sorts all outgoing edges of all nodes in parallel. Comparison is over
+   * getEdgeDst(e).
+   */
+  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {
+    galois::do_all(
+        galois::iterate(size_t{0}, this->size()),
+        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },
+        galois::no_stats(), galois::steal());
+  }
+
+  void allocateFrom(const FileGraph& graph) {
+    numNodes = graph.size();
+    numEdges = 0;
+    edgeEnd  = 0;
+    maxEdges = graph.sizeEdges();
+    maxNodes = numNodes;
+
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      pfxsum.allocateInterleaved(numNodes);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      pfxsum.allocateInterleaved(numNodes);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+    resetPrefixSum();
+  }
+
+  void allocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = 0;
+    edgeEnd  = 0;
+    maxEdges = nEdges;
+    maxNodes = nNodes;
+
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(maxNodes);
+      edgeIndData.allocateBlocked(maxNodes);
+      edgeDst.allocateBlocked(maxEdges);
+      edgeData.allocateBlocked(maxEdges);
+      prefixSumCache.allocateBlocked(maxNodes);
+      this->outOfLineAllocateBlocked(maxNodes);
+    } else {
+      nodeData.allocateInterleaved(maxNodes);
+      edgeIndData.allocateInterleaved(maxNodes);
+      edgeDst.allocateInterleaved(maxEdges);
+      edgeData.allocateInterleaved(maxEdges);
+      prefixSumCache.allocateInterleaved(maxNodes);
+      this->outOfLineAllocateInterleaved(maxNodes);
+    }
+    resetPrefixSum();
+  }
+
+  void destroyAndAllocateFrom(uint64_t nNodes, uint64_t nEdges) {
+    numNodes = nNodes;
+    numEdges = 0;
+    edgeEnd  = 0;
+    maxEdges = nEdges;
+    maxNodes = nNodes;
+
+    deallocate();
+    if (UseNumaAlloc) {
+      nodeData.allocateBlocked(numNodes);
+      edgeIndData.allocateBlocked(numNodes);
+      edgeDst.allocateBlocked(numEdges);
+      edgeData.allocateBlocked(numEdges);
+      prefixSumCache.allocateBlocked(numNodes);
+      this->outOfLineAllocateBlocked(numNodes);
+    } else {
+      nodeData.allocateInterleaved(numNodes);
+      edgeIndData.allocateInterleaved(numNodes);
+      edgeDst.allocateInterleaved(numEdges);
+      edgeData.allocateInterleaved(numEdges);
+      prefixSumCache.allocateInterleaved(numNodes);
+      this->outOfLineAllocateInterleaved(numNodes);
+    }
+    resetPrefixSum();
+  }
+
+  void constructNodes() {
+#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL
+    for (uint64_t x = 0; x < numNodes; ++x) {
+      nodeData.constructAt(x);
+      this->outOfLineConstructAt(x);
+    }
+#else
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes),
+        [&](uint64_t x) {
+          nodeData.constructAt(x);
+          this->outOfLineConstructAt(x);
+        },
+        galois::no_stats(), galois::loopname("CONSTRUCT_NODES"));
+#endif
+  }
+
+  void deallocate() {
+    nodeData.destroy();
+    nodeData.deallocate();
+
+    edgeIndData.deallocate();
+    edgeIndData.destroy();
+
+    edgeDst.deallocate();
+    edgeDst.destroy();
+
+    edgeData.deallocate();
+    edgeData.destroy();
+
+    prefixSumCache.deallocate();
+    prefixSumCache.destroy();
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst,
+                     const typename EdgeData::value_type& val) {
+    edgeData.set(e, val);
+    edgeDst[e] = dst;
+  }
+
+  void constructEdge(uint64_t e, uint64_t dst) { edgeDst[e] = dst; }
+
+  void fixEndEdge(uint64_t n, uint64_t e) { edgeIndData[n].second = e; }
+  void fixStartEdge(uint64_t n, uint64_t e) { edgeIndData[n].first = e; }
+
+  /**
+   * Perform an in-memory transpose of the graph, replacing the original
+   * CSR to CSC
+   */
+  template <bool ComputePFXSum = true>
+  void transpose(const char* regionName = NULL) {
+    galois::StatTimer timer("TIMER_GRAPH_TRANSPOSE", regionName);
+    timer.start();
+
+    EdgeDst edgeDst_old;
+    EdgeData edgeData_new;
+    EdgeIndData edgeIndData_old;
+    EdgeIndData edgeIndData_temp;
+
+    if (UseNumaAlloc) {
+      edgeIndData_old.allocateBlocked(numNodes);
+      edgeIndData_temp.allocateBlocked(numNodes);
+      edgeDst_old.allocateBlocked(edgeEnd);
+      edgeData_new.allocateBlocked(maxEdges);
+    } else {
+      edgeIndData_old.allocateInterleaved(numNodes);
+      edgeIndData_temp.allocateInterleaved(numNodes);
+      edgeDst_old.allocateInterleaved(edgeEnd);
+      edgeData_new.allocateInterleaved(maxEdges);
+    }
+
+    uint64_t numNodes_temp = numNodes.load(std::memory_order_relaxed);
+    uint64_t edgeEnd_temp  = edgeEnd.load(std::memory_order_relaxed);
+    // Copy old node->index location + initialize the temp array
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes_temp),
+        [&](uint64_t n) {
+          edgeIndData_old[n]         = edgeIndData[n];
+          edgeIndData_temp[n].first  = 0;
+          edgeIndData_temp[n].second = 0;
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_COPY"));
+
+    // get destination of edge, copy to array, and
+    galois::do_all(
+        galois::iterate(UINT64_C(0), edgeEnd_temp),
+        [&](uint64_t e) {
+          auto dst       = edgeDst[e];
+          edgeDst_old[e] = dst;
+          // counting outgoing edges in the tranpose graph by
+          // counting incoming edges in the original graph
+          __sync_add_and_fetch(&edgeIndData_temp[dst].second, 1);
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_INC"));
+
+    // TODO is it worth doing parallel prefix sum?
+    // prefix sum calculation of the edge index array
+    edgeIndData_temp[0].first = 0;
+    for (uint64_t n = 1; n < numNodes_temp; ++n) {
+      edgeIndData_temp[n].second += edgeIndData_temp[n - 1].second;
+      edgeIndData_temp[n].first = edgeIndData_temp[n - 1].second;
+    }
+
+    // copy over the new tranposed edge index data
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes_temp),
+        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_SET"));
+
+    /* AdityaAtulTewari edit: Elided since this was stored in above loop.
+    // edgeIndData_temp[i] will now hold number of edges that all nodes
+    // before the ith node have
+    if (numNodes >= 1) {
+      edgeIndData_temp[0] = 0;
+      galois::do_all(
+          galois::iterate(UINT64_C(1), numNodes),
+          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEINTDATA_TEMP"));
+    }
+    */
+
+    galois::do_all(
+        galois::iterate(UINT64_C(0), numNodes_temp),
+        [&](uint64_t src) {
+          // e = start index into edge array for a particular node
+          uint64_t e = edgeIndData_old[src].first;
+
+          // get all outgoing edges of a particular node in the
+          // non-transpose and convert to incoming
+          while (e < edgeIndData_old[src].second) {
+            // destination nodde
+            auto dst = edgeDst_old[e];
+            // location to save edge
+            auto e_new =
+                __sync_fetch_and_add(&(edgeIndData_temp[dst].first), 1);
+            // save src as destination
+            edgeDst[e_new] = src;
+            // copy edge data to "new" array
+            edgeDataCopy(edgeData_new, edgeData, e_new, e);
+            e++;
+          }
+        },
+        galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDST"));
+
+    // if edge weights, then overwrite edgeData with new edge data
+    if (EdgeData::has_value) {
+      galois::do_all(
+          galois::iterate(UINT64_C(0), edgeEnd_temp),
+          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },
+          galois::no_stats(), galois::loopname("TRANSPOSE_EDGEDATA_SET"));
+    }
+    edgeEnd.store(numEdges, std::memory_order_relaxed);
+
+    resetPrefixSum();
+    if (ComputePFXSum) {
+      computePrefixSum();
+    }
+
+    timer.stop();
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,
+                    uint64_t e,
+                    typename std::enable_if<is_non_void>::type* = 0) {
+    edgeData_new[e_new] = edgeData[e];
+  }
+
+  template <bool is_non_void = EdgeData::has_value>
+  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,
+                    typename std::enable_if<!is_non_void>::type* = 0) {
+    // does nothing
+  }
+
+  template <typename E                                            = EdgeTy,
+            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool readUnweighted = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LS_LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        if (readUnweighted) {
+          edgeData.set(*nn, {});
+        } else {
+          constructEdgeValue(graph, nn);
+        }
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+    resetPrefixSum();
+  }
+
+  template <typename E                                           = EdgeTy,
+            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>
+  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,
+                     const bool GALOIS_UNUSED(readUnweighted) = false) {
+    // at this point memory should already be allocated
+    auto r =
+        graph
+            .divideByNode(
+                NodeData::size_of::value + EdgeIndData::size_of::value +
+                    LS_LC_CSR_64_Graph::size_of_out_of_line::value,
+                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)
+            .first;
+
+    this->setLocalRange(*r.first, *r.second);
+
+    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {
+      nodeData.constructAt(*ii);
+      edgeIndData[*ii] = *graph.edge_end(*ii);
+
+      this->outOfLineConstructAt(*ii);
+
+      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),
+                                    en = graph.edge_end(*ii);
+           nn != en; ++nn) {
+        constructEdgeValue(graph, nn);
+        edgeDst[*nn] = graph.getEdgeDst(nn);
+      }
+    }
+    resetPrefixSum();
+  }
+
+  /**
+   * Returns the reference to the edgeIndData LargeArray
+   * (a prefix sum of edges)
+   *
+   * @returns reference to LargeArray prefixSumCache
+   */
+  const EdgeDst& getEdgePrefixSum() {
+    if (!prefixValid)
+      computePrefixSum();
+    return prefixSumCache;
+  }
+
+  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {
+    return galois::graphs::divideNodesBinarySearch(
+        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);
+  }
+
+  /**
+   *
+   * custom allocator for vector<vector<>>
+   * Adding for Louvain clustering
+   * TODO: Find better way to do this
+   */
+  void constructFrom(uint64_t numNodes, uint64_t numEdges,
+                     std::vector<uint64_t>& prefix_sum,
+                     std::vector<std::vector<uint64_t>>& edges_id,
+                     std::vector<std::vector<EdgeTy>>& edges_data) {
+    // allocateFrom(numNodes, numEdges);
+    /*
+     * Deallocate if reusing the graph
+     */
+    destroyAndAllocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n].second = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        edgeIndData[n].first = 0;
+        if (edgeIndData[n].second > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        edgeIndData[n].first = edgeIndData[n - 1].second;
+        if (edgeIndData[n].second - edgeIndData[n].first > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n].first);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n].second);
+        }
+      }
+    });
+
+    resetPrefixSum();
+    initializeLocalRanges();
+  }
+  void constructFrom(
+      uint64_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,
+      galois::gstl::Vector<galois::PODResizeableArray<uint64_t>>& edges_id,
+      std::vector<std::vector<EdgeTy>>& edges_data) {
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes),
+                   [&](uint64_t n) { edgeIndData[n].second = prefix_sum[n]; });
+
+    galois::do_all(galois::iterate((uint64_t)0, numNodes), [&](uint64_t n) {
+      if (n == 0) {
+        edgeIndData[n].first = 0;
+        if (edgeIndData[n].second > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin());
+        }
+      } else {
+        edgeIndData[n].first = edgeIndData[n - 1].second;
+        if (edgeIndData[n].second - edgeIndData[n].first > 0) {
+          std::copy(edges_id[n].begin(), edges_id[n].end(),
+                    edgeDst.begin() + edgeIndData[n].first);
+          std::copy(edges_data[n].begin(), edges_data[n].end(),
+                    edgeData.begin() + edgeIndData[n].first);
+        }
+      }
+    });
+
+    initializeLocalRanges();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is not void.
+   *
+   */
+  template <
+      typename U                                                      = void,
+      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));
+      // version 1 padding TODO make version agnostic
+      if (numEdges % 2) {
+        readPosition += sizeof(uint32_t);
+      }
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+      readPosition =
+          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));
+      if (numEdges % 2) {
+        readPosition += sizeof(uint64_t);
+      }
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+    /**
+     * Load edge data array
+     **/
+    assert(edgeData.data());
+    if (!edgeData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeData.data()),
+                   sizeof(EdgeTy) * numEdges);
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Reads the GR files directly into in-memory
+   * data-structures of LC_CSR graphs using freads.
+   *
+   * Edge is void.
+   *
+   */
+  template <
+      typename U                                                     = void,
+      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>
+  void readGraphFromGRFile(const std::string& filename) {
+    std::ifstream graphFile(filename.c_str());
+    if (!graphFile.is_open()) {
+      GALOIS_DIE("failed to open file");
+    }
+    uint64_t header[4];
+    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);
+    uint64_t version = header[0];
+    numNodes         = header[2];
+    numEdges         = header[3];
+    galois::gPrint("Number of Nodes: ", numNodes,
+                   ", Number of Edges: ", numEdges, "\n");
+    allocateFrom(numNodes, numEdges);
+    constructNodes();
+    /**
+     * Load outIndex array
+     **/
+    assert(edgeIndData.data());
+    if (!edgeIndData.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    // start position to read index data
+    uint64_t readPosition = (4 * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),
+                   sizeof(uint64_t) * numNodes);
+    /**
+     * Load edgeDst array
+     **/
+    assert(edgeDst.data());
+    if (!edgeDst.data()) {
+      GALOIS_DIE("out of memory");
+    }
+    readPosition = ((4 + numNodes) * sizeof(uint64_t));
+    graphFile.seekg(readPosition);
+    if (version == 1) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint32_t) * numEdges);
+    } else if (version == 2) {
+      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),
+                     sizeof(uint64_t) * numEdges);
+    } else {
+      GALOIS_DIE("unknown file version: ", version);
+    }
+
+    initializeLocalRanges();
+    graphFile.close();
+  }
+
+  /**
+   * Given a manually created graph, initialize the local ranges on this graph
+   * so that threads can iterate over a balanced number of vertices.
+   */
+  void initializeLocalRanges() {
+    galois::on_each([&](unsigned tid, unsigned total) {
+      auto r = divideByNode(0, 1, tid, total).first;
+      this->setLocalRange(*r.first, *r.second);
+    });
+  }
+};
+
+// used to determine if a instance is this template
+template <typename Type>
+struct is_LS_LC_CSR_64_Graph : std::false_type {};
+
+template <typename NodeTy, typename EdgeTy, bool HasNoLockable,
+          bool UseNumaAlloc, bool HasOutOfLineLockable, typename FileEdgeTy>
+struct is_LS_LC_CSR_64_Graph<
+    LS_LC_CSR_64_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,
+                       HasOutOfLineLockable, FileEdgeTy>> : std::true_type {};
+
+} // namespace galois::graphs
+
+#endif
diff --git a/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
new file mode 100644
index 0000000000..183a3277ec
--- /dev/null
+++ b/libgalois/include/galois/graphs/LS_LC_CSR_Graph.h
@@ -0,0 +1,456 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2024, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#ifndef GALOIS_GRAPHS_LS_LC_CSR_GRAPH_H
+#define GALOIS_GRAPHS_LS_LC_CSR_GRAPH_H
+
+#include <unordered_set>
+#include <iterator>
+#include <cstddef>
+#include <atomic>
+#include <new>
+#include <type_traits>
+
+#include <boost/range/iterator_range_core.hpp>
+#include <boost/range/counting_range.hpp>
+#include <boost/iterator/iterator_facade.hpp>
+#include <boost/functional/hash.hpp>
+
+#include <parallel_hashmap/phmap.h>
+
+#include "galois/config.h"
+#include "galois/LargeVector.h"
+#include "galois/PrefixSum.h"
+
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+
+namespace galois::graphs {
+
+/**
+ * Local computation graph.
+ */
+template <typename VertexData, typename EdgeData, bool concurrent = true>
+class LS_LC_CSR_Graph : private boost::noncopyable {
+public:
+  using VertexTopologyID = uint64_t;
+  using VertexRange =
+      boost::iterator_range<boost::counting_iterator<VertexTopologyID>>;
+
+  using EdgeHandle = std::pair<VertexTopologyID, VertexTopologyID>;
+
+  class EdgeIterator;
+  using EdgeRange = boost::iterator_range<EdgeIterator>;
+
+private:
+  using SpinLock = galois::substrate::PaddedLock<concurrent>;
+  static constexpr bool HasVertexData = !std::is_same_v<VertexData, void>;
+  static constexpr bool HasEdgeData   = !std::is_same_v<EdgeData, void>;
+
+  using VertexDataStore =
+      std::conditional_t<HasVertexData, typename std::vector<VertexData>,
+                         typename std::tuple<>>;
+
+  // todo: should we use a galois spinlock here instead of a mutex?
+  using EdgeDataMap = phmap::parallel_flat_hash_map_m<EdgeHandle, EdgeData>;
+
+  using EdgeDataStore =
+      std::conditional_t<HasEdgeData, EdgeDataMap, std::tuple<>>;
+
+  // forward-declarations of internal structs
+  struct VertexMetadata;
+  using EdgeMetadata = VertexTopologyID;
+
+  VertexDataStore m_vertex_data;
+  std::vector<VertexMetadata> m_vertices;
+
+  // m_edges[0] is the CSR with gaps, m_edges[1] is the update log.
+  LargeVector<EdgeMetadata> m_edges[2];
+  SpinLock m_edges_lock; // guards resizing of edges vectors
+  EdgeDataStore m_edge_data;
+  alignas(hardware_destructive_interference_size) std::atomic_uint64_t
+      m_edges_tail = ATOMIC_VAR_INIT(0);
+  // m_holes is the number of holes in the log (m_edges[1])
+  alignas(hardware_destructive_interference_size) std::atomic_uint64_t m_holes =
+      ATOMIC_VAR_INIT(0);
+
+  /*
+   * Prefix Sum utilities
+   */
+  std::vector<uint64_t> m_pfx_sum_cache;
+
+  alignas(hardware_destructive_interference_size)
+      std::atomic<bool> m_prefix_valid = ATOMIC_VAR_INIT(false);
+
+  void resetPrefixSum() { m_pfx_sum_cache.resize(m_vertices.size()); }
+
+  // Compute the prefix sum using the two level method
+  void computePrefixSum() {
+    // todo: switch to parallel prefix sum when `galois::PrefixSum` is fixed
+    std::transform_inclusive_scan(
+        m_vertices.begin(), m_vertices.end(), m_pfx_sum_cache.begin(),
+        std::plus<uint64_t>(),
+        [](VertexMetadata const& v) { return v.degree(); }, 0ul);
+    m_prefix_valid.store(true, std::memory_order_release);
+  }
+
+  // returns a reference to the metadata for the pointed-to edge
+  inline EdgeMetadata& getEdgeMetadata(uint8_t buffer, uint64_t index) const {
+    return m_edges[buffer][index];
+  }
+
+public:
+  LS_LC_CSR_Graph(uint64_t num_vertices)
+      : m_vertices(num_vertices, VertexMetadata()) {
+    if constexpr (HasVertexData) {
+      m_vertex_data.resize(num_vertices);
+    }
+    resetPrefixSum();
+  }
+
+  inline uint64_t size() const noexcept { return m_vertices.size(); }
+
+  /** Data Manipulations **/
+
+  template <typename V = VertexData, typename = std::enable_if<HasVertexData>>
+  inline void setData(VertexTopologyID vertex, V data) {
+    m_vertex_data[vertex] = data;
+  }
+
+  // return data associated with a vertex
+  template <typename V = VertexData, typename = std::enable_if<HasVertexData>>
+  inline V& getData(VertexTopologyID vertex) {
+    return m_vertex_data[vertex];
+  }
+
+  template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
+  inline void setEdgeData(EdgeHandle handle, E data) {
+    m_edge_data.lazy_emplace_l(
+        handle, [&](auto& v) { v.second = data; },
+        [&](auto const& cons) { cons(handle, data); });
+  }
+
+  inline VertexTopologyID begin() const noexcept {
+    return static_cast<VertexTopologyID>(0);
+  }
+
+  inline VertexTopologyID end() const noexcept {
+    return static_cast<VertexTopologyID>(m_vertices.size());
+  }
+
+  VertexRange vertices() { return VertexRange(begin(), end()); }
+
+  VertexTopologyID addVertexTopologyOnly() {
+    m_vertices.emplace_back();
+    if constexpr (HasVertexData) {
+      m_vertex_data.resize(m_vertices.size());
+    }
+    resetPrefixSum();
+    return m_vertices.size() - 1;
+  }
+
+  // Adds multiple vertices to the graph. The new vertices will be assigned
+  // consecutive topology IDs, and the lowest new ID is returned.
+  template <typename V = VertexData, typename = std::enable_if<HasVertexData>>
+  VertexTopologyID addVertices(std::vector<V> data) {
+    VertexTopologyID const start = m_vertices.size();
+    m_vertices.resize(m_vertices.size() + data.size());
+    m_vertex_data.resize(m_vertices.size());
+    resetPrefixSum();
+    galois::do_all(
+        galois::iterate(0ul, data.size()),
+        [&](VertexTopologyID const& off) { setData(start + off, data[off]); });
+    return start;
+  }
+
+  inline size_t getDegree(VertexTopologyID id) {
+    return m_vertices[id].degree();
+  }
+
+  inline VertexTopologyID getEdgeDst(EdgeHandle eh) { return eh.second; }
+
+  template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
+  inline E& getEdgeData(EdgeHandle handle) {
+    return m_edge_data[handle];
+  }
+
+  template <typename E = EdgeData, typename = std::enable_if<HasEdgeData>>
+  inline E& getEdgeData(EdgeIterator const& it) {
+    return m_edge_data[*it];
+  }
+  /*
+   * Count the total number of edges in parallel.
+   */
+  uint64_t sizeEdges() {
+    galois::GAccumulator<uint64_t> num_edges;
+    num_edges.reset();
+    galois::do_all(galois::iterate(begin(), end()),
+                   [&](VertexTopologyID const& vertex) {
+                     num_edges += getDegree(vertex);
+                   });
+    return num_edges.reduce();
+  }
+
+  inline EdgeIterator edge_begin(VertexTopologyID vertex) {
+    auto const& vertex_meta = m_vertices[vertex];
+    return EdgeIterator(
+        vertex, &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin));
+  }
+
+  inline EdgeIterator edge_end(VertexTopologyID vertex) {
+    auto const& vertex_meta = m_vertices[vertex];
+    return EdgeIterator(vertex,
+                        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end));
+  }
+
+  inline EdgeRange edges(VertexTopologyID node) {
+    return EdgeRange(edge_begin(node), edge_end(node));
+  }
+
+  /*
+   * Sort the outgoing edges for the given vertex.
+   */
+  void sortEdges(VertexTopologyID node) {
+    auto& vertex_meta = m_vertices[node];
+
+    EdgeMetadata* start =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
+
+    EdgeMetadata* end = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
+
+    std::sort(start, end);
+  }
+
+  /*
+   * Returns whether the given edge exists.
+   *
+   * Assumes sortEdges was already called for the vertex!
+   */
+  bool findEdgeSorted(VertexTopologyID src, VertexTopologyID dst) {
+    auto const& vertex_meta = m_vertices[src];
+    EdgeMetadata* start =
+        &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin);
+    EdgeMetadata* end = &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end);
+
+    return std::binary_search(start, end, EdgeMetadata(dst));
+  }
+
+  template <bool sorted = false>
+  void addEdges(VertexTopologyID src, const std::vector<VertexTopologyID> dsts,
+                std::vector<EdgeData> data) {
+    GALOIS_ASSERT(data.size() == dsts.size());
+    this->addEdgesTopologyOnly<sorted>(src, dsts);
+    for (size_t i = 0; i < dsts.size(); ++i) {
+      auto key = std::make_pair(src, dsts[i]);
+      setEdgeData(key, data[i]);
+    }
+  }
+
+  /*
+   * Adds outgoing edges from the given src to all dsts. If `sorted`, assume
+   * both `dsts` and the existing edge array is sorted ascending, and maintain
+   * sorted order.
+   */
+  template <bool sorted = false>
+  void addEdgesTopologyOnly(VertexTopologyID src,
+                            const std::vector<VertexTopologyID> dsts) {
+    // Copies the edge list to the end of m_edges[1] together with the new
+    // edges.
+
+    auto& vertex_meta = m_vertices[src];
+    m_holes.fetch_add(vertex_meta.degree(), std::memory_order_relaxed);
+
+    uint64_t const new_degree = vertex_meta.degree() + dsts.size();
+    uint64_t const new_begin =
+        m_edges_tail.fetch_add(new_degree, std::memory_order_relaxed);
+    uint64_t const new_end = new_begin + new_degree;
+
+    if (m_edges[1].size() < new_end) {
+      m_edges_lock.lock();
+      {
+        if (m_edges[1].size() < new_end)
+          m_edges[1].resize(std::max(m_edges[1].size() * 2, new_end));
+      }
+      m_edges_lock.unlock();
+    }
+
+    EdgeMetadata* log_dst = &getEdgeMetadata(1, new_begin);
+    if constexpr (sorted) {
+      std::merge(dsts.begin(), dsts.end(),
+                 &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
+                 &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end),
+                 log_dst);
+    } else {
+      // copy old edges
+      log_dst = std::copy(
+          &getEdgeMetadata(vertex_meta.buffer, vertex_meta.begin),
+          &getEdgeMetadata(vertex_meta.buffer, vertex_meta.end), log_dst);
+
+      // insert new edges
+      std::copy(dsts.begin(), dsts.end(), log_dst);
+    }
+
+    // update vertex metadata
+    vertex_meta.buffer = 1;
+    vertex_meta.begin  = new_begin;
+    vertex_meta.end    = new_end;
+
+    m_prefix_valid.store(false, std::memory_order_release);
+  }
+
+  // Performs the compaction algorithm by copying any vertices left in buffer 0
+  // to buffer 1, then swapping the buffers.
+  //
+  // Not safe to call in parallel with insertions/deletions.
+  void compact() {
+    using std::swap;
+
+    // move from buffer 0 to buffer 1
+    galois::do_all(
+        galois::iterate(vertices().begin(), vertices().end()),
+        [&](VertexTopologyID vertex_id) {
+          VertexMetadata& vertex_meta = m_vertices[vertex_id];
+
+          if (vertex_meta.buffer == 0) {
+            this->addEdgesTopologyOnly<false>(vertex_id, {});
+          }
+
+          // we are about to swap the buffers, so all vertices will
+          // be in buffer 0
+          vertex_meta.buffer = 0;
+        },
+        galois::steal());
+
+    // At this point, there are no more live edges in buffer 0.
+    m_edges_lock.lock();
+    {
+      m_edges[0].resize(0);
+      swap(m_edges[0], m_edges[1]);
+      // relaxed is fine because of locks held:
+      m_edges_tail.store(0, std::memory_order_relaxed);
+      m_holes.store(0, std::memory_order_relaxed);
+    }
+    m_edges_lock.unlock();
+  }
+
+  /*
+    Compaction policy utilities.
+  */
+
+  // Returns an estimated memory usage in bytes for the entire data structure.
+  inline size_t getMemoryUsageBytes() {
+    size_t estimate = m_vertices.size() * sizeof(VertexMetadata);
+    if constexpr (HasVertexData) {
+      estimate += m_vertices.size() * sizeof(VertexData);
+    }
+    m_edges_lock.lock();
+    {
+      estimate +=
+          (m_edges[0].size() + m_edges_tail.load(std::memory_order_relaxed)) *
+          sizeof(EdgeMetadata);
+    }
+    m_edges_lock.unlock();
+    if constexpr (HasEdgeData) {
+      estimate += m_edge_data.size() *
+                  (sizeof(EdgeData) +
+                   sizeof(std::pair<VertexTopologyID, VertexTopologyID>));
+    }
+    return estimate;
+  }
+
+  // Returns the number of bytes used for holes in the log.
+  inline size_t getLogHolesMemoryUsageBytes() {
+    return m_holes.load(std::memory_order_relaxed) * sizeof(EdgeMetadata);
+  }
+
+  /**
+   * DO NOT USE WHILE MODIFYING THE GRAPH!
+   * ONLY USE IF GRAPH HAS BEEN LOADED
+   *
+   * @param n Index into edge prefix sum
+   * @returns The value that would be located at index n in an edge prefix sum
+   * array
+   */
+  uint64_t operator[](uint64_t n) {
+    if (!m_prefix_valid.load(std::memory_order_acquire))
+      computePrefixSum();
+    return m_pfx_sum_cache[n];
+  }
+
+  std::vector<uint64_t> const& getEdgePrefixSum() {
+    if (!m_prefix_valid.load(std::memory_order_acquire))
+      computePrefixSum();
+    return m_pfx_sum_cache;
+  }
+
+private:
+  struct VertexMetadata {
+    uint8_t buffer : 1;
+    uint64_t begin : 48; // inclusive
+    uint64_t end : 48;   // exclusive
+
+    VertexMetadata() : buffer(0), begin(0), end(0) {}
+
+    VertexMetadata(VertexMetadata const& other)
+        : buffer(other.buffer), begin(other.begin), end(other.end) {}
+
+    VertexMetadata(VertexMetadata&& other)
+        : buffer(std::move(other.buffer)), begin(std::move(other.begin)),
+          end(std::move(other.end)) {}
+
+    inline uint64_t degree() const { return end - begin; }
+  };
+
+public:
+  class EdgeIterator
+      : public boost::iterator_facade<EdgeIterator, EdgeHandle,
+                                      boost::random_access_traversal_tag,
+                                      EdgeHandle const> {
+  private:
+    VertexTopologyID const src;
+    EdgeMetadata const* ptr;
+
+    EdgeIterator(VertexTopologyID src, EdgeMetadata const* ptr)
+        : src(src), ptr(ptr) {}
+
+    void advance(std::ptrdiff_t n) { ptr += n; }
+
+    std::ptrdiff_t distance_to(EdgeIterator const& y) const {
+      return y.ptr - ptr;
+    }
+
+    void increment() { ++ptr; }
+
+    void decrement() { --ptr; }
+
+    bool equal(EdgeIterator const& other) const { return ptr == other.ptr; }
+
+    EdgeHandle dereference() const { return EdgeHandle(src, *ptr); }
+
+    friend class LS_LC_CSR_Graph;
+    friend class boost::iterator_core_access;
+  };
+};
+
+}; // namespace galois::graphs
+
+#endif
diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h
index a52d9dd676..ef29d1c0ad 100644
--- a/libgalois/include/galois/graphs/MorphGraph.h
+++ b/libgalois/include/galois/graphs/MorphGraph.h
@@ -609,9 +609,9 @@ class MorphGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
         ii        = src->createEdgeWithReuse(dst, e, false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
       }
     }
     return boost::make_filter_iterator(is_out_edge(), ii, src->end());
@@ -633,7 +633,7 @@ class MorphGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdge(src, e, Directional ? true : false,
-                             std::forward<Args>(args)...);
+                                    std::forward<Args>(args)...);
         ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
       }
     }
@@ -702,7 +702,7 @@ class MorphGraph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
@@ -748,6 +748,8 @@ public
     return n->getData();
   }
 
+  GraphNode& getNode(uint64_t n) { return std::advance(this->begin(), n); }
+
   //! Checks if a node is in the graph
   //! @returns true if a node has is in the graph
   bool containsNode(const GraphNode& n,
@@ -786,6 +788,22 @@ public
     src->resizeEdges(size);
   }
 
+  /**
+   * Adds an edge to graph, replacing existing value if edge already exists.
+   *
+   * Ignore the edge data, let the caller use the returned iterator to set the
+   * value if desired.  This frees us from dealing with the void edge data
+   * problem in this API
+   */
+  edge_iterator addEdge(uint64_t src, uint64_t dst,
+                        galois::MethodFlag mflag = MethodFlag::WRITE) {
+    auto s = this->begin();
+    std::advance(s, src);
+    auto d = this->begin();
+    std::advance(d, dst);
+    return createEdgeWithReuse(*s, *d, mflag);
+  }
+
   /**
    * Adds an edge to graph, replacing existing value if edge already exists.
    *
@@ -1030,6 +1048,14 @@ public
     return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());
   }
 
+  uint64_t getDegree(GraphNode N) {
+    uint64_t ret = 0;
+    for (auto& edge : out_edges(N)) {
+      ret++;
+    }
+    return ret;
+  }
+
   //! Returns the end of an in-neighbor edge iterator
   template <bool _Undirected = !Directional>
   in_edge_iterator
@@ -1128,6 +1154,27 @@ public
   //! Returns the size of edge data.
   size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }
 
+  MorphGraph() = default;
+
+  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>
+  MorphGraph(uint32_t numNodes, uint64_t numEdges, EdgeNumFnTy edgeNum,
+             EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData) {
+    std::vector<GraphNode> nodes{numNodes};
+    for (size_t n = 0; n < numNodes; ++n) {
+      // NodeTy node;
+      GraphNode a = this->createNode();
+      this->addNode(a);
+      nodes[n] = a;
+    }
+    for (size_t n = 0; n < numNodes; ++n) {
+      for (size_t e = 0; e < edgeNum(n); ++e) {
+        auto edge = this->addEdge(nodes[n], nodes[_edgeDst(n, e)]);
+        if (!std::is_void<EdgeTy>::value)
+          this->getEdgeData(edge) = _edgeData(n, e);
+      }
+    }
+  }
+
 #ifdef AUX_MAP
   /**
    * Allocate memory for nodes given a file graph with a particular number of
diff --git a/libgalois/include/galois/graphs/MorphHyperGraph.h b/libgalois/include/galois/graphs/MorphHyperGraph.h
index 1dae113408..f408d9fa9b 100644
--- a/libgalois/include/galois/graphs/MorphHyperGraph.h
+++ b/libgalois/include/galois/graphs/MorphHyperGraph.h
@@ -620,9 +620,9 @@ class MorphHyperGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
         ii        = src->createEdgeWithReuse(dst, e, false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
       }
     }
     return boost::make_filter_iterator(is_out_edge(), ii, src->end());
@@ -644,7 +644,7 @@ class MorphHyperGraph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdge(src, e, Directional ? true : false,
-                             std::forward<Args>(args)...);
+                                    std::forward<Args>(args)...);
         ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
       }
     }
@@ -713,7 +713,7 @@ class MorphHyperGraph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h b/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h
index db19218240..86b811a914 100644
--- a/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h
+++ b/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h
@@ -542,9 +542,9 @@ class Morph_SepInOut_Graph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
         ii        = src->createEdgeWithReuse(dst, e, false,
-                                      std::forward<Args>(args)...);
+                                             std::forward<Args>(args)...);
       }
     }
     return boost::make_filter_iterator(is_out_edge(), ii, src->end());
@@ -565,7 +565,7 @@ class Morph_SepInOut_Graph : private boost::noncopyable {
         dst->acquire(mflag);
         EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);
         ii        = dst->createEdge(src, e, Directional ? true : false,
-                             std::forward<Args>(args)...);
+                                    std::forward<Args>(args)...);
         ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);
       }
     }
@@ -634,7 +634,7 @@ class Morph_SepInOut_Graph : private boost::noncopyable {
   EdgeTy*
   constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,
                         GraphNode src, GraphNode dst,
-                        typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                        typename std::enable_if<_A1&& !_A2>::type* = 0) {
     return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);
   }
 
diff --git a/libgalois/include/galois/graphs/OCGraph.h b/libgalois/include/galois/graphs/OCGraph.h
index 02cb9afd9e..5e1c2d7c26 100644
--- a/libgalois/include/galois/graphs/OCGraph.h
+++ b/libgalois/include/galois/graphs/OCGraph.h
@@ -394,13 +394,13 @@ class OCImmutableEdgeGraph
 
   template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<!_A1&& !_A2>::type* = 0) {
     galois::runtime::acquire(&nodeData[N], mflag);
   }
 
   template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>
   void acquireNode(GraphNode N, MethodFlag mflag,
-                   typename std::enable_if<_A1 && !_A2>::type* = 0) {
+                   typename std::enable_if<_A1&& !_A2>::type* = 0) {
     this->outOfLineAcquire(idFromNode(N), mflag);
   }
 
diff --git a/libgalois/include/galois/graphs/OfflineGraph.h b/libgalois/include/galois/graphs/OfflineGraph.h
index e3ba4cd17e..eeafb528db 100644
--- a/libgalois/include/galois/graphs/OfflineGraph.h
+++ b/libgalois/include/galois/graphs/OfflineGraph.h
@@ -189,11 +189,17 @@ class OfflineGraph {
     return retval;
   }
 
+protected:
+  void setSize(size_t val) { numNodes = val; }
+  void setSizeEdges(size_t val) { numEdges = val; }
+
 public:
   typedef boost::counting_iterator<uint64_t> iterator;
   typedef boost::counting_iterator<uint64_t> edge_iterator;
   typedef uint64_t GraphNode;
 
+  OfflineGraph() {}
+
   OfflineGraph(const std::string& name)
       : fileEdgeDst(name, std::ios_base::binary),
         fileIndex(name, std::ios_base::binary),
@@ -321,9 +327,9 @@ class OfflineGraph {
    * @param scaleFactor Vector specifying if certain divisions should get more
    * than other divisions
    */
-  auto divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id,
-                    size_t total,
-                    std::vector<unsigned> scaleFactor = std::vector<unsigned>())
+  virtual auto
+  divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id, size_t total,
+               std::vector<unsigned> scaleFactor = std::vector<unsigned>())
       -> GraphRange {
     return galois::graphs::divideNodesBinarySearch<OfflineGraph>(
         numNodes, numEdges, nodeWeight, edgeWeight, id, total, *this,
diff --git a/libgalois/include/galois/gslist.h b/libgalois/include/galois/gslist.h
index dd7fcc8c70..ca8a372515 100644
--- a/libgalois/include/galois/gslist.h
+++ b/libgalois/include/galois/gslist.h
@@ -174,7 +174,7 @@ class gslist_base {
 
   gslist_base() : first(0) {}
 
-  gslist_base(const gslist_base&) = delete;
+  gslist_base(const gslist_base&)            = delete;
   gslist_base& operator=(const gslist_base&) = delete;
 
   gslist_base(gslist_base&& other) : first(0) { *this = std::move(other); }
diff --git a/libgalois/include/galois/runtime/Executor_ForEach.h b/libgalois/include/galois/runtime/Executor_ForEach.h
index 5b40cb6e48..ff17133009 100644
--- a/libgalois/include/galois/runtime/Executor_ForEach.h
+++ b/libgalois/include/galois/runtime/Executor_ForEach.h
@@ -383,9 +383,10 @@ class ForEachExecutor {
 
 public:
   ForEachExecutor(FunctionTy f, const ArgsTy& args)
-      : ForEachExecutor(T1{}, f, args, get_trait_value<wl_tag>(args).args,
-                        std::make_index_sequence<std::tuple_size<decltype(
-                            get_trait_value<wl_tag>(args).args)>::value>{}) {}
+      : ForEachExecutor(
+            T1{}, f, args, get_trait_value<wl_tag>(args).args,
+            std::make_index_sequence<std::tuple_size<
+                decltype(get_trait_value<wl_tag>(args).args)>::value>{}) {}
 
   template <typename RangeTy>
   void init(const RangeTy&) {}
@@ -416,8 +417,10 @@ class ForEachExecutor {
 };
 
 template <typename WLTy>
-constexpr auto has_with_iterator(int) -> decltype(
-    std::declval<typename WLTy::template with_iterator<int*>::type>(), bool()) {
+constexpr auto has_with_iterator(int)
+    -> decltype(std::declval<
+                    typename WLTy::template with_iterator<int*>::type>(),
+                bool()) {
   return true;
 }
 
diff --git a/libgalois/include/galois/runtime/GraphUpdateManager.h b/libgalois/include/galois/runtime/GraphUpdateManager.h
new file mode 100644
index 0000000000..cb563cae6f
--- /dev/null
+++ b/libgalois/include/galois/runtime/GraphUpdateManager.h
@@ -0,0 +1,134 @@
+#include <iostream>
+#include <thread>
+#include <atomic>
+#include <galois/Timer.h>
+#include "galois/wmd/graphTypes.h"
+
+// Usage: call start() to start the ingestion of the file
+//        call stop() to stop the ingestion of the file
+//        call setBatchSize() to set the batch size
+//    Refer to wmd-graph-build for an example of how to use this class
+
+using namespace agile::workflow1;
+
+template <typename NodeData, typename EdgeData>
+class graphUpdateManager {
+public:
+  using T              = galois::graphs::DistLocalGraph<NodeData, EdgeData>;
+  graphUpdateManager() = default;
+  graphUpdateManager(
+      std::unique_ptr<galois::graphs::FileParser<NodeData, EdgeData>> parser,
+      std::string inputFile, int period, T* distGraphPtr) {
+    periodForCheck = period;
+    graphFile      = inputFile;
+    graph          = distGraphPtr;
+    fileParser     = std::move(parser);
+  }
+  // disable copy constructor
+  graphUpdateManager(const graphUpdateManager&)            = delete;
+  graphUpdateManager& operator=(const graphUpdateManager&) = delete;
+
+  // disable move constructor
+  graphUpdateManager(graphUpdateManager&&)            = delete;
+  graphUpdateManager& operator=(graphUpdateManager&&) = delete;
+
+  void start() {
+    // start the dynamic changes
+    startIngest = std::thread(&graphUpdateManager::ingestFile, this);
+    checkThread = std::thread(&graphUpdateManager::checkForMessages, this);
+  }
+
+  void setBatchSize(uint64_t size) { batchSize = size; }
+
+  uint64_t getBatchSize() { return batchSize; }
+
+  void setPeriod(uint64_t period) { periodForCheck = period; }
+
+  uint64_t getPeriod() { return periodForCheck; }
+
+  bool stop() {
+    if (stopIngest) {
+      while (!checkThread.joinable())
+        ;
+      startIngest.join();
+    }
+    return stopIngest;
+  }
+  bool stop2() {
+    std::this_thread::sleep_for(std::chrono::milliseconds(10 * periodForCheck));
+    stopCheck = true;
+    while (!checkThread.joinable())
+      ;
+    checkThread.join();
+    return stopIngest;
+  }
+
+private:
+  std::thread checkThread;
+  std::thread startIngest;
+  uint64_t periodForCheck;
+  std::string graphFile;
+  T* graph;
+  uint64_t batchSize = 10;
+  bool stopIngest    = false;
+  bool stopCheck     = false;
+  std::unique_ptr<galois::graphs::FileParser<NodeData, EdgeData>> fileParser;
+
+  template <typename N = NodeData, typename E = EdgeData>
+  void processLine(const char* line, size_t len) {
+    galois::graphs::ParsedGraphStructure<N, E> value =
+        fileParser->ParseLine(const_cast<char*>(line), len);
+    for (auto& edge : value.edges) {
+      std::vector<uint64_t> dsts;
+      dsts.push_back(edge.dst);
+      std::vector<E> data;
+      data.push_back(edge);
+      graph->addEdges(edge.src, dsts, data);
+    }
+  }
+
+  template <typename N = NodeData, typename E = EdgeData>
+  void ingestFile() {
+    std::ifstream inputFile(graphFile);
+    if (!inputFile.is_open()) {
+      std::cerr << "Error opening file: " << graphFile << "\n";
+      return;
+    }
+
+    // Read each line from the stringstream
+    std::string line;
+    uint64_t lineNumber = 0;
+    while ((std::getline(inputFile, line))) {
+      processLine(line.c_str(), line.size());
+      lineNumber++;
+      if (lineNumber == batchSize) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(periodForCheck));
+        lineNumber = 0;
+      }
+    }
+    inputFile.close();
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    net.flush();
+    stopIngest = true;
+  }
+
+  template <typename N = NodeData, typename E = EdgeData>
+  void checkForMessages() {
+    // check for messages
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    while (!stopCheck) {
+      auto m = net.recieveTagged(galois::runtime::evilPhase);
+      if (m.has_value()) {
+        uint64_t src_node;
+        galois::runtime::gDeserialize(m->second, src_node);
+        std::vector<uint64_t> edge_dsts;
+        galois::runtime::gDeserialize(m->second, edge_dsts);
+        std::vector<E> edge_data;
+        galois::runtime::gDeserialize(m->second, edge_data);
+        graph->addEdges(src_node, edge_dsts, edge_data);
+      }
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(periodForCheck / (batchSize)));
+    }
+  }
+};
diff --git a/libgalois/include/galois/runtime/Mem.h b/libgalois/include/galois/runtime/Mem.h
index 994f27ac25..3d2a353f3c 100644
--- a/libgalois/include/galois/runtime/Mem.h
+++ b/libgalois/include/galois/runtime/Mem.h
@@ -554,8 +554,8 @@ class StaticSingleInstance : private boost::noncopyable {
 // std::unique_ptr<Derived>();
 
 template <typename Derived>
-substrate::PtrLock<Derived>
-    StaticSingleInstance<Derived>::ptr = substrate::PtrLock<Derived>();
+substrate::PtrLock<Derived> StaticSingleInstance<Derived>::ptr =
+    substrate::PtrLock<Derived>();
 
 class PageHeap : public StaticSingleInstance<PageHeap> {
 
diff --git a/libgalois/include/galois/runtime/Range.h b/libgalois/include/galois/runtime/Range.h
index 01632dcd3a..135686e83f 100644
--- a/libgalois/include/galois/runtime/Range.h
+++ b/libgalois/include/galois/runtime/Range.h
@@ -303,8 +303,8 @@ template <typename C>
 class HasLocalIter {
 
   template <typename T>
-  using CallExprType = typename std::remove_reference<decltype(
-      std::declval<T>().local_begin())>::type;
+  using CallExprType = typename std::remove_reference<
+      decltype(std::declval<T>().local_begin())>::type;
 
   template <typename T>
   static std::true_type go(typename std::add_pointer<CallExprType<T>>::type);
diff --git a/libgalois/include/galois/runtime/SharedMem.h b/libgalois/include/galois/runtime/SharedMem.h
index 34d847d6ed..7389502bd9 100644
--- a/libgalois/include/galois/runtime/SharedMem.h
+++ b/libgalois/include/galois/runtime/SharedMem.h
@@ -46,10 +46,10 @@ class SharedMem : public galois::substrate::SharedMem {
     internal::setPagePoolState(nullptr);
   }
 
-  SharedMem(const SharedMem&) = delete;
+  SharedMem(const SharedMem&)            = delete;
   SharedMem& operator=(const SharedMem&) = delete;
 
-  SharedMem(SharedMem&&) = delete;
+  SharedMem(SharedMem&&)            = delete;
   SharedMem& operator=(SharedMem&&) = delete;
 };
 
diff --git a/libgalois/include/galois/runtime/StackTracer.h b/libgalois/include/galois/runtime/StackTracer.h
new file mode 100644
index 0000000000..d847db7508
--- /dev/null
+++ b/libgalois/include/galois/runtime/StackTracer.h
@@ -0,0 +1,219 @@
+#ifndef GALOIS_RUNTIME_STACKTRACER_H
+#define GALOIS_RUNTIME_STACKTRACER_H
+#include <cstdint>
+
+struct MPSCBuffer {
+  volatile uint64_t head;
+  volatile uintptr_t* buf;
+  uint16_t LIMIT;
+
+public:
+  /* Old Constructors
+  MPSCBuffer(uint16_t lim) : head(0), buf(new uintptr_t[lim]), LIMIT(lim)
+  {
+    for(uint16_t i = 0; i < LIMIT; i++) buf[i] = 0;
+  }
+
+  MPSCBuffer(uint16_t lim, volatile uintptr_t* buf) : head(0), buf(buf),
+  LIMIT(lim) {}
+  */
+
+  // Will overwrite value if you go over allocated buf size
+  int put(uintptr_t val) {
+    // Check if input is valid
+    if (val == 0)
+      return -1;
+
+    uint64_t my_buf_ptr = __atomic_fetch_add(&head, 1, __ATOMIC_RELAXED);
+    __atomic_store_8(&buf[my_buf_ptr % LIMIT], val, __ATOMIC_RELAXED);
+
+    return 0;
+  }
+
+  uint16_t get_size() {
+    uint64_t head_local = __atomic_load_8(&head, __ATOMIC_RELAXED);
+    if (head_local > LIMIT)
+      return LIMIT;
+    else
+      return head_local;
+  }
+
+  int get(uint16_t loc, uintptr_t& ret) {
+    auto size = get_size();
+    if (loc >= size)
+      return -1;
+    uintptr_t val;
+    while (!(val = __atomic_load_8(&buf[loc], __ATOMIC_RELAXED)))
+      ;
+
+    ret = val;
+
+    return 0;
+  }
+};
+
+#ifdef STACK_CAPTURE
+
+struct ThreadStackCap {
+  bool non_recurse          = true;
+  volatile bool initialized = false;
+  volatile uint64_t top;
+  volatile uint64_t bot;
+};
+
+thread_local ThreadStackCap cap;
+static volatile uintptr_t mpsc_cap_buffer[1024] = {0};
+
+struct StackCap {
+  bool grows_down;
+  uint64_t init_top;
+  uint64_t init_bot;
+  MPSCBuffer caps;
+
+  uint64_t getStackPtr() {
+    uint64_t a;
+    return (uint64_t)&a;
+  }
+
+public:
+  /* Old Constructors
+  StackCap() : caps(1024)
+  {
+    uint64_t bot_val;
+
+    uint64_t top_p = getStackPtr();
+    grows_down = ((uint64_t)&bot_val > top_p);
+    init_top = grows_down ? UINT64_MAX : 0;
+    init_bot = grows_down ? 0 : UINT64_MAX;
+  }
+
+  StackCap(uint16_t lim, volatile uintptr_t* cap_buf) : caps(lim, cap_buf) {}
+  */
+
+  void setup() {
+    uint64_t bot_val;
+
+    uint64_t top_p = getStackPtr();
+    grows_down     = ((uint64_t)&bot_val > top_p);
+    init_top       = grows_down ? UINT64_MAX : 0;
+    init_bot       = grows_down ? 0 : UINT64_MAX;
+  }
+
+  void capture_stack_info() {
+    if (!cap.initialized) {
+      this->setup();
+      cap.top               = init_top;
+      cap.bot               = init_bot;
+      uintptr_t tl_cap_addr = (uintptr_t)&cap;
+      caps.put(tl_cap_addr);
+      __atomic_store_n(&cap.initialized, true, __ATOMIC_RELAXED);
+    }
+
+    uint64_t curr   = getStackPtr();
+    bool change_top = grows_down ? (curr < cap.top) : (curr > cap.top);
+    bool change_bot = grows_down ? (curr > cap.bot) : (curr < cap.bot);
+    if (change_top)
+      __atomic_store_8(&cap.top, curr, __ATOMIC_RELAXED);
+    if (change_bot)
+      __atomic_store_8(&cap.bot, curr, __ATOMIC_RELAXED);
+  }
+
+  /**
+   * @return the maximum stack value
+   * */
+  uint64_t get_max() {
+    uint64_t max = 0;
+    for (uint16_t i = 0; i < caps.get_size(); i++) {
+      uintptr_t tr_cap_addr = 0;
+      caps.get(i, tr_cap_addr);
+      ThreadStackCap* cont_cap = (ThreadStackCap*)tr_cap_addr;
+      bool valid_vals =
+          (cont_cap->top != init_top) && (cont_cap->bot != init_bot);
+      uint64_t candidate_max = grows_down ? (cont_cap->bot - cont_cap->top)
+                                          : (cont_cap->top - cont_cap->bot);
+      if (valid_vals && (candidate_max > max))
+        max = candidate_max;
+    }
+
+    return max;
+  }
+
+  /**
+   * @param idx the index of the capacity you want to get
+   * @param top a reference where the top of the stack should be put
+   * @param bot a reference where the bottom of the stack should be put
+   * @return tells you if the value at top and bot should be trusted (no errors
+   *is 0)
+   **/
+  int get_top_bot(uint16_t idx, uint64_t& top, uint64_t& bot) {
+    uintptr_t tr_cap_addr    = 0;
+    int ret                  = caps.get(idx, tr_cap_addr);
+    ThreadStackCap* cont_cap = (ThreadStackCap*)tr_cap_addr;
+    if (ret != 0 || cont_cap == nullptr)
+      return -1;
+    top = cont_cap->top;
+    bot = cont_cap->bot;
+    return 0;
+  }
+
+  bool& is_non_recurse() { return *&cap.non_recurse; }
+
+  /** This is a very dangerous function please be careful when calling this. */
+  int reset() {
+    for (uint16_t i = 0; i < caps.get_size(); i++) {
+      uintptr_t tr_cap_addr = 0;
+      caps.get(i, tr_cap_addr);
+      ThreadStackCap* cont_cap = (ThreadStackCap*)tr_cap_addr;
+      if (cont_cap == nullptr)
+        return -1;
+      cont_cap->top = init_top;
+      cont_cap->bot = init_bot;
+    }
+    return 0;
+  }
+};
+
+StackCap stack_capture = {
+    .grows_down = true,
+    .init_top   = 0,
+    .init_bot   = 0,
+    .caps       = {.head = 0, .buf = mpsc_cap_buffer, .LIMIT = 1024}};
+
+#else
+struct StackCap {
+public:
+  bool non_recurse = true;
+
+  void capture_stack_info() {}
+
+  uint64_t get_max() { return 0; }
+
+  bool& is_non_recurse() { return non_recurse; }
+
+  int reset() { return 0; }
+};
+
+StackCap stack_capture;
+
+#endif
+
+void cyg_profile_func_stack(void* this_fn, void* call_site) {
+  (void)this_fn;
+  (void)call_site;
+  if (stack_capture.is_non_recurse()) {
+    stack_capture.is_non_recurse() = false;
+    stack_capture.capture_stack_info();
+    stack_capture.is_non_recurse() = true;
+  }
+}
+
+extern "C" {
+void __cyg_profile_func_enter(void* this_fn, void* call_site) {
+  cyg_profile_func_stack(this_fn, call_site);
+}
+void __cyg_profile_func_exit(void* this_fn, void* call_site) {
+  cyg_profile_func_stack(this_fn, call_site);
+}
+}
+
+#endif // GALOIS_RUNTIME_STACKTRACER_H
diff --git a/libgalois/include/galois/runtime/Statistics.h b/libgalois/include/galois/runtime/Statistics.h
index 62603441c3..4239ce50f5 100644
--- a/libgalois/include/galois/runtime/Statistics.h
+++ b/libgalois/include/galois/runtime/Statistics.h
@@ -28,6 +28,8 @@
 #include <sys/resource.h>
 #include <sys/time.h>
 
+// added her to get rid of annoying int log deprecation in boost 1.69
+#define BOOST_ALLOW_DEPRECATED_HEADERS
 #include <boost/uuid/uuid.hpp>            // uuid class
 #include <boost/uuid/uuid_generators.hpp> // generators
 #include <boost/uuid/uuid_io.hpp>         // streaming operators etc.
diff --git a/libgalois/include/galois/runtime/ThreadTimer.h b/libgalois/include/galois/runtime/ThreadTimer.h
index 86ae77389d..e75ba51efd 100644
--- a/libgalois/include/galois/runtime/ThreadTimer.h
+++ b/libgalois/include/galois/runtime/ThreadTimer.h
@@ -49,10 +49,10 @@ class PerThreadTimer : private ThreadTimers {
   PerThreadTimer(const char* const region, const char* const category)
       : region_(region), category_(category) {}
 
-  PerThreadTimer(const PerThreadTimer&) = delete;
-  PerThreadTimer(PerThreadTimer&&)      = delete;
+  PerThreadTimer(const PerThreadTimer&)            = delete;
+  PerThreadTimer(PerThreadTimer&&)                 = delete;
   PerThreadTimer& operator=(const PerThreadTimer&) = delete;
-  PerThreadTimer& operator=(PerThreadTimer&&) = delete;
+  PerThreadTimer& operator=(PerThreadTimer&&)      = delete;
 
   ~PerThreadTimer() { reportTimes(); }
 
@@ -67,10 +67,10 @@ class PerThreadTimer<false> {
 public:
   PerThreadTimer(const char* const, const char* const) {}
 
-  PerThreadTimer(const PerThreadTimer&) = delete;
-  PerThreadTimer(PerThreadTimer&&)      = delete;
+  PerThreadTimer(const PerThreadTimer&)            = delete;
+  PerThreadTimer(PerThreadTimer&&)                 = delete;
   PerThreadTimer& operator=(const PerThreadTimer&) = delete;
-  PerThreadTimer& operator=(PerThreadTimer&&) = delete;
+  PerThreadTimer& operator=(PerThreadTimer&&)      = delete;
 
   ~PerThreadTimer() = default;
 
diff --git a/libgalois/include/galois/substrate/PerThreadStorage.h b/libgalois/include/galois/substrate/PerThreadStorage.h
index b4a6140dd4..dabcc56fd7 100644
--- a/libgalois/include/galois/substrate/PerThreadStorage.h
+++ b/libgalois/include/galois/substrate/PerThreadStorage.h
@@ -55,7 +55,7 @@ class PerBackend {
 public:
   PerBackend();
 
-  PerBackend(const PerBackend&) = delete;
+  PerBackend(const PerBackend&)            = delete;
   PerBackend& operator=(const PerBackend&) = delete;
 
   ~PerBackend() {
@@ -156,6 +156,10 @@ class PerThreadStorage {
     return reinterpret_cast<T*>(ditem);
   }
 
+  T* get(unsigned int thread) { return getRemote(thread); }
+
+  const T* get(unsigned int thread) const { return getRemote(thread); }
+
   unsigned size() const { return getThreadPool().getMaxThreads(); }
 };
 
@@ -194,7 +198,7 @@ class PerSocketStorage {
     return *this;
   }
 
-  PerSocketStorage(const PerSocketStorage&) = delete;
+  PerSocketStorage(const PerSocketStorage&)            = delete;
   PerSocketStorage& operator=(const PerSocketStorage&) = delete;
 
   ~PerSocketStorage() { destruct(); }
diff --git a/libgalois/include/galois/substrate/SharedMem.h b/libgalois/include/galois/substrate/SharedMem.h
index 1c809b52ad..e8a6fe58a4 100644
--- a/libgalois/include/galois/substrate/SharedMem.h
+++ b/libgalois/include/galois/substrate/SharedMem.h
@@ -48,10 +48,10 @@ class SharedMem {
    */
   ~SharedMem();
 
-  SharedMem(const SharedMem&) = delete;
+  SharedMem(const SharedMem&)            = delete;
   SharedMem& operator=(const SharedMem&) = delete;
 
-  SharedMem(SharedMem&&) = delete;
+  SharedMem(SharedMem&&)            = delete;
   SharedMem& operator=(SharedMem&&) = delete;
 };
 
diff --git a/libgalois/include/galois/substrate/ThreadPool.h b/libgalois/include/galois/substrate/ThreadPool.h
index 4158b87321..1ed295d8a0 100644
--- a/libgalois/include/galois/substrate/ThreadPool.h
+++ b/libgalois/include/galois/substrate/ThreadPool.h
@@ -130,10 +130,10 @@ class ThreadPool {
 public:
   ~ThreadPool();
 
-  ThreadPool(const ThreadPool&) = delete;
+  ThreadPool(const ThreadPool&)            = delete;
   ThreadPool& operator=(const ThreadPool&) = delete;
 
-  ThreadPool(ThreadPool&&) = delete;
+  ThreadPool(ThreadPool&&)            = delete;
   ThreadPool& operator=(ThreadPool&&) = delete;
 
   //! execute work on all threads
diff --git a/libgalois/include/galois/worklists/AdaptiveObim.h b/libgalois/include/galois/worklists/AdaptiveObim.h
index 79223cf628..8b03e936e9 100644
--- a/libgalois/include/galois/worklists/AdaptiveObim.h
+++ b/libgalois/include/galois/worklists/AdaptiveObim.h
@@ -303,9 +303,8 @@ struct AdaptiveOrderedByIntegerMetric
     ThreadData(Index initial)
         : curIndex(initial, 0), scanStart(initial, 0), current(0),
           lastMasterVersion(0), numPops(0), popsLastFix(0),
-          slowPopsLastPeriod(0), pushesLastPeriod(0),
-          popsFromSameQ(0), stats{0, 0, 0, 0},
-          minPrio(std::numeric_limits<Index>::max()),
+          slowPopsLastPeriod(0), pushesLastPeriod(0), popsFromSameQ(0),
+          stats{0, 0, 0, 0}, minPrio(std::numeric_limits<Index>::max()),
           maxPrio(std::numeric_limits<Index>::min()) {}
   };
 
@@ -402,7 +401,7 @@ struct AdaptiveOrderedByIntegerMetric
         double diff = ((p.maxPrio >> delta) - (p.minPrio >> delta)) >= 1
                           ? ((p.maxPrio >> delta) - (p.minPrio >> delta))
                           : 1;
-        double xx = 16 / diff;
+        double xx   = 16 / diff;
         if (delta > (unsigned int)(std::floor(std::log2(xx))))
           delta -= (unsigned int)(std::floor(std::log2(xx)));
         else
diff --git a/libgalois/include/galois/worklists/Chunk.h b/libgalois/include/galois/worklists/Chunk.h
index cf6d697e6a..17398e9ff7 100644
--- a/libgalois/include/galois/worklists/Chunk.h
+++ b/libgalois/include/galois/worklists/Chunk.h
@@ -145,8 +145,8 @@ struct ChunkMaster {
 public:
   typedef T value_type;
 
-  ChunkMaster()                   = default;
-  ChunkMaster(const ChunkMaster&) = delete;
+  ChunkMaster()                              = default;
+  ChunkMaster(const ChunkMaster&)            = delete;
   ChunkMaster& operator=(const ChunkMaster&) = delete;
 
   void flush() {
diff --git a/libgalois/include/galois/worklists/WorkList.h b/libgalois/include/galois/worklists/WorkList.h
index fab4e80e2a..25eb900785 100644
--- a/libgalois/include/galois/worklists/WorkList.h
+++ b/libgalois/include/galois/worklists/WorkList.h
@@ -56,7 +56,7 @@ namespace { // don't pollute the symbol table with the example
 // All classes (should) conform to:
 template <typename T>
 class AbstractWorkList {
-  AbstractWorkList(const AbstractWorkList&) = delete;
+  AbstractWorkList(const AbstractWorkList&)                  = delete;
   const AbstractWorkList& operator=(const AbstractWorkList&) = delete;
 
 public:
diff --git a/libgalois/include/shad/Graph.h b/libgalois/include/shad/Graph.h
new file mode 100644
index 0000000000..3f3a41c02d
--- /dev/null
+++ b/libgalois/include/shad/Graph.h
@@ -0,0 +1,177 @@
+// TODO(hc): Upgrade copyright if it is necessary; for now, we have no plan
+//  to make this public.
+
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBGALOIS_INCLUDE_SHAD_GRAPH_H_
+#define LIBGALOIS_INCLUDE_SHAD_GRAPH_H_
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+#include "galois/shad/DataTypes.h"
+#include "GraphTypes.h"
+
+#define UINT shad::data_types::UINT
+#define DOUBLE shad::data_types::DOUBLE
+#define USDATE shad::data_types::USDATE
+#define ENCODE shad::data_types::encode
+
+namespace shad {
+
+class Vertex {
+public:
+  // Vertex id; initially it is set
+  // to a local node id while CuSP reads a file and constructs
+  // this vertex. After each host finishes and synchronizes it to construct
+  // a full CSR graph, it is updated to a global node id.
+  uint64_t id;
+  TYPES type;
+  uint64_t shadKey;
+  // Number of edges.
+  // This is incremented while reads a graph.
+  uint64_t numEdges{0};
+
+  Vertex() {
+    this->id      = shad::data_types::kNullValue<uint64_t>;
+    this->type    = TYPES::NONE;
+    this->shadKey = shad::data_types::kNullValue<uint64_t>;
+  }
+
+  Vertex(uint64_t id_, TYPES type_, uint64_t shadKey_) {
+    this->id      = id_;
+    this->type    = type_;
+    this->shadKey = shadKey_;
+  }
+
+  void incrNumEdges() { this->numEdges += 1; }
+
+  uint64_t getNumEdges() { return this->numEdges; }
+};
+
+class Edge {
+public:
+  uint64_t src; // vertex id of src
+  uint64_t dst; // vertex id of dst
+  TYPES type;
+  TYPES src_type;
+  TYPES dst_type;
+  uint64_t src_glbid;
+  uint64_t dst_glbid;
+
+  Edge() {
+    src       = shad::data_types::kNullValue<uint64_t>;
+    dst       = shad::data_types::kNullValue<uint64_t>;
+    type      = TYPES::NONE;
+    src_type  = TYPES::NONE;
+    dst_type  = TYPES::NONE;
+    src_glbid = shad::data_types::kNullValue<uint64_t>;
+    dst_glbid = shad::data_types::kNullValue<uint64_t>;
+  }
+
+  Edge(std::vector<std::string>& tokens) {
+    if (tokens[0] == "Sale") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[2]);
+      type      = TYPES::SALE;
+      src_type  = TYPES::PERSON;
+      dst_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "Author") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      type      = TYPES::AUTHOR;
+      src_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        dst_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        dst_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        dst_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "Includes") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      type      = TYPES::INCLUDES;
+      src_type  = TYPES::FORUM;
+      dst_type  = TYPES::FORUMEVENT;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "HasTopic") {
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASTOPIC;
+      dst_type  = TYPES::TOPIC;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        src_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        src_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        src_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "HasOrg") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASORG;
+      src_type  = TYPES::PUBLICATION;
+      dst_type  = TYPES::TOPIC;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    }
+  }
+};
+
+} // namespace shad
+
+#endif // GRAPH_H
diff --git a/libgalois/include/shad/GraphTypes.h b/libgalois/include/shad/GraphTypes.h
new file mode 100644
index 0000000000..e9f7afc0ab
--- /dev/null
+++ b/libgalois/include/shad/GraphTypes.h
@@ -0,0 +1,71 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBGALOIS_INCLUDE_SHAD_GRAPHTYPES_H_
+#define LIBGALOIS_INCLUDE_SHAD_GRAPHTYPES_H_
+
+namespace shad {
+
+enum class TYPES {
+  PERSON,
+  FORUMEVENT,
+  FORUM,
+  PUBLICATION,
+  TOPIC,
+  PURCHASE,
+  SALE,
+  AUTHOR,
+  WRITTENBY,
+  INCLUDES,
+  INCLUDEDIN,
+  HASTOPIC,
+  TOPICIN,
+  HASORG,
+  ORGIN,
+  NONE
+};
+
+} // namespace shad
+
+#endif // GRAPHTYPES_H
diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h
new file mode 100644
index 0000000000..344b111254
--- /dev/null
+++ b/libgalois/include/shad/ShadGraphConverter.h
@@ -0,0 +1,710 @@
+#ifndef LIBGALOIS_INCLUDE_SHAED_GRAPH_READER_H_
+#define LIBGALOIS_INCLUDE_SHAED_GRAPH_READER_H_
+
+#include <fstream>
+#include <string>
+
+#include "galois/graphs/BufferedGraph.h"
+#include "galois/shad/DataTypes.h"
+
+#include "shad/Graph.h"
+#include "shad/GraphTypes.h"
+
+namespace shad {
+
+struct ShadNodeTy {
+  int type;
+  uint64_t key;
+};
+using ShadEdgeTy = uint64_t;
+
+/**
+ * TODO(hc): This is a shared-memory version.
+ * Later, a distributed-memory version in libgluon will reuse this code.
+ */
+class ShadGraphConverter {
+
+public:
+  ShadGraphConverter() : nodeDataBuffer(nullptr) {}
+
+  ~ShadGraphConverter() {
+    // BufferedGraph holds these arrays.
+    outIndexBuffer = nullptr;
+    nodeDataBuffer = nullptr;
+    edgeDestBuffer = nullptr;
+    edgeDataBuffer = nullptr;
+  }
+
+  /**
+   * @brief Flush a graph topology to a file for debugging.
+   */
+  void flushGraphTopology() {
+    std::ofstream fp("shad_graph.out");
+    for (size_t i = 0; i < this->verticeIdKeyMapping.size(); ++i) {
+      uint64_t key = this->verticeIdKeyMapping[i];
+      Vertex v     = this->vertices[key];
+      fp << "node " << i << ", type: " << to_underlying(v.type)
+         << ", key: " << key << "\n";
+      auto edgeRange = this->edges.equal_range(key);
+      for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei) {
+        Edge& edge = ei->second;
+        Vertex dst = this->vertices[edge.dst];
+        fp << "\t edge dst " << dst.id << ", type: " << to_underlying(edge.type)
+           << ", key: " << dst.shadKey << "\n";
+      }
+    }
+    fp.close();
+  }
+
+  /**
+   * @brief Read a input graph file and inspect the number of nodes and edges.
+   * @detail In order to construct a dense LC_CSR_Graph, we need to know how
+   * many edges and nodes exist. This method reads one line by one line, and
+   * counts those information.
+   * Note that this method assumes that the types of {"Person", "ForumEvent",
+   * "Forum", "Publication", "Topic"} are nodes, and the types of
+   * {"SALE", "Author", "Includes", "HasTopic", "HasOrg"} are edges.
+   *
+   * @param filename file name to read
+   * @param numNodes number of nodes that this method reads
+   * @param numEdges number of edges that this method reads
+   */
+  void InspectGraph(const std::string& filename, size_t* numNodes,
+                    size_t* numEdges) {
+    // TODO(hc): Get the number of nodes and edges from file
+    // For example, it reads {SALE, Author, Includes, HasTopic, HasOrg} as
+    // edges. So we just count how many they exist in the file.
+
+    std::string line;
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+      std::cerr << "Cannot open file " << filename << "\n";
+      exit(-1);
+    }
+    while (!file.eof()) {
+      getline(file, line);
+      // Skip comments.
+      if (line[0] == '#')
+        continue;
+      // Delimiter and # tokens set for WMD data file.
+      std::vector<std::string> tokens = splitTokens(line, ',', 10);
+
+      if (this->isTokenNodeType(tokens[0])) {
+        ++(*numNodes);
+      } else if (this->isTokenEdgeType(tokens[0])) {
+        *numEdges += 2;
+      }
+    }
+
+    std::cout << "Number of nodes:" << *numNodes
+              << ", number of edges:" << *numEdges << "\n";
+  }
+
+  /**
+   * @brief Construct a buffered graph from existing arrays constructed
+   * by constructNodeArrays() and constructEdgeArrays().
+   *
+   * @param numGlobalNodes The number of global nodes
+   * @param numGlobalEdges The number of global edges
+   * @param nodeBegin Global node ID of the first local node
+   * @param nodeEnd (Global node ID of the last local node) + 1
+   * @param edgeBegin Global edge ID of the first local edge
+   * @param edgeEnd (Global edge ID of the last local edge) + 1
+   * @param bufferedGraph Buffered graph for CuSP
+   */
+  void constructBufferedGraph(
+      uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint32_t nodeBegin,
+      uint32_t nodeEnd, uint64_t edgeBegin, uint64_t edgeEnd,
+      [[maybe_unused]] galois::graphs::BufferedGraph<ShadEdgeTy>*
+          bufferedGraph) {
+    // TODO(hc): Each of these functions first construct graphs in the SHAD
+    // format as this file is written in not binary, but string, and also
+    // nodes or edges are not sorted. So, until we preprocess the input graph
+    // file, we should first read it in memory, and reconstruct this to Galois
+    // compatible
+
+    uint32_t numLocalNodes = nodeEnd - nodeBegin;
+    uint64_t numLocalEdges = edgeEnd - edgeBegin;
+
+    bufferedGraph->constructFrom(outIndexBuffer, edgeDestBuffer, edgeDataBuffer,
+                                 numGlobalNodes, numGlobalEdges, numLocalNodes,
+                                 numLocalEdges, nodeBegin, edgeBegin);
+#if 0
+    TODO(hc): This verification should be fixed since it tests
+              a shared-memory execution that one host loads the whole
+              graph. It should not work on distributed-memory machine
+              since a CSR graph should be partitioned but tepmorary
+              maps reading and holding SHAD graphs are for global graph.
+#ifndef NDEBUG
+    std::cout << "CSR verification starts.." << std::endl << std::flush;
+    this->VerifyCSRConstruction(outIndexBuffer, nodeDataBuffer,
+        edgeDestBuffer, edgeDataBuffer);
+    std::cout << "CSR verification starts.. [done]" << std::endl << std::flush;
+#endif
+#endif
+    // TODO(hc): Construct `buffer_graph`.
+  }
+
+  /**
+   * @brief Read SHAD graph file and construct in-memory buffer SHAD graph.
+   *
+   * @param filename SHAD graph file name
+   */
+  // TODO(hc): We can assign a disjointed range of file for each host.
+  // For now, let all hosts read the whole file.
+  void readSHADFile(const std::string& filename, uint64_t* numGlobalNodes,
+                    uint64_t* numGlobalEdges) {
+    std::ifstream graphFile(filename.c_str());
+    uint64_t vertexId{0};
+    std::string line;
+    uint64_t numNodes{0}, numEdges{0};
+    // TODO(hc): We can parallelize it by assigning disjointed
+    // ranges with some inspection.
+    // But this would be the future work as
+    while (!graphFile.eof()) {
+      getline(graphFile, line);
+      // Skip comments.
+      if (line[0] == '#')
+        continue;
+      // Delimiter and # tokens set for WMD data file.
+      std::vector<std::string> tokens = splitTokens(line, ',', 10);
+
+      if (tokens[0] == "Person") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[1]),
+                         TYPES::PERSON, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "ForumEvent") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[4]),
+                         TYPES::FORUMEVENT, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Forum") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[3]),
+                         TYPES::FORUM, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Publication") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[5]),
+                         TYPES::PUBLICATION, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Topic") {
+        insertSHADVertex(ENCODE<uint64_t, std::string, UINT>(tokens[6]),
+                         TYPES::TOPIC, vertexId);
+        ++numNodes;
+      } else if (tokens[0] == "Sale") {
+        Edge sale(tokens);
+        insertSHADEdge(sale.src, sale);
+
+        Edge purchase = sale;
+        purchase.type = TYPES::PURCHASE;
+        std::swap(purchase.src, purchase.dst);
+        insertSHADEdge(purchase.src, purchase);
+        numEdges += 2;
+      } else if (tokens[0] == "Author") {
+        Edge authors(tokens);
+        insertSHADEdge(authors.src, authors);
+
+        Edge writtenBY = authors;
+        writtenBY.type = TYPES::WRITTENBY;
+        std::swap(writtenBY.src, writtenBY.dst);
+        std::swap(writtenBY.src_type, writtenBY.dst_type);
+        insertSHADEdge(writtenBY.src, writtenBY);
+        numEdges += 2;
+      } else if (tokens[0] == "Includes") {
+        Edge includes(tokens);
+        insertSHADEdge(includes.src, includes);
+
+        Edge includedIN = includes;
+        includedIN.type = TYPES::INCLUDEDIN;
+        std::swap(includedIN.src, includedIN.dst);
+        std::swap(includedIN.src_type, includedIN.dst_type);
+        insertSHADEdge(includedIN.src, includedIN);
+        numEdges += 2;
+      } else if (tokens[0] == "HasTopic") {
+        Edge hasTopic(tokens);
+        insertSHADEdge(hasTopic.src, hasTopic);
+
+        Edge topicIN = hasTopic;
+        topicIN.type = TYPES::TOPICIN;
+        std::swap(topicIN.src, topicIN.dst);
+        std::swap(topicIN.src_type, topicIN.dst_type);
+        insertSHADEdge(topicIN.src, topicIN);
+        numEdges += 2;
+      } else if (tokens[0] == "HasOrg") {
+        Edge hasOrg(tokens);
+        insertSHADEdge(hasOrg.src, hasOrg);
+
+        Edge orgIN = hasOrg;
+        orgIN.type = TYPES::ORGIN;
+        std::swap(orgIN.src, orgIN.dst);
+        std::swap(orgIN.src_type, orgIN.dst_type);
+        insertSHADEdge(orgIN.src, orgIN);
+        numEdges += 2;
+      }
+    }
+
+    // After the above loop, vertices and edges are complete.
+    this->CountNumEdgesForEachVertex(numNodes, numEdges);
+    *numGlobalNodes = numNodes;
+    *numGlobalEdges = numEdges;
+
+#ifndef NDEBUG
+    this->VerifySHADGraphRead(filename);
+#endif
+  }
+
+  /**
+   * @brief Return node data array.
+   * Note that this can be either of global graph or local graph.
+   */
+  ShadNodeTy* getNodeDataBuffer() { return nodeDataBuffer; }
+
+  /**
+   * @brief Return node outgoing edge index array
+   * Note that this can be either of global graph or local graph.
+   */
+  uint64_t* getOutIndexBuffer() { return outIndexBuffer; }
+
+  /**
+   * @brief Construct vertex outgoing edge range buffer and
+   * vertex data buffer.
+   *
+   * @detail Extract local vertices' outgoing edge ranges and
+   * data from a temprory buffer of vertex map that is read and constructed
+   * from a SHAD CSV graph file. Note that these arrays are for local graph
+   * partition and their indices should be corresponding to local node ids.
+   *
+   * @param nodeBegin Global node ID of the first local node
+   * @param nodeEnd (Global node ID of the last local node + 1)
+   * @param numLocalNodes The number of local nodes
+   *
+   */
+  void constructNodeArrays(uint32_t nodeBegin, uint32_t nodeEnd,
+                           uint32_t numLocalNodes) {
+    // 1) Construct an edge index array (size == number of nodes).
+    this->outIndexBuffer = new uint64_t[numLocalNodes];
+    this->nodeDataBuffer = new ShadNodeTy[numLocalNodes];
+
+    // TODO(hc): for now, only consider a single host, but need to add offset
+    // later.
+    galois::do_all(galois::iterate(this->vertices), [&](auto element) {
+      Vertex& vertex    = element.second;
+      uint64_t vertexId = vertex.id;
+      if (vertexId >= nodeBegin && vertexId < nodeEnd) {
+        this->outIndexBuffer[vertexId - nodeBegin] = vertex.getNumEdges();
+        // Fill vertex data too; This assumes that a SHAD graph
+        // has a type, which is considered as a vertex data.
+        this->nodeDataBuffer[vertexId - nodeBegin].type =
+            this->to_underlying(vertex.type);
+        this->nodeDataBuffer[vertexId - nodeBegin].key = vertex.shadKey;
+        // std::cout << vertexId - nodeBegin << " is set to "
+        //<< this->nodeDataBuffer[vertexId - nodeBegin].type << " and " <<
+        // this->nodeDataBuffer[vertexId - nodeBegin].key << "\n";
+      }
+    });
+    // 2) Perform parallel prefix sum to finalize outgoing edge index
+    // array construction.
+    galois::ParallelSTL::partial_sum(
+        outIndexBuffer, &(outIndexBuffer[numLocalNodes]), outIndexBuffer);
+  }
+
+  /**
+   * @brief Construct edge destination and data arrays.
+   *
+   * @detail Extract local edge destination and data from a
+   * temprory buffer of edge map that is read and constructed
+   * from a SHAD CSV graph file. Note that these arrays are for local graph
+   * partition and their indices should be corresponding to local node ids.
+   *
+   * @tparam T Edge data type; if this is not void, edge data array is
+   * constructed
+   *
+   * @param nodeBegin Global node ID of the first local node
+   * @param edgeBegin Global edge ID of the first local edge
+   * @param numLocalNodes The number of local nodes
+   * @param numLocalEdges The number of local edges
+   *
+   */
+  template <typename T                                           = ShadEdgeTy,
+            typename std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
+  void constructEdgeArrays(uint32_t nodeBegin, uint64_t edgeBegin,
+                           uint32_t numLocalNodes, uint64_t numLocalEdges) {
+    this->edgeDestBuffer = new uint32_t[numLocalEdges];
+    this->edgeDataBuffer = new ShadEdgeTy[numLocalEdges];
+    std::vector<uint32_t> edgeIndexPointers(numLocalNodes, 0);
+    galois::on_each([&](uint32_t tid, uint32_t numThreads) {
+      // 1) Find disjointed node range for each thread.
+      auto thread_work_range =
+          galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads);
+      // 2) Each thread iterates the whole edges.
+      for (auto edgeElem : this->edges) {
+        uint64_t srcVertex   = edgeElem.first;
+        Vertex& vertex       = this->vertices[srcVertex];
+        uint64_t srcVertexId = vertex.id;
+        // 3) Each thread fills edge destination for the assigned nodes.
+        if (srcVertexId >= thread_work_range.first + nodeBegin &&
+            srcVertexId < thread_work_range.second + nodeBegin) {
+          uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++;
+          // OutIndexBuffer now contains global edge range.
+          // So we need to subtract edge offset to get the local edge id.
+          uint64_t nodeBaseOffset =
+              ((srcVertexId - nodeBegin) == 0)
+                  ? 0
+                  : outIndexBuffer[srcVertexId - nodeBegin - 1] - edgeBegin;
+          edgeDestBuffer[edgeIdx + nodeBaseOffset] =
+              this->vertices[edgeElem.second.dst].id;
+          edgeDataBuffer[edgeIdx + nodeBaseOffset] =
+              to_underlying(edgeElem.second.type);
+        }
+      }
+    });
+    // Or inspector/executor model
+    // But that might be more expensive.
+  }
+
+  /**
+   * @brief Construct edge destination array
+   *
+   * @detail Extract local edge destination from a
+   * temprory buffer of edge map that is read and constructed
+   * from a SHAD CSV graph file. Note that this array is for local graph
+   * partition and their indices should be corresponding to local node ids.
+   *
+   * @tparam T Edge data type; This function is enabled when
+   * edge data type is void
+   *
+   * @param nodeBegin Global node ID of the first local node
+   * @param edgeBegin Global edge ID of the first local edge
+   * @param numLocalNodes The number of local nodes
+   * @param numLocalEdges The number of local edges
+   *
+   */
+  template <typename T                                          = ShadEdgeTy,
+            typename std::enable_if_t<std::is_same_v<T, void>>* = nullptr>
+  void constructEdgeArrays(uint32_t nodeBegin, uint64_t edgeBegin,
+                           uint32_t numLocalNodes, uint64_t numLocalEdges) {
+    edgeDestBuffer = new uint32_t[numLocalEdges];
+    std::vector<uint32_t> edgeIndexPointers(numLocalNodes, 0);
+    galois::on_each([&](uint32_t tid, uint32_t numThreads) {
+      // 1) Find disjointed node range for each thread.
+      auto thread_work_range =
+          galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads);
+      // 2) Each thread iterates the whole edges.
+      for (auto edgeElem : this->edges) {
+        uint64_t srcVertex   = edgeElem.first;
+        Vertex& vertex       = this->vertices[srcVertex];
+        uint64_t srcVertexId = vertex.id;
+        // 3) Each thread fills edge destination for the assigned nodes.
+        if (srcVertexId >= thread_work_range.first + nodeBegin &&
+            srcVertexId < thread_work_range.second + nodeBegin) {
+          uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++;
+          uint64_t nodeBaseOffset =
+              ((srcVertexId - nodeBegin) == 0)
+                  ? 0
+                  : outIndexBuffer[srcVertexId - 1] - edgeBegin;
+          edgeDestBuffer[edgeIdx + nodeBaseOffset] =
+              this->vertices[edgeElem.second.dst].id;
+        }
+      }
+    });
+    // Or inspector/executor model
+    // But that might be more expensive.
+  }
+
+  /**
+   * @brief Extract outgoing edge index ranges for local vertices
+   * from the global outgoing edge index range array.
+   *
+   * @param nodeBegin Node global id of the first local node
+   * @param nodeEnd (Node global id for the last local node + 1)
+   */
+  void extractLocalOutIndexArray(uint32_t nodeBegin, uint32_t nodeEnd) {
+
+    uint64_t* newOutIndexBuffer = new uint64_t[nodeEnd - nodeBegin];
+    galois::do_all(galois::iterate(nodeBegin, nodeEnd), [&](uint32_t n) {
+      newOutIndexBuffer[n - nodeBegin] = this->outIndexBuffer[n];
+    });
+    delete[] this->outIndexBuffer;
+    this->outIndexBuffer = newOutIndexBuffer;
+  }
+
+  /**
+   * @brief Check if a type of a node having the passed id is
+   * equal to the one in a temporary vertex map constructed from
+   * SHAD graph file.
+   *
+   * @param id Node global id to check
+   * @param type Node type
+   *
+   * @return True if passed information matches to the one in
+   * a temporary vertex map
+   */
+  bool checkNode(uint64_t id, int type) {
+    uint64_t key   = this->verticeIdKeyMapping[id];
+    Vertex& vertex = this->vertices[key];
+    return (this->to_underlying(vertex.type) == type);
+  }
+
+  /**
+   * @brief Check if a type of a edge having the passed id is
+   * equal to the one in a temporary edge map constructed from
+   * SHAD graph file.
+   *
+   * @param snid Global node ID of the source node of an edge
+   * @param dnid Global node ID of the destination node of an edge
+   * @param type Edge type
+   * @param type Edge type
+   *
+   * @return True if passed information matches to the one in
+   * a temporary edge map
+   */
+  bool checkEdge(uint64_t snid, uint64_t dnid, uint64_t /*eid*/, int type) {
+    uint64_t skey  = this->verticeIdKeyMapping[snid];
+    auto edgeRange = this->edges.equal_range(skey);
+    uint64_t eidx{0};
+    Edge edge;
+    bool found{false};
+    for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei, ++eidx) {
+      edge = ei->second;
+      // Multiple edges having the same source and destination could
+      // exist. So we repeat until find the one that has the same type to
+      // the passed one.
+      if (this->vertices[edge.dst].id == dnid &&
+          this->to_underlying(edge.type) == type) {
+        found = true;
+        break;
+      }
+    }
+    return found;
+  }
+
+private:
+  /**
+   * @brief Return true if a token is a node type.
+   *
+   * @param token Token parsed from a graph file to check
+   */
+  bool isTokenNodeType(std::string token) {
+    if (token == "Person" || token == "ForumEvent" || token == "Forum" ||
+        token == "Publication" || token == "Topic") {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * @brief Return true if a token is an edge type.
+   *
+   * @param token Token parsed from a graph file to check
+   */
+  bool isTokenEdgeType(std::string token) {
+    if (token == "Sale" || token == "Author" || token == "Includes" ||
+        token == "HasTopic" || token == "HasOrg") {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  std::vector<std::string> splitTokens(std::string& line, char delim,
+                                       uint64_t size = 0) {
+    uint64_t ndx = 0, start = 0, end = 0;
+    std::vector<std::string> tokens(size);
+
+    for (; end < line.length(); end++) {
+      if ((line[end] == delim) || (line[end] == '\n')) {
+        tokens[ndx] = line.substr(start, end - start);
+        start       = end + 1;
+        ndx++;
+      }
+    }
+
+    // Flush the last token.
+    tokens[size - 1] = line.substr(start, end - start);
+    return tokens;
+  }
+
+  void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t numEdges) {
+    // galois::on_each([this, numNodes, numEdges](
+    galois::on_each([&](uint32_t tid, uint32_t numThreads) {
+      // Each thread is assigned disjointed range of nodes.
+      // Each thread iterates edges and accumulates edges for only
+      // the nodes assigned to that.
+      auto thread_work_range =
+          galois::block_range(uint64_t{0}, numNodes, tid, numThreads);
+      for (auto edgeElem : this->edges) {
+        uint64_t srcVertex = edgeElem.first;
+        Vertex& vertex     = this->vertices[srcVertex];
+        if (vertex.id >= thread_work_range.first &&
+            vertex.id < thread_work_range.second) {
+          vertex.incrNumEdges();
+        }
+      }
+    });
+
+#ifndef NDEBUG
+    this->VerifyNumEdgesPerVertex(numEdges);
+#else
+    (void)numEdges;
+#endif
+  }
+
+  /**
+   * @brief Insert SHAD vertex to a vertex map.
+   *
+   * @param key SHAD token key
+   * @param type SHAD vertex type
+   * @param id Vertex id; Local vertex id until it is synchronized
+   */
+  void insertSHADVertex(const uint64_t& key, const TYPES& type, uint64_t& id) {
+    auto found = this->vertices.find(key);
+    if (found == this->vertices.end()) {
+      this->vertices[key]           = Vertex(id, type, key);
+      this->verticeIdKeyMapping[id] = key;
+      id++;
+    } else {
+      std::cerr << "[error] There is no reason to have duplicated vertices\n";
+    }
+  }
+
+  /**
+   * @brief Insert SHAD edge to a edge map.
+   * @detail Edges
+   *
+   * @param vertexKey Source vertex's SHAD token key
+   * @param edge Adjacent edge of the vertex
+   */
+  void insertSHADEdge(const uint64_t& vertexKey, const Edge& edge) {
+    this->edges.insert({vertexKey, edge});
+  }
+
+  /*
+  uint64_t edge_begin(uint32_t n) {
+    return this->verticeIdKeyMapping[n]
+  */
+
+#ifndef NDEBUG
+  /**
+   * @brief Verify in-meomry SHAD graph.
+   *
+   * @param filename SHAD graph file name
+   */
+  // TODO(hc): This function can be parallelized but
+  // let me stick with sequential execution until the whole
+  // implementation works correctly.
+  void VerifySHADGraphRead(const std::string& filename) {
+    size_t numNodes{0}, numEdges{0};
+    this->InspectGraph(filename, &numNodes, &numEdges);
+    // 1) Check the number of vertices and edges.
+    assert(this->vertices.size() == numNodes);
+    // Note that edges are doubled to symmetrize a graph.
+    assert(this->edges.size() == numEdges);
+    for ([[maybe_unused]] auto& element : this->edges) {
+      // 2) Check if a source node key of the edges map is equal to a source
+      // of an edge.
+      assert(element.first == element.second.src);
+      // 3) Check if vertex information in the edges map is equal to the one
+      // in the vertex map.
+      assert(element.second.src_type ==
+             this->vertices[element.second.src].type);
+      assert(element.second.dst_type ==
+             this->vertices[element.second.dst].type);
+    }
+  }
+
+  void VerifyNumEdgesPerVertex([[maybe_unused]] uint64_t numEdges) {
+    // 4) Check if the total number of edges of each vertex is equal to
+    // the number of total edges counted during inspection.
+    uint64_t numAccumulatedEdges{0};
+    for (auto& element : this->vertices) {
+      numAccumulatedEdges += element.second.getNumEdges();
+    }
+    assert(numAccumulatedEdges == numEdges);
+  }
+
+  void VerifyCSRConstruction([[maybe_unused]] uint64_t* outIndexBuffer,
+                             [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
+                             [[maybe_unused]] uint32_t* edgeDestBuffer,
+                             [[maybe_unused]] void* edgeDataBuffer) {}
+
+  template <typename T = ShadEdgeTy,
+            typename std::enable_if_t<std::is_same_v<T, uint64_t>>* = nullptr>
+  void VerifyCSRConstruction(uint64_t* outIndexBuffer,
+                             [[maybe_unused]] ShadNodeTy* nodeDataBuffer,
+                             uint32_t* edgeDestBuffer,
+                             ShadEdgeTy* edgeDataBuffer) {
+    // 1) Iterate edge index array.
+    // 2) Compare each verteices' edge range with SHAD vertex
+    for (size_t i = 0; i < this->vertices.size(); ++i) {
+      Vertex& srcV        = this->vertices[this->verticeIdKeyMapping[i]];
+      uint64_t srcShadKey = srcV.shadKey;
+      assert(this->verticeIdKeyMapping[i] == srcV.shadKey);
+      uint64_t edgeBegin = (i == 0) ? 0 : outIndexBuffer[i - 1];
+      uint64_t edgeEnd   = outIndexBuffer[i];
+      assert(srcV.numEdges == edgeEnd - edgeBegin);
+      assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i].type));
+      assert(srcV.id == i);
+      galois::do_all(
+          galois::iterate(edgeBegin, edgeEnd),
+          [&](size_t j) {
+            uint32_t dstV                      = edgeDestBuffer[j];
+            [[maybe_unused]] uint64_t edgeData = edgeDataBuffer[j];
+
+            [[maybe_unused]] bool found{false};
+            auto edgeRange = this->edges.equal_range(srcShadKey);
+            size_t cnt{0};
+            for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei) {
+              Edge& edge = ei->second;
+              if (this->vertices[edge.dst].id == dstV) {
+                // Multiple edges between vertices are possible.
+                if (this->to_underlying(edge.type) == int(edgeData)) {
+                  assert(this->vertices[edge.src].id == i);
+                  assert(this->vertices[edge.src].id == srcV.id);
+                  found = true;
+                }
+              }
+              cnt++;
+            }
+            assert((edgeEnd - edgeBegin) == cnt);
+            /*
+            for (auto i = this->edges.begin(); i != this->edges.end(); ++i) {
+              std::cout << srcId << " vs " << i->first << "\n";
+            }
+            */
+            assert(found);
+          },
+          galois::steal());
+    }
+  }
+#endif
+
+  /**
+   * @brief Cast a type to an underlying type; in case of scoped enum,
+   * this should be an integral type.
+   *
+   * @param e
+   */
+  template <typename E>
+  constexpr typename std::underlying_type<E>::type to_underlying(E e) noexcept {
+    return static_cast<typename std::underlying_type<E>::type>(e);
+  }
+
+  // This holds the whole global vertices and their
+  // information such as its type. A key is globla node ID, and its value
+  // is the information.
+  std::unordered_map<uint64_t, Vertex> vertices;
+  // This holds the whole global edges and their information
+  // such as its type. The key is global source node ID, and its
+  // value is an edge iterator pointing to adjacent edges to the source.
+  std::unordered_multimap<uint64_t, Edge> edges;
+  // Key is global node id and value is corresponding key of that node
+  std::unordered_map<uint64_t, uint64_t> verticeIdKeyMapping;
+  // TODO(hc): Always assume uint64_t node data type
+  ShadNodeTy* nodeDataBuffer;
+  uint64_t* outIndexBuffer;
+  uint32_t* edgeDestBuffer;
+  ShadEdgeTy* edgeDataBuffer;
+};
+
+}; // namespace shad
+
+#endif
diff --git a/libgalois/src/Context.cpp b/libgalois/src/Context.cpp
index 0d3f0014d2..7fa371dc09 100644
--- a/libgalois/src/Context.cpp
+++ b/libgalois/src/Context.cpp
@@ -57,11 +57,10 @@ galois::runtime::LockManagerBase::tryAcquire(
     lockable->owner.setValue(this);
     return NEW_OWNER;
 #endif
-}
-else if (getOwner(lockable) == this) {
-  return ALREADY_OWNER;
-}
-return FAIL;
+  } else if (getOwner(lockable) == this) {
+    return ALREADY_OWNER;
+  }
+  return FAIL;
 }
 
 void galois::runtime::SimpleRuntimeContext::release(
diff --git a/libgalois/src/FileGraph.cpp b/libgalois/src/FileGraph.cpp
index 420854378b..97db8c7aac 100644
--- a/libgalois/src/FileGraph.cpp
+++ b/libgalois/src/FileGraph.cpp
@@ -709,7 +709,7 @@ void FileGraphWriter::phase1() {
   graphVersion = numNodes <= std::numeric_limits<uint32_t>::max() ? 1 : 2;
 
   size_t bytes    = galois::graphs::rawBlockSize(numNodes, numEdges, sizeofEdge,
-                                              graphVersion);
+                                                 graphVersion);
   char* mmap_base = reinterpret_cast<char*>(mmap(
       nullptr, bytes, PROT_READ | PROT_WRITE, _MAP_ANON | MAP_PRIVATE, -1, 0));
   if (mmap_base == MAP_FAILED)
diff --git a/libgalois/src/HWTopoLinux.cpp b/libgalois/src/HWTopoLinux.cpp
index 0835b0070e..cac6265914 100644
--- a/libgalois/src/HWTopoLinux.cpp
+++ b/libgalois/src/HWTopoLinux.cpp
@@ -100,7 +100,7 @@ unsigned getNumaNode(cpuinfo& c) {
 std::vector<cpuinfo> parseCPUInfo() {
   std::vector<cpuinfo> vals;
 
-  const int len = 1024;
+  const int len = 4096;
   std::array<char, len> line;
 
   std::ifstream procInfo("/proc/cpuinfo");
@@ -118,6 +118,7 @@ std::vector<cpuinfo> parseCPUInfo() {
     if (sscanf(line.data(), "processor : %d", &num) == 1) {
       assert(cur < num);
       cur = num;
+      // if(cur != 0) break;
       vals.resize(cur + 1);
       vals.at(cur).proc = num;
     } else if (sscanf(line.data(), "physical id : %d", &num) == 1) {
diff --git a/libgalois/src/PageAlloc.cpp b/libgalois/src/PageAlloc.cpp
index e051a6431d..a45e72e93d 100644
--- a/libgalois/src/PageAlloc.cpp
+++ b/libgalois/src/PageAlloc.cpp
@@ -60,7 +60,6 @@ void* galois::substrate::allocPages(unsigned num, bool preFault) {
     void* ptr =
         trymmap(num * hugePageSize, preFault ? _MAP_HUGE_POP : _MAP_HUGE);
     if (!ptr) {
-      gDebug("Huge page alloc failed, falling back");
       ptr = trymmap(num * hugePageSize, preFault ? _MAP_POP : _MAP);
     }
 
diff --git a/libgalois/test/CMakeLists.txt b/libgalois/test/CMakeLists.txt
index d28e294794..6ce4579fe7 100644
--- a/libgalois/test/CMakeLists.txt
+++ b/libgalois/test/CMakeLists.txt
@@ -41,8 +41,10 @@ add_test_unit(forward-declare-graph)
 add_test_unit(gcollections)
 add_test_unit(graph)
 add_test_unit(graph-compile)
+add_test_unit(graph-compile-lscsr)
 add_test_unit(gslist)
 add_test_unit(hwtopo)
+add_test_unit(large-vector)
 add_test_unit(lc-adaptor)
 add_test_unit(lock)
 add_test_unit(loop-overhead REQUIRES OPENMP_FOUND)
@@ -52,6 +54,7 @@ add_test_unit(move)
 add_test_unit(oneach)
 add_test_unit(papi 2)
 add_test_unit(pc)
+add_test_unit(prefixsum)
 add_test_unit(reduction)
 add_test_unit(sort)
 add_test_unit(static)
@@ -60,3 +63,4 @@ add_test_unit(twoleveliteratora)
 add_test_unit(wakeup-overhead)
 add_test_unit(worklists-compile)
 add_test_unit(morphgraph-removal)
+add_test_unit(wfl)
diff --git a/libgalois/test/bandwidth.cpp b/libgalois/test/bandwidth.cpp
index e30d8cf061..0550c20000 100644
--- a/libgalois/test/bandwidth.cpp
+++ b/libgalois/test/bandwidth.cpp
@@ -79,7 +79,7 @@ void run_interleaved(size_t seed, size_t mega, bool full) {
   auto ptr    = galois::substrate::largeMallocInterleaved(
       size * sizeof(int),
       full ? galois::substrate::getThreadPool().getMaxThreads()
-           : galois::runtime::activeThreads);
+              : galois::runtime::activeThreads);
   int* block = (int*)ptr.get();
 
   run_interleaved_helper r(block, seed, size);
diff --git a/libgalois/test/graph-compile-lscsr.cpp b/libgalois/test/graph-compile-lscsr.cpp
new file mode 100644
index 0000000000..610068ba4c
--- /dev/null
+++ b/libgalois/test/graph-compile-lscsr.cpp
@@ -0,0 +1,129 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2024, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#include <iostream>
+#include <queue>
+
+#include "galois/Galois.h"
+#include "galois/graphs/LS_LC_CSR_Graph.h"
+
+template <typename GraphTy>
+void check() {
+  GraphTy g(4);
+
+  g.addEdgesTopologyOnly(0, {1, 2});
+  g.addEdgesTopologyOnly(1, {2, 3});
+  g.addEdgesTopologyOnly(2, {3});
+
+  auto print_graph = [&g](std::string_view msg) {
+    std::cout << "- " << msg << " -" << std::endl;
+    for (auto src : g.vertices()) {
+      for (auto edge : g.edges(src)) {
+        std::cout << src << "->" << g.getEdgeDst(edge) << std::endl;
+      }
+    }
+  };
+
+  print_graph("initial graph");
+
+  g.addEdgesTopologyOnly(2, {1});
+
+  print_graph("added 2->1");
+
+  g.compact();
+
+  print_graph("compacted");
+
+  std::cout << std::endl << std::endl;
+}
+
+int main() {
+  galois::SharedMemSys Galois_runtime;
+  check<galois::graphs::LS_LC_CSR_Graph<void, void>>();
+  check<galois::graphs::LS_LC_CSR_Graph<float, float>>();
+
+  // check that we can access data on nodes/edges
+  galois::graphs::LS_LC_CSR_Graph<uint32_t, uint32_t> g(4);
+
+  g.setData(0, 0);
+  GALOIS_ASSERT(g.getData(0) == 0);
+  g.setData(1, 1);
+  GALOIS_ASSERT(g.getData(1) == 1);
+  g.setData(2, 2);
+  GALOIS_ASSERT(g.getData(2) == 2);
+  g.setData(3, 3);
+  GALOIS_ASSERT(g.getData(3) == 3);
+
+  uint64_t four = g.addVertices({4, 5, 6, 7});
+
+  for (size_t ii = 0; ii < 4; ++ii) {
+    // make sure previous data survived the resize
+    GALOIS_ASSERT(g.getData(ii) == ii);
+    // check the new vertex data
+    GALOIS_ASSERT(g.getData(four + ii) == 4 + ii);
+  }
+
+  g.addEdges(0, {1, 2, 3}, {1, 2, 3});
+  for (auto const& handle : g.edges(0)) {
+    GALOIS_ASSERT(g.getEdgeDst(handle) == g.getEdgeData(handle));
+  }
+
+  uint64_t eight = g.addVertexTopologyOnly();
+  GALOIS_ASSERT(eight == 8);
+
+  g.addEdgesTopologyOnly(eight, {3, 2, 1, 0});
+  GALOIS_ASSERT(g.getEdgeDst(*g.edge_begin(eight)) == 3);
+  GALOIS_ASSERT(g.getEdgeDst(*(++g.edge_begin(eight))) == 2);
+  GALOIS_ASSERT(g.getEdgeDst(*(++(++g.edge_begin(eight)))) == 1);
+  GALOIS_ASSERT(g.getEdgeDst(*(++(++(++g.edge_begin(eight))))) == 0);
+  GALOIS_ASSERT(g.getEdgeDst(*(--g.edge_end(eight))) == 0);
+
+  g.sortEdges(eight);
+
+  GALOIS_ASSERT(g.getEdgeDst(*g.edge_begin(eight)) == 0);
+  GALOIS_ASSERT(g.getEdgeDst(*(++g.edge_begin(eight))) == 1);
+  GALOIS_ASSERT(g.getEdgeDst(*(++(++g.edge_begin(eight)))) == 2);
+  GALOIS_ASSERT(g.getEdgeDst(*(g.edge_begin(eight) + 2)) == 2);
+  GALOIS_ASSERT(g.getEdgeDst(*(--g.edge_end(eight))) == 3);
+
+  // check searching
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 0));
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 1));
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 2));
+  GALOIS_ASSERT(g.findEdgeSorted(eight, 3));
+  GALOIS_ASSERT(!g.findEdgeSorted(eight, 4));
+
+  // check prefix sum
+  GALOIS_ASSERT(g[0] == 3);
+  GALOIS_ASSERT(g[1] == 3);
+  GALOIS_ASSERT(g[2] == 3);
+  // ...
+  GALOIS_ASSERT(g[8] == 7);
+
+  uint64_t num_vertices = (1 << 22) + 67;
+  galois::graphs::LS_LC_CSR_Graph<void, void> big(num_vertices);
+  for (uint64_t i = 0; i < num_vertices; ++i) {
+    big.addEdgesTopologyOnly(i, {(i + 1) % num_vertices});
+  }
+  for (uint64_t i = 0; i < num_vertices; ++i) {
+    GALOIS_ASSERT(big[i] == i + 1);
+  }
+
+  return 0;
+}
diff --git a/libgalois/test/large-vector.cpp b/libgalois/test/large-vector.cpp
new file mode 100644
index 0000000000..3ac714de62
--- /dev/null
+++ b/libgalois/test/large-vector.cpp
@@ -0,0 +1,75 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2024, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+#include <iostream>
+
+#include "galois/Galois.h"
+#include "galois/LargeVector.h"
+
+int main() {
+  galois::SharedMemSys Galois_runtime;
+
+  {
+    galois::LargeVector<uint64_t> the_vector;
+
+    // should use 4 hugepages
+    std::vector<uint64_t*> refs;
+    for (size_t i = 0; i < (1 << 21); ++i) {
+      refs.emplace_back(&the_vector.emplace_back(i));
+    }
+
+    for (size_t i = 0; i < (1 << 21); ++i) {
+      GALOIS_ASSERT(*refs[i] == i);
+    }
+  }
+
+  {
+    static uint64_t num_constructed = 0, num_destructed = 0;
+    class Object {
+      uint8_t dummy;
+
+    public:
+      Object() { ++num_constructed; }
+      ~Object() { ++num_destructed; }
+    };
+    static_assert(sizeof(Object) > 0);
+
+    const size_t max_cap = (1 << 22);
+    galois::LargeVector<Object> the_vector(max_cap);
+    // constructor should not actually fill the vector
+    GALOIS_ASSERT(num_constructed == 0);
+
+    // entire vector should be mapped, even if it is empty
+    const Object* addr = &the_vector[max_cap];
+    GALOIS_ASSERT((addr - &the_vector[0]) == max_cap);
+
+    the_vector.resize(max_cap);
+
+    GALOIS_ASSERT(num_constructed == max_cap);
+    GALOIS_ASSERT(addr == &the_vector[max_cap]);
+
+    // resize should call the destructor, but vector should stay mapped
+    GALOIS_ASSERT(num_destructed == 0);
+    the_vector.resize(0);
+    GALOIS_ASSERT(num_destructed == max_cap);
+    GALOIS_ASSERT(addr == &the_vector[max_cap]);
+  }
+
+  return 0;
+}
diff --git a/libgalois/test/move.cpp b/libgalois/test/move.cpp
index 608fc4651b..5f04b7fa8e 100644
--- a/libgalois/test/move.cpp
+++ b/libgalois/test/move.cpp
@@ -26,17 +26,17 @@
 #include "galois/substrate/PerThreadStorage.h"
 
 struct MoveOnly {
-  MoveOnly()           = default;
-  MoveOnly(MoveOnly&&) = default;
-  MoveOnly& operator=(MoveOnly&&) = default;
-  MoveOnly(const MoveOnly&)       = delete;
+  MoveOnly()                           = default;
+  MoveOnly(MoveOnly&&)                 = default;
+  MoveOnly& operator=(MoveOnly&&)      = default;
+  MoveOnly(const MoveOnly&)            = delete;
   MoveOnly& operator=(const MoveOnly&) = delete;
 };
 
 struct MoveOnlyA {
   int* x;
   MoveOnlyA() {}
-  MoveOnlyA(const MoveOnlyA&) = delete;
+  MoveOnlyA(const MoveOnlyA&)           = delete;
   MoveOnly& operator=(const MoveOnlyA&) = delete;
   ~MoveOnlyA() {}
 };
diff --git a/libgalois/test/prefixsum.cpp b/libgalois/test/prefixsum.cpp
new file mode 100644
index 0000000000..9aa599ca93
--- /dev/null
+++ b/libgalois/test/prefixsum.cpp
@@ -0,0 +1,101 @@
+#include "galois/Timer.h"
+#include "galois/Galois.h"
+#include "galois/WaterFallLock.h"
+#include "galois/substrate/PerThreadStorage.h"
+#include "galois/PrefixSum.h"
+
+#include <iostream>
+#include <cstdlib>
+#include <unistd.h>
+#include <cxxabi.h>
+#include <utility>
+
+unsigned iter       = 0;
+unsigned numThreads = 0;
+
+char bname[100];
+
+template <typename T, typename Y>
+void test(T& prefix_sum, uint64_t sz, Y* dst) {
+  gethostname(bname, sizeof(bname));
+  char* name = 0;
+  int status;
+  name = abi::__cxa_demangle(prefix_sum.name(), 0, 0, &status);
+  if (status || !name)
+    std::abort();
+
+  auto run = [&prefix_sum, sz]() { prefix_sum.computePrefixSum(sz); };
+
+  unsigned M = numThreads;
+  while (M) {
+    galois::setActiveThreads(M);
+    galois::Timer t;
+    run();
+    for (uint64_t i = 0; i < sz; i++) {
+      if (dst[i] != i + 1)
+        std::abort();
+    }
+
+    t.start();
+    run();
+    t.stop();
+    std::cout << bname << "," << name << "," << M << "," << t.get_usec()
+              << "\n";
+    M -= 1;
+  }
+  free(name);
+}
+
+uint64_t transmute(const std::pair<uint64_t, uint64_t>& p) {
+  return p.second - p.first;
+};
+uint64_t scan_op(const std::pair<uint64_t, uint64_t>& p, const uint64_t& l) {
+  return p.second - p.first + l;
+};
+uint64_t combiner(const uint64_t& f, const uint64_t& s) { return f + s; };
+
+int main(int argc, char** argv) {
+  galois::SharedMemSys Galois_runtime;
+  if (argc > 1)
+    iter = atoi(argv[1]);
+  else
+    iter = 16;
+  if (argc > 2)
+    numThreads = atoi(argv[2]);
+  else
+    numThreads = galois::substrate::getThreadPool().getMaxThreads();
+
+  gethostname(bname, sizeof(bname));
+  using namespace galois;
+
+  std::cout << "Host"
+            << ","
+            << "Lock Name"
+            << ","
+            << "numThreads"
+            << ","
+            << "Time (us)" << std::endl;
+
+  // using PTS   = PerThreadStorage<unsigned>;
+
+  auto src = (std::pair<uint64_t, uint64_t>*)malloc(
+      sizeof(std::pair<uint64_t, uint64_t>) * (1 << 30));
+  auto dst = (uint64_t*)malloc(sizeof(uint64_t) * (1 << 30));
+
+  for (uint64_t i = 0; i < (1 << 20); i++)
+    src[i] = {0, 1};
+
+  using PSUM = PrefixSum<std::pair<uint64_t, uint64_t>, uint64_t, transmute,
+                         scan_op, combiner, CacheLinePaddedArr>;
+
+  PSUM prefix{src, dst};
+
+  test<PSUM>(prefix, 50, dst);
+  test<PSUM>(prefix, 1000, dst);
+  test<PSUM>(prefix, 40000, dst);
+  test<PSUM>(prefix, (1 << 20), dst);
+  free(src);
+  free(dst);
+
+  return 0;
+}
diff --git a/libgalois/test/reduction.cpp b/libgalois/test/reduction.cpp
index ef5fc3be99..3285fcf9e8 100644
--- a/libgalois/test/reduction.cpp
+++ b/libgalois/test/reduction.cpp
@@ -12,11 +12,11 @@ struct Move {
   Move(const Move&) = delete;
   Move(Move&&) noexcept {}
   Move& operator=(const Move&) = delete;
-  Move& operator               =(Move&&) noexcept { return *this; }
+  Move& operator=(Move&&) noexcept { return *this; }
 };
 
 void test_move() {
-  auto merge_fn = [](Move& a, Move &&) -> Move& { return a; };
+  auto merge_fn = [](Move& a, Move&&) -> Move& { return a; };
 
   auto identity_fn = []() { return Move(); };
 
diff --git a/libgalois/test/wfl.cpp b/libgalois/test/wfl.cpp
new file mode 100644
index 0000000000..77f4f54b90
--- /dev/null
+++ b/libgalois/test/wfl.cpp
@@ -0,0 +1,106 @@
+#include "galois/Timer.h"
+#include "galois/Galois.h"
+#include "galois/WaterFallLock.h"
+#include "galois/substrate/PerThreadStorage.h"
+
+#include <iostream>
+#include <cstdlib>
+#include <unistd.h>
+#include <cxxabi.h>
+
+unsigned iter       = 0;
+unsigned numThreads = 0;
+
+char bname[100];
+
+template <typename T>
+struct emp {
+  galois::WaterFallLock<T>& w;
+  std::vector<uint64_t>& arr_a;
+
+  void operator()(const uint64_t& tid, const uint64_t& numThreads) {
+    for (unsigned i = 0; i < iter; i++) {
+      auto perm_tid = tid ? tid - 1 : numThreads - 1;
+      arr_a[tid]++;
+      w.template done<1>(tid);
+      w.template wait<1>(perm_tid);
+      arr_a[perm_tid]++;
+      if (arr_a[perm_tid] != 2)
+        std::abort();
+      w.template done<2>(perm_tid);
+      w.template wait<2>(tid);
+      arr_a[tid]++;
+      if (arr_a[tid] != 3)
+        std::abort();
+      w.template done<3>(tid);
+      w.template wait<3>(perm_tid);
+      arr_a[perm_tid] -= 3;
+      if (arr_a[perm_tid] != 0)
+        std::abort();
+      w.template done<0>(perm_tid);
+      w.template wait<0>(tid);
+    }
+  }
+};
+
+template <typename T>
+void test(galois::WaterFallLock<T>* w, std::vector<uint64_t>& arr0) {
+  if (w == nullptr) {
+    std::cout << "skipping " << std::endl;
+    return;
+  }
+  gethostname(bname, sizeof(bname));
+  char* name = 0;
+  int status;
+  name = abi::__cxa_demangle(w->name(), 0, 0, &status);
+  if (status || !name)
+    std::abort();
+
+  emp<T> e{*w, arr0};
+
+  unsigned M = numThreads;
+  while (M) {
+    galois::setActiveThreads(M);
+    w->reset();
+    galois::Timer t;
+    galois::on_each(e);
+    t.start();
+    galois::on_each(e);
+    t.stop();
+    std::cout << bname << "," << name << "," << M << "," << t.get_usec()
+              << "\n";
+    M -= 1;
+  }
+  free(name);
+}
+
+int main(int argc, char** argv) {
+  galois::SharedMemSys Galois_runtime;
+  if (argc > 1)
+    iter = atoi(argv[1]);
+  else
+    iter = 16;
+  if (argc > 2)
+    numThreads = atoi(argv[2]);
+  else
+    numThreads = galois::substrate::getThreadPool().getMaxThreads();
+
+  gethostname(bname, sizeof(bname));
+  using namespace galois;
+
+  std::cout << "Host"
+            << ","
+            << "Lock Name"
+            << ","
+            << "numThreads"
+            << ","
+            << "Time (us)" << std::endl;
+
+  using PTS = PerThreadStorage<unsigned>;
+  using CLP = CacheLinePaddedArr<unsigned>;
+
+  auto arr0 = std::vector<uint64_t>(numThreads, 0);
+  test<PTS>(new WaterFallLock<PTS>(), arr0);
+  test<CLP>(new WaterFallLock<CLP>(), arr0);
+  return 0;
+}
diff --git a/libgluon/CMakeLists.txt b/libgluon/CMakeLists.txt
index 3c9812e498..543e796156 100644
--- a/libgluon/CMakeLists.txt
+++ b/libgluon/CMakeLists.txt
@@ -18,7 +18,7 @@ target_include_directories(galois_gluon PUBLIC
 )
 
 if (GALOIS_COMM_STATS)
-  target_compile_definitions(galois_gluon PRIVATE GALOIS_COMM_STATS=1)
+  target_compile_definitions(galois_gluon PUBLIC GALOIS_COMM_STATS=1)
 endif()
 
 if (GALOIS_USE_BARE_MPI)
diff --git a/libgluon/include/galois/cuda/Context.h b/libgluon/include/galois/cuda/Context.h
index 0ecf9eba82..57492bfdf6 100644
--- a/libgluon/include/galois/cuda/Context.h
+++ b/libgluon/include/galois/cuda/Context.h
@@ -32,6 +32,7 @@
 #include <cuda.h>
 #include "gg.h"
 #include "galois/cuda/HostDecls.h"
+#include "galois/cuda/DynamicBitset.h"
 
 struct CUDA_Context_Shared {
   unsigned int* num_nodes;         // per host
@@ -170,6 +171,34 @@ size_t mem_usage_CUDA_common(MarshalGraph& g, unsigned num_hosts) {
   return mem_usage;
 }
 
+size_t mem_usage_CUDA_common(PartitionedGraphInfo& g_info, unsigned num_hosts) {
+  size_t mem_usage       = 0;
+  size_t max_shared_size = 0; // for union across master/mirror of all hosts
+  mem_usage += num_hosts * sizeof(unsigned int);
+  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (g_info.num_master_nodes[h] > 0) {
+      mem_usage += g_info.num_master_nodes[h] * sizeof(unsigned int);
+    }
+    if (g_info.num_master_nodes[h] > max_shared_size) {
+      max_shared_size = g_info.num_master_nodes[h];
+    }
+  }
+  mem_usage += num_hosts * sizeof(unsigned int);
+  mem_usage += num_hosts * sizeof(Shared<unsigned int>);
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (g_info.num_mirror_nodes[h] > 0) {
+      mem_usage += g_info.num_mirror_nodes[h] * sizeof(unsigned int);
+    }
+    if (g_info.num_mirror_nodes[h] > max_shared_size) {
+      max_shared_size = g_info.num_mirror_nodes[h];
+    }
+  }
+  mem_usage += max_shared_size * sizeof(unsigned int);
+  mem_usage += ((max_shared_size + 63) / 64) * sizeof(unsigned long long int);
+  return mem_usage;
+}
+
 template <typename Type>
 void load_graph_CUDA_field(struct CUDA_Context_Common* ctx,
                            struct CUDA_Context_Field<Type>* field,
@@ -191,6 +220,44 @@ void load_graph_CUDA_field(struct CUDA_Context_Common* ctx,
   field->is_updated.cpu_wr_ptr()->alloc(ctx->gg.nnodes);
 }
 
+//! Set up cuda context for vector communication.
+//! A vector of the vector is represented as a flattened 1D vector.
+//! Users can either allocate data on this function or not.
+//! The data could be a pointer which had been allocated at outside.
+template <typename Type>
+void load_graph_CUDA_field_inflating(struct CUDA_Context_Common* ctx,
+                                     struct CUDA_Context_Field<Type>* field,
+                                     unsigned num_hosts, unsigned nnodes,
+                                     size_t infl_size) {
+  load_graph_CUDA_field_inflating<Type>(ctx, field, num_hosts, nnodes,
+                                        infl_size, true);
+}
+
+template <typename Type>
+void load_graph_CUDA_field_inflating(struct CUDA_Context_Common* ctx,
+                                     struct CUDA_Context_Field<Type>* field,
+                                     unsigned num_hosts, unsigned nnodes,
+                                     size_t infl_size, bool data_alloc) {
+  size_t max_shared_size = 0; // for union across master/mirror of all hosts
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->master.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->master.num_nodes[h];
+    }
+  }
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->mirror.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->mirror.num_nodes[h];
+    }
+  }
+  field->is_updated.alloc(1);
+  field->is_updated.cpu_wr_ptr()->alloc(nnodes);
+
+  if (data_alloc) {
+    field->data.alloc(nnodes * infl_size);
+  }
+  field->shared_data.alloc(max_shared_size * infl_size);
+}
+
 template <typename Type>
 size_t mem_usage_CUDA_field(struct CUDA_Context_Field<Type>* field,
                             MarshalGraph& g, unsigned num_hosts) {
@@ -211,3 +278,54 @@ size_t mem_usage_CUDA_field(struct CUDA_Context_Field<Type>* field,
   mem_usage += ((g.nnodes + 63) / 64) * sizeof(unsigned long long int);
   return mem_usage;
 }
+
+void load_graph_CUDA_common(struct CUDA_Context_Common* ctx,
+                            PartitionedGraphInfo& g_info, unsigned num_hosts) {
+  ctx->numOwned          = g_info.numOwned;
+  ctx->beginMaster       = g_info.beginMaster;
+  ctx->numNodesWithEdges = g_info.numNodesWithEdges;
+  assert(ctx->id == g_info.id);
+
+  size_t mem_usage =
+      ((g_info.nnodes + 1) + g_info.nedges) * sizeof(index_type) +
+      (g_info.nnodes) * sizeof(node_data_type);
+
+  size_t max_shared_size = 0; // for union across master/mirror of all hosts
+  ctx->master.num_nodes =
+      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
+  memcpy(ctx->master.num_nodes, g_info.num_master_nodes,
+         sizeof(unsigned int) * num_hosts);
+  ctx->master.nodes = (DeviceOnly<unsigned int>*)calloc(
+      num_hosts, sizeof(Shared<unsigned int>));
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->master.num_nodes[h] > 0) {
+      ctx->master.nodes[h].alloc(ctx->master.num_nodes[h]);
+      ctx->master.nodes[h].copy_to_gpu(g_info.master_nodes[h],
+                                       ctx->master.num_nodes[h]);
+    }
+    if (ctx->master.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->master.num_nodes[h];
+    }
+  }
+  ctx->mirror.num_nodes =
+      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));
+  memcpy(ctx->mirror.num_nodes, g_info.num_mirror_nodes,
+         sizeof(unsigned int) * num_hosts);
+  ctx->mirror.nodes = (DeviceOnly<unsigned int>*)calloc(
+      num_hosts, sizeof(Shared<unsigned int>));
+  for (uint32_t h = 0; h < num_hosts; ++h) {
+    if (ctx->mirror.num_nodes[h] > 0) {
+      ctx->mirror.nodes[h].alloc(ctx->mirror.num_nodes[h]);
+      ctx->mirror.nodes[h].copy_to_gpu(g_info.mirror_nodes[h],
+                                       ctx->mirror.num_nodes[h]);
+    }
+    if (ctx->mirror.num_nodes[h] > max_shared_size) {
+      max_shared_size = ctx->mirror.num_nodes[h];
+    }
+  }
+  ctx->offsets.alloc(max_shared_size);
+  ctx->is_updated.alloc(1);
+  ctx->is_updated.cpu_wr_ptr()->alloc(max_shared_size);
+  // printf("[%u] load_graph_GPU: %u owned nodes of total %u resident, %lu
+  // edges\n", ctx->id, ctx->nowned, graph.nnodes, graph.nedges);
+}
diff --git a/libgluon/include/galois/cuda/HostDecls.h b/libgluon/include/galois/cuda/HostDecls.h
index a085b26967..d4852df70d 100644
--- a/libgluon/include/galois/cuda/HostDecls.h
+++ b/libgluon/include/galois/cuda/HostDecls.h
@@ -35,6 +35,45 @@ typedef unsigned int node_data_type;
 typedef unsigned int edge_data_type;
 #endif
 
+struct PartitionedGraphInfo {
+  size_t nnodes;
+  size_t nedges;
+  unsigned int numOwned;    // Number of nodes owned (masters) by this host
+  unsigned int beginMaster; // local id of the beginning of master nodes
+  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that
+                                  // have outgoing edges
+  int id;
+  unsigned numHosts;
+  unsigned int* num_master_nodes;
+  unsigned int** master_nodes;
+  unsigned int* num_mirror_nodes;
+  unsigned int** mirror_nodes;
+
+  PartitionedGraphInfo()
+      : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),
+        id(-1), numHosts(0), num_master_nodes(nullptr), master_nodes(nullptr),
+        num_mirror_nodes(nullptr), mirror_nodes(nullptr) {}
+
+  ~PartitionedGraphInfo() {
+    if (!num_master_nodes)
+      free(num_master_nodes);
+    if (!master_nodes) {
+      for (unsigned i = 0; i < numHosts; ++i) {
+        free(master_nodes[i]);
+      }
+      free(master_nodes);
+    }
+    if (!num_mirror_nodes)
+      free(num_mirror_nodes);
+    if (!mirror_nodes) {
+      for (unsigned i = 0; i < numHosts; ++i) {
+        free(mirror_nodes[i]);
+      }
+      free(mirror_nodes);
+    }
+  }
+};
+
 struct MarshalGraph {
   size_t nnodes;
   size_t nedges;
@@ -55,9 +94,10 @@ struct MarshalGraph {
 
   MarshalGraph()
       : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),
-        id(-1), numHosts(0), row_start(NULL), edge_dst(NULL), node_data(NULL),
-        edge_data(NULL), num_master_nodes(NULL), master_nodes(NULL),
-        num_mirror_nodes(NULL), mirror_nodes(NULL) {}
+        id(-1), numHosts(0), row_start(nullptr), edge_dst(nullptr),
+        node_data(nullptr), edge_data(nullptr), num_master_nodes(nullptr),
+        master_nodes(nullptr), num_mirror_nodes(nullptr),
+        mirror_nodes(nullptr) {}
 
   ~MarshalGraph() {
     if (!row_start)
diff --git a/libgluon/include/galois/graphs/GluonEdgeSubstrate.h b/libgluon/include/galois/graphs/GluonEdgeSubstrate.h
index 7e39a5b7c0..7342c9a57e 100644
--- a/libgluon/include/galois/graphs/GluonEdgeSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonEdgeSubstrate.h
@@ -133,7 +133,7 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
 
       galois::runtime::SendBuffer b;
       gSerialize(b, mirrorEdges[x]);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive the mirror edges
@@ -141,9 +141,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       galois::runtime::gDeserialize(p->second, masterEdges[p->first]);
@@ -169,7 +169,7 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
 
       galois::runtime::SendBuffer b;
       gSerialize(b, totalMirrorEdges, totalOwnedEdges);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive
@@ -177,9 +177,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint64_t totalMirrorFromOther;
@@ -1097,9 +1097,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType>
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data());
+      return FnTy::extract_reset_batch(x, b.data());
     } else {
-      return FnTy::extract_batch(x, b.getVec().data());
+      return FnTy::extract_batch(x, b.data());
     }
   }
 
@@ -1125,9 +1125,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,
                                   size_t& s, DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_reset_batch(x, b.data(), &s, &data_mode);
     } else {
-      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_batch(x, b.data(), &s, &data_mode);
     }
   }
 
@@ -1243,12 +1243,12 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType, bool async>
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());
+      return FnTy::reduce_batch(x, b.data());
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::reduce_mirror_batch(x, b.data());
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::setVal_batch(x, b.data());
       }
     }
   }
@@ -1273,15 +1273,12 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,
                               DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),
-                                data_mode);
+      return FnTy::reduce_batch(x, b.data(), data_mode);
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),
-                                         data_mode);
+        return FnTy::reduce_mirror_batch(x, b.data(), data_mode);
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),
-                                  data_mode);
+        return FnTy::setVal_batch(x, b.data(), data_mode);
       }
     }
   }
@@ -1723,7 +1720,8 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
         size_t syncTypePhase = 0;
         if (async && (syncType == syncBroadcast))
           syncTypePhase = 1;
-        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);
+        net.sendTagged(x, galois::runtime::evilPhase, std::move(b),
+                       syncTypePhase);
         ++numMessages;
       }
     }
@@ -1958,11 +1956,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
       size_t syncTypePhase = 0;
       if (syncType == syncBroadcast)
         syncTypePhase = 1;
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                                 syncTypePhase)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase, syncTypePhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                              syncTypePhase);
+        p = net.recieveTagged(galois::runtime::evilPhase, syncTypePhase);
 
         if (p) {
           syncRecvApply<syncType, SyncFnTy, BitsetFnTy, async>(
@@ -1977,9 +1973,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject {
           continue;
 
         Twait.start();
-        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+        decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
         do {
-          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          p = net.recieveTagged(galois::runtime::evilPhase);
         } while (!p);
         Twait.stop();
 
diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h
index 11a89157e7..9d53b080ba 100644
--- a/libgluon/include/galois/graphs/GluonSubstrate.h
+++ b/libgluon/include/galois/graphs/GluonSubstrate.h
@@ -29,6 +29,7 @@
 #include <unordered_map>
 #include <fstream>
 
+#include "galois/TwoDVector.h"
 #include "galois/runtime/GlobalObj.h"
 #include "galois/runtime/DistStats.h"
 #include "galois/runtime/SyncStructures.h"
@@ -82,6 +83,8 @@ namespace graphs {
 template <typename GraphTy>
 class GluonSubstrate : public galois::runtime::GlobalObject {
 private:
+  bool is_a_graph_{false};
+
   //! Synchronization type
   enum SyncType {
     syncReduce,   //!< Reduction sync
@@ -116,12 +119,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
   // memoization optimization
   //! Master nodes on different hosts. For broadcast;
-  std::vector<std::vector<size_t>> masterNodes;
+  std::vector<std::vector<size_t>> master_nodes_concrete_;
+  std::vector<std::vector<size_t>> subgraph_master_nodes_;
+  std::vector<std::vector<size_t>>* masterNodes;
   //! Mirror nodes on different hosts. For reduce; comes from the user graph
   //! during initialization (we expect user to give to us)
-  std::vector<std::vector<size_t>>& mirrorNodes;
+  std::vector<std::vector<size_t>>* mirrorNodes;
   //! Maximum size of master or mirror nodes on different hosts
   size_t maxSharedSize;
+  //! Maximum size of master or mirror nodes on different hosts
+  size_t original_max_shared_size_;
 
 #ifdef GALOIS_USE_BARE_MPI
   std::vector<MPI_Group> mpi_identity_groups;
@@ -187,8 +194,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         continue;
 
       galois::runtime::SendBuffer b;
-      gSerialize(b, mirrorNodes[x]);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      gSerialize(b, (*mirrorNodes)[x]);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive the mirror nodes
@@ -196,12 +203,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
-      galois::runtime::gDeserialize(p->second, masterNodes[p->first]);
+      galois::runtime::gDeserialize(p->second, (*masterNodes)[p->first]);
     }
     incrementEvilPhase();
   }
@@ -224,7 +231,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
       galois::runtime::SendBuffer b;
       gSerialize(b, global_total_mirror_nodes, global_total_owned_nodes);
-      net.sendTagged(x, galois::runtime::evilPhase, b);
+      net.sendTagged(x, galois::runtime::evilPhase, std::move(b));
     }
 
     // receive
@@ -232,9 +239,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       if (x == id)
         continue;
 
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+        p = net.recieveTagged(galois::runtime::evilPhase);
       } while (!p);
 
       uint64_t total_mirror_nodes_from_others;
@@ -271,11 +278,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     // convert the global ids stored in the master/mirror nodes arrays to local
     // ids
     // TODO: use 32-bit distinct vectors for masters and mirrors from here on
-    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
       galois::do_all(
-          galois::iterate(size_t{0}, masterNodes[h].size()),
+          galois::iterate(size_t{0}, (*masterNodes)[h].size()),
           [&](size_t n) {
-            masterNodes[h][n] = userGraph.getLID(masterNodes[h][n]);
+            (*masterNodes)[h][n] = userGraph.getLID((*masterNodes)[h][n]);
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier("MasterNodes").c_str()),
@@ -283,11 +290,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           galois::no_stats());
     }
 
-    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
       galois::do_all(
-          galois::iterate(size_t{0}, mirrorNodes[h].size()),
+          galois::iterate(size_t{0}, (*mirrorNodes)[h].size()),
           [&](size_t n) {
-            mirrorNodes[h][n] = userGraph.getLID(mirrorNodes[h][n]);
+            (*mirrorNodes)[h][n] = userGraph.getLID((*mirrorNodes)[h][n]);
           },
 #if GALOIS_COMM_STATS
           galois::loopname(get_run_identifier("MirrorNodes").c_str()),
@@ -299,30 +306,32 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     maxSharedSize = 0;
     // report masters/mirrors to/from other hosts as statistics
-    for (auto x = 0U; x < masterNodes.size(); ++x) {
+    for (auto x = 0U; x < masterNodes->size(); ++x) {
       if (x == id)
         continue;
       std::string master_nodes_str =
           "MasterNodesFrom_" + std::to_string(id) + "_To_" + std::to_string(x);
       galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
-          RNAME, master_nodes_str, masterNodes[x].size());
-      if (masterNodes[x].size() > maxSharedSize) {
-        maxSharedSize = masterNodes[x].size();
+          RNAME, master_nodes_str, (*masterNodes)[x].size());
+      if ((*masterNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*masterNodes)[x].size();
       }
     }
 
-    for (auto x = 0U; x < mirrorNodes.size(); ++x) {
+    for (auto x = 0U; x < mirrorNodes->size(); ++x) {
       if (x == id)
         continue;
       std::string mirror_nodes_str =
           "MirrorNodesFrom_" + std::to_string(x) + "_To_" + std::to_string(id);
       galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(
-          RNAME, mirror_nodes_str, mirrorNodes[x].size());
-      if (mirrorNodes[x].size() > maxSharedSize) {
-        maxSharedSize = mirrorNodes[x].size();
+          RNAME, mirror_nodes_str, (*mirrorNodes)[x].size());
+      if ((*mirrorNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*mirrorNodes)[x].size();
       }
     }
 
+    original_max_shared_size_ = maxSharedSize;
+
     sendInfoToHost();
 
     // do not track memory usage of partitioning
@@ -340,16 +349,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   void reportProxyStats(uint64_t global_total_mirror_nodes,
                         uint64_t GALOIS_UNUSED(global_total_owned_nodes)) {
-    float replication_factor =
-        (float)(global_total_mirror_nodes + userGraph.globalSize()) /
-        (float)userGraph.globalSize();
-    galois::runtime::reportStat_Single(RNAME, "ReplicationFactor",
-                                       replication_factor);
-
-    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
-        RNAME, "TotalNodes", userGraph.globalSize());
-    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
-        RNAME, "TotalGlobalMirrorNodes", global_total_mirror_nodes);
+    if (is_a_graph_) {
+      float replication_factor =
+          (float)(global_total_mirror_nodes + userGraph.globalSize()) /
+          (float)userGraph.globalSize();
+      galois::runtime::reportStat_Single(RNAME, "ReplicationFactor",
+                                         replication_factor);
+
+      galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
+          RNAME, "TotalNodes", userGraph.globalSize());
+      galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(
+          RNAME, "TotalGlobalMirrorNodes", global_total_mirror_nodes);
+    }
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -430,13 +441,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         cartesianGrid(_cartesianGrid), partitionAgnostic(_partitionAgnostic),
         substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0),
         num_round(0), currentBVFlag(nullptr),
-        mirrorNodes(userGraph.getMirrorNodes()) {
+        masterNodes(&master_nodes_concrete_),
+        mirrorNodes(&(userGraph.getMirrorNodes())) {
+    is_a_graph_ = _userGraph.is_a_graph();
     if (cartesianGrid.first != 0 && cartesianGrid.second != 0) {
       GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts,
                     "Cartesian split doesn't equal number of hosts");
       if (id == 0) {
-        galois::gInfo("Gluon optimizing communication for 2-D cartesian cut: ",
-                      cartesianGrid.first, " x ", cartesianGrid.second);
+        galois::gDebug("Gluon optimizing communication for 2-D cartesian cut: ",
+                       cartesianGrid.first, " x ", cartesianGrid.second);
       }
       isCartCut = true;
     } else {
@@ -449,7 +462,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     initBareMPI();
     // master setup from mirrors done by setupCommunication call
-    masterNodes.resize(numHosts);
+    masterNodes->resize(numHosts);
     // setup proxy communication
     galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct_comm(
         "GraphCommSetupTime", RNAME);
@@ -458,11 +471,77 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     Tgraph_construct_comm.stop();
   }
 
+  void RevertHandshakeToRealGraph() {
+    // XXX make sure I dont need anything else
+    masterNodes   = &master_nodes_concrete_;
+    mirrorNodes   = &(userGraph.getMirrorNodes());
+    maxSharedSize = original_max_shared_size_;
+  }
+
+  void SetupSubgraphMirrors(std::vector<std::vector<size_t>>& subgraph_mirrors,
+                            bool use_timer) {
+    galois::StatTimer t("SubgraphMirrorSetup");
+    if (use_timer) {
+      t.start();
+    }
+
+    // resetup master mirrors
+    masterNodes = &subgraph_master_nodes_;
+    mirrorNodes = &subgraph_mirrors;
+    masterNodes->clear();
+    if (masterNodes->size() < numHosts)
+      masterNodes->resize(numHosts);
+
+    // Exchange information for memoization optimization.
+    exchangeProxyInfo();
+
+    assert(masterNodes->size() == numHosts);
+    assert(mirrorNodes->size() == numHosts);
+
+    // convert the global ids stored in the master/mirror nodes arrays to local
+    // ids
+    // TODO: use 32-bit distinct vectors for masters and mirrors from here on
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
+      galois::do_all(
+          galois::iterate(size_t{0}, (*masterNodes)[h].size()),
+          [&](size_t n) {
+            (*masterNodes)[h][n] = userGraph.getLID((*masterNodes)[h][n]);
+          },
+          galois::no_stats());
+    }
+
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
+      galois::do_all(
+          galois::iterate(size_t{0}, (*mirrorNodes)[h].size()),
+          [&](size_t n) {
+            (*mirrorNodes)[h][n] = userGraph.getLID((*mirrorNodes)[h][n]);
+          },
+          galois::no_stats());
+    }
+
+    maxSharedSize = 0;
+    for (auto x = 0U; x < masterNodes->size(); ++x) {
+      assert(x < mirrorNodes->size());
+      if (x == id)
+        continue;
+      if ((*masterNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*masterNodes)[x].size();
+      }
+      if ((*mirrorNodes)[x].size() > maxSharedSize) {
+        maxSharedSize = (*mirrorNodes)[x].size();
+      }
+    }
+
+    if (use_timer) {
+      t.stop();
+    }
+  }
+
+private:
   ////////////////////////////////////////////////////////////////////////////////
   // Data extraction from bitsets
   ////////////////////////////////////////////////////////////////////////////////
 
-private:
   /**
    * Given a bitset, determine the indices of the bitset that are currently
    * set.
@@ -589,6 +668,51 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                            galois::PODResizeableArray<unsigned int>& offsets,
                            size_t& bit_set_count,
                            DataCommMode& data_mode) const {
+    // i.e. not set by user
+    if (substrateDataMode != onlyData) {
+      bitset_comm.reset();
+      std::string syncTypeStr =
+          (syncType == syncReduce) ? "Reduce" : "Broadcast";
+      std::string doall_str(syncTypeStr + "Bitset_" + loopName);
+
+      bitset_comm.reset();
+      // determine which local nodes in the indices array need to be
+      // sychronized
+      galois::do_all(
+          galois::iterate(size_t{0}, indices.size()),
+          [&](size_t n) {
+            // assumes each lid is unique as test is not thread safe
+            size_t lid = indices[n];
+            if (bitset_compute.test(lid)) {
+              bitset_comm.set(n);
+            }
+          },
+#if GALOIS_COMM_STATS
+          galois::loopname(get_run_identifier(doall_str).c_str()),
+#endif
+          galois::no_stats());
+
+      // get the number of set bits and the offsets into the comm bitset
+      // i.e., the things thaneed to be grabbed
+      getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,
+                                     bit_set_count);
+    }
+
+    // from the count of things that need to be grabbed determine the data mode
+    // to use
+    data_mode =
+        get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());
+  }
+
+  template <typename FnTy, SyncType syncType>
+  void GetBitsetAndOffsets2D(const std::string& loopName,
+                             const std::vector<size_t>& indices,
+                             const galois::DynamicBitSet& bitset_compute,
+                             galois::DynamicBitSet& bitset_comm,
+                             galois::PODResizeableArray<unsigned int>& offsets,
+                             size_t& bit_set_count,
+                             DataCommMode& data_mode) const {
+    // i.e. not set by user
     if (substrateDataMode != onlyData) {
       bitset_comm.reset();
       std::string syncTypeStr =
@@ -613,10 +737,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           galois::no_stats());
 
       // get the number of set bits and the offsets into the comm bitset
+      // i.e., the things thaneed to be grabbed
       getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,
                                      bit_set_count);
     }
 
+    // from the count of things that need to be grabbed determine the data mode
+    // to use
+    // NOTE: this function is imprecise as it doesn't get actual size of
+    // vectors but only the size of the wrapper itself, but doesn't matter
+    // for selection purposes
     data_mode =
         get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());
   }
@@ -648,6 +778,39 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  template <typename SyncFnTy>
+  size_t GetMaxSendBufferSizeVecs(uint32_t numShared) {
+    if (substrateDataMode == gidsData) {
+      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(unsigned int)) + sizeof(size_t) +
+             sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    } else if (substrateDataMode == offsetsData) {
+      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(unsigned int)) + sizeof(size_t) +
+             sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    } else if (substrateDataMode == bitsetData) {
+      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);
+      return sizeof(DataCommMode) + sizeof(size_t) +
+             sizeof(size_t)   // bitset size
+             + sizeof(size_t) // bitset vector size
+             + bitset_alloc_size + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    } else { // onlyData or noData (auto)
+      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);
+      return sizeof(DataCommMode) + sizeof(size_t) +
+             sizeof(size_t)   // bitset size
+             + sizeof(size_t) // bitset vector size
+             + bitset_alloc_size + sizeof(size_t) + sizeof(size_t) +
+             (numShared * sizeof(typename SyncFnTy::ValTy::value_type) *
+              SyncFnTy::FeatVecSize());
+    }
+  }
+
   ////////////////////////////////////////////////////////////////////////////////
   // Local to global ID conversion
   ////////////////////////////////////////////////////////////////////////////////
@@ -705,9 +868,44 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         galois::no_stats());
   }
 
+  template <typename T>
+  struct is_vector_of_vec : public std::false_type {};
+  template <typename T, typename A, typename A2>
+  struct is_vector_of_vec<std::vector<std::vector<T, A2>, A>>
+      : public std::true_type {};
+
+  template <typename T>
+  struct IsVector : public std::false_type {};
+  template <typename T, typename A>
+  struct IsVector<std::vector<T, A>> : public std::true_type {};
+
+  template <typename T>
+  struct Is2DVector : public std::false_type {};
+  template <typename T>
+  struct Is2DVector<galois::TwoDVector<T>> : public std::true_type {};
+
   ////////////////////////////////////////////////////////////////////////////////
   // Message prep functions (buffering, send buffer getting, etc.)
   ////////////////////////////////////////////////////////////////////////////////
+
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async,
+            typename std::enable_if<Is2DVector<VecTy>::value>::type* = nullptr>
+  void getSendBuffer(std::string loopName, unsigned x,
+                     galois::runtime::SendBuffer& b, size_t elem_size) {
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes);
+
+    SyncExtract2D<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+        loopName, x, sharedNodes[x], b, elem_size);
+
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string statSendBytes_str(syncTypeStr + "SendBytes_" +
+                                  get_run_identifier(loopName));
+
+    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
+  }
+
   /**
    * Get data that is going to be sent for synchronization and returns
    * it in a send buffer.
@@ -719,21 +917,22 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName Name to give timer
    * @param x Host to send to
    * @param b OUTPUT: Buffer that will hold data to send
+   * @param elem_size The inner-vector dimesnion of a vector of the vector
    */
-  template <
-      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
-      bool async,
-      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async,
+            typename std::enable_if<!Is2DVector<VecTy>::value>::type* = nullptr>
   void getSendBuffer(std::string loopName, unsigned x,
-                     galois::runtime::SendBuffer& b) {
-    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
+                     galois::runtime::SendBuffer& b, size_t elem_size) {
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes);
 
     if (BitsetFnTy::is_valid()) {
       syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-          loopName, x, sharedNodes[x], b);
+          loopName, x, sharedNodes[x], b, elem_size);
     } else {
       syncExtract<syncType, SyncFnTy, VecTy, async>(loopName, x, sharedNodes[x],
-                                                    b);
+                                                    b, elem_size);
     }
 
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -742,23 +941,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
   }
-  template <
-      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
-      bool async,
-      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
-  void getSendBuffer(std::string loopName, unsigned x,
-                     galois::runtime::SendBuffer& b) {
-    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
-
-    syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-        loopName, x, sharedNodes[x], b);
-
-    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
-    std::string statSendBytes_str(syncTypeStr + "SendBytesVector_" +
-                                  get_run_identifier(loopName));
-
-    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());
-  }
 
   /**
    * Given data to serialize in val_vec, serialize it into the send buffer
@@ -779,7 +961,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <bool async, SyncType syncType, typename VecType>
   void serializeMessage(std::string loopName, DataCommMode data_mode,
-                        size_t bit_set_count, std::vector<size_t>& indices,
+                        size_t bit_set_count,
+                        const std::vector<size_t>& indices,
                         galois::PODResizeableArray<unsigned int>& offsets,
                         galois::DynamicBitSet& bit_set_comm, VecType& val_vec,
                         galois::runtime::SendBuffer& b) {
@@ -819,6 +1002,132 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // only difference is val_vec doesn't get resized ever (it's the single array
+  // from the hack call
+  template <bool async, SyncType syncType, typename VecType>
+  void
+  serializeMessageVecHack(std::string loopName, DataCommMode data_mode,
+                          size_t bit_set_count,
+                          const std::vector<size_t>& indices,
+                          galois::PODResizeableArray<unsigned int>& offsets,
+                          galois::DynamicBitSet& bit_set_comm, VecType& val_vec,
+                          galois::runtime::SendBuffer& b) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
+        serialize_timer_str.c_str(), RNAME);
+    if (data_mode == noData) {
+      if (!async) {
+        Tserialize.start();
+        gSerialize(b, data_mode);
+        Tserialize.stop();
+      }
+    } else if (data_mode == gidsData) {
+      offsets.resize(bit_set_count);
+      convertLIDToGID<syncType>(loopName, indices, offsets);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
+      Tserialize.stop();
+    } else if (data_mode == offsetsData) {
+      offsets.resize(bit_set_count);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);
+      Tserialize.stop();
+    } else if (data_mode == bitsetData) {
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec);
+      Tserialize.stop();
+    } else { // onlyData
+      Tserialize.start();
+      gSerialize(b, data_mode, val_vec);
+      Tserialize.stop();
+    }
+  }
+
+  // Calls data on the TwoDVector
+  template <bool async, SyncType syncType, typename TwoDVecType>
+  void SerializeMessage2D(std::string loopName, DataCommMode data_mode,
+                          size_t bit_set_count,
+                          const std::vector<size_t>& indices,
+                          galois::PODResizeableArray<unsigned int>& offsets,
+                          galois::DynamicBitSet& bit_set_comm,
+                          TwoDVecType& two_d_vec,
+                          galois::runtime::SendBuffer& b, size_t feat_size) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
+        serialize_timer_str.c_str(), RNAME);
+    if (data_mode == noData) {
+      if (!async) {
+        Tserialize.start();
+        gSerialize(b, data_mode);
+        Tserialize.stop();
+      }
+    } else if (data_mode == gidsData) {
+      offsets.resize(bit_set_count);
+      convertLIDToGID<syncType>(loopName, indices, offsets);
+      two_d_vec.resize(bit_set_count * feat_size);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data());
+      Tserialize.stop();
+    } else if (data_mode == offsetsData) {
+      offsets.resize(bit_set_count);
+      two_d_vec.resize(bit_set_count * feat_size);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data());
+      Tserialize.stop();
+    } else if (data_mode == bitsetData) {
+      Tserialize.start();
+      two_d_vec.resize(bit_set_count * feat_size);
+      gSerialize(b, data_mode, bit_set_count, bit_set_comm, two_d_vec.data());
+      Tserialize.stop();
+    } else { // onlyData
+      Tserialize.start();
+      gSerialize(b, data_mode, two_d_vec.data());
+      Tserialize.stop();
+    }
+  }
+
+  // Only serializes the prefix
+  template <SyncType syncType>
+  void SerializeMessagePrefix2D(
+      std::string loopName, DataCommMode data_mode, size_t bit_set_count,
+      const std::vector<size_t>& indices,
+      galois::PODResizeableArray<unsigned int>& offsets,
+      galois::DynamicBitSet& bit_set_comm, galois::runtime::SendBuffer& b) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "SerializeMessagePrefix_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(
+        serialize_timer_str.c_str(), RNAME);
+    if (data_mode == noData) {
+      Tserialize.start();
+      gSerialize(b, data_mode);
+      Tserialize.stop();
+    } else if (data_mode == gidsData) {
+      offsets.resize(bit_set_count);
+      convertLIDToGID<syncType>(loopName, indices, offsets);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets);
+      Tserialize.stop();
+    } else if (data_mode == offsetsData) {
+      offsets.resize(bit_set_count);
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, offsets);
+      Tserialize.stop();
+    } else if (data_mode == bitsetData) {
+      Tserialize.start();
+      gSerialize(b, data_mode, bit_set_count, bit_set_comm);
+      Tserialize.stop();
+    } else if (data_mode == onlyData) {
+      Tserialize.start();
+      gSerialize(b, data_mode);
+      Tserialize.stop();
+    }
+  }
+
   /**
    * Given the data mode, deserialize the rest of a message in a Receive Buffer.
    *
@@ -882,6 +1191,44 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     Tdeserialize.stop();
   }
 
+  template <SyncType syncType>
+  void DeserializeMessagePrefix(
+      std::string loopName, DataCommMode data_mode, uint32_t num,
+      galois::runtime::RecvBuffer& buf, size_t& bit_set_count,
+      galois::PODResizeableArray<unsigned int>& offsets,
+      galois::DynamicBitSet& bit_set_comm, size_t& buf_start, size_t& retval,
+      size_t& vec_size) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string serialize_timer_str(syncTypeStr + "DeserializeMessage_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tdeserialize(
+        serialize_timer_str.c_str(), RNAME);
+    Tdeserialize.start();
+
+    // get other metadata associated with message if mode isn't OnlyData
+    if (data_mode != onlyData) {
+      galois::runtime::gDeserialize(buf, bit_set_count);
+
+      if (data_mode == gidsData) {
+        galois::runtime::gDeserialize(buf, offsets);
+        convertGIDToLID<syncType>(loopName, offsets);
+      } else if (data_mode == offsetsData) {
+        galois::runtime::gDeserialize(buf, offsets);
+      } else if (data_mode == bitsetData) {
+        bit_set_comm.resize(num);
+        galois::runtime::gDeserialize(buf, bit_set_comm);
+      } else if (data_mode == dataSplit) {
+        galois::runtime::gDeserialize(buf, buf_start);
+      } else if (data_mode == dataSplitFirst) {
+        galois::runtime::gDeserialize(buf, retval);
+      }
+    }
+    // Grab data size but not data
+    galois::runtime::gDeserialize(buf, vec_size);
+
+    Tdeserialize.stop();
+  }
+
   ////////////////////////////////////////////////////////////////////////////////
   // Other helper functions
   ////////////////////////////////////////////////////////////////////////////////
@@ -988,7 +1335,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   bool nothingToSend(unsigned host, SyncType syncType,
                      WriteLocation writeLocation, ReadLocation readLocation) {
-    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes);
     // TODO refactor (below)
     if (!isCartCut) {
       return (sharedNodes[host].size() == 0);
@@ -1017,7 +1365,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   bool nothingToRecv(unsigned host, SyncType syncType,
                      WriteLocation writeLocation, ReadLocation readLocation) {
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
     // TODO refactor (above)
     if (!isCartCut) {
       return (sharedNodes[host].size() == 0);
@@ -1088,6 +1437,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  template <typename FnTy, SyncType syncType>
+  void ExtractWrapper2D(size_t lid,
+                        typename FnTy::ValTy::value_type* location_to_write) {
+    if (syncType == syncReduce) {
+      FnTy::ExtractDirect(lid, location_to_write);
+      typename FnTy::NodeTy dummy{};
+      FnTy::reset(lid, dummy);
+    } else {
+      FnTy::ExtractDirect(lid, location_to_write);
+    }
+  }
+
   /**
    * Extracts data at provided lid; uses vecIndex to get the correct element
    * from the vector.
@@ -1175,11 +1536,61 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
-  /**
-   * Based on provided arguments, extracts the data that we are interested
-   * in sending into val_vec. Same as above, except it has the vecIndex
-   * arguments and requires vecSync to be true
-   *
+  template <typename FnTy, SyncType syncType, typename VecTy,
+            bool identity_offsets = false, bool parallelize = true>
+  void ExtractSubsetLazy2D(
+      const std::string& loopName, const std::vector<size_t>& indices,
+      size_t size, const galois::PODResizeableArray<unsigned int>& offsets,
+      galois::runtime::SendBuffer& send_buffer, size_t base_offset) {
+    size_t start = 0;
+    if (parallelize) {
+      std::string syncTypeStr =
+          (syncType == syncReduce) ? "Reduce" : "Broadcast";
+      std::string doall_str(syncTypeStr + "ExtractVal_" + loopName);
+
+      galois::do_all(
+          galois::iterate(start, start + size),
+          [&](unsigned int n) {
+            unsigned int offset;
+            if (identity_offsets)
+              offset = n;
+            else
+              offset = offsets[n];
+            size_t lid = indices[offset];
+
+            ExtractWrapper2D<FnTy, syncType>(
+                lid,
+                (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset(
+                    base_offset)[(n - start) * FnTy::FeatVecSize() *
+                                 sizeof(typename FnTy::ValTy::value_type)])));
+          },
+#if GALOIS_COMM_STATS
+          galois::loopname(get_run_identifier(doall_str).c_str()),
+#endif
+          galois::no_stats());
+    } else { // non-parallel version
+      for (unsigned n = start; n < start + size; ++n) {
+        unsigned int offset;
+        if (identity_offsets)
+          offset = n;
+        else
+          offset = offsets[n];
+
+        size_t lid = indices[offset];
+
+        ExtractWrapper2D<FnTy, syncType>(
+            lid, (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset(
+                     base_offset)[(n - start) * FnTy::FeatVecSize() *
+                                  sizeof(typename FnTy::ValTy::value_type)])));
+      }
+    }
+  }
+
+  /**
+   * Based on provided arguments, extracts the data that we are interested
+   * in sending into val_vec. Same as above, except it has the vecIndex
+   * arguments and requires vecSync to be true
+   *
    * @tparam FnTy structure that specifies how synchronization is to be done
    * @tparam syncType either reduce or broadcast; used to determine if reseting
    * the extracted field is necessary
@@ -1326,9 +1737,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType>
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data());
+      return FnTy::extract_reset_batch(x, b.data());
     } else {
-      return FnTy::extract_batch(x, b.getVec().data());
+      return FnTy::extract_batch(x, b.data());
     }
   }
 
@@ -1354,9 +1765,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,
                                   size_t& s, DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_reset_batch(x, b.data(), &s, &data_mode);
     } else {
-      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);
+      return FnTy::extract_batch(x, b.data(), &s, &data_mode);
     }
   }
 
@@ -1391,6 +1802,24 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // 2D
+  template <typename FnTy, SyncType syncType, bool async>
+  void SetWrapper2D(size_t lid,
+                    typename FnTy::ValTy::value_type* pointer_to_data,
+                    galois::DynamicBitSet& bit_set_compute) {
+    if (syncType == syncReduce) {
+      if (FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data)) {
+        if (bit_set_compute.size() != 0)
+          bit_set_compute.set(lid);
+      }
+    } else {
+      if (async)
+        FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data);
+      else
+        FnTy::setVal(lid, userGraph.getData(lid), pointer_to_data);
+    }
+  }
+
   /**
    * VECTOR VARIANT.
    *
@@ -1490,6 +1919,56 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
   }
 
+  // 2D; vecty is a PODResize
+  template <typename IndicesVecTy, typename FnTy, SyncType syncType, bool async,
+            bool identity_offsets = false, bool parallelize = true>
+  void SetSubset2D(const std::string& loopName, const IndicesVecTy& indices,
+                   size_t size,
+                   const galois::PODResizeableArray<unsigned int>& offsets,
+                   galois::runtime::RecvBuffer& buf,
+                   galois::DynamicBitSet& bit_set_compute, size_t start = 0) {
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string doall_str(syncTypeStr + "SetVal_" +
+                          get_run_identifier(loopName));
+    if (parallelize) {
+      galois::do_all(
+          galois::iterate(start, start + size),
+          [&](unsigned int n) {
+            unsigned int offset;
+            if (identity_offsets)
+              offset = n;
+            else
+              offset = offsets[n];
+            auto lid = indices[offset];
+            SetWrapper2D<FnTy, syncType, async>(
+                lid,
+                (typename FnTy::ValTy::value_type*)&(
+                    buf.data()[(n - start) * FnTy::FeatVecSize() *
+                               sizeof(typename FnTy::ValTy::value_type)]),
+                bit_set_compute);
+          },
+#if GALOIS_COMM_STATS
+          galois::loopname(get_run_identifier(doall_str).c_str()),
+#endif
+          galois::no_stats());
+    } else {
+      for (unsigned int n = start; n < start + size; ++n) {
+        unsigned int offset;
+        if (identity_offsets)
+          offset = n;
+        else
+          offset = offsets[n];
+        auto lid = indices[offset];
+        SetWrapper2D<FnTy, syncType, async>(
+            lid,
+            (typename FnTy::ValTy::value_type*)(&(
+                buf.data()[(n - start) * FnTy::FeatVecSize() *
+                           sizeof(typename FnTy::ValTy::value_type)])),
+            bit_set_compute);
+      }
+    }
+  }
+
   /**
    * VECTOR BITSET VARIANT.
    *
@@ -1580,12 +2059,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <typename FnTy, SyncType syncType, bool async>
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());
+      return FnTy::reduce_batch(x, b.data());
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::reduce_mirror_batch(x, b.data());
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());
+        return FnTy::setVal_batch(x, b.data());
       }
     }
   }
@@ -1610,15 +2089,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,
                               DataCommMode& data_mode) {
     if (syncType == syncReduce) {
-      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),
-                                data_mode);
+      return FnTy::reduce_batch(x, b.data(), data_mode);
     } else {
       if (async) {
-        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),
-                                         data_mode);
+        return FnTy::reduce_mirror_batch(x, b.data(), data_mode);
       } else {
-        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),
-                                  data_mode);
+        return FnTy::setVal_batch(x, b.data(), data_mode);
       }
     }
   }
@@ -1644,9 +2120,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<galois::runtime::is_memory_copyable<
                 typename SyncFnTy::ValTy>::value>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
-    uint32_t num = indices.size();
+                   const std::vector<size_t>& indices,
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
+    uint32_t num = indices.size() * elem_size;
     static VecTy val_vec; // sometimes wasteful
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
@@ -1725,8 +2201,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
             typename std::enable_if<!galois::runtime::is_memory_copyable<
                 typename SyncFnTy::ValTy>::value>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
+                   const std::vector<size_t>& indices,
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     std::string extract_timer_str(syncTypeStr + "Extract_" +
                                   get_run_identifier(loopName));
@@ -1739,7 +2215,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     DataCommMode data_mode;
 
-    uint32_t num = indices.size();
+    uint32_t num = indices.size() * elem_size;
     static VecTy val_vec; // sometimes wasteful
     static galois::PODResizeableArray<unsigned int> dummyVector;
 
@@ -1768,7 +2244,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         b.resize(sizeof(DataCommMode) + sizeof(size_t) +
                  (num * sizeof(typename SyncFnTy::ValTy)));
       }
-
     } else {
       b.resize(0);
       if (!async) {
@@ -1808,9 +2283,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       bool async,
       typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>
   void syncExtract(std::string loopName, unsigned from_id,
-                   std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
-    uint32_t num                        = indices.size();
+                   const std::vector<size_t>& indices,
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
+    uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
     static VecTy val_vec; // sometimes wasteful
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
@@ -1871,10 +2346,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           extractSubset<SyncFnTy, syncType, VecTy, false, true>(
               loopName, indices, bit_set_count, offsets, val_vec);
         }
+
         serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,
                                           indices, offsets, bit_set_comm,
                                           val_vec, b);
       } else {
+        // TODO(loc/hochan) vector gpu hack for gnns
         if (data_mode == noData) {
           b.resize(0);
           if (!async) {
@@ -1902,7 +2379,304 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                    (num * sizeof(typename SyncFnTy::ValTy)));
         }
       }
-
+
+      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
+                                    bit_set_comm);
+    } else {
+      data_mode = noData;
+      b.resize(0);
+      if (!async) {
+        gSerialize(b, noData);
+      }
+    }
+
+    Textract.stop();
+
+    std::string metadata_str(syncTypeStr + "MetadataMode_" +
+                             std::to_string(data_mode) + "_" +
+                             get_run_identifier(loopName));
+    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
+                                                            1);
+  }
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<!is_vector_of_vec<VecTy>::value>::type* = nullptr>
+  void syncExtractFloatVecHack(std::string, unsigned, std::vector<size_t>&,
+                               galois::runtime::SendBuffer&, size_t) {
+    // TODO(loc) cleaner way to do this
+    GALOIS_LOG_FATAL(
+        "Execution should not call float vec hack if not vector of vectors");
+  }
+
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<is_vector_of_vec<VecTy>::value>::type*  = nullptr>
+  void syncExtractFloatVecHack(std::string loopName, unsigned from_id,
+                               const std::vector<size_t>& indices,
+                               galois::runtime::SendBuffer& b,
+                               size_t elem_size) {
+    // TODO(loc) assumption that type in the VecTy is a vector of floats
+    // throughout this code; more robust solution would detect it other ways
+    uint32_t num                        = indices.size() * elem_size;
+    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    static VecTy val_vec; // sometimes wasteful
+    static galois::gstl::Vector<float> single_array;
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    ////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string extract_timer_str(syncTypeStr + "Extract_" +
+                                  get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
+                                                      RNAME);
+    std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(
+        extract_alloc_timer_str.c_str(), RNAME);
+    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
+        extract_batch_timer_str.c_str(), RNAME);
+    ////////////////////////////////////////////////////////////////////////////
+
+    DataCommMode data_mode;
+    Textract.start();
+
+    if (num > 0) {
+      size_t bit_set_count = 0;
+      Textractalloc.start();
+      b.reserve(getMaxSendBufferSize<SyncFnTy>(num));
+      Textractalloc.stop();
+
+      Textractbatch.start();
+      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(
+          from_id, b, bit_set_count, data_mode);
+      Textractbatch.stop();
+
+      // GPUs have a batch function they can use; CPUs do not; therefore,
+      // CPUS always enter this if block
+      if (!batch_succeeded) {
+        Textractalloc.start();
+        b.resize(0);
+        bit_set_comm.reserve(maxSharedSize);
+        offsets.reserve(maxSharedSize);
+        val_vec.reserve(maxSharedSize);
+        bit_set_comm.resize(num);
+        offsets.resize(num);
+        val_vec.resize(num);
+        Textractalloc.stop();
+        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+
+        getBitsetAndOffsets<SyncFnTy, syncType>(
+            loopName, indices, bit_set_compute, bit_set_comm, offsets,
+            bit_set_count, data_mode);
+
+        if (data_mode == onlyData) {
+          bit_set_count = indices.size();
+          extractSubset<SyncFnTy, syncType, VecTy, true, true>(
+              loopName, indices, bit_set_count, offsets, val_vec);
+        } else if (data_mode !=
+                   noData) { // bitsetData or offsetsData or gidsData
+          extractSubset<SyncFnTy, syncType, VecTy, false, true>(
+              loopName, indices, bit_set_count, offsets, val_vec);
+        }
+
+        // Vector of vectors is in val_vec
+        // val vec over to contiguous array of #s
+        size_t num_nodes    = bit_set_count;
+        size_t feature_size = 0;
+        if (bit_set_count != 0) {
+          feature_size = val_vec[0].size();
+          single_array.resize(num_nodes * feature_size);
+          galois::do_all(
+              galois::iterate(size_t{0}, num_nodes),
+              [&](size_t index) {
+                std::memcpy(&(single_array.data()[index * feature_size]),
+                            val_vec[index].data(),
+                            feature_size * sizeof(float));
+              },
+              galois::loopname("GluonSerializeManyVecToOne"));
+        }
+
+        serializeMessageVecHack<async, syncType>(
+            loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm,
+            single_array, b);
+        gSerialize(b, feature_size);
+      } else {
+        // TODO(loc/hochan) vector gpu hack for gnns
+        if (data_mode == noData) {
+          b.resize(0);
+          if (!async) {
+            gSerialize(b, data_mode);
+          }
+        } else if (data_mode == gidsData) {
+          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+                   sizeof(size_t) +
+                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        } else if (data_mode == offsetsData) {
+          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+                   sizeof(size_t) +
+                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        } else if (data_mode == bitsetData) {
+          size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
+          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+                   sizeof(size_t)   // bitset size
+                   + sizeof(size_t) // bitset vector size
+                   + bitset_alloc_size + sizeof(size_t) +
+                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        } else { // onlyData
+          b.resize(sizeof(DataCommMode) + sizeof(size_t) +
+                   (num * sizeof(typename SyncFnTy::ValTy)));
+        }
+      }
+
+      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
+                                    bit_set_comm);
+    } else {
+      data_mode = noData;
+      b.resize(0);
+      if (!async) {
+        gSerialize(b, noData);
+      }
+    }
+
+    Textract.stop();
+
+    std::string metadata_str(syncTypeStr + "MetadataMode_" +
+                             std::to_string(data_mode) + "_" +
+                             get_run_identifier(loopName));
+    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,
+                                                            1);
+  }
+
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async>
+  void SyncExtract2D(std::string loopName, unsigned from_id,
+                     const std::vector<size_t>& indices,
+                     galois::runtime::SendBuffer& b, size_t elem_size) {
+    uint32_t num                        = indices.size() * elem_size;
+    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    //////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string extract_timer_str(syncTypeStr + "Extract_" +
+                                  get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
+                                                      RNAME);
+    std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(
+        extract_alloc_timer_str.c_str(), RNAME);
+    std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" +
+                                        get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(
+        extract_batch_timer_str.c_str(), RNAME);
+    //////////////////////////////////////////////////////////////////////////////
+
+    DataCommMode data_mode;
+    Textract.start();
+
+    if (num > 0) {
+      size_t bit_set_count = 0;
+      Textractalloc.start();
+      b.reserve(GetMaxSendBufferSizeVecs<SyncFnTy>(num));
+      Textractalloc.stop();
+
+      Textractbatch.start();
+      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(
+          from_id, b, bit_set_count, data_mode);
+      Textractbatch.stop();
+
+      // GPUs have a batch function they can use; CPUs do not; therefore,
+      // CPUS always enter this if block
+      if (!batch_succeeded) {
+        Textractalloc.start();
+        b.resize(0);
+        bit_set_comm.reserve(maxSharedSize);
+        offsets.reserve(maxSharedSize);
+        bit_set_comm.resize(num);
+        offsets.resize(num);
+        Textractalloc.stop();
+        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+
+        GetBitsetAndOffsets2D<SyncFnTy, syncType>(
+            loopName, indices, bit_set_compute, bit_set_comm, offsets,
+            bit_set_count, data_mode);
+
+        // serialize the prefix for the buffer based on data type: the data
+        // itself gets serialized directly into the buffer later
+        SerializeMessagePrefix2D<syncType>(loopName, data_mode, bit_set_count,
+                                           indices, offsets, bit_set_comm, b);
+
+        if (data_mode != noData) {
+          size_t lazy_buffer_size = 0;
+          if (data_mode == gidsData) {
+            lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize();
+          } else if (data_mode == offsetsData) {
+            lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize();
+          } else if (data_mode == bitsetData) {
+            lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize();
+          } else if (data_mode == onlyData) {
+            lazy_buffer_size = num * SyncFnTy::FeatVecSize();
+          }
+
+          size_t base_offset = 0;
+          if (lazy_buffer_size > 0) {
+            auto lazy_buffer = gSerializeLazySeq(
+                b, lazy_buffer_size,
+                (galois::PODResizeableArray<
+                    typename SyncFnTy::ValTy::value_type>*)nullptr);
+            base_offset = lazy_buffer.off;
+          }
+
+          // serialize the actual data directly into the buffer with lazy
+          // serialization
+          if (data_mode == onlyData) {
+            bit_set_count = indices.size();
+            ExtractSubsetLazy2D<SyncFnTy, syncType, VecTy, true, true>(
+                loopName, indices, bit_set_count, offsets, b, base_offset);
+          } else { // bitsetData or offsetsData or gidsData
+            ExtractSubsetLazy2D<SyncFnTy, syncType, VecTy, false, true>(
+                loopName, indices, bit_set_count, offsets, b, base_offset);
+          }
+        }
+      } else {
+        // TODO(loc/hochan) GPU
+        // if (data_mode == noData) {
+        //  b.resize(0);
+        //  if (!async) {
+        //    gSerialize(b, data_mode);
+        //  }
+        //} else if (data_mode == gidsData) {
+        //  b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+        //           sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+        //           sizeof(size_t) +
+        //           (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        //} else if (data_mode == offsetsData) {
+        //  b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+        //           sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +
+        //           sizeof(size_t) +
+        //           (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        //} else if (data_mode == bitsetData) {
+        //  size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);
+        //  b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +
+        //           sizeof(size_t)   // bitset size
+        //           + sizeof(size_t) // bitset vector size
+        //           + bitset_alloc_size + sizeof(size_t) +
+        //           (bit_set_count * sizeof(typename SyncFnTy::ValTy)));
+        //} else { // onlyData
+        //  b.resize(sizeof(DataCommMode) + sizeof(size_t) +
+        //           (num * sizeof(typename SyncFnTy::ValTy)));
+        //}
+        GALOIS_LOG_FATAL("Make sure this is implemented correctly");
+      }
       reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,
                                     bit_set_comm);
     } else {
@@ -1946,9 +2720,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
       SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
       bool async,
       typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>
-  void syncExtract(std::string loopName, unsigned, std::vector<size_t>& indices,
-                   galois::runtime::SendBuffer& b) {
-    uint32_t num                        = indices.size();
+  void syncExtract(std::string loopName, unsigned,
+                   const std::vector<size_t>& indices,
+                   galois::runtime::SendBuffer& b, size_t elem_size) {
+    uint32_t num                        = indices.size() * elem_size;
     galois::DynamicBitSet& bit_set_comm = syncBitset;
     static VecTy val_vec; // sometimes wasteful
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
@@ -1958,7 +2733,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                                   get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),
                                                       RNAME);
-
     Textract.start();
 
     if (num > 0) {
@@ -1990,13 +2764,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         // vector extract, i.e. get element i of the vector (i passed in as
         // argument as well)
         if (data_mode == onlyData) {
-          // galois::gInfo(id, " node ", i, " has data to send");
           bit_set_count = indices.size();
           extractSubset<SyncFnTy, syncType, VecTy, true, true, true>(
               loopName, indices, bit_set_count, offsets, val_vec, i);
         } else if (data_mode !=
                    noData) { // bitsetData or offsetsData or gidsData
-          // galois::gInfo(id, " node ", i, " has data to send");
           extractSubset<SyncFnTy, syncType, VecTy, false, true, true>(
               loopName, indices, bit_set_count, offsets, val_vec, i);
         }
@@ -2049,7 +2821,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         MPI_Wait(&request[x], MPI_STATUS_IGNORE);
       }
       if (b[x].size() > 0) {
-        b[x].getVec().clear();
+        b[x].get().clear();
       }
 
       getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
@@ -2123,7 +2895,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <WriteLocation writeLocation, ReadLocation readLocation,
             SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
             typename VecTy, bool async>
-  void syncNetSend(std::string loopName) {
+  void syncNetSend(std::string loopName, size_t elem_size) {
     static galois::runtime::SendBuffer
         b; // although a static variable, allocation not reused
            // due to std::move in net.sendTagged()
@@ -2141,13 +2913,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         continue;
 
       getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,
-                                                                  b);
+                                                                  b, elem_size);
+
+      if (b.size() > static_cast<size_t>(std::numeric_limits<int>::max())) {
+        GALOIS_LOG_FATAL("send buff limit limit reached: {}", b.size());
+      }
 
       if ((!async) || (b.size() > 0)) {
         size_t syncTypePhase = 0;
         if (async && (syncType == syncBroadcast))
           syncTypePhase = 1;
-        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);
+        net.sendTagged(x, galois::runtime::evilPhase, std::move(b),
+                       syncTypePhase);
         ++numMessages;
       }
     }
@@ -2178,14 +2955,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <WriteLocation writeLocation, ReadLocation readLocation,
             SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
             typename VecTy, bool async>
-  void syncSend(std::string loopName) {
+  void syncSend(std::string loopName, size_t elem_size) {
     std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
     galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(
         (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME);
 
     TSendTime.start();
     syncNetSend<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,
-                VecTy, async>(loopName);
+                VecTy, async>(loopName, elem_size);
     TSendTime.stop();
   }
 
@@ -2227,9 +3004,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     static VecTy val_vec;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
-    uint32_t num      = sharedNodes[from_id].size();
-    size_t retval     = 0;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
 
     Tset.start();
 
@@ -2255,6 +3033,214 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           deserializeMessage<syncType>(loopName, data_mode, num, buf,
                                        bit_set_count, offsets, bit_set_comm,
                                        buf_start, retval, val_vec);
+          bit_set_comm.reserve(maxSharedSize);
+          offsets.reserve(maxSharedSize);
+          val_vec.reserve(maxSharedSize);
+
+          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+
+          if (data_mode == bitsetData) {
+            size_t bit_set_count2;
+            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
+                                           bit_set_count2);
+            assert(bit_set_count == bit_set_count2);
+          }
+
+          if (data_mode == onlyData) {
+            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
+                      async, true, true>(loopName, sharedNodes[from_id],
+                                         bit_set_count, offsets, val_vec,
+                                         bit_set_compute);
+          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
+            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
+                      async, true, true>(loopName, sharedNodes[from_id],
+                                         bit_set_count, offsets, val_vec,
+                                         bit_set_compute, buf_start);
+          } else if (data_mode == gidsData) {
+            setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async, true,
+                      true>(loopName, offsets, bit_set_count, offsets, val_vec,
+                            bit_set_compute);
+          } else { // bitsetData or offsetsData
+            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,
+                      async, false, true>(loopName, sharedNodes[from_id],
+                                          bit_set_count, offsets, val_vec,
+                                          bit_set_compute);
+          }
+          // TODO: reduce could update the bitset, so it needs to be copied
+          // back to the device
+        }
+      }
+    }
+
+    Tset.stop();
+
+    return retval;
+  }
+
+  // TODO (loc) way too much code duplication
+  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async>
+  size_t SyncRecvApply2D(uint32_t from_id, galois::runtime::RecvBuffer& buf,
+                         std::string loopName) {
+    ////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string set_timer_str(syncTypeStr + "Set_" +
+                              get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);
+    std::string set_batch_timer_str(syncTypeStr + "SetBatch_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(
+        set_batch_timer_str.c_str(), RNAME);
+    ////////////////////////////////////////////////////////////////////////////
+
+    galois::DynamicBitSet& bit_set_comm               = syncBitset;
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
+
+    Tset.start();
+
+    if (num > 0) { // only enter if we expect message from that host
+      DataCommMode data_mode;
+      // 1st deserialize gets data mode
+      galois::runtime::gDeserialize(buf, data_mode);
+
+      if (data_mode != noData) {
+        Tsetbatch.start();
+        bool batch_succeeded =
+            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);
+        Tsetbatch.stop();
+
+        // cpu always enters this block
+        if (!batch_succeeded) {
+          size_t bit_set_count = num;
+          size_t buf_start     = 0;
+
+          size_t vec_size = 0;
+          DeserializeMessagePrefix<syncType>(
+              loopName, data_mode, num, buf, bit_set_count, offsets,
+              bit_set_comm, buf_start, retval, vec_size);
+
+          bit_set_comm.reserve(maxSharedSize);
+          offsets.reserve(maxSharedSize);
+
+          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();
+          if (data_mode == bitsetData) {
+            size_t bit_set_count2;
+            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,
+                                           bit_set_count2);
+            assert(bit_set_count == bit_set_count2);
+          }
+
+          // note for all these the deserialize buffer is extracted from
+          // directly rather than copying it over to another vector
+          if (data_mode == onlyData) {
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
+                        async, true, true>(loopName, sharedNodes[from_id],
+                                           bit_set_count, offsets, buf,
+                                           bit_set_compute);
+          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
+                        async, true, true>(loopName, sharedNodes[from_id],
+                                           bit_set_count, offsets, buf,
+                                           bit_set_compute, buf_start);
+          } else if (data_mode == gidsData) {
+            SetSubset2D<decltype(offsets), SyncFnTy, syncType, async, true,
+                        true>(loopName, offsets, bit_set_count, offsets, buf,
+                              bit_set_compute);
+          } else { // bitsetData or offsetsData
+            SetSubset2D<decltype(sharedNodes[from_id]), SyncFnTy, syncType,
+                        async, false, true>(loopName, sharedNodes[from_id],
+                                            bit_set_count, offsets, buf,
+                                            bit_set_compute);
+          }
+        } else {
+          // TODO(loc/hochan)
+          GALOIS_LOG_FATAL("Implement GPU");
+        }
+      }
+    }
+
+    Tset.stop();
+
+    return retval;
+  }
+
+  // TODO (loc) way too much code duplication
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<is_vector_of_vec<VecTy>::value>::type*  = nullptr>
+  size_t syncRecvApplyVecHack(uint32_t from_id,
+                              galois::runtime::RecvBuffer& buf,
+                              std::string loopName) {
+    ////////////////////////////////////////////////////////////////////////////
+    std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast";
+    std::string set_timer_str(syncTypeStr + "Set_" +
+                              get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);
+    std::string set_batch_timer_str(syncTypeStr + "SetBatch_" +
+                                    get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(
+        set_batch_timer_str.c_str(), RNAME);
+    ////////////////////////////////////////////////////////////////////////////
+
+    galois::DynamicBitSet& bit_set_comm = syncBitset;
+    static VecTy val_vec;
+    // TODO(loc) assumes float for now
+    static galois::gstl::Vector<float> single_array;
+    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
+
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
+
+    Tset.start();
+
+    if (num > 0) { // only enter if we expect message from that host
+      DataCommMode data_mode;
+      // 1st deserialize gets data mode
+      galois::runtime::gDeserialize(buf, data_mode);
+
+      if (data_mode != noData) {
+        // GPU update call
+        Tsetbatch.start();
+        bool batch_succeeded =
+            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);
+        Tsetbatch.stop();
+
+        // cpu always enters this block
+        if (!batch_succeeded) {
+          size_t bit_set_count = num;
+          size_t buf_start     = 0;
+
+          // deserialize the rest of the data in the buffer depending on the
+          // data mode; arguments passed in here are mostly output vars
+          deserializeMessage<syncType>(loopName, data_mode, num, buf,
+                                       bit_set_count, offsets, bit_set_comm,
+                                       buf_start, retval, single_array);
+
+          // deserialize sngle array into vector of vector state again
+          size_t feature_size;
+          gDeserialize(buf, feature_size);
+          size_t num_nodes = single_array.size() / feature_size;
+
+          assert(single_array.size() % feature_size == 0);
+          val_vec.resize(num_nodes);
+          galois::do_all(
+              galois::iterate(size_t{0}, num_nodes),
+              [&](size_t node) {
+                val_vec[node].resize(feature_size);
+                std::memcpy((void*)(val_vec[node].data()),
+                            &(single_array[node * feature_size]),
+                            feature_size * sizeof(float));
+              },
+              galois::loopname("GluonDeserializeBackToVecOfVec"));
 
           bit_set_comm.reserve(maxSharedSize);
           offsets.reserve(maxSharedSize);
@@ -2300,6 +3286,17 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     return retval;
   }
 
+  template <
+      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,
+      bool async,
+      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr,
+      typename std::enable_if<!is_vector_of_vec<VecTy>::value>::type* = nullptr>
+  size_t syncRecvApplyVecHack(uint32_t, galois::runtime::RecvBuffer&,
+                              std::string) {
+    GALOIS_LOG_FATAL("NOT SUPPORTED, should never get called");
+    return 0;
+  }
+
   /**
    * VECTOR BITSET VARIANT.
    *
@@ -2335,9 +3332,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     static VecTy val_vec;
     galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;
 
-    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
-    uint32_t num      = sharedNodes[from_id].size();
-    size_t retval     = 0;
+    auto& sharedNodes =
+        (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
+    uint32_t num  = sharedNodes[from_id].size();
+    size_t retval = 0;
 
     Tset.start();
 
@@ -2472,6 +3470,39 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   }
 #endif
 
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
+            typename VecTy, bool async,
+            typename std::enable_if<Is2DVector<VecTy>::value>::type* = nullptr>
+  void syncNetRecv(std::string loopName) {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    std::string wait_timer_str("Wait_" + get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> Twait(wait_timer_str.c_str(),
+                                                   RNAME);
+
+    if (async) {
+      GALOIS_LOG_FATAL("2d vector + async = unimplemented");
+    } else {
+      for (unsigned x = 0; x < numHosts; ++x) {
+        if (x == id)
+          continue;
+        if (nothingToRecv(x, syncType, writeLocation, readLocation))
+          continue;
+
+        Twait.start();
+        decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+        do {
+          p = net.recieveTagged(galois::runtime::evilPhase);
+        } while (!p);
+        Twait.stop();
+
+        SyncRecvApply2D<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+            p->first, p->second, loopName);
+      }
+      incrementEvilPhase();
+    }
+  }
+
   /**
    * Determines if there is anything to receive from a host and receives/applies
    * the messages.
@@ -2486,7 +3517,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
             SyncType syncType, typename SyncFnTy, typename BitsetFnTy,
-            typename VecTy, bool async>
+            typename VecTy, bool async,
+            typename std::enable_if<!Is2DVector<VecTy>::value>::type* = nullptr>
   void syncNetRecv(std::string loopName) {
     auto& net = galois::runtime::getSystemNetworkInterface();
     std::string wait_timer_str("Wait_" + get_run_identifier(loopName));
@@ -2494,14 +3526,17 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                                                    RNAME);
 
     if (async) {
+      if (is_vector_of_vec<VecTy>::value) {
+        galois::gWarn("Async execution does not support the vector of vec hack "
+                      "(most important for GNN)");
+      }
+
       size_t syncTypePhase = 0;
       if (syncType == syncBroadcast)
         syncTypePhase = 1;
-      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                                 syncTypePhase)) p;
+      decltype(net.recieveTagged(galois::runtime::evilPhase, syncTypePhase)) p;
       do {
-        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,
-                              syncTypePhase);
+        p = net.recieveTagged(galois::runtime::evilPhase, syncTypePhase);
 
         if (p) {
           syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
@@ -2516,14 +3551,19 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
           continue;
 
         Twait.start();
-        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;
+        decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
         do {
-          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          p = net.recieveTagged(galois::runtime::evilPhase);
         } while (!p);
         Twait.stop();
 
-        syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
-            p->first, p->second, loopName);
+        if (is_vector_of_vec<VecTy>::value) {
+          syncRecvApplyVecHack<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+              p->first, p->second, loopName);
+        } else {
+          syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(
+              p->first, p->second, loopName);
+        }
       }
       incrementEvilPhase();
     }
@@ -2578,7 +3618,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     if (rb.size() == 0) { // create the receive buffers
       TRecvTime.start();
-      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+      auto& sharedNodes =
+          (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
       rb.resize(numHosts);
       request.resize(numHosts, MPI_REQUEST_NULL);
 
@@ -2633,7 +3674,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     if (window.size() == 0) { // create the windows
       TRecvTime.start();
-      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;
+      auto& sharedNodes =
+          (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes);
       window.resize(numHosts);
       rb.resize(numHosts);
 
@@ -2705,6 +3747,29 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   // Higher Level Sync Calls (broadcast/reduce, etc)
   ////////////////////////////////////////////////////////////////////////////////
 
+  // 2D vector
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            typename ReduceFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<
+                IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
+  void reduce(std::string loopName, size_t elem_size) {
+    std::string timer_str("Reduce_" + get_run_identifier(loopName));
+    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
+                                                         RNAME);
+
+    using T     = typename ReduceFnTy::ValTy::value_type;
+    using VecTy = galois::TwoDVector<T>;
+
+    TsyncReduce.start();
+
+    syncSend<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
+             VecTy, async>(loopName, elem_size);
+    syncRecv<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
+             VecTy, async>(loopName);
+
+    TsyncReduce.stop();
+  }
+
   /**
    * Does a reduction of data from mirror nodes to master nodes.
    *
@@ -2716,8 +3781,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
-            typename ReduceFnTy, typename BitsetFnTy, bool async>
-  inline void reduce(std::string loopName) {
+            typename ReduceFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<
+                !IsVector<typename ReduceFnTy::ValTy>::value>::type* = nullptr>
+  void reduce(std::string loopName, size_t elem_size) {
     std::string timer_str("Reduce_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),
                                                          RNAME);
@@ -2735,7 +3802,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     case noBareMPI:
 #endif
       syncSend<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
-               VecTy, async>(loopName);
+               VecTy, async>(loopName, elem_size);
       syncRecv<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,
                VecTy, async>(loopName);
 #ifdef GALOIS_USE_BARE_MPI
@@ -2756,6 +3823,29 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     TsyncReduce.stop();
   }
 
+  // 2d
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            typename BroadcastFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<IsVector<
+                typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
+  void broadcast(std::string loopname, size_t elem_size) {
+    std::string timer_str("Broadcast_" + get_run_identifier(loopname));
+    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
+                                                            RNAME);
+
+    typedef typename BroadcastFnTy::ValTy::value_type T;
+    using VecTy = galois::TwoDVector<T>;
+
+    TsyncBroadcast.start();
+
+    syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
+             BitsetFnTy, VecTy, async>(loopname, elem_size);
+    syncRecv<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
+             BitsetFnTy, VecTy, async>(loopname);
+
+    TsyncBroadcast.stop();
+  }
+
   /**
    * Does a broadcast of data from master to mirror nodes.
    *
@@ -2767,8 +3857,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <WriteLocation writeLocation, ReadLocation readLocation,
-            typename BroadcastFnTy, typename BitsetFnTy, bool async>
-  inline void broadcast(std::string loopName) {
+            typename BroadcastFnTy, typename BitsetFnTy, bool async,
+            typename std::enable_if<!IsVector<
+                typename BroadcastFnTy::ValTy>::value>::type* = nullptr>
+  void broadcast(std::string loopName, size_t elem_size) {
     std::string timer_str("Broadcast_" + get_run_identifier(loopName));
     galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),
                                                             RNAME);
@@ -2810,10 +3902,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 #endif
       if (use_bitset) {
         syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
-                 BitsetFnTy, VecTy, async>(loopName);
+                 BitsetFnTy, VecTy, async>(loopName, elem_size);
       } else {
         syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
-                 galois::InvalidBitsetFnTy, VecTy, async>(loopName);
+                 galois::InvalidBitsetFnTy, VecTy, async>(loopName, elem_size);
       }
       syncRecv<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,
                BitsetFnTy, VecTy, async>(loopName);
@@ -2845,12 +3937,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_src_to_src(std::string loopName) {
+  inline void sync_src_to_src(std::string loopName, size_t elem_size) {
     // do nothing for OEC
     // reduce and broadcast for IEC, CVC, UVC
     if (transposed || isVertexCut) {
-      reduce<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
-      broadcast<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
+      reduce<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                   elem_size);
+      broadcast<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(
+          loopName, elem_size);
     }
   }
 
@@ -2863,24 +3957,24 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_src_to_dst(std::string loopName) {
+  inline void sync_src_to_dst(std::string loopName, size_t elem_size) {
     // only broadcast for OEC
     // only reduce for IEC
     // reduce and broadcast for CVC, UVC
     if (transposed) {
       reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
       if (isVertexCut) {
         broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
     } else {
       if (isVertexCut) {
         reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
       broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     }
   }
 
@@ -2893,13 +3987,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_src_to_any(std::string loopName) {
+  inline void sync_src_to_any(std::string loopName, size_t elem_size) {
     // only broadcast for OEC
     // reduce and broadcast for IEC, CVC, UVC
     if (transposed || isVertexCut) {
-      reduce<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+      reduce<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                elem_size);
     }
-    broadcast<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+    broadcast<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                 elem_size);
   }
 
   /**
@@ -2911,23 +4007,23 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_dst_to_src(std::string loopName) {
+  inline void sync_dst_to_src(std::string loopName, size_t elem_size) {
     // only reduce for OEC
     // only broadcast for IEC
     // reduce and broadcast for CVC, UVC
     if (transposed) {
       if (isVertexCut) {
         reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
       broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     } else {
       reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
       if (isVertexCut) {
         broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(
-            loopName);
+            loopName, elem_size);
       }
     }
   }
@@ -2941,14 +4037,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_dst_to_dst(std::string loopName) {
+  inline void sync_dst_to_dst(std::string loopName, size_t elem_size) {
     // do nothing for IEC
     // reduce and broadcast for OEC, CVC, UVC
     if (!transposed || isVertexCut) {
       reduce<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
       broadcast<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     }
   }
 
@@ -2961,13 +4057,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_dst_to_any(std::string loopName) {
+  inline void sync_dst_to_any(std::string loopName, size_t elem_size) {
     // only broadcast for IEC
     // reduce and broadcast for OEC, CVC, UVC
     if (!transposed || isVertexCut) {
-      reduce<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+      reduce<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                     elem_size);
     }
-    broadcast<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+    broadcast<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(
+        loopName, elem_size);
   }
 
   /**
@@ -2979,12 +4077,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_any_to_src(std::string loopName) {
+  inline void sync_any_to_src(std::string loopName, size_t elem_size) {
     // only reduce for OEC
     // reduce and broadcast for IEC, CVC, UVC
-    reduce<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
+    reduce<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                              elem_size);
     if (transposed || isVertexCut) {
-      broadcast<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);
+      broadcast<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                   elem_size);
     }
   }
 
@@ -2997,14 +4097,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_any_to_dst(std::string loopName) {
+  inline void sync_any_to_dst(std::string loopName, size_t elem_size) {
     // only reduce for IEC
     // reduce and broadcast for OEC, CVC, UVC
-    reduce<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(loopName);
+    reduce<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                                   elem_size);
 
     if (!transposed || isVertexCut) {
       broadcast<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(
-          loopName);
+          loopName, elem_size);
     }
   }
 
@@ -3017,10 +4118,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
    * @param loopName used to name timers for statistics
    */
   template <typename SyncFnTy, typename BitsetFnTy, bool async>
-  inline void sync_any_to_any(std::string loopName) {
+  inline void sync_any_to_any(std::string loopName, size_t elem_size) {
     // reduce and broadcast for OEC, IEC, CVC, UVC
-    reduce<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
-    broadcast<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);
+    reduce<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
+    broadcast<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName,
+                                                              elem_size);
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -3028,6 +4130,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   ////////////////////////////////////////////////////////////////////////////////
 
 public:
+  template <WriteLocation writeLocation, ReadLocation readLocation,
+            typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,
+            bool async = false>
+  inline void sync(std::string loopName) {
+    sync<writeLocation, readLocation, SyncFnTy, BitsetFnTy, async>(loopName, 1);
+  }
+
   /**
    * Main sync call exposed to the user that calls the correct sync function
    * based on provided template arguments. Must provide information through
@@ -3043,38 +4152,38 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   template <WriteLocation writeLocation, ReadLocation readLocation,
             typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,
             bool async = false>
-  inline void sync(std::string loopName) {
+  inline void sync(std::string loopName, size_t elem_size) {
     std::string timer_str("Sync_" + loopName + "_" + get_run_identifier());
     galois::StatTimer Tsync(timer_str.c_str(), RNAME);
 
     Tsync.start();
 
     if (partitionAgnostic) {
-      sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+      sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
     } else {
       if (writeLocation == writeSource) {
         if (readLocation == readSource) {
-          sync_src_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_src_to_src<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else if (readLocation == readDestination) {
-          sync_src_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_src_to_dst<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else { // readAny
-          sync_src_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_src_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         }
       } else if (writeLocation == writeDestination) {
         if (readLocation == readSource) {
-          sync_dst_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_dst_to_src<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else if (readLocation == readDestination) {
-          sync_dst_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_dst_to_dst<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else { // readAny
-          sync_dst_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_dst_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         }
       } else { // writeAny
         if (readLocation == readSource) {
-          sync_any_to_src<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_any_to_src<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else if (readLocation == readDestination) {
-          sync_any_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_any_to_dst<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         } else { // readAny
-          sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);
+          sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName, elem_size);
         }
       }
     }
@@ -3153,13 +4262,20 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
      */
     static inline void call(GluonSubstrate* substrate,
                             galois::runtime::FieldFlags& fieldFlags,
-                            std::string loopName, const BITVECTOR_STATUS&) {
+                            std::string loopName, const BITVECTOR_STATUS& b) {
+      call(substrate, fieldFlags, loopName, b, 1);
+    }
+
+    static inline void call(GluonSubstrate* substrate,
+                            galois::runtime::FieldFlags& fieldFlags,
+                            std::string loopName, const BITVECTOR_STATUS&,
+                            size_t elem_size) {
       if (fieldFlags.src_to_dst() && fieldFlags.dst_to_dst()) {
-        substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+        substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
       } else if (fieldFlags.src_to_dst()) {
-        substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+        substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
       } else if (fieldFlags.dst_to_dst()) {
-        substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+        substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
       }
 
       fieldFlags.clear_read_dst();
@@ -3189,6 +4305,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
                             galois::runtime::FieldFlags& fieldFlags,
                             std::string loopName,
                             const BITVECTOR_STATUS& bvFlag) {
+      call(substrate, fieldFlags, loopName, bvFlag, 1);
+    }
+
+    static inline void call(GluonSubstrate* substrate,
+                            galois::runtime::FieldFlags& fieldFlags,
+                            std::string loopName,
+                            const BITVECTOR_STATUS& bvFlag, size_t elem_size) {
       bool src_write = fieldFlags.src_to_src() || fieldFlags.src_to_dst();
       bool dst_write = fieldFlags.dst_to_src() || fieldFlags.dst_to_dst();
 
@@ -3201,42 +4324,56 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
         if (src_write) {
           if (fieldFlags.src_to_src() && fieldFlags.src_to_dst()) {
             if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
-              substrate->sync_src_to_any<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_src_to_any<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::src_invalid(bvFlag)) {
               // src invalid bitset; sync individually so it can be called
               // without bitset
-              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::dst_invalid(bvFlag)) {
               // dst invalid bitset; sync individually so it can be called
               // without bitset
-              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else {
               GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
             }
           } else if (fieldFlags.src_to_src()) {
-            substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else { // src to dst is set
-            substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           }
         } else if (dst_write) {
           if (fieldFlags.dst_to_src() && fieldFlags.dst_to_dst()) {
             if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
-              substrate->sync_dst_to_any<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_dst_to_any<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::src_invalid(bvFlag)) {
-              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else if (galois::runtime::dst_invalid(bvFlag)) {
-              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
-              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
+              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                               elem_size);
             } else {
               GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
             }
           } else if (fieldFlags.dst_to_src()) {
-            substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else { // dst to dst is set
-            substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           }
         }
 
@@ -3252,20 +4389,25 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
         if (src_read && dst_read) {
           if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {
-            substrate->sync_any_to_any<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_any_to_any<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else if (galois::runtime::src_invalid(bvFlag)) {
-            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
-            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
+            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else if (galois::runtime::dst_invalid(bvFlag)) {
-            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
-            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
+            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName,
+                                                             elem_size);
           } else {
             GALOIS_DIE("invalid bitvector flag setting in syncOnDemand");
           }
         } else if (src_read) {
-          substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);
+          substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName, elem_size);
         } else { // dst_read
-          substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);
+          substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName, elem_size);
         }
       }
 
@@ -3301,7 +4443,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
   }
 
 public:
-  void getMarshalGraph(MarshalGraph& m) {
+  void getMarshalGraph(MarshalGraph& m) { getMarshalGraph(m, true); }
+
+  void getMarshalGraph(MarshalGraph& m, bool deallocate_graph) {
     m.nnodes   = userGraph.size();
     m.nedges   = userGraph.sizeEdges();
     m.numOwned = userGraph.numMasters();
@@ -3347,20 +4491,20 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     // copy memoization meta-data
     m.num_master_nodes =
-        (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int));
+        (unsigned int*)calloc(masterNodes->size(), sizeof(unsigned int));
     ;
     m.master_nodes =
-        (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*));
+        (unsigned int**)calloc(masterNodes->size(), sizeof(unsigned int*));
     ;
 
-    for (uint32_t h = 0; h < masterNodes.size(); ++h) {
-      m.num_master_nodes[h] = masterNodes[h].size();
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
+      m.num_master_nodes[h] = (*masterNodes)[h].size();
 
-      if (masterNodes[h].size() > 0) {
-        m.master_nodes[h] =
-            (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int));
+      if ((*masterNodes)[h].size() > 0) {
+        m.master_nodes[h] = (unsigned int*)calloc((*masterNodes)[h].size(),
+                                                  sizeof(unsigned int));
         ;
-        std::copy(masterNodes[h].begin(), masterNodes[h].end(),
+        std::copy((*masterNodes)[h].begin(), (*masterNodes)[h].end(),
                   m.master_nodes[h]);
       } else {
         m.master_nodes[h] = NULL;
@@ -3368,19 +4512,19 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
     }
 
     m.num_mirror_nodes =
-        (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int));
+        (unsigned int*)calloc(mirrorNodes->size(), sizeof(unsigned int));
     ;
     m.mirror_nodes =
-        (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*));
+        (unsigned int**)calloc(mirrorNodes->size(), sizeof(unsigned int*));
     ;
-    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {
-      m.num_mirror_nodes[h] = mirrorNodes[h].size();
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
+      m.num_mirror_nodes[h] = (*mirrorNodes)[h].size();
 
-      if (mirrorNodes[h].size() > 0) {
-        m.mirror_nodes[h] =
-            (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int));
+      if ((*mirrorNodes)[h].size() > 0) {
+        m.mirror_nodes[h] = (unsigned int*)calloc((*mirrorNodes)[h].size(),
+                                                  sizeof(unsigned int));
         ;
-        std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(),
+        std::copy((*mirrorNodes)[h].begin(), (*mirrorNodes)[h].end(),
                   m.mirror_nodes[h]);
       } else {
         m.mirror_nodes[h] = NULL;
@@ -3389,7 +4533,66 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 
     // user needs to provide method of freeing up graph (it can do nothing
     // if they wish)
-    userGraph.deallocate();
+    if (deallocate_graph) {
+      userGraph.deallocate();
+    }
+  }
+
+  void getPartitionedGraphInfo(PartitionedGraphInfo& g_info) {
+    getPartitionedGraphInfo(g_info, true);
+  }
+
+  void getPartitionedGraphInfo(PartitionedGraphInfo& g_info,
+                               bool deallocate_graph) {
+    g_info.numOwned = userGraph.numMasters();
+    // Assumption: master occurs at beginning in contiguous range
+    g_info.beginMaster       = 0;
+    g_info.numNodesWithEdges = userGraph.getNumNodesWithEdges();
+    g_info.id                = id;
+    g_info.numHosts          = numHosts;
+
+    // copy memoization meta-data
+    g_info.num_master_nodes =
+        (unsigned int*)calloc(masterNodes->size(), sizeof(unsigned int));
+    g_info.master_nodes =
+        (unsigned int**)calloc(masterNodes->size(), sizeof(unsigned int*));
+
+    for (uint32_t h = 0; h < masterNodes->size(); ++h) {
+      g_info.num_master_nodes[h] = (*masterNodes)[h].size();
+
+      if ((*masterNodes)[h].size() > 0) {
+        g_info.master_nodes[h] = (unsigned int*)calloc((*masterNodes)[h].size(),
+                                                       sizeof(unsigned int));
+        ;
+        std::copy((*masterNodes)[h].begin(), (*masterNodes)[h].end(),
+                  g_info.master_nodes[h]);
+      } else {
+        g_info.master_nodes[h] = NULL;
+      }
+    }
+
+    g_info.num_mirror_nodes =
+        (unsigned int*)calloc(mirrorNodes->size(), sizeof(unsigned int));
+    g_info.mirror_nodes =
+        (unsigned int**)calloc(mirrorNodes->size(), sizeof(unsigned int*));
+    for (uint32_t h = 0; h < mirrorNodes->size(); ++h) {
+      g_info.num_mirror_nodes[h] = (*mirrorNodes)[h].size();
+
+      if ((*mirrorNodes)[h].size() > 0) {
+        g_info.mirror_nodes[h] = (unsigned int*)calloc((*mirrorNodes)[h].size(),
+                                                       sizeof(unsigned int));
+        std::copy((*mirrorNodes)[h].begin(), (*mirrorNodes)[h].end(),
+                  g_info.mirror_nodes[h]);
+      } else {
+        g_info.mirror_nodes[h] = NULL;
+      }
+    }
+
+    // user needs to provide method of freeing up graph (it can do nothing
+    // if they wish)
+    if (deallocate_graph) {
+      userGraph.deallocate();
+    }
   }
 #endif // het galois def
 
@@ -3547,13 +4750,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject {
 ///*
 // * Headers for boost serialization
 // */
-//#include <boost/archive/binary_oarchive.hpp>
-//#include <boost/archive/binary_iarchive.hpp>
-//#include <boost/serialization/split_member.hpp>
-//#include <boost/serialization/binary_object.hpp>
-//#include <boost/serialization/serialization.hpp>
-//#include <boost/serialization/vector.hpp>
-//#include <boost/serialization/unordered_map.hpp>
+// #include <boost/archive/binary_oarchive.hpp>
+// #include <boost/archive/binary_iarchive.hpp>
+// #include <boost/serialization/split_member.hpp>
+// #include <boost/serialization/binary_object.hpp>
+// #include <boost/serialization/serialization.hpp>
+// #include <boost/serialization/vector.hpp>
+// #include <boost/serialization/unordered_map.hpp>
 //
 // public:
 //  /**
diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h
index b5a2b65d5c..588403ad83 100644
--- a/libgluon/include/galois/runtime/SyncStructures.h
+++ b/libgluon/include/galois/runtime/SyncStructures.h
@@ -1920,17 +1920,17 @@ class FieldFlags {
     static bool is_valid() { return true; }                                    \
                                                                                \
     static galois::DynamicBitSet& get() {                                      \
-      if (personality == GPU_CUDA)                                             \
+      if (device_personality == DevicePersonality::GPU_CUDA)                   \
         get_bitset_##fieldname##_cuda(                                         \
             cuda_ctx, (uint64_t*)bitset_##fieldname.get_vec().data());         \
       return bitset_##fieldname;                                               \
     }                                                                          \
                                                                                \
     static void reset_range(size_t begin, size_t end) {                        \
-      if (personality == GPU_CUDA) {                                           \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
         bitset_##fieldname##_reset_cuda(cuda_ctx, begin, end);                 \
       } else {                                                                 \
-        assert(personality == CPU);                                            \
+        assert(device_personality == DevicePersonality::CPU);                  \
         bitset_##fieldname.reset(begin, end);                                  \
       }                                                                        \
     }                                                                          \
@@ -1981,4 +1981,104 @@ class FieldFlags {
     }                                                                          \
   }
 
+#ifdef GALOIS_ENABLE_GPU
+#define GALOIS_SYNC_STRUCTURE_GNN_LAYER(fieldname, cuda_ctx_for_sync,          \
+                                        gnn_matrix_to_sync_column_length_,     \
+                                        layer_number_to_sync)                  \
+  template <typename NTy>                                                      \
+  struct GNNSumAggregate_##fieldname {                                         \
+    using NodeTy = NTy;                                                        \
+    using ValTy  = GNNFloat;                                                   \
+                                                                               \
+    static ValTy extract(uint32_t, NodeTy&) { return 0.f; }                    \
+                                                                               \
+    static bool reduce(uint32_t, NodeTy&, ValTy) { return false; }             \
+                                                                               \
+    static void reset(uint32_t, NodeTy&) {}                                    \
+                                                                               \
+    static void setVal(uint32_t, char&, ValTy) {}                              \
+                                                                               \
+    static bool extract_batch(unsigned from_id, uint8_t* buf,                  \
+                              size_t* buf_size, DataCommMode* mode) {          \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_node_##fieldname##_matrix_cuda(                              \
+            cuda_ctx_for_sync, from_id, buf, buf_size, mode,                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool extract_batch(unsigned from_id, uint8_t* buf) {                \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_node_##fieldname##_matrix_cuda(                              \
+            cuda_ctx_for_sync, from_id, buf,                                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool reduce_batch(unsigned from_id, uint8_t* buf,                   \
+                             DataCommMode mode) {                              \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_aggregate_node_##fieldname##_matrix_cuda(                        \
+            cuda_ctx_for_sync, from_id, buf, mode,                             \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool reduce_mirror_batch(unsigned from_id, uint8_t* buf,            \
+                                    DataCommMode mode) {                       \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_aggregate_mirror_node_##fieldname##_matrix_cuda(                 \
+            cuda_ctx_for_sync, from_id, buf, mode,                             \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool setVal_batch(unsigned from_id, uint8_t* buf,                   \
+                             DataCommMode mode) {                              \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_set_mirror_node_##fieldname##_matrix_cuda(                       \
+            cuda_ctx_for_sync, from_id, buf, mode,                             \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool extract_reset_batch(unsigned from_id, uint8_t* buf,            \
+                                    size_t* buf_size, DataCommMode* mode) {    \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_reset_node_##fieldname##_matrix_cuda(                        \
+            cuda_ctx_for_sync, from_id, buf, buf_size, mode,                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+                                                                               \
+    static bool extract_reset_batch(unsigned from_id, uint8_t* buf) {          \
+      if (device_personality == DevicePersonality::GPU_CUDA) {                 \
+        batch_get_reset_node_##fieldname##_matrix_cuda(                        \
+            cuda_ctx_for_sync, from_id, buf,                                   \
+            gnn_matrix_to_sync_column_length_, layer_number_to_sync);          \
+        return true;                                                           \
+      }                                                                        \
+      assert(device_personality == DevicePersonality::CPU);                    \
+      return false;                                                            \
+    }                                                                          \
+  };
+#endif
 #endif // header guard
diff --git a/libgluon/include/galois/runtime/cuda/DeviceSync.h b/libgluon/include/galois/runtime/cuda/DeviceSync.h
index db23350c4a..6b49aa743f 100644
--- a/libgluon/include/galois/runtime/cuda/DeviceSync.h
+++ b/libgluon/include/galois/runtime/cuda/DeviceSync.h
@@ -52,7 +52,8 @@ void kernel_sizing(dim3& blocks, dim3& threads) {
 }
 
 template <typename DataType>
-__global__ void batch_get_subset(index_type subset_size,
+__global__ void batch_get_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  DataType* __restrict__ subset,
                                  const DataType* __restrict__ array) {
@@ -61,12 +62,15 @@ __global__ void batch_get_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    subset[src]    = array[index];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid] = array[index * elem_size + eid];
+    }
   }
 }
 
 template <typename DataType, typename OffsetIteratorType>
-__global__ void batch_get_subset(index_type subset_size,
+__global__ void batch_get_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  DataType* __restrict__ subset,
@@ -76,45 +80,52 @@ __global__ void batch_get_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    subset[src]    = array[index];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid] = array[index * elem_size + eid];
+    }
   }
 }
 
 template <typename DataType>
-__global__ void batch_get_reset_subset(index_type subset_size,
-                                       const unsigned int* __restrict__ indices,
-                                       DataType* __restrict__ subset,
-                                       DataType* __restrict__ array,
-                                       DataType reset_value) {
+__global__ void
+batch_get_reset_subset(const index_type subset_size, const index_type elem_size,
+                       const unsigned int* __restrict__ indices,
+                       DataType* __restrict__ subset,
+                       DataType* __restrict__ array, DataType reset_value) {
   unsigned tid       = TID_1D;
   unsigned nthreads  = TOTAL_THREADS_1D;
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    subset[src]    = array[index];
-    array[index]   = reset_value;
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid]  = array[index * elem_size + eid];
+      array[index * elem_size + eid] = reset_value;
+    }
   }
 }
 
 template <typename DataType, typename OffsetIteratorType>
-__global__ void batch_get_reset_subset(index_type subset_size,
-                                       const unsigned int* __restrict__ indices,
-                                       const OffsetIteratorType offsets,
-                                       DataType* __restrict__ subset,
-                                       DataType* __restrict__ array,
-                                       DataType reset_value) {
+__global__ void
+batch_get_reset_subset(const index_type subset_size, const index_type elem_size,
+                       const unsigned int* __restrict__ indices,
+                       const OffsetIteratorType offsets,
+                       DataType* __restrict__ subset,
+                       DataType* __restrict__ array, DataType reset_value) {
   unsigned tid       = TID_1D;
   unsigned nthreads  = TOTAL_THREADS_1D;
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    subset[src]    = array[index];
-    array[index]   = reset_value;
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      subset[src * elem_size + eid]  = array[index * elem_size + eid];
+      array[index * elem_size + eid] = reset_value;
+    }
   }
 }
 
 template <typename DataType, SharedType sharedType>
-__global__ void batch_set_subset(index_type subset_size,
+__global__ void batch_set_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const DataType* __restrict__ subset,
                                  DataType* __restrict__ array,
@@ -124,7 +135,10 @@ __global__ void batch_set_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    array[index]   = subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] = subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -132,7 +146,8 @@ __global__ void batch_set_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
-__global__ void batch_set_subset(index_type subset_size,
+__global__ void batch_set_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  const DataType* __restrict__ subset,
@@ -143,7 +158,10 @@ __global__ void batch_set_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    array[index]   = subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] = subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -151,7 +169,8 @@ __global__ void batch_set_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType>
-__global__ void batch_add_subset(index_type subset_size,
+__global__ void batch_add_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const DataType* __restrict__ subset,
                                  DataType* __restrict__ array,
@@ -161,7 +180,10 @@ __global__ void batch_add_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    array[index] += subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] += subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -169,7 +191,8 @@ __global__ void batch_add_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
-__global__ void batch_add_subset(index_type subset_size,
+__global__ void batch_add_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  const DataType* __restrict__ subset,
@@ -180,7 +203,10 @@ __global__ void batch_add_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    array[index] += subset[src];
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      array[index * elem_size + eid] += subset[src * elem_size + eid];
+    }
+
     if (sharedType != sharedMirror) {
       is_array_updated->set(index);
     }
@@ -188,7 +214,8 @@ __global__ void batch_add_subset(index_type subset_size,
 }
 
 template <typename DataType, SharedType sharedType>
-__global__ void batch_min_subset(index_type subset_size,
+__global__ void batch_min_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const DataType* __restrict__ subset,
                                  DataType* __restrict__ array,
@@ -198,17 +225,20 @@ __global__ void batch_min_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[src];
-    if (array[index] > subset[src]) {
-      array[index] = subset[src];
-      if (sharedType != sharedMirror) {
-        is_array_updated->set(index);
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      if (array[index * elem_size + eid] > subset[src * elem_size + eid]) {
+        array[index * elem_size + eid] = subset[src * elem_size + eid];
+        if (sharedType != sharedMirror) {
+          is_array_updated->set(index);
+        }
       }
     }
   }
 }
 
 template <typename DataType, SharedType sharedType, typename OffsetIteratorType>
-__global__ void batch_min_subset(index_type subset_size,
+__global__ void batch_min_subset(const index_type subset_size,
+                                 const index_type elem_size,
                                  const unsigned int* __restrict__ indices,
                                  const OffsetIteratorType offsets,
                                  const DataType* __restrict__ subset,
@@ -219,10 +249,12 @@ __global__ void batch_min_subset(index_type subset_size,
   index_type src_end = subset_size;
   for (index_type src = 0 + tid; src < src_end; src += nthreads) {
     unsigned index = indices[offsets[src]];
-    if (array[index] > subset[src]) {
-      array[index] = subset[src];
-      if (sharedType != sharedMirror) {
-        is_array_updated->set(index);
+    for (index_type eid = 0; eid < elem_size; eid++) {
+      if (array[index * elem_size + eid] > subset[src * elem_size + eid]) {
+        array[index * elem_size + eid] = subset[src * elem_size + eid];
+        if (sharedType != sharedMirror) {
+          is_array_updated->set(index);
+        }
       }
     }
   }
@@ -393,6 +425,80 @@ void reset_bitset_field(struct CUDA_Context_Field<DataType>* field,
                                           mask1, test2, bit_index2, mask2);
 }
 
+// TODO(lhc) we may not need this later, but for now just use this
+void reset_bitset_field(Shared<DynamicBitset>& bitset, size_t begin,
+                        size_t end) {
+  dim3 blocks;
+  dim3 threads;
+  kernel_sizing(blocks, threads);
+  const DynamicBitset* bitset_cpu = bitset.cpu_rd_ptr();
+  assert(begin <= (bitset_cpu->size() - 1));
+  assert(end <= (bitset_cpu->size() - 1));
+
+  size_t vec_begin = (begin + 63) / 64;
+  size_t vec_end;
+
+  if (end == (bitset_cpu->size() - 1))
+    vec_end = bitset_cpu->vec_size();
+  else
+    vec_end = (end + 1) / 64; // floor
+
+  size_t begin2 = vec_begin * 64;
+  size_t end2   = vec_end * 64;
+
+  bool test1;
+  size_t bit_index1;
+  uint64_t mask1;
+
+  bool test2;
+  size_t bit_index2;
+  uint64_t mask2;
+
+  if (begin2 > end2) {
+    test2 = false;
+
+    if (begin < begin2) {
+      test1       = true;
+      bit_index1  = begin / 64;
+      size_t diff = begin2 - begin;
+      assert(diff < 64);
+      mask1 = ((uint64_t)1 << (64 - diff)) - 1;
+
+      // create or mask
+      size_t diff2 = end - end2 + 1;
+      assert(diff2 < 64);
+      mask2 = ~(((uint64_t)1 << diff2) - 1);
+      mask1 |= ~mask2;
+    } else {
+      test1 = false;
+    }
+  } else {
+    if (begin < begin2) {
+      test1       = true;
+      bit_index1  = begin / 64;
+      size_t diff = begin2 - begin;
+      assert(diff < 64);
+      mask1 = ((uint64_t)1 << (64 - diff)) - 1;
+    } else {
+      test1 = false;
+    }
+
+    if (end >= end2) {
+      test2       = true;
+      bit_index2  = end / 64;
+      size_t diff = end - end2 + 1;
+      assert(diff < 64);
+      mask2 = ~(((uint64_t)1 << diff) - 1);
+    } else {
+      test2 = false;
+    }
+  }
+
+  bitset_reset_range<<<blocks, threads>>>(bitset.gpu_rd_ptr(), vec_begin,
+                                          vec_end, test1, bit_index1, mask1,
+                                          test2, bit_index2, mask2);
+}
+
 template <typename DataType>
 void reset_data_field(struct CUDA_Context_Field<DataType>* field, size_t begin,
                       size_t end, DataType val) {
@@ -437,6 +543,15 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
                             struct CUDA_Context_Field<DataType>* field,
                             unsigned from_id, uint8_t* send_buffer,
                             DataType i = 0) {
+  batch_get_shared_field<DataType, sharedType, reset>(ctx, field, from_id,
+                                                      send_buffer, 1, i);
+}
+
+template <typename DataType, SharedType sharedType, bool reset>
+void batch_get_shared_field(struct CUDA_Context_Common* ctx,
+                            struct CUDA_Context_Field<DataType>* field,
+                            unsigned from_id, uint8_t* send_buffer,
+                            size_t elem_size, DataType i = 0) {
   struct CUDA_Context_Shared* shared;
   if (sharedType == sharedMaster) {
     shared = &ctx->master;
@@ -454,12 +569,12 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
   size_t v_size = shared->num_nodes[from_id];
   if (reset) {
     batch_get_reset_subset<DataType><<<blocks, threads>>>(
-        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
-        field->data.gpu_wr_ptr(), i);
+        v_size, elem_size, shared->nodes[from_id].device_ptr(),
+        shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
   } else {
     batch_get_subset<DataType><<<blocks, threads>>>(
-        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),
-        field->data.gpu_rd_ptr());
+        v_size, elem_size, shared->nodes[from_id].device_ptr(),
+        shared_data->device_ptr(), field->data.gpu_rd_ptr());
   }
   check_cuda_kernel;
   // timer1.stop();
@@ -468,7 +583,9 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
   memcpy(send_buffer, &data_mode, sizeof(data_mode));
   memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));
   shared_data->copy_to_cpu(
-      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);
+      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)),
+      v_size * elem_size);
+
   // timer2.stop();
   // timer.stop();
   // fprintf(stderr, "Get %u->%u: Time (ms): %llu + %llu = %llu\n",
@@ -480,7 +597,8 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
 template <typename DataType>
 void serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
                       size_t bit_set_count, size_t num_shared,
-                      DeviceOnly<DataType>* shared_data, uint8_t* send_buffer) {
+                      DeviceOnly<DataType>* shared_data, uint8_t* send_buffer,
+                      size_t elem_size) {
   if (data_mode == noData) {
     // do nothing
     return;
@@ -520,7 +638,8 @@ void serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
   // serialize data vector
   memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));
   offset += sizeof(bit_set_count);
-  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);
+  shared_data->copy_to_cpu((DataType*)(send_buffer + offset),
+                           (elem_size * bit_set_count));
   // offset += bit_set_count * sizeof(DataType);
 }
 
@@ -530,6 +649,16 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
                             unsigned from_id, uint8_t* send_buffer,
                             size_t* v_size, DataCommMode* data_mode,
                             DataType i = 0) {
+  batch_get_shared_field<DataType, sharedType, reset>(
+      ctx, field, from_id, send_buffer, v_size, data_mode, 1, i);
+}
+
+template <typename DataType, SharedType sharedType, bool reset>
+void batch_get_shared_field(struct CUDA_Context_Common* ctx,
+                            struct CUDA_Context_Field<DataType>* field,
+                            unsigned from_id, uint8_t* send_buffer,
+                            size_t* v_size, DataCommMode* data_mode,
+                            size_t elem_size, DataType i = 0) {
   struct CUDA_Context_Shared* shared;
   if (sharedType == sharedMaster) {
     shared = &ctx->master;
@@ -541,69 +670,53 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx,
   dim3 threads;
   kernel_sizing(blocks, threads);
 
-  // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2"),
-  // timer3("timer3"), timer4("timer 4"); timer.start();
   if (enforcedDataMode != onlyData) {
-    // timer1.start();
     ctx->is_updated.cpu_rd_ptr()->resize(shared->num_nodes[from_id]);
     ctx->is_updated.cpu_rd_ptr()->reset();
+    //! check updated entries and update bitset
     batch_get_subset_bitset<<<blocks, threads>>>(
         shared->num_nodes[from_id], shared->nodes[from_id].device_ptr(),
         ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());
     check_cuda_kernel;
-    // timer1.stop();
-    // timer2.start();
     get_offsets_from_bitset(shared->num_nodes[from_id],
                             ctx->offsets.device_ptr(),
                             ctx->is_updated.gpu_rd_ptr(), v_size);
-    // timer2.stop();
   }
   *data_mode = get_data_mode<DataType>(*v_size, shared->num_nodes[from_id]);
-  // timer3.start();
   if ((*data_mode) == onlyData) {
     *v_size = shared->num_nodes[from_id];
     if (reset) {
       batch_get_reset_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);
     } else {
       batch_get_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_rd_ptr());
     }
   } else { // bitsetData || offsetsData
     if (reset) {
       batch_get_reset_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), i);
     } else {
       batch_get_subset<DataType><<<blocks, threads>>>(
-          *v_size, shared->nodes[from_id].device_ptr(),
+          *v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_rd_ptr());
     }
   }
   check_cuda_kernel;
-  // timer3.stop();
-  // timer4.start();
   serializeMessage(ctx, *data_mode, *v_size, shared->num_nodes[from_id],
-                   shared_data, send_buffer);
-  // timer4.stop();
-  // timer.stop();
-  // fprintf(stderr, "Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu
-  // + %llu + %llu + %llu = %llu\n",
-  //  ctx->id, from_id, *data_mode,
-  //  ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *
-  //  (*v_size), timer1.duration_ms(), timer2.duration_ms(),
-  //  timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());
+                   shared_data, send_buffer, elem_size);
 }
 
 template <typename DataType>
 void deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
                         size_t& bit_set_count, size_t num_shared,
-                        DeviceOnly<DataType>* shared_data,
-                        uint8_t* recv_buffer) {
+                        DeviceOnly<DataType>* shared_data, uint8_t* recv_buffer,
+                        size_t elem_size) {
   size_t offset = 0; // data_mode is already deserialized
 
   if (data_mode != onlyData) {
@@ -640,8 +753,8 @@ void deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,
 
   // deserialize data vector
   offset += sizeof(bit_set_count);
-  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);
-  // offset += bit_set_count * sizeof(DataType);
+  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset),
+                           bit_set_count * elem_size);
 }
 
 template <typename DataType, SharedType sharedType, UpdateOp op>
@@ -649,6 +762,15 @@ void batch_set_shared_field(struct CUDA_Context_Common* ctx,
                             struct CUDA_Context_Field<DataType>* field,
                             unsigned from_id, uint8_t* recv_buffer,
                             DataCommMode data_mode) {
+  batch_set_shared_field<DataType, sharedType, op>(ctx, field, from_id,
+                                                   recv_buffer, data_mode, 1);
+}
+
+template <typename DataType, SharedType sharedType, UpdateOp op>
+void batch_set_shared_field(struct CUDA_Context_Common* ctx,
+                            struct CUDA_Context_Field<DataType>* field,
+                            unsigned from_id, uint8_t* recv_buffer,
+                            DataCommMode data_mode, size_t elem_size) {
   assert(data_mode != noData);
   struct CUDA_Context_Shared* shared;
   if (sharedType == sharedMaster) {
@@ -666,54 +788,57 @@ void batch_set_shared_field(struct CUDA_Context_Common* ctx,
   // timer.start();
   // timer1.start();
   deserializeMessage(ctx, data_mode, v_size, shared->num_nodes[from_id],
-                     shared_data, recv_buffer);
+                     shared_data, recv_buffer, elem_size);
   // timer1.stop();
   // timer2.start();
   if (data_mode == onlyData) {
     if (op == setOp) {
       batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(),
           field->is_updated.gpu_wr_ptr());
     } else if (op == addOp) {
       batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(),
           field->is_updated.gpu_wr_ptr());
     } else if (op == minOp) {
       batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           shared_data->device_ptr(), field->data.gpu_wr_ptr(),
           field->is_updated.gpu_wr_ptr());
     }
   } else if (data_mode == gidsData) {
     if (op == setOp) {
       batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
-          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
+          v_size, elem_size, ctx->offsets.device_ptr(),
+          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
+          field->is_updated.gpu_wr_ptr());
     } else if (op == addOp) {
       batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
-          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
+          v_size, elem_size, ctx->offsets.device_ptr(),
+          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
+          field->is_updated.gpu_wr_ptr());
     } else if (op == minOp) {
       batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),
-          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
+          v_size, elem_size, ctx->offsets.device_ptr(),
+          shared_data->device_ptr(), field->data.gpu_wr_ptr(),
+          field->is_updated.gpu_wr_ptr());
     }
   } else { // bitsetData || offsetsData
     if (op == setOp) {
       batch_set_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
     } else if (op == addOp) {
       batch_add_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
     } else if (op == minOp) {
       batch_min_subset<DataType, sharedType><<<blocks, threads>>>(
-          v_size, shared->nodes[from_id].device_ptr(),
+          v_size, elem_size, shared->nodes[from_id].device_ptr(),
           ctx->offsets.device_ptr(), shared_data->device_ptr(),
           field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());
     }
diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt
new file mode 100644
index 0000000000..4c3b8533ae
--- /dev/null
+++ b/libgnn/CMakeLists.txt
@@ -0,0 +1,67 @@
+set(sources
+  src/DistributedMinibatchTracker.cpp
+  src/GNNMath.cpp
+  src/GNNOptimizers.cpp
+  src/MinibatchGenerator.cpp
+  src/graphs/GNNGraph.cpp
+)
+
+add_library(galois_gnn STATIC ${sources})
+
+target_compile_options(galois_gnn PUBLIC
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+target_include_directories(galois_gnn PUBLIC
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+target_link_libraries(galois_gnn PUBLIC $<LINK_ONLY:MKL::MKL>)
+target_link_libraries(galois_gnn PUBLIC galois_shmem)
+target_link_libraries(galois_gnn PUBLIC galois_dist_async galois_cusp galois_gluon galois_support)
+target_include_directories(galois_gnn PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+add_library(galois_gnn_single STATIC ${sources})
+target_compile_options(galois_gnn_single PUBLIC
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+target_include_directories(galois_gnn_single PUBLIC
+    $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+target_link_libraries(galois_gnn_single PUBLIC $<LINK_ONLY:MKL::MKL>)
+target_link_libraries(galois_gnn_single PUBLIC galois_shmem)
+target_link_libraries(galois_gnn_single PUBLIC galois_dist_async galois_cusp galois_gluon galois_support)
+target_include_directories(galois_gnn_single PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
+set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn)
+add_subdirectory(test)
+
+if (GALOIS_ENABLE_GPU)
+  target_compile_definitions(galois_gnn PUBLIC GALOIS_ENABLE_GPU=1)
+
+  # create the galois_gnn_gpu library to get linked into galois_gnn
+  set(gpusources
+    src/GNNMath.cu
+    src/graphs/GNNGraph.cu
+    src/layers/GNNLayer.cu
+    src/layers/GraphConvolutionalLayer.cu
+    src/layers/SoftmaxLayer.cu
+    src/layers/SAGELayer.cu
+    src/GraphNeuralNetwork.cu
+    src/GNNOptimizers.cu
+    src/GNNCudaContext.cu
+    src/CUDAUtil.cu
+  )
+  add_library(galois_gnn_gpu STATIC ${gpusources})
+  target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES)
+  target_compile_options(galois_gnn_gpu PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>")
+  set_property(TARGET galois_gnn_gpu PROPERTY CUDA_STANDARD 14)
+  target_compile_definitions(galois_gnn_gpu PUBLIC GALOIS_ENABLE_GPU=1)
+  target_include_directories(galois_gnn_gpu PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+  )
+
+  # link to gpu lib (which takes care of moderngpu and cub) as well as cu libs
+  target_link_libraries(galois_gnn_gpu galois_gluon Galois::gpu galois_support -lcublas -lcurand)
+
+  # gpu -> cpu lib
+  target_link_libraries(galois_gnn galois_gnn_gpu)
+endif()
diff --git a/libgnn/README.md b/libgnn/README.md
new file mode 100644
index 0000000000..ded103c9b9
--- /dev/null
+++ b/libgnn/README.md
@@ -0,0 +1,580 @@
+Author: Loc Hoang, <l_hoang@utexas.edu>
+
+Best viewed with a Markdown viewer due to Latex + formatting.
+
+This file's sections are ordered such that you can read from
+top to bottom and still get a decent understanding of the
+pieces of `libgnn`. As such, independent portions are near the
+top.
+
+This file is being written so that whoever works on this code in the
+future has a general idea what contributions I've made to the code
+and how the gnn branch differs from master. Some of these changes
+need to get merged into master in the future. It also allows me
+to take stock of the changes/implementation choices I've made
+in the past year.
+
+# CuSP Changes
+
+Variants of the regular partitions were added to allow training
+nodes to be partitioned relatively evenly among machines rather
+than having CVC/OEC use a regular block partition over all nodes (which
+would ignore the train/val/test split).
+
+This causes some weird effects when this version's CuSP is used outside
+of GNNs or if the training boundaries are not hardcoded (e.g., if
+the training boundaries are unknown, a segfault can occur). Some care
+will be needed to make this integration more clean.
+
+# Gluon Changes
+
+Many changes occurred to Gluon to optimize for the vector communication
+case. A few of them are listed below.
+
+* Serialize/deserialize **directly** to/from the serialization and
+deserialization buffers. This eliminates a large amount of redundant
+copying from original source to vector to buffer (and in the reverse)
+which is incredibly important for performance when communicating vectors.
+Something important to also take away from this experience is that
+if you have a vector of vectors, serializing each vector individually
+into the buffer is a very bad idea: care should be taken to make
+it so that you can serialize as much data as possible in one go.
+
+* QoL change: way to disable Gluon timers with a variable change/flag.
+
+* Method to swap out mirror handshake since this is used by subgraph
+code to avoid sending messages to inactive mirrors.
+
+* Hochan ported large message handling from KatanaGraph into Galois.
+This involved changing the serialization buffers among other things.
+
+# GNN Optimizers
+
+Only one that exists is the ADAM optimizer. Note that each
+layer has its own moments and does not share them (this may or
+may not be standard; I'm not sure).
+
+All hosts will see the same gradients due to synchronization,
+so all hosts should end up making the same changes to the weights.
+
+# Layers
+
+Each layer inherits from a `GNNLayer`  class which has common functionality
+like weight allocation, output allocation, etc. The children classes
+can add more things to it; for example, SAGE adds weights for the
+concatenated feature and intermediates for intermediate calculation
+(also reused in backward prop).
+
+One thing to note is that the backward output matrix (used to output
+gradients during the backward phase) is **not** a completely independent
+piece of memory: it is the **memory used by the forward output of
+the layer that came before it**. The reason for this is that doing it
+this way saves a very large amount of memory, especially in a full batch
+setting where the number of nodes (multiplied by features/hidden feature
+size) can grow very large. **Be very careful about this as it means that
+you cannot reuse the output matrix from the forward pass after it
+has been overwritten.** This results in some rather convoluted logic that
+you may find in the code. It also means that **whenever an output matrix
+is resized for any reason, the pointers that each layer holds MUST
+be updated, or you will get undefined behavior**.
+
+## Softmax Layer
+
+Runs a softmax on each individual row, gets the highest value,
+compares with ground truth, gets loss.
+
+Note that the **forward and backward output matrix are shared** in this
+layer, so be careful with the assumptions made after the backward
+step is run (because the forward output will no longer be accessible
+after the backward step; this is why the accuracy check in the
+code has to occur before backward is called).
+
+Regarding the backward step: it turns out that for single class
+classification, the gradient if the answer is wrong is simply
+the softmax value itself, and if the answer is right, then its
+the softmax value - 1. This has the advantage of being very
+numerically stable as well.
+
+Things are slightly more complicated for the multi-class case; some
+investigation needs to be done to figure this out.
+
+## SAGE Layer (and GCN Layer by Extension)
+
+### ReLU Activation and Overwriting of the Forward Matrix
+
+ReLU activation is used by the compute layers: if the value
+is greater than 0, it is kept, else it is discarded.
+
+Because the forward output matrix gets overwritten during
+the backward step and because the derivative of the
+ReLU operation requires knowledge of what elements were
+affected by the ReLU, the system must *track* which
+elements were not set to 0 using a bitmask. This
+mask is used during the backward phase to keep gradients
+only if their values corresponding to that gradient
+were originally greater than 0, and it works even
+if the original forward matrix has been overwritten.
+
+### Row Dimensions and Active Portions of Matrices
+
+An optimal version of a normal GNN should make it so that
+the number of active rows decreases as execution progresses
+through the layers of the GNN: the last layer's active
+rows in the feature matrix should be *only* the seed
+nodes (i.e., nodes that are being predicted): keeping
+all nodes up to date is a waste of compute.
+
+The number of active nodes at the beginning of a GNNs
+should be all nodes involved in the k-hop neighborhood
+of the seed nodes. The next layer should remove
+the kth hop from the active nodes; the layer after,
+the (k-1)th hop, and so on. This can be accomplished
+relatively easily without disrupting the contiguous
+feature matrix by making sure that the nodes that will
+be dropped are in the suffix of matrix in the order
+that they will be dropped from the bottom. Then,
+to drop them, the code just changes the number of input
+rows for the layer so that any loops/matrix multiplies
+will only look at the relevant row prefix.
+
+In a distributed setting, the active nodes of a particular
+layer should be *shared* across all hosts; a host should not
+drop a node if it is being used somewhere else *and* if
+the node in question has a contribution to it (i.e.,
+has edges or is the master proxy).
+
+### SAGE's Concatenation of Input Features
+
+The GraphSAGE model concatenates the input feature to the aggregated
+feature vector on each node after aggregation which doubles
+the length of the vector. Actually doing this in the feature
+matrix is not great as it would mean that the original weight
+matrix needs to double in size, and additional space would have
+to be allocated on top of the existing input features
+with the aggregated copied over to it.
+
+Instead of doing this, you can allocate a separate weight matrix
+of the same size as the original, multiply the original input
+features with that new weight matrix, and sum it up to the final
+output matrix. The result is exactly the same as if the input
+feature was concatenated to the aggregated features then
+multiplied with a weight matrix with double the number of rows.
+(work it out mathematically; it's the same)
+
+### Intermediates and Flipping Aggregation/Linear XForm: Basics
+
+The GNN computation in SAGE is two-step: aggregation
+followed by linear transform (more steps if dropout is enabled):
+an intermediate matrix is required to store the result of the first
+step for use in the next step. Additionally, keeping this
+intermediate result around in memory significantly speeds up
+the backward step which can use it to derive gradients.
+Therefore, the SAGE layer must allocate space for the intermediate.
+
+The size of the intermediate changes depending on if you do
+linear xform before aggregation; this is done if doing
+the linear xform reduces the column dimension as it makes
+the aggregation aggregate on smaller feature vector sizes (which
+speeds up computation overall in general). It helps to understand
+how the dimensions change after aggregation and after linear
+xform. Say the input matrix is IR by IC (input row by input column).
+
+* Aggregation only needs to occur for the nodes that will
+be active in the next layer, i.e. the *output rows* (OR). Therefore,
+after aggregation, the rows of the matrix go from IR to OR.
+
+* Linear transform changes the number of columns to output columns (OC).
+Therefore, after linear xform, IC turns to OC.
+
+After both operations, the output matrix to the next layer is the
+expected OR by OC. Depending on which one occurs first,
+the code generates an intermediate of OR by IC *or* IC by OC.
+(more than one may be needed if dropout is used as that generates
+a new dropout matrix).
+
+### Intermediates and Flipping Aggregation/Linear XForm: Backward Pass
+
+The computation of a SAGE layer is the following in matrix
+terms where $T$ is the graph topology, $F$ is features,
+and the $W$s are the two weight matrices (one for aggregated
+value, other for concatenated vector).
+
+$TFW_1 + FW_2 = O$
+
+The gradients we want are $W_{1,2}'$ and $F'$ to pass back to the next layer in
+the backward phase. We have the gradient $O'$. The method in which this occurs
+depends on the order of aggregation/xform in the forward phase.
+
+First, $FW_2$. One can derive one part of $F'$ (the other part
+is from the first term) and $W_{2}'$. $F' = O'(W_2)^T$ and $W_{2}' = F^T O'$.
+
+Next, $TFW_1$.
+
+* If aggregation occurs first, we have $(TF)$ in an intermediate
+matrix.  The $W_{1}'$ gradient is $W_{1}' = (TF)^{T}O'$. To get one part of
+$F'$, we do $O' W_{1}^{T} = (TF)'$ followed by $T^T (TF)' = F'$.
+* If xform occurs first, $(FW_1)$ is in the intermediate matrix.
+To get $F'$, $T^T O' = (FW_{1})'$, followed by $(FW_{1})' (W_{1})^T = F'$.
+The weight gradient is $W_{1}' = F^T (FW_{1})'$.
+
+The $F'$ gradient from the two terms ($TFW_1$ and $FW_2$) can be summed
+together.
+
+### Masking Out Non-Masters in Distributed Setting
+
+In a distributed setting, all hosts need to see the same gradient
+computed in the backward phase so that the weights can all be updated
+in the same manner to keep consistency across hosts. This can
+be accomplished by synchronizing appropriately and making
+sure that a gradient computation isn't accounted for more than
+once globally.
+
+For $F'$, keeping it consistent simply means making sure that all
+hosts compute all the required rows. This is doable if a host knows
+what proxies it owns are active in the global subgraph being operated
+on and makes sure that it has the most up-to-date value for that proxy's
+gradient at all times. For example, since all hosts have a copy of the
+weights, in order to get the gradients for $F'$, all a host needs
+is to make sure $O'$ contains the gradients for local proxies
+active in a particular layer (even if they aren't part of that
+host's seed nodes). In this way, all hosts *recompute* the same gradient
+required for a proxy.
+
+For $W'$, each node contributes a gradient to it. A node is
+replicated across hosts via proxies; unlike the previous case,
+however, a *sync* of weight gradients occurs across all hosts because
+not all hosts have all proxies, and in this case, you need the
+contribution of all nodes and not just the ones you have proxies
+of, so you do **not** want a node's gradient to be computed more than
+once across all hosts. Therefore, when doing computation involving
+the weight gradient, a node's contribution should only be computed
+once **by the owner/master of that node**. Therefore, non-masters
+on hosts **need to be masked when computing $W'$**.
+This presents a problem implementation wise: masking non-masters
+is an in-place operation since you do not want to allocate
+new memory, so some care needs to be taken on which matrices to mask
+as well as when to mask them since $F'$ computation requires *non-masked*
+matrices. This is the reason for the very convoluted logic in the
+backward pass in the code that will need to be cleaned up or
+redesigned at some point.
+It might be possible to play a similar trick to active row prefixing
+where non-masters are placed lower in the rows so that "masking"
+can occur by changing the row count, but I believe I tried
+this and ran into issues with non-contiguity of masters/mirrors.
+
+Below is the masking logic used by the current code:
+
+```
+Calculate W2' using masked input or masked gradients (mask required else overcount,
+if not layer 0 then can mask input, else mask gradient)
+
+if (xform before agg)
+  Calculate (FW1)' by tranpose aggregating gradients
+  Mask out the non-masters in feature matrix F if not layer 0, else mask FW1
+  Calculate W1' using F^T and (FW1)' (one of which is masked)
+  Calculate F' from W1 by using (FW1)', W1^T and W2^T (masked FW1 won't occur here,
+  because this is only required if layer isn't 0)
+else
+  Mask F if not layer 0, else mask gradient
+  Get F' from W2 by multiplying O' with W2 (no masks allowed here)
+  Mask TF^T if not layer 0 (because O' won't be masked in that case)
+  Get W1' by multiplying TF^T with O' (one will be masked)
+  Get F' from W1 by (1) multipling O' with W1^T then (2) transpose aggregate to get F'
+  (none of the ops above should be masked)
+```
+
+The above isn't the neatest explanation of things, but essentially,
+anything involving a W' calculation requires one of the operands
+to have masked non-masters. Layer 0 is special because you
+can't mask the inputs there as those are the inputs used at
+the beginning of an epoch.
+
+### Regarding Dropout
+
+The way that dropout works is that random parts of the input
+are set to 0 for that particular batch.
+The ones set to 0 need to be memorized so that the backward
+pass can correctly compute the derivative.
+
+Dropout currently **does not work in a distributed setting**: the problem
+is that each host may dropout different weights due to the nature
+of RNG, leading to divergence on each host. One way to avoid
+this is to make it so each host dropouts a particular portion only and
+synchronize this choice. This has not been implemented efficiently (yet?).
+**I have not kept this code up-to-date as well** as all runs I've been
+doing are without dropout.
+
+*Therefore, it's probably better not to use it for the time being.*
+
+# Graph Neural Network
+
+`GraphNeuralNetwork.{cpp/h}` is the main class which runs the
+graph neural network. It creates the layers and chains their outputs
+together to create the network flow.
+
+## Constructor
+
+1) Creates the intermediate layers. See the section on Layers to get an
+idea of what is done.
+Typically, activation is activated for compute layers except for the last
+layer: activation is typically disabled for that layer for accuracy
+reasons (running activation on the final output layer messes with
+predictions).
+
+2) If minibatching is enabled, create minibatch generators.
+
+3) Create the output layer (Softmax is the only one that works right now,
+but Sigmoid is required for multi-class classification).
+
+## Training Flow
+
+There are a few scenarios based on if training and testing minibatching
+is enabled or not. These are not necessarily the most optimal things to
+do (e.g., you never want the entire graph to participate in training;
+only k-hop neighborhood is required).
+
+1) No training/testing minibatch -> the entire graph participates in training.
+
+2) Training minibatch but no test minibatch -> k-hop neighborhood only, but
+space required for entire graph is allocated (inefficient, should only need
+k-hop neighborhood of test nodes)
+
+3) Train/test minibatching -> k-hop neighborhood subgraphs only, and space
+for them is allocated on demand rather than worst case entire graph.
+
+Note that because of the way the code works, if you want to do an *efficient*
+full-batch no sampling run, you should specify very large numbers for the train
+and test minibatches so that the efficient code path is taken. Due to the
+way the design is at the moment it will **inefficiently regenerate
+the k-hop full batch train/test subgraphs when they are used**: this
+need to be fixed in a future redesign where multiple subgraphs can be
+swapped among.
+
+If a k-hop subgraph needs to be generated, it's generated with the following
+flow:
+
+1) Choose the seed nodes (i.e., nodes that will have their output compared
+to ground truth to potentially get loss/gradients to backpropagate)
+
+2) From seed nodes, sample a few edges OR if not sampling, choose all
+of them. Activate the destination nodes, communicate this, repeat
+for k hops.
+
+3) Correct layer dimensions based on subgraph/number of nodes at
+each layer (reduce memory AND compute footprint).
+
+4) Generate subgraph (see subgraph construction section).
+
+5) Do inference and back prop, update weights, repeat.
+The way this works is relatively simple: the code loops
+through each layer and calls the forward or backward pass function
+on it.
+
+Depending on how the test interval is set, between each epoch
+a test subgraph may be used to check test accuracy.
+The flaw with the current design is that the graph object is
+only aware of one 'graph' at any one point, meaning the code
+has to be very careful to generate the right graph (train/test)
+for use at the right time.
+
+Note that the `kBatch` mode used in the Train code refers to
+a status that is set on nodes based on the minibatch and only
+includes *local seed nodes*, so keep this in mind when using it (there
+have been unintentional problems where I assumed `kBatch` meant
+more than just local seed nodes). The main reason for this is
+that it helps to distinguish local and global seed nodes to avoid
+over-calculating gradients.
+
+# GNN Graph
+
+`GNNGraph.{cpp/h}` is responsible for reading in the graph topology,
+labels, and features. Topology is read/partitioned via the CuSP
+infrastructure. Each host reads labels for nodes it owns; same with
+features (right now it's pretty inefficient as all hosts read the entire
+file; some better way should probably be come up with).
+
+It is responsible for the synchronization substrate: Gluon is initialized
+on the partitioned graph. Normally sync occurs on the node data of the graph,
+but the node data in GNN case is a feature vector. To get around sync
+structure limitations, a global pointer is set to point to the feature
+matrix array (along with some other globals) so that the sync structure
+can know how to access it.
+
+There are sync structures for global degrees and aggregation mainly.
+If a subgraph is used, things change slightly (see subgraph section)
+
+The class provides functions to get degrees and also holds the minibatch
+generator. It also holds one `GNNSubgraph` object if a subgraph is being used
+(this is a limitation; there can only be one active subgraph at any one point).
+If the subgraph is active and the flag for the subgraph is on, then all
+user-facing functions on the `GNNGraph` object will access the *subgraph*
+instead of the original graph. **Be very careful with this and make sure the
+graph is in the right mode that you intend it to be.**
+
+# Subgraph Construction
+
+Subgraphs are created by the sampling/minibatch infrastructure:
+a few nodes are marked "active" along with edges, and
+the program compiles these chosen nodes/edges into a separate
+CSR for use during execution. There are a few implementation details
+during this process that will be documented here.
+
+## Code Structure
+
+The current implementation in Galois has a Subgraph class
+contained by the GNNGraph class. The subgraph is enabled
+by a flag which alters GNNGraph calls to direct to the
+subgraph instead.
+
+Optimally, we want to be able to work with many subgraphs
+at once; this design makes it difficult to do so as
+only 1 subgraph is contained by on GNNGraph. It would
+probably be possible to extend this design and have GNNGraph
+expose a subgraph switcher or something of the sort so that
+it isn't tied directly to the class.
+
+## Sampling
+
+The "activeness" of a node is marked on the node itself as a flag.
+In addition to this, the layer number in which a node is added
+is noted as well (the reason for this will be apparent later).
+
+Each edge has two variables associated with it: a normal flag
+saying if it has been sampled in any layer, and a bitset saying
+which layers the edge has been sampled in. This is because
+an edge once sampled is not necessarily sampled in *all* layers:
+it may be sampled in only a single layer (or many layers),
+and this info needs to be known when iterating over the edges
+to keep things correct.
+
+In addition, the degree of a node for each sampled phase locally
+is kept track of. At the end of all sampling, the degrees
+of the nodes at each layer are synchronized among all hosts.
+This is required because normalization in aggregation uses
+the subgraph degrees (this is actually quite annoying runtime
+wise as it adds this extra degree sync step).
+
+## Construction Steps
+
+The steps in subgraph construction are the following:
+
+1) Create the local ID to subgraph ID mapping (and vice versa)
+2) Count degrees for the sampled vertices in order to construct
+the CSR; this includes edges that may not always be active.
+3) Create the CSR using the degrees.
+4) Create the local subgraph features matrix by copying
+them over from the original feature matrix.
+
+In order to make row elimination easier,
+the SID of the vertices are ordered such that seed nodes are
+first, the 1-hop samples next, then 2-hops, 3-hops, etc.
+This makes it easy to eliminate vertices that aren't used after
+a certain point by changing the row dimensions used by multiplies/
+aggregations. Master nodes that are also seed nodes always occupy
+the first SIDs so that it's easy to loop through master nodes only.
+Other master nodes may end up with non-contiguous SIDs as they
+may become active in different layers; to track these masters
+for masking later, a bitset is maintained.
+Counts as to how many nodes are in each layer have to be
+compiled so this process can be done in parallel. An on_each
+loop is used to get SIDs in parallel.
+
+In addition, nodes that (1) are not master proxies and (2) do
+not have any outgoing or incoming edges are eliminated from
+the local subgraph. This is because some proxies do not have
+edges on some hosts even if they do on other hosts, so even
+if they become active, they do not change the outcome of computation
+and actually add unnecessary overhead. **This dead mirror
+removal is extremely important for performance.** Implementation
+wise it is done by keeping a "definitely active" flag which
+will only mark proxies that definitely have an edge connecting
+them or proxies that are masters.
+
+Degree counting and graph construction proceed as normal: count
+degrees, do a prefix sum, create the CSR. One thing to note is
+that the CSC is also created in order to do the backward aggregation
+step. The data which says which layers an edge is active in is
+pointed to by the newly constructed graph.
+
+## Synchronization when Subgraphs Exist
+
+### Mirror Regeneration
+
+Some mirrors on a local host may be inactive in the subgraph because
+they were not sampled. The subgraph code can create a new mirror
+node mapping that Gluon can swap out for each subgraph.
+
+This has its own overhead, and from some experiments in the
+past this doesn't significantly affect performance, but it's
+done anyways.
+
+### GID to SID
+
+Gluon memoizes GID-LID handshakes on each host to avoid the need
+to send IDs along with messages. This means that if a subgraph is being
+synchronized, another conversion to SIDs must occur. There need
+to be sampled graph versions of the sync structures that use
+a mapping from LID to SID in order to save the updates to the correct
+memory locations.
+
+Sometimes, due to the way Gluon works, a node that isn't part of the
+active subgraph may have its data queried for extract/update. The sync
+structure must account for this and check if such data is being accessed
+so that it can avoid seg-faulting.
+
+# Minibatch Generator
+
+`MinibatchGenerator.{cpp/h}` takes the list of training/test nodes on
+a single host and gives the user an interface for getting the nodes
+in batches at a time. This is used to do minibatching of nodes across
+hosts; each host picks the same number at a time before the beginning
+of minibatch.
+
+# Other (Dead) Files/Code
+
+`DistributedMinibatchTracker` was created to track variable number
+of seed nodes on each host to make the sampling more like single-host
+sampling. This was deprecated for a new functionality in the `MinibatchGenerator`
+which does it in a much more sane manner by having all hosts see the same
+global sequence of nodes to choose and moving the window locally on each
+host (this can result in imbalanced seeds).
+
+A lot of the existing layers have not been kept up-to-date due to the rapid
+development process on minibatching/sampling. Only the SAGE layer and Softmax
+Layer are guaranteed to be functional as those are the ones most
+of the runs have been on.
+
+There is an experimental implementation of something known as "sampled views"
+in which an explicit subgraph isn't constructed; a mask is used instead.
+Performance wise this did not do too well, so the code has been abandoned
+and is not guaranteed to work.
+
+# Regarding GPU Code
+
+It has been a while since I worked on the GPU code, but the idea is essentially
+to pre-allocate the same data that you would have allocated on the CPU
+and use those pointers instead of CPU pointers.
+
+Some updates will need to be made in order to do dynamic resizing of the
+data depending on the size of the minibatch. The best way to avoid this
+in general, though, is to just allocate space for the test subgraph's
+k-hops since that is likely to be more expensive than whatever
+the minibatch size for the train nodes are (unless it's all nodes).
+
+Author: Hochan Lee, <hochan@utexas.edu>
+
+# Intel Open API MKL
+
+Galois-GNN requires Intel Math Kernel Library (MKL), and so, you are required to
+install Intel oneAPI. This toolkit contains all the necessary tools and libraries including
+the MKL library. We recommend to get Intel oneAPI >= 2023.1.0. from the Intel official website
+(https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html)
+as this is what we have used and tested.
+
+Once you followed their instruction and installed Intel oneAPI,
+you should export the MKL path in the installation path to your environment
+before you cmake and install Galois-GNN. CMakeLists.txt will look for the MKL root path.
+
+```Shell
+export MKL_ROOT=[THE PARENT PATH OF THE INSTALLATION PATH]/intel/openapi/mkl/2023.1.0
+```
diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h
new file mode 100644
index 0000000000..e19b0d9525
--- /dev/null
+++ b/libgnn/include/galois/CUDAUtil.h
@@ -0,0 +1,85 @@
+#ifndef GALOIS_CUDA_UTIL
+#define GALOIS_CUDA_UTIL
+//! @file CUDAUtil.h
+//! Contains various utility functions for CUDA.
+//! Taken and revised+added to from here
+//! https://github.com/BVLC/caffe/blob/master/include/caffe/util/device_alternate.hpp
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <curand.h>
+#include "galois/Logging.h"
+
+// TODO check these too and make sure they make sense
+// CUDA: use 256 threads per block
+const int CUDA_NUM_THREADS = 256;
+
+// CUDA: number of blocks for threads.
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+// TODO check these
+#define CHUNK_SIZE 256
+#define TB_SIZE 256
+#define BLOCK_SIZE 256
+#define WARP_SIZE 32
+#define MAX_NUM_CLASSES 128
+#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)
+
+//! Wrap a CUDA call with this to auto-check if it returns any error
+#define CUDA_CHECK(condition)                                                  \
+  do {                                                                         \
+    cudaError_t error = condition;                                             \
+    if (error != cudaSuccess) {                                                \
+      GALOIS_LOG_FATAL("CUDA error: {}", cudaGetErrorString(error));           \
+    }                                                                          \
+  } while (0)
+
+//! Frees a pointer allocated by cuda malloc
+#define CUDA_FREE(ptr)                                                         \
+  do {                                                                         \
+    if (ptr) {                                                                 \
+      CUDA_CHECK(cudaFree(ptr));                                               \
+      ptr = nullptr;                                                           \
+    }                                                                          \
+  } while (0)
+
+//! Call this after a cuda call to make sure it set any error flags
+#define CUDA_TEST(msg)                                                         \
+  do {                                                                         \
+    cudaError_t e;                                                             \
+    cudaDeviceSynchronize();                                                   \
+    if (cudaSuccess != (e = cudaGetLastError())) {                             \
+      GALOIS_LOG_ERROR("{}: {}", msg, e);                                      \
+      GALOIS_LOG_ERROR("{}", cudaGetErrorString(e));                           \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+//! Basic kernel loop for CUDA threads
+//! Caffe describes it as "grid stride"
+#define CUDA_KERNEL_LOOP(i, s, e)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x + s; i < (e);             \
+       i += blockDim.x * gridDim.x)
+
+//! Wrap a CuBLAS call with this to check if it threw any errors
+#define CUBLAS_CHECK(condition)                                                \
+  do {                                                                         \
+    cublasStatus_t status = condition;                                         \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      GALOIS_LOG_ERROR("CuBLAS error code : {}", status);                      \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+//! Wrap a CuRAND call with this to check if it threw any errors
+#define CURAND_CHECK(condition)                                                \
+  do {                                                                         \
+    curandStatus_t status = condition;                                         \
+    if (status != CURAND_STATUS_SUCCESS) {                                     \
+      GALOIS_LOG_ERROR("CuRAND error code : {}", status);                      \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#endif
diff --git a/libgnn/include/galois/CUDAUtilHostDecls.h b/libgnn/include/galois/CUDAUtilHostDecls.h
new file mode 100644
index 0000000000..d9fe5230a5
--- /dev/null
+++ b/libgnn/include/galois/CUDAUtilHostDecls.h
@@ -0,0 +1,3 @@
+#pragma once
+
+void SetCUDADeviceId(int gpu_id);
diff --git a/libgnn/include/galois/DistributedMinibatchTracker.h b/libgnn/include/galois/DistributedMinibatchTracker.h
new file mode 100644
index 0000000000..730a40400e
--- /dev/null
+++ b/libgnn/include/galois/DistributedMinibatchTracker.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "galois/graphs/GNNGraph.h"
+#include <random>
+
+namespace galois {
+
+//! Tracks how many nodes remain to be chosen from every host's
+//! minibatch and also determines how many to pull from this
+//! particular host every iteration.
+class DistributedMinibatchTracker {
+public:
+  DistributedMinibatchTracker(size_t my_host_id, size_t num_hosts,
+                              size_t my_minibatch_nodes,
+                              size_t local_minibatch_size)
+      : my_host_id_{my_host_id}, num_hosts_{num_hosts},
+        local_minibatch_size_{local_minibatch_size},
+        total_minibatch_size_{local_minibatch_size_ * num_hosts_},
+        complete_hosts_{0},
+        rng_object_{(long unsigned)rand() * (my_host_id_ + 1)},
+        int_distribution_{1, 10} {
+    max_num_on_hosts_.resize(num_hosts_, 0);
+    current_num_on_hosts_.resize(num_hosts_, 0);
+    sampled_num_on_hosts_.resize(num_hosts_, 0);
+    max_num_on_hosts_[my_host_id_] = my_minibatch_nodes;
+
+    // all reduce so all get the right values
+    // TODO technically all reduce would be sending unnecessary 0s
+    // but whatever this is relatively small
+    MPI_Allreduce(MPI_IN_PLACE, static_cast<void*>(max_num_on_hosts_.data()),
+                  num_hosts_, MPI_UINT32_T, MPI_SUM, MPI_COMM_WORLD);
+  }
+
+  //! Reset epoch = set all current sampled back to initial state
+  void ResetEpoch() {
+    galois::do_all(
+        galois::iterate(size_t{0}, num_hosts_), [&](size_t host_id_) {
+          current_num_on_hosts_[host_id_] = max_num_on_hosts_[host_id_];
+        });
+    complete_hosts_ = 0;
+  }
+
+  size_t GetNumberForNextMinibatch();
+
+  bool OutOfWork() {
+    GALOIS_LOG_FATAL("NEED TO IMPLEMENT");
+    return complete_hosts_ == num_hosts_;
+  }
+
+private:
+  size_t my_host_id_;
+  size_t num_hosts_;
+  size_t local_minibatch_size_;
+  size_t total_minibatch_size_;
+  unsigned complete_hosts_;
+
+  std::mt19937 rng_object_;
+  std::uniform_int_distribution<unsigned> int_distribution_;
+  //! Maximum amount of nodes on each host; used to reset state
+  std::vector<uint32_t> max_num_on_hosts_;
+  //! Current number of nodes left on each host; used to know how
+  //! to sample on each host
+  std::vector<uint32_t> current_num_on_hosts_;
+  //! Vector to be sync'd indicating how many to grab from each
+  //! batch
+  std::vector<uint32_t> sampled_num_on_hosts_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/GNNCudaContextHostDecls.h b/libgnn/include/galois/GNNCudaContextHostDecls.h
new file mode 100644
index 0000000000..58c45c3b97
--- /dev/null
+++ b/libgnn/include/galois/GNNCudaContextHostDecls.h
@@ -0,0 +1,87 @@
+#pragma once
+#include "galois/cuda/HostDecls.h"
+
+extern int gpudevice;
+
+void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g,
+                         unsigned num_hosts);
+void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers);
+void resize_CUDA_bitset(struct CUDA_Context* ctx, size_t bitset_size);
+void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx,
+                                     unsigned layer_number, unsigned num_hosts,
+                                     unsigned nnodes, size_t infl_in_size,
+                                     size_t infl_out_size);
+
+namespace galois {
+void batch_get_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            size_t column_size,
+                                            unsigned layer_number);
+void batch_aggregate_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_aggregate_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_set_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            DataCommMode mode,
+                                            size_t column_size,
+                                            unsigned layer_number);
+void batch_set_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                                  unsigned from_id,
+                                                  uint8_t* buf,
+                                                  size_t column_size,
+                                                  unsigned layer_number);
+void batch_get_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             size_t column_size,
+                                             unsigned layer_number);
+void batch_aggregate_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_aggregate_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_set_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             DataCommMode mode,
+                                             size_t column_size,
+                                             unsigned layer_number);
+void batch_set_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number);
+void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                                   unsigned from_id,
+                                                   uint8_t* buf,
+                                                   size_t column_size,
+                                                   unsigned layer_number);
+void get_bitset_graph_aggregate_cuda(struct CUDA_Context* ctx,
+                                     uint64_t* bitset_compute);
+
+void bitset_graph_aggregate_reset_cuda(struct CUDA_Context* ctx, size_t begin,
+                                       size_t end);
+
+void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix,
+                             size_t column_size, size_t num_nodes,
+                             unsigned layer_number);
+size_t getLayerInputMatrixColumnSize(struct CUDA_Context* ctx,
+                                     unsigned layer_number);
+size_t getLayerOutputMatrixColumnSize(struct CUDA_Context* ctx,
+                                      unsigned layer_number);
+} // namespace galois
diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh
new file mode 100644
index 0000000000..a50e8974ba
--- /dev/null
+++ b/libgnn/include/galois/GNNMath.cuh
@@ -0,0 +1,59 @@
+#ifndef GALOIS_GNN_MATH_CUDA
+#define GALOIS_GNN_MATH_CUDA
+#include "galois/GNNTypes.h"
+#include "galois/CUDAUtil.h"
+
+namespace galois {
+
+extern bool cublas_is_init;
+extern cublasHandle_t global_cublas_handle;
+extern bool curand_is_init;
+extern curandGenerator_t global_curand_generator;
+
+//! Initializes the cublas handle to use cublas on GPUs.
+void InitCuBLAS();
+//! Initializes the curand RNG
+void InitCuRAND();
+
+//! Initializes an array with random numbers (0.0, 1.0]
+void CuRANDUniformRNG(GNNFloat* array_to_fill, size_t num_elements);
+
+//! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using
+//! CuBLAS.
+void CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                   const cublasOperation_t trans_b, size_t input_rows,
+                   size_t input_columns, size_t output_columns,
+                   const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
+
+void CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                   const cublasOperation_t trans_b, size_t input_rows,
+                   size_t input_columns, size_t output_columns,
+                   const GNNFloat* a, const GNNFloat* b, GNNFloat* output,
+                   bool accumulate);
+
+//! Runs softmax + cross entropy on masked nodes. Will not overwrite all of
+//! the output, so make sure it's been zero'd out beforehand.
+//! At this point in time cross entropy is ignored because it only calculates a
+//! loss value which doesn't really do anything for us at the moment.
+__global__ void
+SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length,
+                           const galois::GNNFloat* input_embeddings,
+                           galois::GNNFloat* output);
+
+//! Derivative of cross entropy (to get error of prediction) then derivavtive
+//! of the softmax.
+__global__ void
+SoftmaxCrossEntropyBackward(char* mask, size_t num_nodes, size_t feature_length,
+                            const galois::GNNFloat* predictions,
+                            const galois::GNNLabel* ground_truth,
+                            galois::GNNFloat* output_gradient);
+
+//! Given a vector, apply a softmax on some specified # of elements and save
+//! the result to the specified output. Since this is a device function,
+//! all pointers should be to GPU memory.
+__device__ void DoSoftmax(size_t vector_length, const GNNFloat* input,
+                          GNNFloat* output);
+
+__device__ void GPUVectorZero(size_t vector_length, GNNFloat* vec);
+} // namespace galois
+#endif
diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h
new file mode 100644
index 0000000000..dd7ee5b479
--- /dev/null
+++ b/libgnn/include/galois/GNNMath.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include "galois/Logging.h"
+#include "galois/GNNTypes.h"
+#include <mkl.h>
+#include <cmath>
+
+namespace galois {
+
+//! zeros out a vector of some length
+void VectorZero(size_t length, GNNFloat* a);
+
+//! Find max index in a vector of some length
+size_t MaxIndex(const size_t length, const GNNFloat* vector);
+//! Given 2 float array pointers, do element wise addition of length elements
+//! Can be called in parallel sections as its sigle threaded code
+void VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+               GNNFloat* output);
+//! Given 2 float array pointers, do element wise addition of length elements
+//! while scaling the second vector with a multiplier
+void VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+                  const GNNFloat b_scale, GNNFloat* output);
+
+//! Does a softmax operation on the input vector and saves result to output
+//! vector; single threaded so it can be called in a parallel section
+void GNNSoftmax(const size_t vector_length, const GNNFloat* input,
+                GNNFloat* output);
+//! Get derivative of softmax given the forward pass's input, the derivative
+//! from loss calculation, and a temp vector to store intermediate results.
+//! Everything is the same size.
+void GNNSoftmaxDerivative(const size_t vector_length,
+                          const GNNFloat* prev_output,
+                          const GNNFloat* prev_output_derivative,
+                          GNNFloat* temp_vector, GNNFloat* output);
+
+//! Performs cross entropy given a ground truth and input and returns the loss
+//! value.
+template <typename TruthType>
+galois::GNNFloat GNNCrossEntropy(const size_t vector_length,
+                                 const TruthType* ground_truth,
+                                 const GNNFloat* input) {
+  GNNFloat loss = 0.0;
+  // Note that this function works if there are multiple non-zeros in the
+  // ground truth vector
+  // If there is only 1 then this function is overkill and it should break
+  // early (i.e. single class); in one-hot vector setting for instance
+  // Multiclass = fine: in fact this is meant for multiclass but also
+  // works for single
+  for (size_t i = 0; i < vector_length; i++) {
+    if (ground_truth[i] == 0.0) {
+      if (input[i] == 1.0) {
+        loss -= std::log(static_cast<GNNFloat>(1e-10));
+      } else {
+        loss -= std::log(1 - input[i]);
+      }
+    } else {
+      if (input[i] == 0.0) {
+        loss -= std::log(static_cast<GNNFloat>(1e-10));
+      } else {
+        loss -= std::log(input[i]);
+      }
+    }
+  }
+
+  return loss;
+}
+//! Derivative of cross entropy; gradients saved into an output vector.
+template <typename TruthType>
+void GNNCrossEntropyDerivative(const size_t vector_length,
+                               const TruthType* ground_truth,
+                               const GNNFloat* input, GNNFloat* gradients) {
+  // TODO(loc) delete this function once I fully understand why it wasn't
+  // working
+  GALOIS_LOG_FATAL(
+      "DO NOT USE THIS FUNCTION; NOT CORRECT IN ALL CIRCUMSTANCES");
+  for (size_t i = 0; i < vector_length; i++) {
+    gradients[i] = -ground_truth[i] / (input[i] + static_cast<float>(1e-10));
+    // if (ground_truth[i]) {
+    //  gradients[i] = -1.0 / (input[i] + static_cast<float>(1e-10));
+    //}
+    // else {
+    //  if (input[i] == 1.0) {
+    //    // opposite
+    //    gradients[i] = 1.0 / static_cast<float>(1e-10);
+    //  } else {
+    //    gradients[i] = 1.0 / (1.0 - input[i]);
+    //  }
+    //}
+  }
+}
+
+//! Calls into a library BLAS call to do matrix muliply; uses default alpha/beta
+void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const GNNFloat* a, const GNNFloat* b, GNNFloat* output);
+
+void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const GNNFloat* a, const GNNFloat* b, GNNFloat* output,
+                bool accumulate);
+
+} // namespace galois
diff --git a/libgnn/include/galois/GNNOptimizers.cuh b/libgnn/include/galois/GNNOptimizers.cuh
new file mode 100644
index 0000000000..42f499b557
--- /dev/null
+++ b/libgnn/include/galois/GNNOptimizers.cuh
@@ -0,0 +1,41 @@
+#ifndef GALOIS_GPU_GNN_OPT
+#define GALOIS_GPU_GNN_OPT
+
+#include <vector>
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Holds GPU memory for the adam optimizer as well as function definitions
+//! for weight adjustment
+class AdamOptimizerGPU {
+public:
+  //! Initializes the moment vectors on the GPU based on provided sizes
+  AdamOptimizerGPU(const std::vector<size_t>& trainable_layer_sizes,
+                   size_t num_trainable);
+  //! Frees moment vectors and vector of pointers to moments
+  ~AdamOptimizerGPU();
+
+  GNNFloat* first_moment(size_t i) { return first_moments_[i]; };
+  GNNFloat* second_moment(size_t i) { return second_moments_[i]; };
+
+  //! Calls into a GPU kernel; needs to be done this way as this cuh is included
+  //! in a GCC build, so the kernel cannot be defined in this header.
+  void AdamUpdate(const GNNFloat* derivatives, GNNFloat* matrix_to_update,
+                  size_t matrix_size, GNNFloat* first_moment,
+                  GNNFloat* second_moment, GNNFloat alpha, GNNFloat beta1,
+                  GNNFloat beta2, GNNFloat epsilon, GNNFloat beta1t,
+                  GNNFloat beta2t);
+
+  //! Helper to copy gpu pointer to cpu vector
+  void CopyToVector(std::vector<GNNFloat>& to, PointerWithSize<GNNFloat> from);
+
+private:
+  size_t num_layers_;
+  std::vector<GNNFloat*> first_moments_;
+  std::vector<GNNFloat*> second_moments_;
+};
+
+} // namespace galois
+
+#endif
diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h
new file mode 100644
index 0000000000..8a171f96da
--- /dev/null
+++ b/libgnn/include/galois/GNNOptimizers.h
@@ -0,0 +1,117 @@
+#pragma once
+// Code inspired from this; actual code style is not the same + changed some
+// things such as adding params for every layer which TinyDNN does not seem to
+// do
+// https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h
+// Copyright (c) 2013, Taiga Nomi and the respective contributors
+// All rights reserved.
+// Changed by Galois under 3-BSD
+#include "galois/GNNTypes.h"
+#include <vector>
+#include <cassert>
+
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/GNNOptimizers.cuh"
+#endif
+
+namespace galois {
+
+//! Virtual class; optimizers all need the descent function
+class BaseOptimizer {
+public:
+  virtual void GradientDescent(PointerWithSize<GNNFloat> derivatives,
+                               PointerWithSize<GNNFloat> matrix,
+                               size_t layer_number) = 0;
+};
+
+//! Maintains a first and second moment for each weight in the weight matrix and
+//! does gradient descent invidiually on each weight
+class AdamOptimizer : public BaseOptimizer {
+public:
+  //! Struct for specifying adam config. Defaults based on the Adam paper.
+  struct AdamConfiguration {
+    GNNFloat alpha{0.01};
+    GNNFloat beta1{0.9};
+    GNNFloat beta2{0.999};
+    GNNFloat epsilon{1e-8};
+  };
+
+  AdamOptimizer(const std::vector<size_t>& trainable_layer_sizes,
+                size_t num_trainable_layers)
+      : AdamOptimizer(AdamConfiguration(), trainable_layer_sizes,
+                      num_trainable_layers) {}
+
+  //! Constructor allocates memory, initializes training vars for each layer
+  AdamOptimizer(const AdamConfiguration& config,
+                const std::vector<size_t>& trainable_layer_sizes,
+                size_t num_trainable_layers)
+      :
+#ifdef GALOIS_ENABLE_GPU
+        gpu_object_(trainable_layer_sizes, num_trainable_layers),
+#endif
+        config_(config), num_trainable_layers_(num_trainable_layers),
+        beta1_power_t_(num_trainable_layers_, config.beta1),
+        beta2_power_t_(num_trainable_layers_, config.beta2) {
+    // >= because only prefix will be considered otherwise
+    assert(trainable_layer_sizes.size() >= num_trainable_layers_);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // pointer with size initialization with GPU pointers
+      for (size_t i = 0; i < num_trainable_layers_; i++) {
+        p_first_moments_.emplace_back(gpu_object_.first_moment(i),
+                                      trainable_layer_sizes[i]);
+        p_second_moments_.emplace_back(gpu_object_.second_moment(i),
+                                       trainable_layer_sizes[i]);
+      }
+    } else {
+#endif
+      // allocate vectors based on # of trainable layers
+      for (size_t i = 0; i < num_trainable_layers_; i++) {
+        first_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
+        second_moments_.emplace_back(trainable_layer_sizes[i], 0.0);
+        // Pointer with size construction
+        p_first_moments_.emplace_back(first_moments_.back());
+        p_second_moments_.emplace_back(second_moments_.back());
+      }
+      assert(first_moments_.size() == num_trainable_layers_);
+      assert(second_moments_.size() == num_trainable_layers_);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+  //! Adam based gradient descent
+  void GradientDescent(PointerWithSize<GNNFloat> derivatives,
+                       PointerWithSize<GNNFloat> matrix,
+                       size_t layer_number) final;
+
+#ifdef GALOIS_ENABLE_GPU
+  //! helper function for unit testing to do some vector copying
+  void CopyToVector(std::vector<GNNFloat>& to, PointerWithSize<GNNFloat> from) {
+    gpu_object_.CopyToVector(to, from);
+  }
+#endif
+private:
+#ifdef GALOIS_ENABLE_GPU
+  AdamOptimizerGPU gpu_object_;
+#endif
+
+  //! Configuration options for this layer
+  AdamConfiguration config_;
+  //! First moment vectors; one for each trainable layer
+  std::vector<std::vector<GNNFloat>> first_moments_;
+  //! Second moment vectors; one for each trainable layer
+  std::vector<std::vector<GNNFloat>> second_moments_;
+  // PointerWithSize versions of first/second moments (for use in function
+  // to support GPU pointers as well
+  std::vector<PointerWithSize<GNNFloat>> p_first_moments_;
+  std::vector<PointerWithSize<GNNFloat>> p_second_moments_;
+  //! Number of layers that can be trained (need moment vectors for each)
+  size_t num_trainable_layers_;
+  // power terms used in adam: updated by raising power every time update is
+  // called
+  // vector because one is necessary for each layer
+  std::vector<GNNFloat> beta1_power_t_;
+  std::vector<GNNFloat> beta2_power_t_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h
new file mode 100644
index 0000000000..5dbcf4771b
--- /dev/null
+++ b/libgnn/include/galois/GNNTypes.h
@@ -0,0 +1,63 @@
+#pragma once
+//! @file GNNTypes.h
+//! Typedefs used by the Galois GNN code
+
+#include <cstdint>
+#include <cstddef>
+#include <vector>
+
+#ifdef GALOIS_ENABLE_GPU
+enum class DevicePersonality { CPU, GPU_CUDA };
+extern DevicePersonality device_personality;
+#endif
+
+namespace galois {
+//! Floating point type to use throughout GNN compute; typedef'd so it's easier
+//! to flip later
+using GNNFloat = float;
+//! Type of the labels for a vertex
+using GNNLabel = uint8_t;
+//! Type of a feature on vertices
+using GNNFeature = float;
+//! Type of mask
+using GNNMask = std::vector<char>;
+//! Type of node index on gpus
+using GPUNodeIndex = uint32_t;
+//! Type of edge index on gpus
+using GPUEdgeIndex = uint64_t;
+
+//! Phase of GNN computation
+enum class GNNPhase { kTrain, kValidate, kTest, kOther, kBatch };
+
+//! Vector like wrapper over a pointer and size; exists solely to pass around
+//! raw pointers with size (because vectors are a no-go due to the code
+//! handling both CPU and GPU.)
+template <typename PointerType>
+class PointerWithSize {
+public:
+  //! Default is empty
+  PointerWithSize() : ptr_{nullptr}, num_elements_{0} {}
+  //! Generic constructor which takes 2 fields to initialize
+  PointerWithSize(PointerType* ptr, size_t num_elements)
+      : ptr_{ptr}, num_elements_{num_elements} {}
+  //! Grab vector pointer + size
+  PointerWithSize(std::vector<PointerType>& v)
+      : ptr_{v.data()}, num_elements_{v.size()} {}
+  //! Alias to return pointer data
+  PointerType* data() { return ptr_; }
+  //! Alias to return pointer data (const version)
+  const PointerType* data() const { return ptr_; }
+  //! # elements that pointer should contain
+  size_t size() const { return num_elements_; }
+  // accessors; one lets you mess with the array
+  PointerType& operator[](size_t i) { return ptr_[i]; }
+  const PointerType& operator[](size_t i) const { return ptr_[i]; }
+
+private:
+  //! Pointer to data
+  PointerType* ptr_;
+  //! # elements that I should be able to access from pointer
+  size_t num_elements_;
+};
+
+} // end namespace galois
diff --git a/libgnn/include/galois/GraphNeuralNetwork.cuh b/libgnn/include/galois/GraphNeuralNetwork.cuh
new file mode 100644
index 0000000000..dd2eeed8b0
--- /dev/null
+++ b/libgnn/include/galois/GraphNeuralNetwork.cuh
@@ -0,0 +1,22 @@
+#ifndef GALOIS_GNN_GPU_CLASS
+#define GALOIS_GNN_GPU_CLASS
+
+#include "galois/GNNTypes.h"
+#include "galois/graphs/GNNGraph.cuh"
+
+namespace galois {
+
+//! Helper class for a GNN: holds GPU arguments. In its own class so that the
+//! compiler used for it can differ from the main CPU code
+class GraphNeuralNetworkGPU {
+public:
+  //! Gets accuracy of a prediction given pointers to the data on the GPU
+  float
+  GetGlobalAccuracyGPU(const galois::graphs::GNNGraphGPUAllocations& gpu_graph,
+                       galois::GNNPhase phase,
+                       const galois::PointerWithSize<GNNFloat> predictions);
+};
+
+} // namespace galois
+
+#endif
diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h
new file mode 100644
index 0000000000..1d9108fbe5
--- /dev/null
+++ b/libgnn/include/galois/GraphNeuralNetwork.h
@@ -0,0 +1,1125 @@
+#pragma once
+//! @file GraphNeuralNetwork.h
+//!
+//! Defines the graph neural network class that is used to classify graphs as
+//! well as helper enums/classes involved with the GNN.
+
+#include "galois/Logging.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/graphs/GNNGraph.h"
+#include "galois/layers/GNNLayer.h"
+#include "galois/DistributedMinibatchTracker.h"
+#include "galois/GNNMath.h"
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/DenseLayer.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/L2NormLayer.h"
+#include "galois/layers/ReLULayer.h"
+#include "galois/layers/SAGELayer.h"
+#include "galois/layers/SigmoidLayer.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/GraphNeuralNetwork.cuh"
+#endif
+
+namespace galois {
+
+////////////////////////////////////////////////////////////////////////////////
+
+// TODO validation and testing intervals
+//! Configuration object passed into constructor of a GraphNeuralNetwork to
+//! determine how the network gets constructed.
+class GraphNeuralNetworkConfig {
+public:
+  //! Construction without a config for layers specified; uses a default
+  //! also no sampling specified
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type)
+      : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
+                                 output_layer_type, false, GNNLayerConfig()) {}
+
+  //! Construction without a config for layers specified
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type,
+                           bool do_sampling)
+      : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
+                                 output_layer_type, do_sampling,
+                                 GNNLayerConfig()) {}
+
+  //! Construction without sampling specified
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type,
+                           const GNNLayerConfig& default_layer_config)
+      : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes,
+                                 output_layer_type, false,
+                                 default_layer_config) {}
+
+  //! Construction with a specified config for layers
+  GraphNeuralNetworkConfig(size_t num_layers,
+                           const std::vector<GNNLayerType>& layer_types,
+                           const std::vector<size_t>& layer_column_sizes,
+                           GNNOutputLayerType output_layer_type,
+                           bool do_sampling,
+                           const GNNLayerConfig& default_layer_config)
+      : do_sampling_(do_sampling), num_intermediate_layers_(num_layers),
+        layer_types_(layer_types), layer_column_sizes_(layer_column_sizes),
+        output_layer_type_(output_layer_type),
+        default_layer_config_(default_layer_config) {
+    // Do sanity checks on inputs
+    // should have a type for each layer
+    GALOIS_LOG_ASSERT(num_intermediate_layers_ == layer_types_.size());
+    // For now, should be at least 1 intermediate layer
+    GALOIS_LOG_ASSERT(num_intermediate_layers_ >= 1);
+    // + 1 because it includes output layer
+    GALOIS_LOG_ASSERT((num_intermediate_layers_ + 1) ==
+                      layer_column_sizes_.size());
+  }
+
+  //! # layers NOT including output layer
+  size_t num_intermediate_layers() const { return num_intermediate_layers_; }
+  //! Get intermediate layer i
+  GNNLayerType intermediate_layer_type(size_t i) const {
+    assert(i < num_intermediate_layers_);
+    return layer_types_[i];
+  }
+  //! Get intermediate layer i's size
+  size_t intermediate_layer_size(size_t i) const {
+    assert(i < num_intermediate_layers_);
+    return layer_column_sizes_[i];
+  }
+  //! Type of output layer
+  GNNOutputLayerType output_layer_type() const { return output_layer_type_; }
+  //! Size of output layer is last element of layer column sizes
+  size_t output_layer_size() const {
+    return layer_column_sizes_[num_intermediate_layers_];
+  }
+
+  bool do_sampling() const { return do_sampling_; }
+  unsigned train_minibatch_size() const { return train_minibatch_size_; }
+  unsigned test_minibatch_size() const { return test_minibatch_size_; }
+
+  //! Get the default layer config of layers in this GNN
+  const GNNLayerConfig& default_layer_config() const {
+    return default_layer_config_;
+  }
+
+  // public because they are independent of other settings
+  //! Graph sampling
+  bool do_sampling_{false};
+  //! Creates subgraph that is only composed of training nodes (reduces
+  //! redundant work since you won't calculate things you don't need)
+  bool use_train_subgraph_{false};
+  //! If on, subgraphs cannot pick up val/test nodes
+  bool inductive_subgraph_{false};
+  //! Interval to run validation set on network at; 0 = no run
+  unsigned validation_interval_{0};
+  //! Interval to run testing set on network at; 0 = no run
+  unsigned test_interval_{0};
+  unsigned minibatch_test_interval_{10};
+  unsigned train_minibatch_size_{0};
+  unsigned test_minibatch_size_{0};
+  //! Fan out used for sampling (if sampling is enabled)
+  std::vector<unsigned> fan_out_vector_;
+
+private:
+  //! Number of layers to construct in the GNN not including the output
+  //! layer
+  size_t num_intermediate_layers_;
+  //! Layers to construct for the GNN going from left to right; size should
+  //! match num_layers setting
+  std::vector<GNNLayerType> layer_types_;
+  //! Size (in columns) of each non-output layer; size should match num_layers
+  //! + 1 (+1 is for the output layer)
+  std::vector<size_t> layer_column_sizes_;
+  //! Output layer type
+  GNNOutputLayerType output_layer_type_;
+  //! Default config to use for layers
+  GNNLayerConfig default_layer_config_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+//! Class representing the graph neural network: contains the graph to train as
+//! well as all the layers that comprise it
+template <typename VTy, typename ETy>
+class GraphNeuralNetwork {
+public:
+  //! Construct the graph neural network given the graph to train on as well as
+  //! a configuration object
+  GraphNeuralNetwork(std::unique_ptr<graphs::GNNGraph<VTy, ETy>> graph,
+                     std::unique_ptr<BaseOptimizer> optimizer,
+                     GraphNeuralNetworkConfig&& config)
+      : graph_(std::move(graph)), optimizer_(std::move(optimizer)),
+        config_(std::move(config)) {
+    if (config_.do_sampling_ && config_.use_train_subgraph_) {
+      GALOIS_LOG_FATAL("Do not set train subgraph and sampling at same time "
+                       "(sampling uses training subgraph already)");
+    }
+    // max number of rows that can be passed as inputs; allocate space for it as
+    // this will be the # of rows for each layer
+    size_t max_rows = graph_->size();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      graph_->ResizeGPULayerVector(config_.num_intermediate_layers());
+    }
+#endif
+    // used for chaining layers together; begins as nullptr
+    PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
+    num_graph_user_layers_ = 0;
+
+    // create the intermediate layers
+    for (size_t i = 0; i < config_.num_intermediate_layers(); i++) {
+      GNNLayerType layer_type = config_.intermediate_layer_type(i);
+      size_t prev_layer_columns;
+
+      if (i != 0) {
+        // grab previous layer's size
+        prev_layer_columns = config_.intermediate_layer_size(i - 1);
+      } else {
+        // first layer means the input columns are # features in graph
+        prev_layer_columns = graph_->node_feature_length();
+      }
+
+      // max dims
+      GNNLayerDimensions layer_dims = {.input_rows    = max_rows,
+                                       .input_columns = prev_layer_columns,
+                                       .output_columns =
+                                           config_.intermediate_layer_size(i),
+                                       .output_rows = max_rows};
+
+      // test minibatch size: if it's not enabled, then currently the full
+      // graph is used (should really only subgraph the test nodes, though;
+      // that's a TODO)
+      if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
+          config_.test_minibatch_size()) {
+        galois::gInfo("Not allocating rows");
+        // set to 0 here to make it allocate nothing
+        layer_dims.input_rows  = 0;
+        layer_dims.output_rows = 0;
+      }
+
+      switch (layer_type) {
+      case GNNLayerType::kGraphConvolutional:
+        gnn_layers_.push_back(
+            std::move(std::make_unique<GraphConvolutionalLayer<VTy, ETy>>(
+                i, *graph_, &prev_output_layer, layer_dims,
+                config_.default_layer_config())));
+        gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
+        break;
+      case GNNLayerType::kSAGE:
+        gnn_layers_.push_back(std::move(std::make_unique<SAGELayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++);
+#ifdef GALOIS_ENABLE_GPU
+        // TODO(loc/hochan) sage layer gpu
+#endif
+        break;
+      case GNNLayerType::kL2Norm:
+        gnn_layers_.push_back(std::move(std::make_unique<L2NormLayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        break;
+      case GNNLayerType::kReLU:
+        gnn_layers_.push_back(std::move(std::make_unique<ReLULayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        break;
+      case GNNLayerType::kDense:
+        gnn_layers_.push_back(std::move(std::make_unique<DenseLayer<VTy, ETy>>(
+            i, *graph_, &prev_output_layer, layer_dims,
+            config_.default_layer_config())));
+        break;
+      default:
+        GALOIS_LOG_FATAL("Invalid layer type during network construction");
+      }
+
+      // update output layer for next layer
+      prev_output_layer = gnn_layers_.back()->GetForwardOutput();
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        graph_->InitLayerVectorMetaObjects(
+            i, galois::runtime::getSystemNetworkInterface().Num,
+            layer_dims.input_columns, layer_dims.output_columns);
+      }
+#endif
+    }
+
+    // loop backward and find last GCN/SAGE (main) layer to disable activation
+    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
+         back_iter++) {
+      GNNLayerType layer_type = (*back_iter)->layer_type();
+      if (layer_type == GNNLayerType::kGraphConvolutional ||
+          layer_type == GNNLayerType::kSAGE) {
+        galois::gDebug("Disabling activation on layer ",
+                       (*back_iter)->layer_number(), "\n");
+        (*back_iter)->DisableActivation();
+        break;
+      }
+    }
+
+    if (config_.do_sampling() || config_.use_train_subgraph_ ||
+        config.train_minibatch_size() || config.test_minibatch_size()) {
+      // output layer not included; it will never involve sampling
+      graph_->InitializeSamplingData(num_graph_user_layers_,
+                                     config_.use_train_subgraph_);
+    }
+
+    num_hosts_ = galois::runtime::getSystemNetworkInterface().Num;
+    if (config_.train_minibatch_size()) {
+      graph_->SetupTrainBatcher(config_.train_minibatch_size());
+      // size_t local_num =
+      // if (num_hosts_ > 1) {
+      //  dist_minibatch_tracker_ =
+      //  std::make_unique<DistributedMinibatchTracker>(
+      //      galois::runtime::getSystemNetworkInterface().ID, num_hosts_,
+      //      local_num, config_.train_minibatch_size());
+      //}
+    }
+
+    if (config_.test_minibatch_size()) {
+      graph_->SetupTestBatcher(config_.test_minibatch_size());
+    }
+
+    // create the output layer
+    GNNLayerDimensions output_dims = {
+        .input_rows = max_rows,
+        // get last intermediate layer column size
+        .input_columns = config_.intermediate_layer_size(
+            config_.num_intermediate_layers() - 1),
+        .output_columns = config_.output_layer_size(),
+        .output_rows    = max_rows};
+
+    if ((config_.train_minibatch_size() || config_.use_train_subgraph_) &&
+        config_.test_minibatch_size()) {
+      output_dims.input_rows  = 0;
+      output_dims.output_rows = 0;
+    }
+
+    switch (config_.output_layer_type()) {
+    case (GNNOutputLayerType::kSoftmax):
+      gnn_layers_.push_back(std::move(std::make_unique<SoftmaxLayer<VTy, ETy>>(
+          config_.num_intermediate_layers(), *graph_, &prev_output_layer,
+          output_dims)));
+      break;
+    case (GNNOutputLayerType::kSigmoid):
+      gnn_layers_.push_back(std::move(std::make_unique<SigmoidLayer<VTy, ETy>>(
+          config_.num_intermediate_layers(), *graph_, &prev_output_layer,
+          output_dims)));
+      break;
+    default:
+      GALOIS_LOG_FATAL("Invalid layer type during network construction");
+    }
+
+    // sanity checking multi-class + output layer
+    if (!graph_->is_single_class_label() &&
+        (config_.output_layer_type() != GNNOutputLayerType::kSigmoid)) {
+      GALOIS_LOG_WARN(
+          "Using a non-sigmoid output layer with a multi-class label!");
+      // if debug mode just kill program
+      assert(false);
+    }
+
+    // flip sampling on layers
+    if (config_.use_train_subgraph_ || config_.do_sampling() ||
+        config_.train_minibatch_size()) {
+      for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
+        ptr->EnableSampling();
+      }
+    }
+  }
+
+  //! Number of intermediate layers (DOES NOT INCLUDE OUTPUT LAYER)
+  size_t num_intermediate_layers() { return gnn_layers_.size() - 1; }
+
+  //! Returns pointer to intermediate layer i
+  galois::GNNLayer<VTy, ETy>* GetIntermediateLayer(size_t i) {
+    if (i < gnn_layers_.size() - 1) {
+      return gnn_layers_[i].get();
+    } else {
+      GALOIS_LOG_FATAL("Accessing out of bounds intermediate layer {}", i);
+    }
+  }
+
+  //! Set the phases of all layers at once as well as this network
+  void SetLayerPhases(galois::GNNPhase phase) {
+    phase_ = phase;
+    for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
+      ptr->SetLayerPhase(phase);
+    }
+  }
+
+  //! Set weights on all layers to 1; should be used for debugging only
+  void SetAllLayerWeightsTo1() {
+    for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
+      ptr->InitAllWeightsTo1();
+    }
+  }
+
+  //! Returns the output layer
+  galois::GNNLayer<VTy, ETy>* GetOutputLayer() {
+    return gnn_layers_.back().get();
+  }
+
+  float MinibatchedTesting() {
+    galois::gDebug("Minibatched Testing");
+    graph_->DisableSubgraph();
+    graph_->ResetTestMinibatcher();
+    SetLayerPhases(galois::GNNPhase::kBatch);
+
+    bool choose_all_status = graph_->SubgraphChooseAllStatus();
+
+    uint32_t correct = 0;
+    uint32_t total   = 0;
+    while (true) {
+      work_left_.reset();
+      // size_t seed_node_count = graph_->PrepareNextTestMinibatch();
+      graph_->PrepareNextTestMinibatch();
+      // last layer input size/output rows becomes seed node size
+      // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
+      // seed_node_count);
+      size_t num_sampled_layers = 0;
+
+      for (auto back_iter = gnn_layers_.rbegin();
+           back_iter != gnn_layers_.rend(); back_iter++) {
+        GNNLayerType layer_type = (*back_iter)->layer_type();
+        if (layer_type == GNNLayerType::kGraphConvolutional ||
+            layer_type == GNNLayerType::kSAGE) {
+          // you can minibatch with sampling or minibatch and grab all
+          // relevant neighbors
+          // size_t current_sample_size;
+          graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), false,
+                                 num_sampled_layers + 1);
+          // resize this layer, change seed node count
+          //(*back_iter)
+          //    ->ResizeInputOutputRows(current_sample_size, seed_node_count);
+          // seed_node_count = current_sample_size;
+
+          num_sampled_layers++;
+          // XXX resizes above only work for SAGE layers; will break if other
+          // layers are tested
+        }
+      }
+
+      // resize layer matrices
+      CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+      graph_->EnableSubgraphChooseAll();
+      CorrectBackwardLinks();
+
+      const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
+      std::pair<uint32_t, uint32_t> correct_total =
+          graph_->GetBatchAccuracy(batch_pred);
+
+      correct += correct_total.first;
+      total += correct_total.second;
+
+      work_left_ += graph_->MoreTestMinibatches();
+      char global_work_left = work_left_.reduce();
+      if (!global_work_left) {
+        break;
+      }
+    }
+
+    galois::gInfo("Minibatching Correct / Total ", correct, " ", total);
+
+    if (choose_all_status) {
+      graph_->EnableSubgraphChooseAll();
+    } else {
+      graph_->DisableSubgraphChooseAll();
+    }
+
+    return (1.0 * correct) / (1.0 * total);
+  }
+
+  //! Do training for a specified # of epochs and return test accuracy at the
+  //! end of it
+  float Train(size_t num_epochs) {
+    EnableTimers();
+    const size_t this_host = graph_->host_id();
+    float train_accuracy{0.f};
+    std::vector<size_t> subgraph_layer_sizes;
+    // this subgraph only needs to be created once
+    if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
+      galois::StatTimer total_subgraph_construction_timer(
+          "TotalSubGraphConstruction", kRegionName);
+      galois::StatTimer setup_neighborhood_sample_timer(
+          "SetupNeighborhoodSample", kRegionName);
+      galois::StatTimer edge_sampling_timer("SampleAllEdges", kRegionName);
+      galois::StatTimer subgraph_construction_timer("SubGraphConstruction",
+                                                    kRegionName);
+      total_subgraph_construction_timer.start();
+
+      setup_neighborhood_sample_timer.start();
+      // Setup the subgraph to only be the training graph
+      size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+      setup_neighborhood_sample_timer.stop();
+
+      subgraph_layer_sizes.emplace_back(local_seed_node_count);
+      galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
+                     local_seed_node_count);
+      size_t num_sampled_layers = 0;
+      edge_sampling_timer.start();
+      // gnn_layers_.back()->ResizeRows(local_seed_node_count);
+      for (auto back_iter = gnn_layers_.rbegin();
+           back_iter != gnn_layers_.rend(); back_iter++) {
+        GNNLayerType layer_type = (*back_iter)->layer_type();
+        if (layer_type == GNNLayerType::kGraphConvolutional ||
+            layer_type == GNNLayerType::kSAGE) {
+          size_t current_sample_size = graph_->SampleAllEdges(
+              (*back_iter)->graph_user_layer_number(),
+              config_.inductive_subgraph_, num_sampled_layers + 1);
+          galois::gDebug(graph_->host_prefix(),
+                         "Number of local nodes for train subgraph for layer ",
+                         (*back_iter)->graph_user_layer_number(), " is ",
+                         current_sample_size);
+          // resizing
+          //(*back_iter)
+          //    ->ResizeInputOutputRows(current_sample_size,
+          //    local_seed_node_count);
+          local_seed_node_count = current_sample_size;
+          subgraph_layer_sizes.emplace_back(local_seed_node_count);
+          num_sampled_layers++;
+        }
+      }
+      edge_sampling_timer.stop();
+      subgraph_construction_timer.start();
+      CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+      subgraph_construction_timer.stop();
+      CorrectBackwardLinks();
+      total_subgraph_construction_timer.stop();
+    }
+
+    galois::StatTimer epoch_timer("TrainingTime", kRegionName);
+    galois::StatTimer validation_timer("ValidationTime", kRegionName);
+    galois::StatTimer epoch_test_timer("TestTime", kRegionName);
+    float total_checked{0}, correct{0};
+    for (size_t epoch = 0; epoch < num_epochs; epoch++) {
+      total_checked = 0;
+      correct       = 0;
+      epoch_timer.start();
+      // swap to train subgraph
+      if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) {
+        graph_->EnableSubgraph();
+        // TODO(loc) this doesn't actually function as expected anymore
+        // with the numerous changes to the system; this commenting
+        // out is more of a hack for the train subgraph option (which
+        // probably shouldn't be used anyways)
+
+        // size_t l_count = 0;
+        // gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]);
+        // for (auto back_iter = gnn_layers_.rbegin();
+        //     back_iter != gnn_layers_.rend(); back_iter++) {
+        //  GNNLayerType layer_type = (*back_iter)->layer_type();
+        //  if (layer_type == GNNLayerType::kGraphConvolutional ||
+        //      layer_type == GNNLayerType::kSAGE) {
+        //    (*back_iter)
+        //        ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1],
+        //                                subgraph_layer_sizes[l_count]);
+        //    l_count++;
+        //  }
+        //}
+        CorrectBackwardLinks();
+      }
+
+      // beginning of epoch sampling (no minibatches)
+      if (config_.do_sampling() && !config_.train_minibatch_size()) {
+        galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName);
+        galois::StatTimer subgraph_construction_timer("SubGraphConstruction",
+                                                      kRegionName);
+        galois::StatTimer setup_neighborhood_sample_timer(
+            "SetupNeighborhoodSample", kRegionName);
+        galois::StatTimer edge_sampling_timer("SampleEdges", kRegionName);
+        mb_timer.start();
+
+        setup_neighborhood_sample_timer.start();
+        size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+        setup_neighborhood_sample_timer.stop();
+        // gnn_layers_.back()->ResizeRows(local_seed_node_count);
+        galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ",
+                       local_seed_node_count);
+        size_t num_sampled_layers = 0;
+
+        edge_sampling_timer.start();
+        // work backwards on GCN/SAGE layers
+        // loop backward and find last GCN/SAGE (main) layer to disable
+        // activation
+        for (auto back_iter = gnn_layers_.rbegin();
+             back_iter != gnn_layers_.rend(); back_iter++) {
+          GNNLayerType layer_type = (*back_iter)->layer_type();
+          if (layer_type == GNNLayerType::kGraphConvolutional ||
+              layer_type == GNNLayerType::kSAGE) {
+            size_t current_sample_size = graph_->SampleEdges(
+                (*back_iter)->graph_user_layer_number(),
+                config_.fan_out_vector_[num_sampled_layers],
+                config_.inductive_subgraph_, num_sampled_layers + 1);
+            galois::gDebug(graph_->host_prefix(),
+                           "Number of local nodes for layer ",
+                           (*back_iter)->graph_user_layer_number(), " is ",
+                           current_sample_size);
+
+            //(*back_iter)
+            //    ->ResizeInputOutputRows(current_sample_size,
+            //                            local_seed_node_count);
+            local_seed_node_count = current_sample_size;
+            num_sampled_layers++;
+          }
+        }
+        edge_sampling_timer.stop();
+        // resize layer matrices
+        subgraph_construction_timer.start();
+        CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers));
+        subgraph_construction_timer.stop();
+        CorrectBackwardLinks();
+        mb_timer.stop();
+      }
+
+      if (!config_.train_minibatch_size()) {
+        // no minibatching, full batch
+        const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+        // have to get accuracy here because gradient prop destroys the
+        // predictions matrix
+        train_accuracy = GetGlobalAccuracy(predictions);
+        GradientPropagation();
+      } else {
+        graph_->ResetTrainMinibatcher();
+        // if (num_hosts_ > 1) {
+        //  dist_minibatch_tracker_->ResetEpoch();
+        //}
+
+        SetLayerPhases(galois::GNNPhase::kBatch);
+
+        size_t batch_num = 0;
+
+        // create mini batch graphs and loop until minibatches on all hosts done
+        while (true) {
+          galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName);
+          galois::StatTimer sample_time("MinibatchSampling", kRegionName);
+          galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName);
+          galois::StatTimer subgraph_construction_timer("SubGraphConstruction",
+                                                        kRegionName);
+          mb_timer.start();
+
+          galois::Timer batch_timer;
+          batch_timer.start();
+          work_left_.reset();
+          galois::gInfo("Epoch ", epoch, " batch ", batch_num++);
+          // break when all hosts are done with minibatches
+          prep_timer.start();
+          size_t seed_node_count;
+          // if (num_hosts_ > 1) {
+          //  size_t num_for_next_batch =
+          //      dist_minibatch_tracker_->GetNumberForNextMinibatch();
+          //  galois::gInfo(graph_->host_prefix(), "Sampling ",
+          //  num_for_next_batch,
+          //                " for this minibatch");
+          //  seed_node_count =
+          //      graph_->PrepareNextTrainMinibatch(num_for_next_batch);
+          //} else {
+          //}
+          seed_node_count = graph_->PrepareNextTrainMinibatch();
+
+          galois::gDebug(graph_->host_prefix(),
+                         "Number of local seed nodes is for batch is ",
+                         seed_node_count);
+          prep_timer.stop();
+
+          // last layer input size/output rows becomes seed node size
+          // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count,
+          //                                          seed_node_count);
+
+          sample_time.start();
+          // +1 later in call because 0 is already taken
+          size_t num_sampled_layers = 0;
+          for (auto back_iter = gnn_layers_.rbegin();
+               back_iter != gnn_layers_.rend(); back_iter++) {
+            GNNLayerType layer_type = (*back_iter)->layer_type();
+            if (layer_type == GNNLayerType::kGraphConvolutional ||
+                layer_type == GNNLayerType::kSAGE) {
+              // you can minibatch with sampling or minibatch and grab all
+              // relevant neighbors
+              size_t current_sample_size;
+
+              if (config_.do_sampling()) {
+                current_sample_size = graph_->SampleEdges(
+                    (*back_iter)->graph_user_layer_number(),
+                    config_.fan_out_vector_[num_sampled_layers],
+                    config_.inductive_subgraph_, num_sampled_layers + 1);
+              } else {
+                current_sample_size = graph_->SampleAllEdges(
+                    (*back_iter)->graph_user_layer_number(),
+                    config_.inductive_subgraph_, num_sampled_layers + 1);
+              }
+
+              galois::gDebug(graph_->host_prefix(),
+                             "Number of local nodes for layer ",
+                             (*back_iter)->graph_user_layer_number(), " is ",
+                             current_sample_size);
+
+              // resize this layer, change seed node count
+              //(*back_iter)
+              //    ->ResizeInputOutputRows(current_sample_size,
+              //    seed_node_count);
+              seed_node_count = current_sample_size;
+              num_sampled_layers++;
+            }
+          }
+          sample_time.stop();
+
+          // resize layer matrices
+          subgraph_construction_timer.start();
+          CorrectRowCounts(
+              graph_->ConstructSampledSubgraph(num_sampled_layers));
+          subgraph_construction_timer.stop();
+          CorrectBackwardLinks();
+
+          // XXX resizes above only work for SAGE layers; will break if other
+          // layers are tested
+
+          mb_timer.stop();
+
+          const PointerWithSize<galois::GNNFloat> batch_pred = DoInference();
+
+          if (graph_->is_using_wmd()) {
+            std::pair<float, float> accuracy_results =
+                this->graph_->GetGlobalAccuracyCheckResult(
+                    batch_pred, phase_, config_.do_sampling());
+            train_accuracy = accuracy_results.first / accuracy_results.second;
+            correct += accuracy_results.first;
+            total_checked += accuracy_results.second;
+            galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                           ": The number of correct answers is ", correct, "/",
+                           total_checked, "\n");
+          } else {
+            train_accuracy = GetGlobalAccuracy(batch_pred);
+            galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                           ": Train accuracy/F1 micro is ", train_accuracy,
+                           " time ", batch_timer.get(), "\n");
+          }
+
+          GradientPropagation();
+
+          work_left_ += graph_->MoreTrainMinibatches();
+          char global_work_left = work_left_.reduce();
+          batch_timer.stop();
+          epoch_timer.stop();
+
+          bool test_eval =
+              config_.minibatch_test_interval_
+                  ? (batch_num - 1) % config_.minibatch_test_interval_ == 0
+                  : false;
+
+          if (test_eval) {
+            DisableTimers();
+            float test_acc;
+            if (!config_.test_minibatch_size()) {
+              // TODO something about this path breaks accuracy
+              GALOIS_LOG_FATAL("this path breaks accuracy for the rest of the "
+                               "run for some reason");
+              bool f = graph_->SubgraphChooseAllStatus();
+              graph_->DisableSubgraph();
+              for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+                   layer++) {
+                // TODO nuclear resize
+                (*layer)->ResizeRows(graph_->size());
+              }
+              CorrectBackwardLinks();
+              SetLayerPhases(galois::GNNPhase::kTest);
+              graph_->EnableSubgraphChooseAll();
+              const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
+              test_acc = GetGlobalAccuracy(test_pred);
+              graph_->SetSubgraphChooseAll(f);
+            } else {
+              test_acc = MinibatchedTesting();
+            }
+
+            if (this_host == 0) {
+              galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1,
+                             ": Test accuracy is ", test_acc, "\n");
+              const std::string test_name_acc =
+                  "TestEpoch" + std::to_string(epoch) + "Batch" +
+                  std::to_string(batch_num - 1) + "Accuracy";
+              galois::runtime::reportStat_Single(kRegionName, test_name_acc,
+                                                 test_acc);
+            }
+
+            // report the training time elapsed at this point in time
+            galois::runtime::reportStat_Single(
+                kRegionName,
+                "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" +
+                    std::to_string(batch_num - 1),
+                epoch_timer.get());
+            // revert to training phase for next epoch
+            SetLayerPhases(galois::GNNPhase::kTrain);
+            EnableTimers();
+          }
+
+          epoch_timer.start();
+
+          if (!global_work_left) {
+            // if (num_hosts_ > 1) {
+            //  GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork());
+            //}
+            break;
+          }
+        }
+      }
+      epoch_timer.stop();
+
+      if (this_host == 0) {
+        const std::string t_name_acc =
+            "TrainEpoch" + std::to_string(epoch) + "Accuracy";
+        if (config_.train_minibatch_size() && this->graph_->is_using_wmd()) {
+          train_accuracy = correct / total_checked;
+        }
+        galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ",
+                       train_accuracy, "\n");
+        galois::runtime::reportStat_Single(kRegionName, t_name_acc,
+                                           train_accuracy);
+      }
+
+      bool do_validate = config_.validation_interval_
+                             ? epoch % config_.validation_interval_ == 0
+                             : false;
+      bool do_test =
+          config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false;
+
+      bool subgraph_choose_all_status = graph_->SubgraphChooseAllStatus();
+
+      if (do_validate || do_test) {
+        DisableTimers();
+        // disable subgraph
+        graph_->DisableSubgraph();
+        graph_->EnableSubgraphChooseAll();
+      }
+
+      if (do_validate) {
+        // XXX induced subgraph here
+        for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+             layer++) {
+          // nuclear resize
+          (*layer)->ResizeRows(graph_->size());
+        }
+
+        CorrectBackwardLinks();
+        validation_timer.start();
+        SetLayerPhases(galois::GNNPhase::kValidate);
+        const PointerWithSize<galois::GNNFloat> val_pred = DoInference();
+        validation_timer.stop();
+
+        float val_acc = GetGlobalAccuracy(val_pred);
+        if (this_host == 0) {
+          galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc,
+                         "\n");
+          const std::string v_name_acc =
+              "ValEpoch" + std::to_string(epoch) + "Accuracy";
+          galois::runtime::reportStat_Single(kRegionName, v_name_acc, val_acc);
+        }
+      }
+
+      if (do_test) {
+        epoch_test_timer.start();
+        float test_acc;
+
+        if (!config_.test_minibatch_size()) {
+          for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+               layer++) {
+            // nuclear resize
+            (*layer)->ResizeRows(graph_->size());
+          }
+          CorrectBackwardLinks();
+          SetLayerPhases(galois::GNNPhase::kTest);
+          const PointerWithSize<galois::GNNFloat> test_pred = DoInference();
+          epoch_test_timer.stop();
+          test_acc = GetGlobalAccuracy(test_pred);
+        } else {
+          test_acc = MinibatchedTesting();
+          epoch_test_timer.stop();
+        }
+
+        if (this_host == 0) {
+          galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc,
+                         "\n");
+          const std::string test_name_acc =
+              "TestEpoch" + std::to_string(epoch) + "Accuracy";
+          galois::runtime::reportStat_Single(kRegionName, test_name_acc,
+                                             test_acc);
+        }
+      }
+
+      if (do_validate || do_test) {
+        // report the training time elapsed at this point in time
+        galois::runtime::reportStat_Single(
+            kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch),
+            epoch_timer.get());
+        // revert to training phase for next epoch
+        SetLayerPhases(galois::GNNPhase::kTrain);
+        graph_->SetSubgraphChooseAll(subgraph_choose_all_status);
+
+        // TODO too much code dupe
+        // Resconstruct the train subgraph since it was replaced by test
+        // subgraph
+        if (config_.use_train_subgraph_ && !config_.train_minibatch_size() &&
+            config_.test_minibatch_size() && do_test) {
+          // Setup the subgraph to only be the training graph
+          size_t local_seed_node_count = graph_->SetupNeighborhoodSample();
+          galois::gDebug(graph_->host_prefix(),
+                         "Number of local seed nodes is ",
+                         local_seed_node_count);
+          size_t num_sampled_layers = 0;
+          // gnn_layers_.back()->ResizeRows(local_seed_node_count);
+          for (auto back_iter = gnn_layers_.rbegin();
+               back_iter != gnn_layers_.rend(); back_iter++) {
+            GNNLayerType layer_type = (*back_iter)->layer_type();
+            if (layer_type == GNNLayerType::kGraphConvolutional ||
+                layer_type == GNNLayerType::kSAGE) {
+              size_t current_sample_size = graph_->SampleAllEdges(
+                  (*back_iter)->graph_user_layer_number(),
+                  config_.inductive_subgraph_, num_sampled_layers + 1);
+              // resizing
+              //(*back_iter)
+              //    ->ResizeInputOutputRows(current_sample_size,
+              //                            local_seed_node_count);
+              local_seed_node_count = current_sample_size;
+              num_sampled_layers++;
+            }
+          }
+          CorrectRowCounts(
+              graph_->ConstructSampledSubgraph(num_sampled_layers));
+          CorrectBackwardLinks();
+        }
+
+        EnableTimers();
+      }
+    }
+
+    uint64_t average_epoch_time = epoch_timer.get() / num_epochs;
+    galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime",
+                                     average_epoch_time);
+    // DisableTimers();
+    //  disable subgraph
+    graph_->DisableSubgraph();
+    graph_->EnableSubgraphChooseAll();
+
+    // check test accuracy
+    galois::StatTimer test_timer("FinalTestRun", kRegionName);
+    float global_accuracy;
+
+    test_timer.start();
+
+    if (!config_.test_minibatch_size()) {
+      for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end();
+           layer++) {
+        // TODO nuclear resize; this is **ridiculously** inefficient
+        // because full graph will be used even if not included in test
+        // k-hop neighborhood for eval
+        (*layer)->ResizeRows(graph_->size());
+      }
+      CorrectBackwardLinks();
+      SetLayerPhases(galois::GNNPhase::kTest);
+      const PointerWithSize<galois::GNNFloat> predictions = DoInference();
+      global_accuracy = GetGlobalAccuracy(predictions);
+    } else {
+      global_accuracy = MinibatchedTesting();
+    }
+
+    test_timer.stop();
+
+    if (this_host == 0) {
+      galois::gPrint("Final test accuracy is ", global_accuracy, "\n");
+      galois::runtime::reportStat_Single(kRegionName, "FinalTestAccuracy",
+                                         global_accuracy);
+    }
+
+    return global_accuracy;
+  }
+
+  //! Propogates the graph's feature vectors through the network to get a new
+  //! vector representation.
+  //! Also known as the forward phase in most literature
+  //! @returns Output layer's output
+  const PointerWithSize<GNNFloat> DoInference() {
+    galois::StatTimer timer("DoInference", "GraphNeuralNetwork");
+    if (timers_on_) {
+      timer.start();
+    }
+
+    // start with graph features and pass it through all layers of the network
+    galois::PointerWithSize<galois::GNNFloat> layer_input =
+        graph_->GetLocalFeatures();
+
+    for (std::unique_ptr<galois::GNNLayer<VTy, ETy>>& ptr : gnn_layers_) {
+      layer_input = ptr->ForwardPhase(layer_input);
+    }
+
+    if (timers_on_) {
+      timer.stop();
+    }
+
+    return layer_input;
+  }
+
+  //! Returns classification accuracy for single class label or micro F1 score
+  //! for multi-class predictions; this calls into GNNGraph's accuracy call
+  float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions) {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      if (cpu_pred_.size() != predictions.size()) {
+        cpu_pred_.resize(predictions.size());
+      }
+
+      // TODO get rid of CPU copy here if possible
+      AdamOptimizer* adam = static_cast<AdamOptimizer*>(optimizer_.get());
+      adam->CopyToVector(cpu_pred_, predictions);
+      return graph_->GetGlobalAccuracy(cpu_pred_, phase_,
+                                       config_.do_sampling());
+    } else {
+#endif
+      return graph_->GetGlobalAccuracy(predictions, phase_,
+                                       config_.do_sampling());
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  float GetGlobalAccuracy(const PointerWithSize<GNNFloat> predictions,
+                          bool sampling);
+
+  //! Backpropagate gradients from the output layer backwards through the
+  //! network to update the layer weights. Also known as a backward phase in
+  //! most literature
+  void GradientPropagation() {
+    galois::StatTimer timer("GradientPropagation", "GraphNeuralNetwork");
+    if (timers_on_) {
+      timer.start();
+    }
+
+    // from output layer get initial gradients
+    std::vector<galois::GNNFloat> dummy;
+    std::unique_ptr<galois::GNNLayer<VTy, ETy>>& output_layer =
+        gnn_layers_.back();
+    galois::PointerWithSize<galois::GNNFloat> current_gradients =
+        output_layer->BackwardPhase(dummy, nullptr);
+    // loops through intermediate layers in a backward fashion
+    // -1 to ignore output layer which was handled above
+    for (size_t i = 0; i < gnn_layers_.size() - 1; i++) {
+      // note this assumes you have at least 2 layers (including output)
+      size_t layer_index = gnn_layers_.size() - 2 - i;
+
+      // get the input to the layer before this one
+      galois::PointerWithSize<galois::GNNFloat> prev_layer_input;
+      if (layer_index != 0) {
+        prev_layer_input = gnn_layers_[layer_index - 1]->GetForwardOutput();
+      } else {
+        prev_layer_input = graph_->GetLocalFeatures();
+      }
+
+      // backward prop and get a new set of gradients
+      current_gradients = gnn_layers_[layer_index]->BackwardPhase(
+          prev_layer_input, &current_gradients);
+      // if not output do optimization/gradient descent
+      // at this point in the layer the gradients exist; use the gradients to
+      // update the weights of the layer
+      gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index);
+    }
+
+    if (timers_on_) {
+      timer.stop();
+    }
+  }
+
+  //! # nodes may change in distributed setting due to dead mirrors;
+  //! given the # of nodes at each layer, fix the input/output rows
+  void CorrectRowCounts(const std::vector<unsigned>& nodes_at_each_layer) {
+    // assumes last layer is  output row and resizes it based on first
+    // offset
+    gnn_layers_.back()->ResizeInputOutputRows(nodes_at_each_layer[0],
+                                              nodes_at_each_layer[0]);
+
+    size_t layer_offset = 0;
+    // work backwards
+    for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend();
+         back_iter++) {
+      GNNLayerType layer_type = (*back_iter)->layer_type();
+      if (layer_type == GNNLayerType::kGraphConvolutional ||
+          layer_type == GNNLayerType::kSAGE) {
+        GALOIS_LOG_ASSERT(nodes_at_each_layer[layer_offset + 1] >=
+                          nodes_at_each_layer[layer_offset]);
+        (*back_iter)
+            ->ResizeInputOutputRows(nodes_at_each_layer[layer_offset + 1],
+                                    nodes_at_each_layer[layer_offset]);
+        layer_offset++;
+      }
+    }
+    GALOIS_LOG_ASSERT(layer_offset + 1 == nodes_at_each_layer.size());
+  }
+
+  //! Call whenever resize occurs to correct reuse of pointers for layers
+  void CorrectBackwardLinks() {
+    // layer chain pointer
+    PointerWithSize<GNNFloat> prev_output_layer(nullptr, 0);
+    for (size_t layer_num = 0; layer_num < gnn_layers_.size(); layer_num++) {
+      // first layer is nullptr so can be ignored
+      if (layer_num != 0) {
+        gnn_layers_[layer_num]->UpdateBackwardOutput(&prev_output_layer);
+      }
+      prev_output_layer = gnn_layers_[layer_num]->GetForwardOutput();
+    }
+  }
+
+private:
+  static const constexpr char* kRegionName = "GraphNeuralNetwork";
+
+  bool timers_on_{true};
+
+  void EnableTimers() {
+    timers_on_ = true;
+    galois::gDebug("Enabling timers");
+    graph_->EnableTimers();
+    for (auto& layer : gnn_layers_)
+      layer->EnableTimers();
+  }
+
+  void DisableTimers() {
+    timers_on_ = false;
+    galois::gDebug("Disabling timers");
+    graph_->DisableTimers();
+    for (auto& layer : gnn_layers_)
+      layer->DisableTimers();
+  }
+
+  //! Underlying graph to train
+  std::unique_ptr<graphs::GNNGraph<VTy, ETy>> graph_;
+  //! Optimizer object for weight updates
+  std::unique_ptr<BaseOptimizer> optimizer_;
+  //! Configuration object used to construct this GNN
+  GraphNeuralNetworkConfig config_;
+  //! GNN layers including the output
+  std::vector<std::unique_ptr<galois::GNNLayer<VTy, ETy>>> gnn_layers_;
+  //! Current phase of the GNN: train, validation, test
+  GNNPhase phase_{GNNPhase::kTrain};
+  //! Number of layers that use the graph (e.g. SAGE, GCN)
+  size_t num_graph_user_layers_;
+
+  //! Termination detection for minibatching
+  galois::DGAccumulator<uint32_t> work_left_;
+
+  size_t num_hosts_{0};
+  std::unique_ptr<galois::DistributedMinibatchTracker> dist_minibatch_tracker_;
+
+#ifdef GALOIS_ENABLE_GPU
+  //! Holds all GPU functions
+  GraphNeuralNetworkGPU gpu_object_;
+  // Used to copy predictions from gpu over
+  std::vector<GNNFloat> cpu_pred_;
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h
new file mode 100644
index 0000000000..127367bdf1
--- /dev/null
+++ b/libgnn/include/galois/MinibatchGenerator.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include "galois/GNNTypes.h"
+#include "galois/Logging.h"
+#include "galois/graphs/DistributedGraph.h"
+#include <ctime>
+#include <random>
+#include <algorithm>
+
+namespace galois {
+
+//! Generates minibatchs given a mask for the class of things to generate
+//! the minibatch for
+class MinibatchGenerator {
+public:
+  MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size,
+                     size_t master_bound)
+      : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size},
+        current_position_{0}, master_bound_{master_bound} {
+    // set seed based on time then initialize random generate with rand()
+    // srand(1);
+    srand(time(NULL));
+    rand_generator_ = std::make_unique<std::mt19937>(rand());
+    GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size());
+  }
+
+  void GetNextMinibatch(std::vector<char>* batch_mask) {
+    if (!shuffle_mode_) {
+      OriginalGetNextMinibatch(batch_mask);
+    } else {
+      ShuffleGetNextMinibatch(batch_mask);
+    }
+  }
+
+  // void GetNextMinibatch(std::vector<char>* batch_mask, size_t num_to_get) {
+  //  if (!shuffle_mode_) {
+  //    // TODO
+  //    GALOIS_LOG_FATAL("not yet implemented");
+  //  } else {
+  //    ShuffleGetNextMinibatch(batch_mask, num_to_get);
+  //  }
+  //}
+
+  //! True if no more minibatches from this generator
+  bool NoMoreMinibatches() {
+    if (!shuffle_mode_) {
+      return current_position_ == master_bound_;
+    } else {
+      return current_position_ >= all_indices_.size();
+    }
+  }
+
+  //! Reset the only state (a position bit)
+  void ResetMinibatchState() {
+    current_position_ = 0;
+    if (shuffle_mode_) {
+      std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
+    }
+  }
+
+  //! Original shuffle mode in which every host only considers locally owned
+  //! training nodes in the all indices array
+  void ShuffleMode() {
+    if (!shuffle_mode_) {
+      shuffle_mode_ = true;
+      all_indices_.reserve(master_bound_);
+      // setup all set indices for the minibatch
+      for (size_t pos = 0; pos < master_bound_; pos++) {
+        if (mask_to_minibatch_[pos]) {
+          all_indices_.emplace_back(pos);
+        }
+      }
+      // shuffle it
+      std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
+      printf("Number of things in minibatch generator is %lu\n",
+             all_indices_.size());
+    }
+  }
+
+  //! Distributed shuffle mode: all hosts create array with ALL global training
+  //! node IDs and initialize shuffler to same seed. All hosts then advance it
+  //! at the same time, resulting in a consistent minibatch across all hosts.
+  //! Will *NOT* balance # of training nodes done on a host each minibatch
+  //! unlike original shuffle.
+  void
+  DistributedShuffleMode(const galois::graphs::DistGraph<char, void>& graph,
+                         GNNMask& global_training_mask,
+                         size_t total_train_nodes) {
+    if (!shuffle_mode_) {
+      shuffle_mode_ = true;
+      all_indices_.reserve(total_train_nodes);
+      // setup all set indices for the minibatch
+      for (size_t pos = 0; pos < global_training_mask.size(); pos++) {
+        if (global_training_mask[pos]) {
+          if (graph.isLocal(pos)) {
+            all_indices_.emplace_back(graph.getLID(pos));
+          } else {
+            // size is greater than LID; use this as a "not present"
+            all_indices_.emplace_back(graph.size());
+          }
+        }
+      }
+      GALOIS_LOG_VASSERT(all_indices_.size() == total_train_nodes,
+                         "{} vs right {}", all_indices_.size(),
+                         total_train_nodes);
+
+      // shuffle it
+      std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_);
+      printf("Number of things in minibatch generator is %lu\n",
+             all_indices_.size());
+    }
+  }
+
+  //! Total number of nodes that can be minibatched by this minibatch
+  //! generator on this host
+  size_t ShuffleMinibatchTotal() {
+    if (shuffle_mode_) {
+      return all_indices_.size();
+    } else {
+      return 0;
+    }
+  }
+
+private:
+  const GNNMask& mask_to_minibatch_;
+  size_t minibatch_size_;
+  size_t current_position_;
+  size_t master_bound_;
+  std::vector<uint32_t> all_indices_;
+  bool shuffle_mode_ = false;
+  std::unique_ptr<std::mt19937> rand_generator_;
+
+  void OriginalGetNextMinibatch(std::vector<char>* batch_mask);
+  void ShuffleGetNextMinibatch(std::vector<char>* batch_mask);
+
+  // Do not use these unless you know what they're doing
+  void DistributedShuffleGetNextMinibatch(std::vector<char>* batch_mask);
+  void DistributedShuffleGetNextMinibatch(std::vector<char>* batch_mask,
+                                          size_t num_to_get);
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/PerThreadRNG.h b/libgnn/include/galois/PerThreadRNG.h
new file mode 100644
index 0000000000..441b1b542c
--- /dev/null
+++ b/libgnn/include/galois/PerThreadRNG.h
@@ -0,0 +1,42 @@
+#pragma once
+#include <random>
+#include "galois/substrate/PerThreadStorage.h"
+#include "galois/Galois.h"
+#include "galois/GNNTypes.h"
+#include "galois/Logging.h"
+
+namespace galois {
+
+//! Per thread RNG object for generating numbers in parallel
+class PerThreadRNG {
+public:
+  //! Default seed 0, default distribution 0 to 1
+  PerThreadRNG() : PerThreadRNG(0.0, 1.0){};
+  //! User specified range
+  PerThreadRNG(float begin, float end) : distribution_{begin, end} {
+    // each thread needs to have a different seed so that the same # isn't
+    // chosen across all threads
+    galois::on_each([&](unsigned tid, unsigned n_threads) {
+      engine_.getLocal()->seed(tid * n_threads);
+    });
+  };
+  //! Returns a random number between numbers specified during init
+  GNNFloat GetRandomNumber() {
+    return (*distribution_.getLocal())(*engine_.getLocal());
+  }
+  //! Return true or false based on some dropout rate
+  bool DoBernoulli(float dropout_rate) {
+    // TODO can the random number be 0? what is the behavior of 0 > 0?
+    // same with 1 > 1.....
+    return (GetRandomNumber() > dropout_rate) ? 1 : 0;
+  }
+
+private:
+  //! Per thread generator of random
+  galois::substrate::PerThreadStorage<std::default_random_engine> engine_;
+  //! Per thread distribution of random
+  galois::substrate::PerThreadStorage<std::uniform_real_distribution<GNNFloat>>
+      distribution_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h
new file mode 100644
index 0000000000..d08913caf0
--- /dev/null
+++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h
@@ -0,0 +1,157 @@
+#include "galois/GNNTypes.h"
+// #include "galois/Logging.h"
+
+namespace galois {
+namespace graphs {
+
+extern uint32_t* gnn_degree_vec_1_;
+extern uint32_t* gnn_degree_vec_2_;
+
+extern galois::DynamicBitSet bitset_sampled_degrees_;
+extern std::vector<galois::LargeArray<uint32_t>>* gnn_sampled_out_degrees_;
+
+template <typename NTy>
+struct InitialDegreeSync {
+  using NodeTy = NTy;
+  using ValTy  = std::pair<uint32_t, uint32_t>;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t lid, NodeTy&) {
+    return std::make_pair(gnn_degree_vec_1_[lid], gnn_degree_vec_2_[lid]);
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t lid, NodeTy&, ValTy y) {
+    gnn_degree_vec_1_[lid] += y.first;
+    gnn_degree_vec_2_[lid] += y.second;
+    if (y.first || y.second) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t lid, NodeTy&) {
+    gnn_degree_vec_1_[lid] = 0;
+    gnn_degree_vec_2_[lid] = 0;
+  }
+
+  //! element wise set
+  static void setVal(uint32_t lid, NodeTy&, ValTy y) {
+    gnn_degree_vec_1_[lid] = y.first;
+    gnn_degree_vec_2_[lid] = y.second;
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+template <typename NTy>
+struct SubgraphDegreeSync {
+  using NodeTy = NTy;
+  using ValTy  = galois::gstl::Vector<uint32_t>;
+
+  static size_t FeatVecSize() { return gnn_sampled_out_degrees_->size(); }
+
+  static ValTy extract(uint32_t lid, NodeTy&) {
+    ValTy vec_to_send(gnn_sampled_out_degrees_->size());
+    size_t count = 0;
+    for (galois::LargeArray<uint32_t>& layer_degrees :
+         *gnn_sampled_out_degrees_) {
+      vec_to_send[count++] = layer_degrees[lid];
+    }
+    assert(count == vec_to_send.size());
+    return vec_to_send;
+  }
+
+  static void ExtractDirect(uint32_t lid,
+                            typename ValTy::value_type* to_write) {
+    size_t count = 0;
+    for (galois::LargeArray<uint32_t>& layer_degrees :
+         *gnn_sampled_out_degrees_) {
+      std::memcpy(&to_write[count], &layer_degrees[lid],
+                  sizeof(typename ValTy::value_type));
+      count++;
+    }
+  }
+
+  static bool reduce(uint32_t lid, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_sampled_out_degrees_->size());
+    for (size_t degree_index = 0; degree_index < y.size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index];
+    }
+    return true;
+  }
+
+  static bool reduce(uint32_t lid, NodeTy&, ValTy::value_type* y) {
+    for (size_t degree_index = 0;
+         degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index];
+    }
+    return true;
+  }
+
+  //! No-op: readAny = overwritten anyways; can probably get away with no-op
+  static void reset(uint32_t lid, NodeTy&) {
+    for (galois::LargeArray<uint32_t>& layer_degrees :
+         *gnn_sampled_out_degrees_) {
+      layer_degrees[lid] = 0;
+    }
+  }
+
+  //! element wise set
+  static void setVal(uint32_t lid, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_sampled_out_degrees_->size());
+    for (size_t degree_index = 0; degree_index < y.size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index];
+    }
+  }
+
+  static void setVal(uint32_t lid, NodeTy&, ValTy::value_type* y) {
+    for (size_t degree_index = 0;
+         degree_index < gnn_sampled_out_degrees_->size(); degree_index++) {
+      (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+struct SubgraphDegreeBitset {
+  static constexpr bool is_vector_bitset() { return false; }
+  static constexpr bool is_valid() { return true; }
+  static galois::DynamicBitSet& get() { return bitset_sampled_degrees_; }
+  static void reset_range(size_t begin, size_t end) {
+    bitset_sampled_degrees_.reset(begin, end);
+  }
+};
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh
new file mode 100644
index 0000000000..6b6ff2bb74
--- /dev/null
+++ b/libgnn/include/galois/graphs/GNNGraph.cuh
@@ -0,0 +1,84 @@
+#pragma once
+#include "galois/GNNTypes.h"
+
+namespace galois {
+namespace graphs {
+
+//! Class to hold everything allocated on the GPU that has to do with GNNGraph.
+//! Similar in nature to the CUDAContext class in existing D-IrGL
+class GNNGraphGPUAllocations {
+public:
+  //! CUDA frees all allocated memory (i.e. non-nullptr)
+  ~GNNGraphGPUAllocations();
+  //! Copies graph topology over to GPU; using ints because cuSparse lib
+  //! expects ints for the CSR arrays
+  void SetGraphTopology(const std::vector<int>& edge_index,
+                        const std::vector<int>& edge_dests);
+  //! Host side function that allocates memory for the features on the vertices
+  //! and copies them over to the GPU.
+  void SetFeatures(const std::vector<GNNFeature>& features,
+                   unsigned num_features);
+  //! Copy over ground truth for the graph to GPU
+  void SetLabels(const std::vector<GNNLabel>& ground_truth);
+  //! Copy over masks for the 3 sets to GPU
+  void SetMasks(const std::vector<char>& train, const std::vector<char>& val,
+                const std::vector<char>& test);
+
+  void AllocAggregateBitset(size_t size);
+
+  GNNFeature* feature_vector() const { return feature_vector_; };
+  int* edge_index() const { return edge_index_; }
+  int* edge_destinations() const { return edge_destinations_; }
+  GNNLabel* ground_truth() const { return ground_truth_; }
+  char* local_training_mask() const { return local_training_mask_; }
+  char* local_validation_mask() const { return local_validation_mask_; }
+  char* local_testing_mask() const { return local_testing_mask_; }
+
+  //! Get the total degree of the partitioned graph
+  uint32_t* get_global_degrees() const { return global_degrees_; }
+  //! Get the total degree of the sampled subgraph
+  uint32_t* get_global_train_degrees() const { return global_train_degrees_; }
+  //! Allocate memory to objects related to normalization
+  void InitNormFactor(size_t num_nodes);
+  //! Copy degree of the partitioned graph from CPU
+  void SetGlobalDegrees(const std::vector<uint32_t> global_degrees);
+  //! Copy degree of the sampled subgraph from CPU
+  void SetGlobalTrainDegrees(const std::vector<uint32_t> global_train_degrees);
+
+  void CopyToCPU(const PointerWithSize<GNNFloat>& input);
+
+private:
+  // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS
+
+  //! Number of features (which is equivalent to number of nodes)
+  unsigned* num_features_{nullptr};
+  //! Length of a feature vector
+  unsigned* feature_length_{nullptr};
+  //! Number of edges in graph
+  unsigned* num_edges_{nullptr};
+
+  // Note: no graph object, similar to Xuhao's LGraph in older code
+  //! edge_index[n] gets the first edge index for node n (i.e. edge_index_[0]
+  //! = 0)
+  int* edge_index_{nullptr};
+  //! edge_destinations_[i] = destination for edge i
+  int* edge_destinations_{nullptr};
+  //! (Local) feature vector
+  GNNFeature* feature_vector_{nullptr};
+
+  //! (Local) ground truth vector
+  GNNLabel* ground_truth_{nullptr};
+
+  // masks for phases
+  char* local_training_mask_{nullptr};
+  char* local_validation_mask_{nullptr};
+  char* local_testing_mask_{nullptr};
+
+  uint32_t* global_degrees_{nullptr};
+  size_t global_degree_size_{0};
+  uint32_t* global_train_degrees_{nullptr};
+  size_t global_train_degree_size_{0};
+};
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h
new file mode 100644
index 0000000000..525df23a1e
--- /dev/null
+++ b/libgnn/include/galois/graphs/GNNGraph.h
@@ -0,0 +1,2766 @@
+#pragma once
+
+#include "galois/GNNTypes.h"
+#include "galois/PerThreadRNG.h"
+#include "galois/graphs/CuSPPartitioner.h"
+#include "galois/graphs/GluonSubstrate.h"
+#include "galois/graphs/GraphAggregationSyncStructures.h"
+#include "galois/MinibatchGenerator.h"
+#include "galois/Logging.h"
+#include "galois/graphs/ReadGraph.h"
+#include "galois/GNNMath.h"
+#include "galois/graphs/DegreeSyncStructures.h"
+
+#include <fstream>
+#include <limits>
+#include <unordered_set>
+
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/graphs/GNNGraph.cuh"
+#endif
+
+namespace galois {
+
+// TODO remove the need to hardcode this path
+//! Path to location of all gnn files
+static const std::string default_gnn_dataset_path =
+    "/home/hochan/inputs/Learning/";
+
+//! Helper struct to maintain start/end/size of any particular range. Mostly
+//! used for mask ranges.
+struct GNNRange {
+  size_t begin{0};
+  size_t end{0};
+  size_t size{0};
+};
+
+namespace graphs {
+
+//! Possible partitioning schemes for the GNN graph
+enum class GNNPartitionScheme { kOEC, kCVC, kOCVC };
+
+template <typename VTy, typename ETy>
+class GNNGraph {
+public:
+  using GNNDistGraph = galois::graphs::DistGraph<VTy, ETy>;
+  using GraphNode    = typename GNNDistGraph::GraphNode;
+  // defined as such because dist graph range objects used long unsigned
+  using NodeIterator = boost::counting_iterator<size_t>;
+  using EdgeIterator = typename GNNDistGraph::edge_iterator;
+
+  // using GNNEdgeSortIterator = internal::EdgeSortIterator<GraphNode,
+  //  uint64_t, galois::LargeArray<uint32_t>,
+  //  galois::LargeArray<std::vector<bool>>>;
+
+  GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme,
+           bool has_single_class_label, bool use_wmd = false)
+      : GNNGraph(galois::default_gnn_dataset_path, dataset_name,
+                 partition_scheme, has_single_class_label, use_wmd) {}
+
+  //! Loads a graph and all relevant metadata (labels, features, masks, etc.)
+  GNNGraph(const std::string& input_directory, const std::string& dataset_name,
+           GNNPartitionScheme partition_scheme, bool has_single_class_label,
+           bool use_wmd = false)
+      : input_directory_(input_directory), use_wmd_(use_wmd) {
+    GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_,
+                       dataset_name);
+    // save host id
+    host_id_ = galois::runtime::getSystemNetworkInterface().ID;
+    host_prefix_ =
+        std::string("[") +
+        std::to_string(galois::runtime::getSystemNetworkInterface().ID) +
+        std::string("] ");
+    // load partition
+    partitioned_graph_ =
+        LoadPartition(input_directory_, dataset_name, partition_scheme);
+    galois::gInfo(host_prefix_, "Loading partition is completed");
+    // reverse edges
+    partitioned_graph_->ConstructIncomingEdges();
+    // mark a node if it is sampled
+    mark_sampled_nodes_.resize(partitioned_graph_->size());
+
+    galois::gInfo(host_prefix_, "Number of local proxies is ",
+                  partitioned_graph_->size());
+    galois::gInfo(host_prefix_, "Number of local edges is ",
+                  partitioned_graph_->sizeEdges());
+
+    // init gluon from the partitioned graph
+    sync_substrate_ =
+        std::make_unique<galois::graphs::GluonSubstrate<GNNDistGraph>>(
+            *partitioned_graph_, host_id_,
+            galois::runtime::getSystemNetworkInterface().Num, false,
+            partitioned_graph_->cartesianGrid());
+    bitset_graph_aggregate.resize(partitioned_graph_->size());
+
+    // Construct/read additional graph data
+    if (use_wmd) {
+      galois::gInfo("Feature is constructed by aggregating 2-hop features, "
+                    "instead from feature files");
+      this->ConstructFeatureBy2HopAggregation();
+      this->ConstructLocalLabels();
+      this->SetLocalMasksRandomly();
+    } else {
+      if (dataset_name != "ogbn-papers100M-remap") {
+        ReadLocalLabels(dataset_name, has_single_class_label);
+      } else {
+        galois::gInfo("Remapped ogbn 100M");
+        ReadLocalLabelsBin(dataset_name);
+      }
+      ReadLocalFeatures(dataset_name);
+      ReadLocalMasks(dataset_name);
+    }
+
+    // init norm factors (involves a sync call)
+    InitNormFactor();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // allocate/copy data structures over to GPU
+      GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_);
+      InitGPUMemory();
+
+      // initialize CUDA context
+      cuda_ctx_ = get_CUDA_context(host_id_);
+      if (!init_CUDA_context(cuda_ctx_, ::gpudevice)) {
+        GALOIS_DIE("Failed to initialize CUDA context");
+      }
+      PartitionedGraphInfo g_info;
+      GetPartitionedGraphInfo(g_info);
+      load_graph_CUDA_GNN(cuda_ctx_, g_info,
+                          galois::runtime::getSystemNetworkInterface().Num);
+    }
+#endif
+  }
+
+  //! Returns host id
+  size_t host_id() const { return host_id_; }
+
+  //! Returns host id in brackets to use for printing things
+  const std::string& host_prefix() const { return host_prefix_; }
+
+  //! Length of a node feature
+  size_t node_feature_length() const { return node_feature_length_; }
+
+  //! Return the number of label classes (i.e. number of possible outputs)
+  size_t GetNumLabelClasses() const { return num_label_classes_; }
+
+  bool is_single_class_label() const { return using_single_class_labels_; }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Graph accessors
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Return # of nodes in the partitioned graph
+  size_t size() const { return partitioned_graph_->size(); }
+  size_t global_size() const { return partitioned_graph_->globalSize(); }
+  //! Returns # of nodes in the *graph that is currently active*.
+  size_t active_size() const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->size();
+    } else {
+      return subgraph_->size();
+    }
+  }
+
+  bool is_owned(size_t gid) const { return partitioned_graph_->isOwned(gid); }
+  bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); }
+  size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); }
+  size_t GetGID(size_t lid) const { return partitioned_graph_->getGID(lid); }
+  size_t GetHostID(size_t gid) const {
+    return partitioned_graph_->getHostID(gid);
+  }
+
+  //! Node begin for all local nodes
+  NodeIterator begin() const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->allNodesRange().begin();
+    } else {
+      return subgraph_->begin();
+    }
+  }
+  //! Node end for all local nodes
+  NodeIterator end() const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->allNodesRange().end();
+    } else {
+      return subgraph_->end();
+    }
+  }
+
+  NodeIterator begin_owned() const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->masterNodesRange().begin();
+    } else {
+      return subgraph_->begin_owned();
+    }
+  }
+
+  NodeIterator end_owned() const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->masterNodesRange().end();
+    } else {
+      return subgraph_->end_owned();
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Edges
+  //////////////////////////////////////////////////////////////////////////////
+
+  void InitializeSamplingData() { InitializeSamplingData(1, false); }
+  //! Initialize data required to do graph sampling
+  void InitializeSamplingData(size_t num_layers, bool choose_all) {
+    subgraph_ = std::make_unique<GNNSubgraph>(partitioned_graph_->size());
+    sample_node_timestamps_.create(partitioned_graph_->size(),
+                                   std::numeric_limits<uint32_t>::max());
+    edge_sample_status_.resize(num_layers);
+    for (size_t i = 0; i < num_layers; i++) {
+      edge_sample_status_[i].resize(partitioned_graph_->sizeEdges());
+    }
+    sampled_edges_.resize(partitioned_graph_->sizeEdges());
+    // this is to hold the degree of a sampled graph considering all hosts; yes,
+    // memory wise this is slightly problematic possibly, but each layer is its
+    // own subgraph
+    if (!choose_all) {
+      sampled_out_degrees_.resize(num_layers);
+      for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+        array.create(partitioned_graph_->size());
+      }
+    } else {
+      subgraph_choose_all_ = true;
+    }
+    definitely_sampled_nodes_.resize(partitioned_graph_->size());
+    master_offset_accum_.resize(num_layers + 1);
+    mirror_offset_accum_.resize(num_layers + 1);
+    sample_master_offsets_.resize(num_layers + 1, 0);
+    sample_mirror_offsets_.resize(num_layers + 1, 0);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Out Edges
+  //////////////////////////////////////////////////////////////////////////////
+
+  // All following functions take a local node id
+  EdgeIterator edge_begin(GraphNode n) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->edge_begin(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->edge_begin(ConvertToLID(n));
+    } else {
+      return subgraph_->edge_begin(n);
+    }
+  };
+
+  EdgeIterator edge_end(GraphNode n) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->edge_end(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->edge_end(ConvertToLID(n));
+    } else {
+      return subgraph_->edge_end(n);
+    }
+  };
+  GraphNode GetEdgeDest(EdgeIterator ei) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->getEdgeDst(ei);
+    } else if (use_subgraph_view_) {
+      // WARNING: this may return max of uint32 if the edge destination doesn't
+      // exist in the subgraph view
+      // get edge dest should NOT be called in that case
+      GraphNode rv = ConvertToSID(partitioned_graph_->getEdgeDst(ei));
+      assert(rv != std::numeric_limits<uint32_t>::max());
+      return rv;
+    } else {
+      return subgraph_->GetEdgeDest(ei);
+    }
+  };
+
+  galois::runtime::iterable<
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
+  edges(GraphNode N) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->edges(N);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->edges(ConvertToLID(N));
+    } else {
+      return subgraph_->edges(N);
+    }
+  }
+
+  bool IsEdgeSampledAny(EdgeIterator ei) const {
+    return sampled_edges_.test(*ei);
+  }
+  bool IsEdgeSampled(uint32_t ei, size_t layer_num) const {
+    if (!use_subgraph_) {
+      // view uses original graph edge iterators
+      return edge_sample_status_[layer_num].test(ei);
+    } else {
+      return subgraph_->OutEdgeSampled(ei, layer_num, *this);
+    }
+  };
+  bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const {
+    if (!use_subgraph_) {
+      // view uses original graph edge iterators
+      return edge_sample_status_[layer_num].test(*ei);
+    } else {
+      return subgraph_->OutEdgeSampled(ei, layer_num, *this);
+    }
+  };
+  //! Always use original graph's edge iterator here
+  bool IsEdgeSampledOriginalGraph(EdgeIterator ei, size_t layer_num) const {
+    return edge_sample_status_[layer_num].test(*ei);
+  };
+
+  //! Set the flag on the edge to 1; makes it sampled
+  void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) {
+    sampled_edges_.set(*ei);
+    edge_sample_status_[layer_num].set(*ei);
+  };
+  //! Set the flag on the edge to 0; makes it not sampled
+  void MakeEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
+    edge_sample_status_[layer_num].reset(*ei, *ei);
+  };
+
+  // GNNEdgeSortIterator EdgeSortBegin(GraphNode n) {
+  //  return GNNEdgeSortIterator(*edge_begin(n),
+  //  partitioned_graph_->edge_dst_ptr_LA(), &edge_sample_status_);
+  //}
+  // GNNEdgeSortIterator EdgeSortEnd(GraphNode n) {
+  //  return GNNEdgeSortIterator(*edge_begin(n),
+  //  partitioned_graph_->edge_dst_ptr_LA(), &edge_sample_status_);
+  //}
+
+  //////////////////////////////////////////////////////////////////////////////
+  // in edges
+  //////////////////////////////////////////////////////////////////////////////
+
+  EdgeIterator in_edge_begin(GraphNode n) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->in_edge_begin(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->in_edge_begin(ConvertToLID(n));
+    } else {
+      return subgraph_->in_edge_begin(n);
+    }
+  }
+  EdgeIterator in_edge_end(GraphNode n) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->in_edge_end(n);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->in_edge_end(ConvertToLID(n));
+    } else {
+      return subgraph_->in_edge_end(n);
+    }
+  }
+  galois::runtime::iterable<
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
+  in_edges(GraphNode N) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->in_edges(N);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->in_edges(ConvertToLID(N));
+    } else {
+      return subgraph_->in_edges(N);
+    }
+  }
+  GraphNode GetInEdgeDest(EdgeIterator ei) const {
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return partitioned_graph_->GetInEdgeDest(ei);
+    } else if (use_subgraph_view_) {
+      return partitioned_graph_->GetInEdgeDest(ei);
+      GraphNode rv = ConvertToSID(partitioned_graph_->GetInEdgeDest(ei));
+      assert(rv != std::numeric_limits<uint32_t>::max());
+      return rv;
+    } else {
+      return subgraph_->GetInEdgeDest(ei);
+    }
+  };
+
+  EdgeIterator InEdgeToOutEdge(EdgeIterator in_edge_iter) const {
+    return partitioned_graph_->InEdgeToOutEdge(in_edge_iter);
+  }
+
+  bool IsInEdgeSampledAny(EdgeIterator ei) const {
+    return sampled_edges_.test(partitioned_graph_->InEdgeToOutEdge(ei));
+  };
+  bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const {
+    if (!use_subgraph_) {
+      // view can use this fine + requires it
+      return edge_sample_status_[layer_num].test(
+          partitioned_graph_->InEdgeToOutEdge(ei));
+    } else {
+      return subgraph_->InEdgeSampled(ei, layer_num, *this);
+    }
+  };
+
+  //! Set the flag on the edge to 1; makes it sampled
+  void MakeInEdgeSampled(EdgeIterator ei, size_t layer_num) {
+    edge_sample_status_[layer_num].set(partitioned_graph_->InEdgeToOutEdge(ei));
+  };
+  //! Set the flag on the edge to 0; makes it not sampled
+  void MakeInEdgeUnsampled(EdgeIterator ei, size_t layer_num) {
+    edge_sample_status_[layer_num].reset(
+        partitioned_graph_->InEdgeToOutEdge(ei),
+        partitioned_graph_->InEdgeToOutEdge(ei));
+  };
+
+  //////////////////////////////////////////////////////////////////////////////
+  // neighborhood sampling
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Set seed nodes, i.e., nodes that are being predicted on
+  size_t SetupNeighborhoodSample() {
+    return SetupNeighborhoodSample(GNNPhase::kTrain);
+  }
+  size_t SetupNeighborhoodSample(GNNPhase seed_phase) {
+    DisableSubgraph();
+
+    if (!bitset_sample_flag_.size()) {
+      bitset_sample_flag_.resize(size());
+    }
+    bitset_sample_flag_.ParallelReset();
+    definitely_sampled_nodes_.ParallelReset();
+
+    galois::do_all(
+        galois::iterate(begin_owned(), end_owned()),
+        [&](const NodeIterator& x) {
+          if (IsValidForPhase(*x, seed_phase)) {
+            SetSampledNode(*x);
+            bitset_sample_flag_.set(*x);
+            definitely_sampled_nodes_.set(*x);
+          } else {
+            UnsetSampledNode(*x);
+          }
+        },
+        galois::loopname("InitialSeedSetting"));
+    // unsets nodes set in previous iterations; for some reason they get
+    // synchronized along  with everything else even though bitset sample flag
+    // should prevent it (that, or it's because they don't get sync'd that they
+    // remain the same)
+    galois::do_all(galois::iterate(end_owned(), end()),
+                   [&](const NodeIterator& x) { UnsetSampledNode(*x); });
+
+    // clear node timestamps
+    galois::StatTimer fill_time("ClearFillTime");
+    fill_time.start();
+    galois::ParallelSTL::fill(sample_node_timestamps_.begin(),
+                              sample_node_timestamps_.end(),
+                              std::numeric_limits<uint32_t>::max());
+    galois::ParallelSTL::fill(sample_master_offsets_.begin(),
+                              sample_master_offsets_.end(), 0);
+    galois::ParallelSTL::fill(sample_mirror_offsets_.begin(),
+                              sample_mirror_offsets_.end(), 0);
+    fill_time.stop();
+
+    for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
+      master_offset_accum_[i].reset();
+      mirror_offset_accum_[i].reset();
+    }
+
+    // clear all sampled edges
+    galois::StatTimer ctime("ClearSampleEdges");
+    ctime.start();
+    for (galois::DynamicBitSet& edge_layer : edge_sample_status_) {
+      edge_layer.ParallelReset();
+    }
+    ctime.stop();
+    //  galois::do_all(
+    //      galois::iterate(edge_sample_status_.begin(),
+    //      edge_sample_status_.end()),
+    //      [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); },
+    //      galois::loopname("ClearSampleEdges"));
+
+    sampled_edges_.ParallelReset();
+
+    // reset all degrees
+    if (!subgraph_choose_all_) {
+      galois::StatTimer cad_timer("ClearAllDegrees");
+      cad_timer.start();
+      for (galois::LargeArray<uint32_t>& array : sampled_out_degrees_) {
+        galois::ParallelSTL::fill(array.begin(), array.end(), 0);
+      }
+      cad_timer.stop();
+    }
+
+    if (!bitset_sampled_degrees_.size()) {
+      bitset_sampled_degrees_.resize(partitioned_graph_->size());
+    }
+    bitset_sampled_degrees_.reset();
+
+    // Seed nodes sync
+    SampleNodeSync("SeedNodeSample");
+
+    galois::GAccumulator<unsigned> local_seed_count;
+    local_seed_count.reset();
+    galois::GAccumulator<unsigned> master_offset;
+    master_offset.reset();
+    galois::GAccumulator<unsigned> mirror_offset;
+    mirror_offset.reset();
+    // count # of seed nodes
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& x) {
+          if (IsInSampledGraph(x)) {
+            if (*x < *end_owned()) {
+              master_offset += 1;
+            } else {
+              // mirror
+              mirror_offset += 1;
+            }
+
+            // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x));
+            local_seed_count += 1;
+            // 0 = seed node
+            sample_node_timestamps_[*x] = 0;
+          }
+        },
+        galois::loopname("SeedNodeOffsetCounting"));
+
+    sample_master_offsets_[0] = master_offset.reduce();
+    sample_mirror_offsets_[0] = mirror_offset.reduce();
+
+    return local_seed_count.reduce();
+  }
+
+  //! Choose all edges from sampled nodes
+  size_t SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph,
+                        size_t timestamp) {
+    DisableSubgraph();
+
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& src_iter) {
+          // only operate on if sampled
+          if (IsInSampledGraph(src_iter)) {
+            // marks ALL edges of nodes that connect to train/other nodes
+            for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
+              // total += 1;
+              if (inductive_subgraph) {
+                if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                     GNNPhase::kTrain) &&
+                    !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter),
+                                     GNNPhase::kOther)) {
+                  continue;
+                }
+              }
+              MakeEdgeSampled(edge_iter, agg_layer_num);
+              uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter);
+              if (!IsInSampledGraph(dest)) {
+                bitset_sample_flag_.set(dest);
+              }
+              definitely_sampled_nodes_.set(*src_iter);
+              definitely_sampled_nodes_.set(dest);
+            }
+          }
+        },
+        galois::steal(), galois::loopname("ChooseAllEdges"));
+
+    // update nodes, then communicate update to all hosts so that they can
+    // continue the exploration
+    galois::do_all(
+        galois::iterate(size_t{0}, bitset_sample_flag_.size()),
+        [&](uint32_t new_node_id) {
+          if (bitset_sample_flag_.test(new_node_id)) {
+            SetSampledNode(new_node_id);
+          }
+        },
+        galois::loopname("NeighborhoodSampleSet"));
+
+    SampleNodeSync("SampleFlag");
+
+    galois::GAccumulator<unsigned> local_sample_count;
+    local_sample_count.reset();
+    // count # of seed nodes
+    galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+      if (IsInSampledGraph(x)) {
+        local_sample_count += 1;
+        if (sample_node_timestamps_[*x] ==
+            std::numeric_limits<uint32_t>::max()) {
+          if (x < end_owned()) {
+            // owned nodes that are activated on other hosts shoudl always
+            // be activated because it's responsible for keeping others in
+            // sync during comms; ignoring it = bad
+            // TODO(gluon) make it so you don't have to deal with this
+            // and just use host as a reducer point
+            definitely_sampled_nodes_.set(*x);
+          }
+          sample_node_timestamps_[*x] = timestamp;
+        }
+      }
+    });
+
+    EnableSubgraphChooseAll();
+    return local_sample_count.reduce();
+  }
+
+  //! Sample neighbors of nodes that are marked as ready for sampling
+  size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample,
+                     bool inductive_subgraph, size_t timestamp) {
+    use_subgraph_      = false;
+    use_subgraph_view_ = false;
+
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& src_iter) {
+          // only operate on if sampled
+          if (IsInSampledGraph(src_iter)) {
+            // chance of not uniformly choosing an edge of this node
+            // num_to_sample times (degree norm is 1 / degree)
+            double probability_of_reject;
+            if (!inductive_subgraph) {
+              probability_of_reject =
+                  std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample);
+            } else {
+              probability_of_reject = std::pow(
+                  1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample);
+            }
+
+            // loop through edges, turn "on" edge with some probability
+            for (auto edge_iter : partitioned_graph_->edges(*src_iter)) {
+              if (sample_rng_.DoBernoulli(probability_of_reject)) {
+                if (inductive_subgraph) {
+                  // only take if node is training node or a node not classified
+                  // into train/test/val
+                  if (!IsValidForPhase(
+                          partitioned_graph_->getEdgeDst(edge_iter),
+                          GNNPhase::kTrain) &&
+                      !IsValidForPhase(
+                          partitioned_graph_->getEdgeDst(edge_iter),
+                          GNNPhase::kOther)) {
+                    continue;
+                  }
+                }
+
+                uint32_t edge_dst = partitioned_graph_->getEdgeDst(edge_iter);
+                // if here, it means edge accepted; set sampled on, mark
+                // as part of next set
+                MakeEdgeSampled(edge_iter, sample_layer_num);
+                if (!IsInSampledGraph(edge_dst)) {
+                  bitset_sample_flag_.set(edge_dst);
+                }
+                bitset_sampled_degrees_.set(*src_iter);
+                definitely_sampled_nodes_.set(*src_iter);
+                definitely_sampled_nodes_.set(edge_dst);
+                // degree increment
+                sampled_out_degrees_[sample_layer_num][*src_iter]++;
+              }
+            }
+          }
+        },
+        galois::steal(), galois::loopname("NeighborhoodSample"));
+
+    // update nodes, then communicate update to all hosts so that they can
+    // continue the exploration
+    galois::do_all(
+        galois::iterate(size_t{0}, bitset_sample_flag_.size()),
+        [&](uint32_t new_node_id) {
+          if (bitset_sample_flag_.test(new_node_id)) {
+            SetSampledNode(new_node_id);
+          }
+        },
+        galois::loopname("NeighborhoodSampleSet"));
+
+    // why not read source? even if it doesn't need to sample anything, it needs
+    // to know that it's active so that subgraph construction can proceed
+    // correctly
+    SampleNodeSync("SampleFlag");
+
+    // count sampled node size
+    galois::GAccumulator<unsigned> local_sample_count;
+    local_sample_count.reset();
+    // count # of seed nodes
+    galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) {
+      if (IsInSampledGraph(x)) {
+        local_sample_count += 1;
+        if (sample_node_timestamps_[*x] ==
+            std::numeric_limits<uint32_t>::max()) {
+          if (x < end_owned()) {
+            // owned nodes that are activated on other hosts shoudl always
+            // be activated because it's responsible for keeping others in
+            // sync during comms; ignoring it = bad
+            // TODO(gluon) make it so you don't have to deal with this
+            // and just use host as a reducer point
+            definitely_sampled_nodes_.set(*x);
+          }
+          sample_node_timestamps_[*x] = timestamp;
+        }
+      }
+    });
+
+    DisableSubgraphChooseAll();
+    return local_sample_count.reduce();
+  }
+
+  std::vector<unsigned> ConstructSampledSubgraph(size_t num_sampled_layers) {
+    return ConstructSampledSubgraph(num_sampled_layers, false);
+  };
+  //! Construct the subgraph from sampled edges and corresponding nodes
+  std::vector<unsigned> ConstructSampledSubgraph(size_t num_sampled_layers,
+                                                 bool use_view) {
+    // false first so that the build process can use functions to access the
+    // real graph
+    DisableSubgraph();
+
+    gnn_sampled_out_degrees_ = &sampled_out_degrees_;
+
+    // first, sync the degres of the sampled edges across all hosts
+    // read any because destinations need it to for reverse phase
+    if (use_timer_) {
+      sync_substrate_->template sync<
+          writeSource, readAny, SubgraphDegreeSync<VTy>, SubgraphDegreeBitset>(
+          "SubgraphDegree");
+    } else {
+      sync_substrate_->template sync<
+          writeSource, readAny, SubgraphDegreeSync<VTy>, SubgraphDegreeBitset>(
+          "Ignore");
+    }
+
+    galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime");
+    offsets_n_rows_time.start();
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](const NodeIterator& x) {
+          if (IsActiveInSubgraph(*x)) {
+            if (sample_node_timestamps_[*x] !=
+                std::numeric_limits<uint32_t>::max()) {
+              if (*x < *end_owned()) {
+                // master
+                master_offset_accum_[sample_node_timestamps_[*x]] += 1;
+              } else {
+                // mirror
+                mirror_offset_accum_[sample_node_timestamps_[*x]] += 1;
+              }
+            } else {
+              GALOIS_LOG_FATAL(
+                  "should have been timestamped at some point if active");
+            }
+          }
+        },
+        galois::loopname("MasterMirrorOffset"));
+
+    std::vector<unsigned> new_rows(master_offset_accum_.size());
+    for (unsigned i = 0; i < master_offset_accum_.size(); i++) {
+      sample_master_offsets_[i] = master_offset_accum_[i].reduce();
+      sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce();
+      new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i];
+      if (i > 0) {
+        new_rows[i] += new_rows[i - 1];
+      }
+    }
+
+    offsets_n_rows_time.stop();
+
+    if (!use_view) {
+      subgraph_->BuildSubgraph(*this, num_sampled_layers);
+    } else {
+      // a view only has lid<->sid mappings
+      subgraph_->BuildSubgraphView(*this, num_sampled_layers);
+    }
+
+    sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors(),
+                                          use_timer_);
+
+    // after this, this graph is a subgraph
+    if (!use_view) {
+      use_subgraph_ = true;
+    } else {
+      use_subgraph_view_ = true;
+    }
+
+    return new_rows;
+  }
+
+  unsigned SampleNodeTimestamp(unsigned lid) const {
+    return sample_node_timestamps_[lid];
+  }
+
+  void EnableSubgraph() { use_subgraph_ = true; }
+  void EnableSubgraphView() { use_subgraph_view_ = true; }
+  void DisableSubgraph() {
+    use_subgraph_      = false;
+    use_subgraph_view_ = false;
+    sync_substrate_->RevertHandshakeToRealGraph();
+  }
+  bool IsSubgraphOn() const { return use_subgraph_; }
+  bool IsSubgraphViewOn() const { return use_subgraph_view_; }
+
+  //! Converts an id to an lid for the graph if subgraphs are in use
+  uint32_t ConvertToLID(GraphNode sid) const {
+    if (use_subgraph_ || use_subgraph_view_) {
+      return subgraph_->SIDToLID(sid);
+    } else {
+      return sid;
+    }
+  }
+  //! Converts an LID to an SID if subgraphs are in use
+  uint32_t ConvertToSID(GraphNode lid) const {
+    if (use_subgraph_ || use_subgraph_view_) {
+      return subgraph_->LIDToSID(lid);
+    } else {
+      return lid;
+    }
+  }
+  //! Converts SID to GID if subgraphs in use (else just return GID)
+  uint32_t SIDToGID(GraphNode sid) const {
+    if (use_subgraph_ || use_subgraph_view_) {
+      return GetGID(subgraph_->SIDToLID(sid));
+    } else {
+      return GetGID(sid);
+    }
+  }
+  //! Returns a pointer to the LID to SID map from the subgraph if subgraphs
+  //! are in use
+  galois::LargeArray<uint32_t>* GetLIDToSIDPointer() {
+    if (use_subgraph_ || use_subgraph_view_) {
+      return subgraph_->GetLIDToSIDPointer();
+    } else {
+      return nullptr;
+    }
+  }
+
+  // void SortAllInEdgesBySID() {
+  //  // check it out for node 0
+  //  //for (auto iter : in_edges(0)) {
+  //  //  galois::gInfo("0 to ", GetInEdgeDest(*iter), " with in out edge map ",
+  //  *InEdgeToOutEdge(iter), " SID ",
+  //  subgraph_->LIDToSID(GetInEdgeDest(*iter)));
+  //  //}
+  //  //galois::gInfo("Starting sort");
+  //  galois::StatTimer t("SortBySID");
+  //  t.start();
+  //  partitioned_graph_->SortAllInEdgesBySID(*(subgraph_->GetLIDToSIDPointer()));
+  //  t.stop();
+  //  galois::gInfo("sort took ", t.get());
+  //  //galois::gInfo("End Sort");
+  //  //for (auto iter : in_edges(0)) {
+  //  //  galois::gInfo("0 to ", GetInEdgeDest(*iter), " with in out edge map ",
+  //  *InEdgeToOutEdge(iter), " SID ",
+  //  subgraph_->LIDToSID(GetInEdgeDest(*iter)));
+  //  //}
+  //}
+
+  //////////////////////////////////////////////////////////////////////////////
+  size_t SetupTrainBatcher(size_t train_batch_size) {
+    if (train_batcher_) {
+      // clear before remake
+      train_batcher_.reset();
+    }
+    train_batcher_ = std::make_unique<MinibatchGenerator>(
+        local_training_mask_, train_batch_size, *end_owned());
+    train_batcher_->ShuffleMode();
+    // train_batcher_->DistributedShuffleMode(*partitioned_graph_,
+    // global_training_mask_, global_training_count_);
+    local_minibatch_mask_.resize(partitioned_graph_->size());
+    return train_batcher_->ShuffleMinibatchTotal();
+  }
+
+  void ResetTrainMinibatcher() { train_batcher_->ResetMinibatchState(); }
+
+  //! Setup the state for the next minibatch sampling call by using the
+  //! minibatcher to pick up the next set batch of nodes
+  size_t PrepareNextTrainMinibatch() {
+    train_batcher_->GetNextMinibatch(&local_minibatch_mask_);
+#ifndef NDEBUG
+    size_t count = 0;
+    // galois::gPrint("Minibatch : ");
+    for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) {
+      if (local_minibatch_mask_[i]) {
+        // galois::gPrint(partitioned_graph_->getGID(i), ",");
+        count++;
+      }
+    }
+    // galois::gPrint("\n");
+    galois::gInfo(host_prefix(), "Batched nodes ", count);
+#endif
+    return SetupNeighborhoodSample(GNNPhase::kBatch);
+  }
+
+  // Used with distributed minibatch tracker
+  // size_t PrepareNextTrainMinibatch(size_t num_to_get) {
+  //  train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get);
+  //  return SetupNeighborhoodSample(GNNPhase::kBatch);
+  //}
+  //! Returns true if there are still more minibatches in this graph
+  bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); };
+
+  template <
+      typename T                                                      = VTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void ConstructFeatureBy2HopAggregation() {
+    galois::StatTimer timer("ConstructFeatureBy2HopAggregation");
+    if (this->use_timer_) {
+      timer.start();
+    }
+
+    // TODO(hc): This constant is from SHAD implementation.
+    //           This will be an user parameter for general/flexible support.
+
+    // The first 15 floats are for the current node feature,
+    // and the another 15 floats are for the aggregated neighbor's node feature.
+    // These two 15-dimension features are concateneated to a single feature
+    // for each node.
+    this->node_feature_length_ = 30;
+    this->local_node_features_.resize(
+        this->partitioned_graph_->size() * this->node_feature_length_, 0.f);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      this->ConstructFeatureBy2HopAggregationGPU();
+    } else {
+#endif
+      this->ConstructFeatureBy2HopAggregationCPU();
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+
+    if (this->use_timer_) {
+      timer.stop();
+    }
+  }
+
+  template <typename T = VTy,
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
+  void ConstructFeatureBy2HopAggregation() {}
+
+  void ConstructFeatureBy2HopAggregationGPU() {
+    // TODO(hc): This might not be used in the future.
+    //           This might be renamed to use "PANDO" instead of "GPU".
+    //           For now, just following the existing code format.
+    GALOIS_LOG_FATAL(
+        "ConstructFeatureBy2HopAggregationGPU() is not supported.");
+  }
+
+  void ConstructFeatureBy2HopAggregationCPU() {
+    galois::gInfo("Construct an initial feature on CPU by "
+                  "aggregating and concatenating neighbors' features.");
+    // this->PrintFeatures("0hop");
+    //  this->FillTestNodeType();
+    // this->PrintGraphTopo("before");
+    this->Construct1HopFeatureCPU();
+    // this->PrintFeatures("1hop");
+    this->Construct2HopFeatureCPU();
+    // this->PrintFeatures("2hop");
+  }
+
+  void PrintFeatures(std::string postfix) {
+    // XXX(hc): Printing code for correctness check.
+    auto& net        = galois::runtime::getSystemNetworkInterface();
+    unsigned host_id = net.ID;
+    std::ofstream fp(postfix + "." + std::to_string(host_id) + ".feat");
+    for (size_t lid = 0; lid < this->partitioned_graph_->size(); ++lid) {
+      /*
+      size_t gid = this->partitioned_graph_->getGID(lid);
+      fp << "src:" << gid << ", " <<
+          this->partitioned_graph_->getData(lid).type << ", " <<
+          this->partitioned_graph_->getData(lid).key << "\n";
+      for (size_t i = 0; i < this->node_feature_length_; ++i) {
+        fp << "\t [" << i << "] = " <<
+            this->local_node_features_[lid * this->node_feature_length_ + i]
+            << "\n";
+      }
+      */
+      fp << this->partitioned_graph_->getData(lid).key;
+      for (size_t i = 0; i < this->node_feature_length_; ++i) {
+        fp << ","
+           << this->local_node_features_[lid * this->node_feature_length_ + i];
+      }
+      fp << "\n";
+    }
+    fp.close();
+  }
+
+  /// Construct feature from 1-hop neighbors.
+  /// This method traverses 1-hop outgoing neighbors from each vertex
+  /// and constructs a histogram of the outgoing edge type and
+  /// the outgoing neighbor type.
+  void Construct1HopFeatureCPU() {
+    auto& graph = *(this->partitioned_graph_);
+    // Aggregate adjacent node and edge types and construct
+    // an intermediate feature.
+    galois::do_all(
+        galois::iterate(size_t{0}, graph.size()),
+        [&](size_t src_lid) {
+          bitset_graph_aggregate.set(src_lid);
+          for (auto edge_iter = graph.edge_begin(src_lid);
+               edge_iter < graph.edge_end(src_lid); ++edge_iter) {
+            size_t dst_lid     = graph.getEdgeDst(edge_iter);
+            uint32_t dst_type  = graph.getData(dst_lid).type;
+            uint64_t edge_type = graph.getEdgeData(edge_iter);
+            // Aggregate out neighbors' types.
+            ++this->local_node_features_[this->node_feature_length_ * src_lid +
+                                         dst_type];
+            // TODO(hc): Assume that edge type is always 0.
+            //           So, the 0th feature value of a node should be
+            //           (degree of the node + sum of type-0 neighbors).
+            ++this->local_node_features_[this->node_feature_length_ * src_lid +
+                                         edge_type];
+          }
+        },
+        galois::steal(), galois::loopname("Construct1HopFeatureCPU"));
+
+    gnn_matrix_to_sync_               = this->local_node_features_.data();
+    gnn_matrix_to_sync_column_length_ = this->node_feature_length_;
+    // All the source vertices reduce and update proxies' data
+    // and both the source and destination vertices set those
+    // updated data to their data.
+    sync_substrate_->template sync<writeSource, readAny, GNNSumAggregate<VTy>,
+                                   Bitset_graph_aggregate>(
+        "GraphAggregateSync");
+  }
+
+  /// Construct feature from 2-hop neighbors.
+  /// After `Construct1HopFeatureCPU()`, each vertex aggregates types of
+  /// the outgoing edges and neighbors, and constructs a histogram for
+  /// its feature. Now, in this method, each vertex aggregates those
+  /// histograms from outgoing neighbors and constructs a new histogram.
+  /// Then, each vertex appends this new histogram to the old histogram
+  /// as its feature.
+  void Construct2HopFeatureCPU() {
+    auto& graph = *(this->partitioned_graph_);
+    // Aggregate neighbor nodes' features and append (concatenate) it to the
+    // current node feature. So the first half is the current node and
+    // the next half is the aggregated node feature.
+    galois::do_all(
+        galois::iterate(size_t{0}, graph.size()),
+        [&](size_t src_lid) {
+          // Offset for the second part of the source node feature.
+          size_t src_foffset = this->node_feature_length_ * src_lid +
+                               this->node_feature_length_ / 2;
+          bitset_graph_aggregate.set(src_lid);
+          for (auto edge_iter = graph.edge_begin(src_lid);
+               edge_iter < graph.edge_end(src_lid); ++edge_iter) {
+            size_t dst_lid = graph.getEdgeDst(edge_iter);
+            // Offset for the first part of the destination node feature.
+            size_t dst_foffset = this->node_feature_length_ * dst_lid;
+            for (size_t fid = 0; fid < this->node_feature_length_ / 2; ++fid) {
+              // Aggregate outgoing neighbors' features and,
+              // construct and append a new histogram to the old one.
+              this->local_node_features_[src_foffset + fid] +=
+                  this->local_node_features_[dst_foffset + fid];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("Construct2HopFeatureCPU"));
+    this->SHADFeatureAggregateSync(this->local_node_features_.data(),
+                                   this->node_feature_length_);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  void SetupTestBatcher(size_t test_batch_size) {
+    if (test_batcher_) {
+      // clear before remake
+      test_batcher_.reset();
+    }
+    test_batcher_ = std::make_unique<MinibatchGenerator>(
+        local_testing_mask_, test_batch_size, *end_owned());
+    local_minibatch_mask_.resize(partitioned_graph_->size());
+  }
+  void ResetTestMinibatcher() { test_batcher_->ResetMinibatchState(); }
+  //! Setup the state for the next minibatch sampling call by using the
+  //! minibatcher to pick up the next set batch of nodes
+  size_t PrepareNextTestMinibatch() {
+    test_batcher_->GetNextMinibatch(&local_minibatch_mask_);
+    return SetupNeighborhoodSample(GNNPhase::kBatch);
+  }
+
+  //! Returns true if there are still more minibatches in this graph
+  bool MoreTestMinibatches() { return !test_batcher_->NoMoreMinibatches(); };
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * @brief Normalization factor calculation for GCN without graph sampling
+   *
+   * @detail This function calculates normalization factor for nodes
+   * on a GCN layer, but not with graph sampling (ego graph construction).
+   * This normalization is proposed in GCN paper, and its equation is
+   * D^(-1/2)*A*D^(-1/2).
+   * XXX(hc): This degraded accuracy when graph sampling was enabled.
+   * That could be many reasons for that, for example, a graph was already
+   * small, and so, sampled graphs across layers are too small to normalize,
+   * or, it might be theoretical design reason as the original GCN
+   * did not consider ego graph construction.
+   * For example, the one possible reason is that backward phase and
+   * forward phase edge iterators are also different and maybe need to
+   * use different iterators.
+   * For now, I stopped this analysis and
+   * just enabled this method for only GCN without graph
+   * sampling. With graph sampling, I used SAGE's graph normalization.
+   */
+  GNNFloat GetGCNNormFactor(GraphNode lid
+                            /*, size_t graph_user_layer_num*/) const {
+#if 0
+    if (use_subgraph_ || use_subgraph_view_) {
+      size_t degree;
+      if (!subgraph_choose_all_) {
+        // case because degrees in each layer differ
+        degree =
+            sampled_out_degrees_[graph_user_layer_num][
+                subgraph_->SIDToLID(lid)];
+      } else {
+        // XXX if inductive
+        // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
+        degree = global_degrees_[subgraph_->SIDToLID(lid)];
+      }
+      if (degree) {
+        return 1.0 / std::sqrt(static_cast<float>(degree) + 1);
+      } else {
+        return 0;
+      }
+    } else {
+      if (global_degrees_[lid]) {
+        if (this->size() != this->active_size()) {
+          std::cout << lid << " does not match\n";
+        }
+        return 1.0 / std::sqrt(static_cast<float>(global_degrees_[lid]) + 1);
+      } else {
+        return 0.0;
+      }
+    }
+#endif
+    if (global_degrees_[lid]) {
+      return 1.0 / std::sqrt(static_cast<float>(global_degrees_[lid]) + 1);
+    } else {
+      return 0.0;
+    }
+  }
+
+  GNNFloat GetGlobalDegreeNorm(GraphNode n) const {
+    if (global_degrees_[n]) {
+      return 1.0 / global_degrees_[n];
+    } else {
+      return 0.0;
+    }
+  }
+
+  GNNFloat GetGlobalTrainDegreeNorm(GraphNode n) const {
+    if (global_train_degrees_[n]) {
+      return 1.0 / global_train_degrees_[n];
+    } else {
+      return 0.0;
+    }
+  }
+
+  //! Get degree norm of subgraph for particular layer (i.e. includes training)
+  GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const {
+    if (use_subgraph_ || use_subgraph_view_) {
+      size_t degree;
+      if (!subgraph_choose_all_) {
+        // case because degrees in each layer differ
+        degree =
+            sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)];
+      } else {
+        // XXX if inductive
+        // degree = global_train_degrees_[subgraph_->SIDToLID(n)];
+        degree = global_degrees_[subgraph_->SIDToLID(n)];
+      }
+      if (degree) {
+        return 1.0 / degree;
+      } else {
+        return 0;
+      }
+    } else {
+      return GetGlobalDegreeNorm(n);
+    }
+  }
+
+  // Get accuracy: sampling is by default false
+  float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions,
+                          GNNPhase phase) {
+    // No GPU version yet, but this is where it would be
+    return GetGlobalAccuracy(predictions, phase, false);
+  }
+
+  float GetGlobalAccuracy(PointerWithSize<GNNFloat> predictions, GNNPhase phase,
+                          bool sampling) {
+    // No GPU version yet, but this is where it would be
+    return GetGlobalAccuracyCPU(predictions, phase, sampling);
+  }
+
+  /**
+   * @brief Compare predictions from a model and ground truths, and return the
+   * results.
+   */
+  std::pair<float, float>
+  GetGlobalAccuracyCheckResult(PointerWithSize<GNNFloat> predictions,
+                               GNNPhase phase, bool sampling) {
+    return GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+  }
+
+  std::pair<uint32_t, uint32_t>
+  GetBatchAccuracy(PointerWithSize<GNNFloat> predictions) {
+    // check owned nodes' accuracy
+    num_correct_.reset();
+    total_checked_.reset();
+
+    galois::do_all(
+        // will only loop over sampled nodes if sampling is on
+        galois::iterate(begin_owned(), end_owned()),
+        // this is possibly the subgraph id
+        [&](const unsigned node_id) {
+          if (IsValidForPhase(node_id, GNNPhase::kBatch)) {
+            total_checked_ += 1;
+            size_t predicted_label =
+                galois::MaxIndex(num_label_classes_,
+                                 &(predictions[node_id * num_label_classes_]));
+            if (predicted_label ==
+                static_cast<size_t>(GetSingleClassLabel(node_id))) {
+              num_correct_ += 1;
+            }
+          }
+        },
+        // steal on as some threads may have nothing to work on
+        galois::steal(), galois::loopname("GlobalAccuracy"));
+
+    size_t global_correct = num_correct_.reduce();
+    size_t global_checked = total_checked_.reduce();
+
+    return std::make_pair(global_correct, global_checked);
+  }
+
+  //! Returns the ground truth label of some local id assuming labels are single
+  //! class labels.
+  GNNFloat GetSingleClassLabel(const unsigned lid) const {
+    assert(using_single_class_labels_);
+    unsigned to_use = lid;
+    if (use_subgraph_ || use_subgraph_view_) {
+      to_use = subgraph_->SIDToLID(lid);
+    }
+
+    if (local_ground_truth_labels_[to_use] != num_label_classes_) {
+      return local_ground_truth_labels_[to_use];
+    } else {
+      GALOIS_LOG_FATAL(
+          "should not get the label of a node that has no ground truth {}",
+          to_use);
+    }
+  }
+
+  //! Returns pointer to start of ground truth vector for some local id assuming
+  //! labels are multi-class.
+  const GNNLabel* GetMultiClassLabel(const unsigned lid) const {
+    assert(!using_single_class_labels_);
+    return static_cast<const GNNLabel*>(local_ground_truth_labels_.data() +
+                                        (lid * num_label_classes_));
+  }
+
+  //! Return matrix of the local node features
+  const PointerWithSize<GNNFloat> GetLocalFeatures() {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // TODO remove reliance on local_node_features
+      return PointerWithSize(gpu_memory_.feature_vector(),
+                             local_node_features_.size());
+    }
+#endif
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      return PointerWithSize(local_node_features_);
+    } else {
+      return PointerWithSize(subgraph_->GetLocalFeatures().data(),
+                             subgraph_->size() * node_feature_length_);
+    }
+  }
+
+  //! Given an LID and the current phase of GNN computation, determine if the
+  //! lid in question is valid for the current phase (i.e., it is part of
+  //! a training, validation, or test phase mask)
+  bool IsValidForPhase(const unsigned lid,
+                       const galois::GNNPhase current_phase) const {
+    // XXX maybe just map this all over to subgraph, though in that case
+    // issue is that subgraph doesn't necessarily know about test/val
+    unsigned to_use = lid;
+    if (use_subgraph_ || use_subgraph_view_) {
+      to_use = subgraph_->SIDToLID(lid);
+    }
+    // re: phase checks in this if: ranges are not used for these
+    // phases even if they might exist; it's something to look into
+    // possibly, though at the same time it may not be worth it
+    if (!incomplete_masks_ && current_phase != GNNPhase::kOther &&
+        current_phase != GNNPhase::kBatch) {
+      return IsValidForPhaseCompleteRange(to_use, current_phase);
+    } else {
+      return IsValidForPhaseMasked(to_use, current_phase);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! @brief Variant of the plain feature aggregation.
+  //! @detail This is a variant version of the dense feature aggregation
+  //! that follows SHAD GNN feature construction. This aggregates features of
+  //! the neighbor vertices that are from (vertex's feature offset +
+  //! 1/2 * feature length) to (vertex's feature offset + feature length),
+  //! to (vertex's feature offset) of the current vertex, from its proxies.
+  //!
+  //! @param matrix_to_sync Float pointer pointing to features of the target
+  //! vertex
+  //! @param matrix_column_size Feature length to calculate a base offset of
+  //! each vertex
+  void SHADFeatureAggregateSync(GNNFloat* matrix_to_sync,
+                                const size_t matrix_column_size) const {
+    gnn_matrix_to_sync_               = matrix_to_sync;
+    gnn_matrix_to_sync_column_length_ = matrix_column_size;
+
+    // set globals for the sync substrate
+    if (use_timer_) {
+      sync_substrate_
+          ->template sync<writeSource, readAny, SHADGNNSumAggregate<VTy>,
+                          Bitset_graph_aggregate>("SHADGraphAggregateSync");
+    } else {
+      sync_substrate_
+          ->template sync<writeSource, readAny, SHADGNNSumAggregate<VTy>,
+                          Bitset_graph_aggregate>("Ignore");
+    }
+  }
+
+  void SampleNodeSync(std::string stat_str) {
+    sampled_nodes_ = &(this->mark_sampled_nodes_);
+
+    // set globals for the sync substrate
+    if (use_timer_) {
+      sync_substrate_->template sync<writeSource, readDestination,
+                                     SampleFlagSync<VTy>, SampleFlagBitset>(
+          stat_str);
+    } else {
+      sync_substrate_->template sync<writeSource, readDestination,
+                                     SampleFlagSync<VTy>, SampleFlagBitset>(
+          "Ignore");
+    }
+  }
+
+  // TODO(loc) Should not be a default version of this to avoid potential
+  // issues later
+  void AggregateSync(GNNFloat* matrix_to_sync,
+                     const size_t matrix_column_size) const {
+    AggregateSync(matrix_to_sync, matrix_column_size, false,
+                  std::numeric_limits<uint32_t>::max());
+  };
+
+  //! Given a matrix and the column size, do an aggregate sync where each row
+  //! is considered a node's data and sync using the graph's Gluon
+  //! substrate
+  //! Note that it's const because the only thing being used is the graph
+  //! topology of this object; the thing modified is the passed in matrix
+  void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size,
+                     bool is_backward, uint32_t active_row_boundary) const {
+    gnn_matrix_to_sync_               = matrix_to_sync;
+    gnn_matrix_to_sync_column_length_ = matrix_column_size;
+    subgraph_size_                    = active_size();
+    num_active_layer_rows_            = active_row_boundary;
+
+    if (!use_subgraph_ && !use_subgraph_view_) {
+      // set globals for the sync substrate
+      if (!is_backward) {
+        if (use_timer_) {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("GraphAggregateSync");
+        } else {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("Ignore");
+        }
+      } else {
+        galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+        clubbed_timer.start();
+        sync_substrate_
+            ->template sync<writeDestination, readAny, GNNSumAggregate<VTy>,
+                            Bitset_graph_aggregate>(
+                "BackwardGraphAggregateSync");
+        clubbed_timer.stop();
+      }
+    } else {
+      // setup the SID to LID map for the sync substrate to use (SID != LID)
+      gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer();
+
+      if (!is_backward) {
+        if (use_timer_) {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSampleSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("GraphAggregateSync");
+        } else {
+          sync_substrate_
+              ->template sync<writeSource, readAny, GNNSampleSumAggregate<VTy>,
+                              Bitset_graph_aggregate>("Ignore");
+        }
+      } else {
+        galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+        clubbed_timer.start();
+        sync_substrate_
+            ->template sync<writeDestination, readAny,
+                            GNNSampleSumAggregate<VTy>, Bitset_graph_aggregate>(
+                "BackwardGraphAggregateSync");
+        clubbed_timer.stop();
+      }
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Sampling related
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Makes a node "sampled"; used for debugging/testing
+  void SetSampledNode(size_t node) { mark_sampled_nodes_[node] = 1; }
+  //! Makes a node "not sampled"; used for debugging/testing
+  void UnsetSampledNode(size_t node) { mark_sampled_nodes_[node] = 0; }
+
+  //! Returns true if a particular node is currently considered "in" a sampled
+  //! graph
+  bool IsInSampledGraph(const NodeIterator& ni) const {
+    // TODO(loc) GPU
+    assert(*ni < size());
+    return mark_sampled_nodes_[*ni];
+  }
+  bool IsInSampledGraph(size_t node_id) const {
+    // TODO(loc) GPU
+    assert(node_id < size());
+    return mark_sampled_nodes_[node_id];
+  }
+  bool IsInSampledGraphSubgraph(size_t node_id) const {
+    // TODO(loc) GPU
+    assert(node_id < size());
+    if (use_subgraph_) {
+      return mark_sampled_nodes_[ConvertToLID(node_id)];
+    } else {
+      return mark_sampled_nodes_[node_id];
+    }
+  }
+
+  bool IsActiveInSubgraph(size_t node_id) const {
+    return definitely_sampled_nodes_.test(node_id);
+  }
+
+  //! Calculate norm factor considering the entire graph
+  void CalculateFullNormFactor() {
+    // TODO(loc) reset all degrees if this is called multiple times?
+    // get the norm factor contribution for each node based on the GLOBAL graph
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), partitioned_graph_->size()),
+        [&](size_t src) {
+          for (auto edge_iter = partitioned_graph_->edge_begin(src);
+               edge_iter != partitioned_graph_->edge_end(src); edge_iter++) {
+            // count degrees for all + train/other
+            size_t dest = GetEdgeDest(edge_iter);
+            if (IsValidForPhase(dest, GNNPhase::kTrain) ||
+                IsValidForPhase(dest, GNNPhase::kOther)) {
+              global_train_degrees_[src] += 1;
+            }
+            global_degrees_[src] += 1;
+          }
+        },
+        galois::loopname("CalculateLocalDegrees"));
+    // degree sync
+    gnn_degree_vec_1_ = global_train_degrees_.data();
+    gnn_degree_vec_2_ = global_degrees_.data();
+    sync_substrate_
+        ->template sync<writeSource, readAny, InitialDegreeSync<VTy>>(
+            "InitialDegreeSync");
+  }
+
+#ifdef GALOIS_ENABLE_GPU
+  void AggregateSyncGPU(GNNFloat* matrix_to_sync,
+                        const size_t matrix_column_size,
+                        const unsigned layer_number) const {
+    size_t layer_input_mtx_column_size =
+        getLayerInputMatrixColumnSize(cuda_ctx_, layer_number);
+    size_t layer_output_mtx_column_size =
+        getLayerOutputMatrixColumnSize(cuda_ctx_, layer_number);
+    // set globals for the sync substrate
+    gnn_matrix_to_sync_               = matrix_to_sync;
+    gnn_matrix_to_sync_column_length_ = matrix_column_size;
+    cuda_ctx_for_sync                 = cuda_ctx_;
+    layer_number_to_sync              = layer_number;
+    // TODO bitset setting
+    // call sync
+    cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size,
+                            size(), layer_number);
+
+    // XXX no timer if use_timer is off
+    if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) {
+      if (use_timer_) {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_input<VTy>>(
+            "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+      } else {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_input<VTy>>(
+            "Ignore", gnn_matrix_to_sync_column_length_);
+      }
+    } else if (gnn_matrix_to_sync_column_length_ ==
+               layer_output_mtx_column_size) {
+      if (use_timer_) {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_output<VTy>>(
+            "GraphAggregateSync", gnn_matrix_to_sync_column_length_);
+      } else {
+        sync_substrate_->template sync<writeSource, readAny,
+                                       GNNSumAggregate_layer_output<VTy>>(
+            "Ignore", gnn_matrix_to_sync_column_length_);
+      }
+    } else {
+      GALOIS_LOG_FATAL("Column size of the synchronized matrix does not"
+                       " match to the column size of the CUDA context");
+    }
+  }
+
+  void InitLayerVectorMetaObjects(size_t layer_number, unsigned num_hosts,
+                                  size_t infl_in_size, size_t infl_out_size) {
+    init_CUDA_layer_vector_meta_obj(cuda_ctx_, layer_number, num_hosts, size(),
+                                    infl_in_size, infl_out_size);
+  }
+
+  void ResizeGPULayerVector(size_t num_layers) {
+    resize_CUDA_layer_vector(cuda_ctx_, num_layers);
+  }
+
+  const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; }
+
+  void GetMarshalGraph(MarshalGraph& m) const {
+    sync_substrate_->getMarshalGraph(m, false);
+  }
+
+  void GetPartitionedGraphInfo(PartitionedGraphInfo& g_info) const {
+    sync_substrate_->getPartitionedGraphInfo(g_info);
+  }
+#endif
+
+  void ContiguousRemap(const std::string& new_name) {
+    node_remapping_.resize(partitioned_graph_->size());
+
+    uint32_t new_node_id = 0;
+
+    // serial loops because new ID needs to be kept consistent
+    // first, train nodes
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kTrain)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Train nodes are from 0 to ", new_node_id);
+
+    // second, val nodes
+    uint32_t val_start = new_node_id;
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kValidate)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Val nodes are from ", val_start, " to ", new_node_id, "(",
+                  new_node_id - val_start, ")");
+
+    // third, test nodes
+    uint32_t test_start = new_node_id;
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kTest)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Test nodes are from ", test_start, " to ", new_node_id, "(",
+                  new_node_id - test_start, ")");
+
+    // last, everything else
+    uint32_t other_start = new_node_id;
+    for (size_t cur_node = 0; cur_node < partitioned_graph_->size();
+         cur_node++) {
+      if (IsValidForPhase(cur_node, GNNPhase::kOther)) {
+        node_remapping_[new_node_id++] = cur_node;
+      }
+    }
+    galois::gInfo("Other nodes are from ", other_start, " to ", new_node_id,
+                  "(", new_node_id - other_start, ")");
+    GALOIS_LOG_ASSERT(new_node_id == partitioned_graph_->size());
+
+    // save the mapping to a binary file for use by graph convert to deal with
+    // the gr
+    std::string label_filename = input_directory_ + new_name + "-mapping.bin";
+    std::ofstream label_write_stream;
+    label_write_stream.open(label_filename, std::ios::binary | std::ios::out);
+    label_write_stream.write((char*)node_remapping_.data(),
+                             sizeof(uint32_t) * node_remapping_.size());
+    label_write_stream.close();
+  }
+
+  void EnableTimers() {
+    use_timer_ = true;
+    if (subgraph_) {
+      subgraph_->EnableTimers();
+    }
+  }
+  void DisableTimers() {
+    use_timer_ = false;
+    if (subgraph_) {
+      subgraph_->DisableTimers();
+    }
+  }
+
+  bool SubgraphChooseAllStatus() { return subgraph_choose_all_; }
+  void EnableSubgraphChooseAll() { subgraph_choose_all_ = true; }
+  void DisableSubgraphChooseAll() { subgraph_choose_all_ = false; }
+  void SetSubgraphChooseAll(bool a) { subgraph_choose_all_ = a; }
+
+  std::vector<unsigned>& GetMasterOffsets() { return sample_master_offsets_; }
+  std::vector<unsigned>& GetMirrorOffsets() { return sample_mirror_offsets_; }
+
+  galois::DynamicBitSet& GetNonLayerZeroMasters() {
+    return non_layer_zero_masters_;
+  }
+  const galois::DynamicBitSet& GetNonLayerZeroMasters() const {
+    return non_layer_zero_masters_;
+  }
+
+  // TODO(hc): `ResizeSamplingBitsets()` and
+  // `GetDefinitelySampledNodesBset()` expose private member variables
+  // for unit tests. Other than them, these should not be used.
+
+  void ResizeSamplingBitsets() {
+    if (!bitset_sampled_degrees_.size()) {
+      bitset_sampled_degrees_.resize(partitioned_graph_->size());
+    }
+    if (!bitset_sample_flag_.size()) {
+      bitset_sample_flag_.resize(size());
+    }
+    if (!definitely_sampled_nodes_.size()) {
+      definitely_sampled_nodes_.resize(partitioned_graph_->size());
+    }
+  }
+
+  galois::DynamicBitSet& GetDefinitelySampledNodesBset() {
+    return definitely_sampled_nodes_;
+  }
+
+  /* @brief Return true if this is constructed from a WMD graph otherwise false.
+   */
+  bool is_using_wmd() { return this->use_wmd_; }
+
+private:
+// included like this to avoid cyclic dependency issues + not used anywhere but
+// in this class anyways
+#include "galois/graphs/GNNSubgraph.h"
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Initialization
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Partitions a particular dataset given some partitioning scheme
+  std::unique_ptr<GNNDistGraph>
+  LoadPartition(const std::string& input_directory,
+                const std::string& dataset_name,
+                galois::graphs::GNNPartitionScheme partition_scheme) {
+    // XXX input path
+    std::string input_file = input_directory + dataset_name + ".csgr";
+    if (this->use_wmd_) {
+      input_file = dataset_name;
+    }
+    GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file);
+
+    // load partition
+    switch (partition_scheme) {
+    case galois::graphs::GNNPartitionScheme::kOEC:
+      return galois::cuspPartitionGraph<GnnOEC, VTy, ETy>(
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true,
+          "", "", false, 1);
+    case galois::graphs::GNNPartitionScheme::kCVC:
+      return galois::cuspPartitionGraph<GnnCVC, VTy, ETy>(
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true,
+          "", "", false, 1);
+    case galois::graphs::GNNPartitionScheme::kOCVC:
+      return galois::cuspPartitionGraph<GenericCVC, VTy, ETy>(
+          input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true,
+          "", "", false, 1);
+    default:
+      GALOIS_LOG_FATAL("Error: partition scheme specified is invalid");
+      return nullptr;
+    }
+  }
+
+  template <
+      typename T                                                      = VTy,
+      typename std::enable_if_t<std::is_same_v<T, shad::ShadNodeTy>>* = nullptr>
+  void ConstructLocalLabels() {
+    GALOIS_LOG_VERBOSE("[{}] Constructing labels from disk...", host_id_);
+    auto& graph = *(this->partitioned_graph_);
+    // For WMD graph, we always assume a single class label.
+    // allocate memory for labels
+    // single-class (one-hot) label for each vertex: N x 1
+    using_single_class_labels_ = true;
+    local_ground_truth_labels_.resize(graph.size());
+    // In WMD graphs, a vertex class is a vertex type.
+    // As the vertex type is already materialized in a vertex data,
+    // iterate a graph and extract that.
+    // TODO(hc): Using concurrent set using a finer-grained lock
+    // is better
+    std::mutex label_class_set_mtx;
+    std::unordered_set<int> label_class_set;
+    num_label_classes_ = 0;
+    galois::do_all(galois::iterate(size_t{0}, graph.size()), [&](size_t lid) {
+      local_ground_truth_labels_[lid] = graph.getData(lid).type;
+      label_class_set_mtx.lock();
+      auto found = label_class_set.find(local_ground_truth_labels_[lid]);
+      if (found == label_class_set.end()) {
+        label_class_set.emplace(local_ground_truth_labels_[lid]);
+        ++num_label_classes_;
+      }
+      label_class_set_mtx.unlock();
+    });
+
+    // Exchange found local vertex classes with other hosts to
+    // calculate the total number of the classes.
+    //
+    // Serialize the label class set to a vector to serialize this data
+    // to galois::runtime::SendBuffer. The current libdist does not
+    // support std::set and std::unordered_set de/serialization.
+    // TODO(hc): support this type of serialization.
+    std::vector<int> label_vec(label_class_set.begin(), label_class_set.end());
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    for (uint32_t h = 0; h < net.Num; ++h) {
+      if (h == net.ID) {
+        continue;
+      }
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, label_vec);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    for (uint32_t h = 0; h < net.Num - 1; ++h) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+
+      std::vector<int> h_label_vec;
+      galois::runtime::gDeserialize(p->second, h_label_vec);
+      galois::do_all(galois::iterate(h_label_vec), [&](int i) {
+        label_class_set_mtx.lock();
+        auto found = label_class_set.find(i);
+        if (found == label_class_set.end()) {
+          label_class_set.emplace(i);
+          // Increaes the number of classes only if
+          // it was not found in the local host.
+          ++num_label_classes_;
+        }
+        label_class_set_mtx.unlock();
+      });
+    }
+    increment_evilPhase();
+  }
+
+  template <typename T = VTy,
+            typename std::enable_if_t<!std::is_same_v<T, shad::ShadNodeTy>>* =
+                nullptr>
+  void ConstructLocalLabels() {}
+
+  void ReadLocalLabelsBin(const std::string& dataset_name) {
+    GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
+
+    std::ifstream file_stream;
+    file_stream.open(input_directory_ + dataset_name + "-labels-dims.txt",
+                     std::ios::in);
+    size_t num_nodes;
+    file_stream >> num_nodes >> num_label_classes_ >> std::ws;
+    assert(num_nodes == partitioned_graph_->globalSize());
+    if (host_id_ == 0) {
+      galois::gInfo("Number of label classes is ", num_label_classes_);
+    }
+    file_stream.close();
+
+    std::string filename = input_directory_ + dataset_name + "-labels.bin";
+    std::ifstream file_stream_bin;
+    file_stream_bin.open(filename, std::ios::binary | std::ios::in);
+
+    std::vector<GNNLabel> all_labels(num_nodes);
+    // read all labels into a vector
+    file_stream_bin.read((char*)all_labels.data(),
+                         sizeof(GNNLabel) * num_nodes);
+
+    using_single_class_labels_ = true;
+    local_ground_truth_labels_.resize(partitioned_graph_->size());
+
+    galois::GAccumulator<size_t> found_local_vertices;
+    found_local_vertices.reset();
+
+    // save only local ones; can do in parallel as well
+    // assumes -1 already dealt with
+    galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->size()),
+                   [&](size_t lid) {
+                     local_ground_truth_labels_[lid] = all_labels[GetGID(lid)];
+                     found_local_vertices += 1;
+                   });
+
+    size_t fli = found_local_vertices.reduce();
+    galois::gInfo(host_prefix_, "Read ", fli, " labels (",
+                  local_ground_truth_labels_.size() * double{4} / (1 << 30),
+                  " GB)");
+    GALOIS_LOG_ASSERT(fli == partitioned_graph_->size());
+  }
+
+  //! Read labels of local nodes only
+  void ReadLocalLabels(const std::string& dataset_name,
+                       bool has_single_class_label) {
+    GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_);
+    std::string filename;
+    if (has_single_class_label) {
+      filename = input_directory_ + dataset_name + "-labels.txt";
+    } else {
+      filename = input_directory_ + dataset_name + "-mlabels.txt";
+    }
+
+    // read file header, save num label classes while at it
+    std::ifstream file_stream;
+    file_stream.open(filename, std::ios::in);
+    size_t num_nodes;
+    file_stream >> num_nodes >> num_label_classes_ >> std::ws;
+    assert(num_nodes == partitioned_graph_->globalSize());
+    if (host_id_ == 0) {
+      galois::gInfo("Number of label classes is ", num_label_classes_);
+    }
+
+    // allocate memory for labels
+    if (has_single_class_label) {
+      // single-class (one-hot) label for each vertex: N x 1
+      using_single_class_labels_ = true;
+      local_ground_truth_labels_.resize(partitioned_graph_->size());
+    } else {
+      // multi-class label for each vertex: N x num classes
+      using_single_class_labels_ = false;
+      local_ground_truth_labels_.resize(partitioned_graph_->size() *
+                                        num_label_classes_);
+    }
+
+    size_t cur_gid              = 0;
+    size_t found_local_vertices = 0;
+    // each line contains a set of 0s and 1s
+    std::string read_line;
+
+    // loop through all labels of the graph
+    while (std::getline(file_stream, read_line)) {
+      // only process label if this node is local
+      if (partitioned_graph_->isLocal(cur_gid)) {
+        uint32_t cur_lid = partitioned_graph_->getLID(cur_gid);
+        // read line as bitset of 0s and 1s
+        std::istringstream label_stream(read_line);
+        int cur_bit;
+        // bitset size is # of label classes
+        for (size_t cur_class = 0; cur_class < num_label_classes_;
+             ++cur_class) {
+          // read a bit
+          label_stream >> cur_bit;
+
+          if (has_single_class_label) {
+            // no label
+            if (cur_bit == -1) {
+              local_ground_truth_labels_[cur_lid] = num_label_classes_;
+              break;
+            }
+
+            // in single class, only 1 bit is set in bitset; that represents the
+            // class to take
+            if (cur_bit != 0) {
+              // set class and break (assumption is that's the only bit that is
+              // set)
+              local_ground_truth_labels_[cur_lid] = cur_class;
+              break;
+            }
+          } else {
+            // else the entire bitset needs to be copied over to the label array
+            // TODO this can possibly be saved all at once rather than bit by
+            // bit?
+            local_ground_truth_labels_[cur_lid * num_label_classes_ +
+                                       cur_class] = cur_bit;
+          }
+        }
+        found_local_vertices++;
+      }
+      // always increment cur_gid
+      cur_gid++;
+    }
+
+    file_stream.close();
+
+    galois::gInfo(host_prefix_, "Read ", found_local_vertices, " labels (",
+                  local_ground_truth_labels_.size() * double{4} / (1 << 30),
+                  " GB)");
+    GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size());
+  }
+
+  //! Read features of local nodes only
+  void ReadLocalFeatures(const std::string& dataset_name) {
+    GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_);
+
+    // read in dimensions of features, specifically node feature length
+    size_t num_global_vertices;
+
+    std::string file_dims = input_directory_ + dataset_name + "-dims.txt";
+    std::ifstream ifs;
+    ifs.open(file_dims, std::ios::in);
+    ifs >> num_global_vertices >> node_feature_length_;
+    ifs.close();
+
+    GALOIS_LOG_ASSERT(num_global_vertices == partitioned_graph_->globalSize());
+    GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_global_vertices,
+                       node_feature_length_);
+
+    // memory for all features of all nodes in graph
+    // TODO read features without loading entire feature file into memory; this
+    // is quite inefficient
+    std::unique_ptr<GNNFloat[]> full_feature_set = std::make_unique<GNNFloat[]>(
+        num_global_vertices * node_feature_length_);
+
+    // read in all features
+    std::ifstream file_stream;
+    std::string feature_file = input_directory_ + dataset_name + "-feats.bin";
+    file_stream.open(feature_file, std::ios::binary | std::ios::in);
+    file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) *
+                                                        num_global_vertices *
+                                                        node_feature_length_);
+    file_stream.close();
+
+    // allocate memory for local features
+    local_node_features_.resize(partitioned_graph_->size() *
+                                node_feature_length_);
+
+    // copy over features for local nodes only
+    galois::GAccumulator<size_t> num_kept_vertices;
+    num_kept_vertices.reset();
+    galois::do_all(
+        galois::iterate(size_t{0}, num_global_vertices), [&](size_t gid) {
+          if (partitioned_graph_->isLocal(gid)) {
+            // copy over feature vector
+            std::copy(full_feature_set.get() + gid * node_feature_length_,
+                      full_feature_set.get() + (gid + 1) * node_feature_length_,
+                      &local_node_features_[partitioned_graph_->getLID(gid) *
+                                            node_feature_length_]);
+            num_kept_vertices += 1;
+          }
+        });
+    full_feature_set.reset();
+
+    galois::gInfo(host_prefix_, "Read ", local_node_features_.size(),
+                  " features (",
+                  local_node_features_.size() * double{4} / (1 << 30), " GB)");
+    GALOIS_LOG_ASSERT(num_kept_vertices.reduce() == partitioned_graph_->size());
+  }
+
+  //! Helper function to read masks from file into the appropriate structures
+  //! given a name, mask type, and arrays to save into
+  size_t ReadLocalMasksFromFile(const std::string& dataset_name,
+                                const std::string& mask_type,
+                                GNNRange* mask_range,
+                                std::vector<char>* masks) {
+    size_t range_begin;
+    size_t range_end;
+
+    // read mask range
+    std::string mask_filename =
+        input_directory_ + dataset_name + "-" + mask_type + "_mask.txt";
+    bool train_is_on = false;
+    if (mask_type == "train") {
+      train_is_on = true;
+    }
+
+    std::ifstream mask_stream;
+    mask_stream.open(mask_filename, std::ios::in);
+    mask_stream >> range_begin >> range_end >> std::ws;
+    GALOIS_LOG_ASSERT(range_begin <= range_end);
+
+    // set the range object
+    mask_range->begin = range_begin;
+    mask_range->end   = range_end;
+    mask_range->size  = range_end - range_begin;
+
+    size_t cur_line_num = 0;
+    // valid nodes on this host
+    size_t local_sample_count = 0;
+    // this tracks TOTAL # of valid nodes in this group (not necessarily valid
+    // ones on this host)
+    size_t valid_count = 0;
+    std::string line;
+    // each line is a number signifying if mask is set for the vertex
+    while (std::getline(mask_stream, line)) {
+      std::istringstream mask_stream(line);
+      // only examine vertices/lines in range
+      if (cur_line_num >= range_begin && cur_line_num < range_end) {
+        unsigned mask = 0;
+        mask_stream >> mask;
+        if (mask == 1) {
+          valid_count++;
+          if (partitioned_graph_->isLocal(cur_line_num)) {
+            (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1;
+            local_sample_count++;
+          }
+          if (train_is_on) {
+            global_training_mask_[cur_line_num] = 1;
+          }
+        }
+      }
+      cur_line_num++;
+    }
+    mask_stream.close();
+
+    if (train_is_on) {
+      global_training_count_ = valid_count;
+    }
+
+    if (valid_count != mask_range->size) {
+      // overlapping masks: need to actually check the masks rather than use
+      // ranges
+      if (!incomplete_masks_) {
+        galois::gInfo(
+            "Masks are not contained in range: must actually check mask");
+      }
+      incomplete_masks_ = true;
+    }
+
+    return valid_count;
+  }
+
+  //! Finds nodes that aren't part of the 3 main GNN phase classifications
+  size_t FindOtherMask() {
+    galois::GAccumulator<size_t> other_accum;
+    other_accum.reset();
+    other_mask_.resize(partitioned_graph_->size());
+
+    galois::do_all(
+        galois::iterate(size_t{0}, partitioned_graph_->size()),
+        [&](size_t local_id) {
+          if (!IsValidForPhase(local_id, GNNPhase::kTrain) &&
+              !IsValidForPhase(local_id, GNNPhase::kValidate) &&
+              !IsValidForPhase(local_id, GNNPhase::kTest)) {
+            other_mask_[local_id] = 1;
+            other_accum += 1;
+          }
+        },
+        galois::loopname("FindOtherMask"));
+    return other_accum.reduce();
+  }
+
+  //! @brief Choose and set local training/validation/testing vertices
+  //! consecutively.
+  void SetLocalMasksConsecutively() {
+    // allocate the memory for the local masks
+    global_training_mask_.resize(partitioned_graph_->globalSize());
+    local_training_mask_.resize(partitioned_graph_->size());
+    local_validation_mask_.resize(partitioned_graph_->size());
+    local_testing_mask_.resize(partitioned_graph_->size());
+
+    global_training_count_        = partitioned_graph_->globalSize() / 4;
+    size_t global_testing_count   = global_training_count_ / 2;
+    global_training_mask_range_   = {.begin = 0,
+                                     .end   = global_training_count_,
+                                     .size  = global_training_count_};
+    global_testing_mask_range_    = {.begin = global_training_count_,
+                                     .end   = global_training_count_ +
+                                            global_testing_count,
+                                     .size = global_testing_count};
+    global_validation_mask_range_ = {
+        .begin = global_training_count_ + global_testing_count,
+        .end   = global_training_count_ + 2 * global_testing_count,
+        .size  = global_testing_count};
+    // training
+    for (size_t i = global_training_mask_range_.begin;
+         i < global_training_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+      global_training_mask_[i] = 1;
+    }
+
+    // validation
+    for (size_t i = global_validation_mask_range_.begin;
+         i < global_validation_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+
+    // testing
+    for (size_t i = global_testing_mask_range_.begin;
+         i < global_testing_mask_range_.end; i++) {
+      if (partitioned_graph_->isLocal(i)) {
+        local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+      }
+    }
+  }
+
+  //! @brief Randomly choose and set local training/validation/testing
+  //! vertices. This mimics what AGILE GNN does through Pytorch
+  //! `DistributedRandomSampler`.
+  void DistributedRandomSampling(size_t local_sample_size,
+                                 std::vector<char>* masks) {
+    // Pytorch's DistributedRandomSampler,
+    // first materializes an array populated with
+    // 0 to (num_local_vertices - 1), shuffles this array, and
+    // extracts 0 to (num_local_shuffle - 1) vertices.
+    // This method mimics this operation.
+    // Like Pytorch, all the hosts use the same seed, and so,
+    // deterministically choose each type of vertices for not only
+    // the current host, but also others, and mark vertices to
+    // the corresponding mask array if they are locals.
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    std::vector<std::pair<uint64_t, uint64_t>> num_masters_per_hosts(net.Num);
+    std::pair<uint64_t, uint64_t> master_ranges = {
+        partitioned_graph_->getGID(0),
+        partitioned_graph_->getGID(partitioned_graph_->numMasters() - 1)};
+    // 1) Exchange node master ranges, and so, each host knows
+    // the range of vertex sampling.
+    for (uint32_t h = 0; h < net.Num; ++h) {
+      if (h == net.ID) {
+        continue;
+      }
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, master_ranges);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+    }
+    net.flush();
+    for (uint32_t h = 0; h < net.Num - 1; ++h) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+
+      galois::runtime::gDeserialize(p->second, num_masters_per_hosts[p->first]);
+    }
+    increment_evilPhase();
+
+    // 2) Sample vertices and mark them to the `masks` array
+    // if a vertex is local.
+    for (uint32_t h = 0; h < net.Num; ++h) {
+      size_t h_begin =
+          (h == net.ID) ? master_ranges.first : num_masters_per_hosts[h].first;
+      size_t h_end = (h == net.ID) ? master_ranges.second
+                                   : num_masters_per_hosts[h].second;
+      std::vector<uint64_t> h_all_indices(h_end - h_begin);
+      // Fill global vertex ids to h_global_ids.
+      galois::do_all(galois::iterate(h_begin, h_end),
+                     [&](size_t i) { h_all_indices[i - h_begin] = i; });
+      std::mt19937 rand(0);
+      std::shuffle(h_all_indices.begin(), h_all_indices.end(), rand);
+      galois::do_all(
+          galois::iterate(size_t{0}, local_sample_size), [&](size_t i) {
+            // First, it doens't have duplications.
+            // Second, only mark `masks` if the checking vertex is a local
+            // vertex.
+            if (partitioned_graph_->isLocal(h_all_indices[i])) {
+              (*masks)[partitioned_graph_->getLID(h_all_indices[i])] = 1;
+            }
+          });
+    }
+  }
+
+  void SetLocalMasksRandomly() {
+    // allocate the memory for the local masks
+    global_training_mask_.resize(partitioned_graph_->globalSize());
+    local_training_mask_.resize(partitioned_graph_->size());
+    local_validation_mask_.resize(partitioned_graph_->size());
+    local_testing_mask_.resize(partitioned_graph_->size());
+
+    auto& net                   = galois::runtime::getSystemNetworkInterface();
+    global_training_count_      = partitioned_graph_->globalSize() / 4;
+    size_t global_testing_count = global_training_count_ / 2;
+    size_t num_local_training_samples   = global_training_count_ / net.Num;
+    size_t num_local_testing_samples    = global_testing_count / net.Num;
+    size_t num_local_validating_samples = num_local_testing_samples;
+    global_training_mask_range_         = {.begin = 0,
+                                           .end   = global_training_count_,
+                                           .size  = global_training_count_};
+    global_testing_mask_range_          = {.begin = 0,
+                                           .end   = global_training_count_,
+                                           .size  = global_training_count_};
+    global_validation_mask_range_       = {.begin = 0,
+                                           .end   = global_training_count_,
+                                           .size  = global_training_count_};
+
+    incomplete_masks_ = true;
+    DistributedRandomSampling(num_local_training_samples,
+                              &local_training_mask_);
+    DistributedRandomSampling(num_local_testing_samples, &local_testing_mask_);
+    DistributedRandomSampling(num_local_validating_samples,
+                              &local_validation_mask_);
+  }
+
+  //! Read masks of local nodes only for training, validation, and testing
+  void ReadLocalMasks(const std::string& dataset_name) {
+    // allocate the memory for the local masks
+    global_training_mask_.resize(partitioned_graph_->globalSize());
+    local_training_mask_.resize(partitioned_graph_->size());
+    local_validation_mask_.resize(partitioned_graph_->size());
+    local_testing_mask_.resize(partitioned_graph_->size());
+
+    if (dataset_name == "reddit") {
+      global_training_count_ = 153431;
+
+      // TODO reddit is hardcode handled at the moment; better way to not do
+      // this?
+      global_training_mask_range_ = {.begin = 0, .end = 153431, .size = 153431};
+      global_validation_mask_range_ = {
+          .begin = 153431, .end = 153431 + 23831, .size = 23831};
+      global_testing_mask_range_ = {
+          .begin = 177262, .end = 177262 + 55703, .size = 55703};
+
+      // training
+      for (size_t i = global_training_mask_range_.begin;
+           i < global_training_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+        global_training_mask_[i] = 1;
+      }
+
+      // validation
+      for (size_t i = global_validation_mask_range_.begin;
+           i < global_validation_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+
+      // testing
+      for (size_t i = global_testing_mask_range_.begin;
+           i < global_testing_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+    } else if (dataset_name == "ogbn-papers100M-remap") {
+      global_training_count_ = 1207178;
+
+      global_training_mask_range_ = {
+          .begin = 0, .end = 1207178, .size = 1207178};
+      global_validation_mask_range_ = {
+          .begin = 1207178, .end = 1207178 + 125264, .size = 125264};
+      global_testing_mask_range_ = {
+          .begin = 1332442, .end = 1332442 + 214337, .size = 214337};
+      // training
+      for (size_t i = global_training_mask_range_.begin;
+           i < global_training_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_training_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+        global_training_mask_[i] = 1;
+      }
+      // validation
+      for (size_t i = global_validation_mask_range_.begin;
+           i < global_validation_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_validation_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+      // testing
+      for (size_t i = global_testing_mask_range_.begin;
+           i < global_testing_mask_range_.end; i++) {
+        if (partitioned_graph_->isLocal(i)) {
+          local_testing_mask_[partitioned_graph_->getLID(i)] = 1;
+        }
+      }
+      valid_other_ = FindOtherMask();
+      GALOIS_LOG_ASSERT(valid_other_ <= 109513177);
+    } else {
+      size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train",
+                                                  &global_training_mask_range_,
+                                                  &local_training_mask_);
+      size_t valid_val   = ReadLocalMasksFromFile(dataset_name, "val",
+                                                  &global_validation_mask_range_,
+                                                  &local_validation_mask_);
+      size_t valid_test  = ReadLocalMasksFromFile(dataset_name, "test",
+                                                  &global_testing_mask_range_,
+                                                  &local_testing_mask_);
+      valid_other_       = FindOtherMask();
+      // the "other" set of nodes that don't fall into any classification
+      if (galois::runtime::getSystemNetworkInterface().ID == 0) {
+        galois::gInfo("Valid # training nodes is ", valid_train);
+        galois::gInfo("Valid # validation nodes is ", valid_val);
+        galois::gInfo("Valid # test nodes is ", valid_test);
+        galois::gInfo("Valid # other nodes is ", valid_other_);
+      }
+    }
+  }
+
+  //! Initializes the norm factors using the entire graph's topology for global
+  //! degree access
+  void InitNormFactor() {
+    GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_);
+    global_degrees_.resize(partitioned_graph_->size(), 0.0);
+    global_train_degrees_.resize(partitioned_graph_->size(), 0.0);
+    CalculateFullNormFactor();
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_memory_.InitNormFactor(partitioned_graph_->size());
+    }
+#endif
+  }
+
+  //! Used if ranges for a mask are complete (if in range, it's part of mask).
+  bool
+  IsValidForPhaseCompleteRange(const unsigned lid,
+                               const galois::GNNPhase current_phase) const {
+    // only use ranges if they're complete
+    // convert to gid first
+    size_t gid = partitioned_graph_->getGID(lid);
+
+    // select range to use based on phase
+    const GNNRange* range_to_use;
+    switch (current_phase) {
+    case GNNPhase::kTrain:
+      range_to_use = &global_training_mask_range_;
+      break;
+    case GNNPhase::kValidate:
+      range_to_use = &global_validation_mask_range_;
+      break;
+    case GNNPhase::kTest:
+      range_to_use = &global_testing_mask_range_;
+      break;
+    case GNNPhase::kOther:
+      GALOIS_LOG_FATAL("no range for other");
+      break;
+    default:
+      GALOIS_LOG_FATAL("Invalid phase used");
+      range_to_use = nullptr;
+    }
+
+    // if within range, it is valid
+    // there is an assumption here that ranges are contiguous; may not
+    // necessarily be the case in all inputs in which case using the mask is
+    // required (but less cache efficient)
+    if (range_to_use->begin <= gid && gid < range_to_use->end) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  //! Used if ranges for a mask are incomplete, meaning I actually have to
+  //! check the mask.
+  bool IsValidForPhaseMasked(const unsigned lid,
+                             const galois::GNNPhase current_phase) const {
+    // select mask to use based on phase
+    const GNNMask* mask_to_use;
+    switch (current_phase) {
+    case GNNPhase::kTrain:
+      mask_to_use = &local_training_mask_;
+      break;
+    case GNNPhase::kValidate:
+      mask_to_use = &local_validation_mask_;
+      break;
+    case GNNPhase::kTest:
+      mask_to_use = &local_testing_mask_;
+      break;
+    case GNNPhase::kOther:
+      if (valid_other_ == 0) {
+        return false;
+      }
+      mask_to_use = &other_mask_;
+      break;
+    case GNNPhase::kBatch:
+      mask_to_use = &local_minibatch_mask_;
+      break;
+    default:
+      GALOIS_LOG_FATAL("Invalid phase used");
+      mask_to_use = nullptr;
+    }
+    return (*mask_to_use)[lid];
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Accuracy
+  //////////////////////////////////////////////////////////////////////////////
+
+  float GetGlobalAccuracyCPU(PointerWithSize<GNNFloat> predictions,
+                             GNNPhase phase, bool sampling) {
+    galois::StatTimer global_accuracy_timer("GetGlobalAccuracy");
+    galois::StatTimer global_accuracy_for_singleclass_timer(
+        "GetGlobalAccuracyForSingleClass");
+    galois::StatTimer global_accuracy_for_multiclass_timer(
+        "GetGlobalAccuracyForMultiClass");
+    global_accuracy_timer.start();
+    float accuracy{0};
+    if (is_single_class_label()) {
+      global_accuracy_for_singleclass_timer.start();
+      auto accuracy_result =
+          GetGlobalAccuracyCPUSingle(predictions, phase, sampling);
+      accuracy = accuracy_result.first / accuracy_result.second;
+      global_accuracy_for_singleclass_timer.stop();
+    } else {
+      global_accuracy_for_multiclass_timer.start();
+      accuracy = GetGlobalAccuracyCPUMulti(predictions, phase, sampling);
+      global_accuracy_for_multiclass_timer.stop();
+    }
+    global_accuracy_timer.stop();
+    return accuracy;
+  }
+
+  std::pair<float, float>
+  GetGlobalAccuracyCPUSingle(PointerWithSize<GNNFloat> predictions,
+                             GNNPhase phase, bool) {
+    // check owned nodes' accuracy
+    num_correct_.reset();
+    total_checked_.reset();
+
+#if 0
+    std::cout << "single accuracy print:\n";
+    for (int i = *begin_owned(); i < *end_owned(); ++i) {
+      if (!IsValidForPhase(i, GNNPhase::kBatch)) {
+        continue;
+      }
+      //std::cout << subgraph_->SIDToLID(i) << ", " << galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
+      std::cout << "accuracy:" << subgraph_->SIDToLID(i) << ", " <<
+      predictions[i * num_label_classes_] << ", " <<
+      predictions[i * num_label_classes_ + 1] << ", " <<
+      predictions[i * num_label_classes_ + 2] << ", " <<
+      predictions[i * num_label_classes_ + 3] << ", " <<
+      predictions[i * num_label_classes_ + 4] << "-> " <<
+      galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) <<
+      " vs " << GetSingleClassLabel(i) << "\n";
+    }
+#endif
+    galois::do_all(
+        // will only loop over sampled nodes if sampling is on
+        galois::iterate(begin_owned(), end_owned()),
+        // this is possibly the subgraph id
+        [&](const unsigned node_id) {
+          if (IsValidForPhase(node_id, phase)) {
+            total_checked_ += 1;
+            // get prediction by getting max
+            // note the use of node_id here: lid only used to check original
+            // labels
+            size_t predicted_label =
+                galois::MaxIndex(num_label_classes_,
+                                 &(predictions[node_id * num_label_classes_]));
+            // check against ground truth and track accordingly
+            // TODO static cast used here is dangerous
+            if (predicted_label ==
+                static_cast<size_t>(GetSingleClassLabel(node_id))) {
+              num_correct_ += 1;
+            }
+          }
+        },
+        // steal on as some threads may have nothing to work on
+        galois::steal());
+
+    size_t global_correct = num_correct_.reduce();
+    size_t global_checked = total_checked_.reduce();
+
+    GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_,
+                     global_correct, global_checked);
+    return std::make_pair(static_cast<float>(global_correct),
+                          static_cast<float>(global_checked));
+  }
+
+  float GetGlobalAccuracyCPUMulti(PointerWithSize<GNNFloat> predictions,
+                                  GNNPhase phase, bool sampling) {
+    const GNNLabel* full_ground_truth = GetMultiClassLabel(0);
+    assert(predictions.size() == (num_label_classes_ * size()));
+
+    size_t global_true_positive  = 0;
+    size_t global_true_negative  = 0;
+    size_t global_false_positive = 0;
+    size_t global_false_negative = 0;
+    size_t global_f1_score       = 0;
+
+    // per class check
+    for (size_t label_class = 0; label_class < num_label_classes_;
+         label_class++) {
+      local_true_positive_.reset();
+      local_true_negative_.reset();
+      local_false_positive_.reset();
+      local_false_negative_.reset();
+
+      // loop through all *owned* nodes (do not want to overcount)
+      galois::do_all(
+          galois::iterate(begin_owned(), end_owned()),
+          [&](const unsigned lid) {
+            if (IsValidForPhase(lid, phase)) {
+              if (sampling) {
+                if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) {
+                  return;
+                }
+              }
+
+              size_t label_index = lid * num_label_classes_ + label_class;
+
+              GNNLabel true_label = full_ground_truth[label_index];
+              GNNLabel prediction_is_positive =
+                  (predictions[label_index] > 0.5) ? 1 : 0;
+
+              if (true_label && prediction_is_positive) {
+                local_true_positive_ += 1;
+              } else if (true_label && !prediction_is_positive) {
+                local_false_negative_ += 1;
+              } else if (!true_label && prediction_is_positive) {
+                local_false_positive_ += 1;
+              } else if (!true_label && !prediction_is_positive) {
+                local_true_negative_ += 1;
+              } else {
+                // all cases should be covered with clauses above, so it should
+                // NEVER get here; adding it here just for sanity purposes
+                GALOIS_LOG_FATAL(
+                    "Logic error with true label and prediction label");
+              }
+            }
+            total_checked_ += 1;
+          },
+          galois::steal(), galois::loopname("GlobalMultiAccuracy"));
+
+      // reduce from accumulators across all hosts for this particular class
+      size_t class_true_positives  = local_true_positive_.reduce();
+      size_t class_false_positives = local_false_positive_.reduce();
+      size_t class_true_negatives  = local_true_negative_.reduce();
+      size_t class_false_negatives = local_false_negative_.reduce();
+
+      // add to global counts
+      global_true_positive += class_true_positives;
+      global_false_positive += class_false_positives;
+      global_true_negative += class_true_negatives;
+      global_false_negative += class_false_negatives;
+
+      // calculate precision, recall, and f1 score for this class
+      // ternery op used to avoid division by 0
+      double class_precision =
+          (class_true_positives + class_true_negatives) > 0
+              ? static_cast<double>(class_true_positives) /
+                    (class_true_positives + class_false_positives)
+              : 0.0;
+      double class_recall =
+          (class_true_positives + class_false_negatives) > 0
+              ? static_cast<double>(class_true_positives) /
+                    (class_true_positives + class_false_negatives)
+              : 0.0;
+      double class_f1_score = (class_precision + class_recall) > 0
+                                  ? (2.0 * (class_precision * class_recall)) /
+                                        (class_precision + class_recall)
+                                  : 0.0;
+
+      global_f1_score += class_f1_score;
+    } // end label class loop
+
+    // GALOIS_LOG_WARN("{} {} {} {}", global_true_positive,
+    // global_true_negative, global_false_positive, global_false_negative);
+
+    // double global_f1_macro_score = global_f1_score / num_label_classes_;
+
+    // micro = considers all classes for precision/recall
+    double global_micro_precision =
+        (global_true_positive + global_true_negative) > 0
+            ? static_cast<double>(global_true_positive) /
+                  (global_true_positive + global_false_positive)
+            : 0.0;
+    double global_micro_recall =
+        (global_true_positive + global_false_negative) > 0
+            ? static_cast<double>(global_true_positive) /
+                  (global_true_positive + global_false_negative)
+            : 0.0;
+
+    double global_f1_micro_score =
+        (global_micro_precision + global_micro_recall) > 0
+            ? (2.0 * (global_micro_precision * global_micro_recall)) /
+                  (global_micro_precision + global_micro_recall)
+            : 0.0;
+
+    return global_f1_micro_score;
+  }
+
+  void increment_evilPhase() {
+    ++galois::runtime::evilPhase;
+    if (galois::runtime::evilPhase >=
+        static_cast<uint32_t>(std::numeric_limits<int64_t>::max())) {
+      galois::runtime::evilPhase = 1;
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Vars
+  //////////////////////////////////////////////////////////////////////////////
+
+  //! Directory for input data
+  const std::string input_directory_;
+  //! In a multi-host setting, this variable stores the host id that the graph
+  //! is currently running on
+  unsigned host_id_;
+  //! String header that can be used for debug print statements to get the host
+  //! this graph is on
+  std::string host_prefix_;
+  //! Number of classes for a single vertex label
+  size_t num_label_classes_{1};
+  //! Length of a feature node
+  size_t node_feature_length_{0};
+  //! Partitioned graph
+  std::unique_ptr<GNNDistGraph> partitioned_graph_;
+  //! Sync substrate for the partitioned graph
+  std::unique_ptr<galois::graphs::GluonSubstrate<GNNDistGraph>> sync_substrate_;
+  //! True if labels are single class
+  bool using_single_class_labels_;
+  //! Ground truth label for nodes in the partitioned graph; Nx1 if single
+  //! class, N x num classes if multi-class label
+  std::vector<GNNLabel> local_ground_truth_labels_;
+  //! Feature vectors for nodes in partitioned graph
+  std::vector<GNNFeature> local_node_features_;
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  std::unique_ptr<GNNSubgraph> subgraph_;
+  // Degrees for sampled subgraph
+  std::vector<galois::LargeArray<uint32_t>> sampled_out_degrees_;
+  //! Sample data on edges: each edge gets a small bitset to mark
+  //! if it's been sampled for a particular layer
+  std::vector<galois::DynamicBitSet> edge_sample_status_;
+  // TODO use a char maybe? unlikely anyone will go over 2^8 layers...
+  //! What timestep a node was added to sampled set; used to determine
+  //! size of subgraph at each layer
+  galois::LargeArray<unsigned> sample_node_timestamps_;
+  //! Count of how many masters are in each layer in a sampled subgraph.
+  std::vector<unsigned> sample_master_offsets_;
+  //! Count of how many mirrors are in each layer in a sampled subgraph.
+  std::vector<unsigned> sample_mirror_offsets_;
+  //! Definitely sampled nodes
+  galois::DynamicBitSet definitely_sampled_nodes_;
+
+  std::vector<galois::GAccumulator<uint32_t>> master_offset_accum_;
+  std::vector<galois::GAccumulator<uint32_t>> mirror_offset_accum_;
+  //! In a subgraph, all layer 0 masters are made the prefix of SIDs; other
+  //! masters that are not layer 0 will be scattered elsewhere. This bitset
+  //! tracks which of those SIDs are the masters.
+  //! This is required for master masking in certain layers in distributed
+  //! execution to avoid recomputation of certain gradients.
+  galois::DynamicBitSet non_layer_zero_masters_;
+
+  //! Indicates newly sampled nodes (for distributed synchronization of sampling
+  //! status
+  galois::DynamicBitSet new_sampled_nodes_;
+  //! If edge is sampled at any point, mark this
+  galois::DynamicBitSet sampled_edges_;
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // TODO maybe revisit this and use an actual bitset
+  size_t global_training_count_;
+  //! Bitset indicating which nodes are training nodes (global)
+  GNNMask global_training_mask_;
+  //! Bitset indicating which nodes are training nodes
+  GNNMask local_training_mask_;
+  //! Bitset indicating which nodes are validation nodes
+  GNNMask local_validation_mask_;
+  //! Bitset indicating which nodes are testing nodes
+  GNNMask local_testing_mask_;
+  //! Bitset indicating which nodes don't fall anywhere
+  GNNMask other_mask_;
+  //! Bitset indicating which nodes are part of the minibatch
+  GNNMask local_minibatch_mask_;
+
+  size_t valid_other_{0};
+
+  //! Global mask range for training nodes; must convert to LIDs when using
+  //! in this class
+  GNNRange global_training_mask_range_;
+  //! Global mask range for validation nodes; must convert to LIDs when using
+  //! in this class
+  GNNRange global_validation_mask_range_;
+  //! Global mask range for testing nodes; must convert to LIDs when using
+  //! in this class
+  GNNRange global_testing_mask_range_;
+
+  //! If true, then node splits of train/val/test aren't complete (i.e.
+  //! falling in range != part of that set)
+  bool incomplete_masks_{false};
+
+  //! RNG for subgraph sampling
+  galois::PerThreadRNG sample_rng_;
+
+  // TODO LargeArray instead of vector?
+  //! Degrees: needed since graph is distributed
+  std::vector<uint32_t> global_degrees_;
+  std::vector<uint32_t> global_train_degrees_;
+
+  bool use_subgraph_{false};
+  bool use_subgraph_view_{false};
+  bool subgraph_choose_all_{false};
+
+  std::unique_ptr<MinibatchGenerator> train_batcher_;
+  std::unique_ptr<MinibatchGenerator> test_batcher_;
+
+  std::vector<uint32_t> node_remapping_;
+
+  // True if a WMD graph is being used otherwise false
+  bool use_wmd_{false};
+
+  //////////////////////////////////////////////////////////////////////////////
+  // GPU things
+  //////////////////////////////////////////////////////////////////////////////
+
+#ifdef GALOIS_ENABLE_GPU
+  struct CUDA_Context* cuda_ctx_;
+  //! Object that holds all GPU allocated pointers to memory related to graphs.
+  GNNGraphGPUAllocations gpu_memory_;
+  //! Call this to setup GPU memory for this graph: allocates necessary GPU
+  //! memory and copies things over
+  void InitGPUMemory() {
+    // create int casted CSR
+    uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr();
+    uint32_t* e_dest_ptr  = partitioned_graph_->edge_dst_ptr();
+
+    // + 1 because first element is 0 in BLAS CSRs
+    std::vector<int> e_index(partitioned_graph_->size() + 1);
+    std::vector<int> e_dest(partitioned_graph_->sizeEdges());
+
+    // set in parallel
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), partitioned_graph_->size() + 1),
+        [&](size_t index) {
+          if (index != 0) {
+            if (e_index_ptr[index - 1] >
+                static_cast<size_t>(std::numeric_limits<int>::max())) {
+              GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
+                               e_index_ptr[index - 1]);
+            }
+            e_index[index] = static_cast<int>(e_index_ptr[index - 1]);
+          } else {
+            e_index[index] = 0;
+          }
+        },
+        galois::loopname("GPUEdgeIndexConstruction"));
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0),
+                        partitioned_graph_->sizeEdges()),
+        [&](size_t edge) {
+          if (e_dest_ptr[edge] >
+              static_cast<size_t>(std::numeric_limits<int>::max())) {
+            GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs",
+                             e_dest_ptr[edge]);
+          }
+
+          e_dest[edge] = static_cast<int>(e_dest_ptr[edge]);
+        },
+        galois::loopname("GPUEdgeDestConstruction"));
+
+    gpu_memory_.SetGraphTopology(e_index, e_dest);
+    e_index.clear();
+    e_dest.clear();
+
+    gpu_memory_.SetFeatures(local_node_features_, node_feature_length_);
+    gpu_memory_.SetLabels(local_ground_truth_labels_);
+    gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_,
+                         local_testing_mask_);
+    gpu_memory_.AllocAggregateBitset(partitioned_graph_->size());
+    gpu_memory_.SetGlobalTrainDegrees(global_train_degrees_);
+    gpu_memory_.SetGlobalDegrees(global_degrees_);
+  }
+
+#endif
+  //! Used to track accurate predictions during accuracy calculation
+  DGAccumulator<size_t> num_correct_;
+  //! Used to count total number of things checked during accuracy calculation
+  DGAccumulator<size_t> total_checked_;
+  // Below are used for multi-class accuracy
+  DGAccumulator<size_t> local_true_positive_;
+  DGAccumulator<size_t> local_true_negative_;
+  DGAccumulator<size_t> local_false_positive_;
+  DGAccumulator<size_t> local_false_negative_;
+
+  std::vector<char> mark_sampled_nodes_;
+
+  bool use_timer_{true};
+};
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h
new file mode 100644
index 0000000000..9bddc9d313
--- /dev/null
+++ b/libgnn/include/galois/graphs/GNNSubgraph.h
@@ -0,0 +1,642 @@
+#include "galois/graphs/GNNGraph.h"
+
+#include <limits>
+
+// Note no header guard or anything like that; this file is meant to be
+// included in the middle of GNNGraph class declaration as a class in a class
+class GNNSubgraph {
+public:
+  using GraphNode    = typename LC_CSR_CSC_Graph<VTy, ETy>::GraphNode;
+  using NodeIterator = boost::counting_iterator<size_t>;
+  using EdgeIterator = typename LC_CSR_CSC_Graph<VTy, ETy>::edge_iterator;
+
+  //! Allocates space for the lid to sid map
+  GNNSubgraph(size_t main_graph_size) {
+    lid_to_subgraph_id_.create(main_graph_size,
+                               std::numeric_limits<uint32_t>::max());
+    // the subgraph to original graph maps are allocated on demand in gstl
+    // vectors since those change every epoch
+    subgraph_mirrors_.resize(galois::runtime::getSystemNetworkInterface().Num);
+  }
+  //! Given sampled bits set on gnn_graph, builds an explicit subgraph
+  //! for the sampled bits
+  size_t BuildSubgraph(GNNGraph<VTy, ETy>& gnn_graph,
+                       size_t num_sampled_layers) {
+    galois::StatTimer timer("BuildSubgraph", kRegionName);
+    TimerStart(&timer);
+    for (auto& vec : subgraph_mirrors_) {
+      vec.clear();
+    }
+    CreateSubgraphMapping(gnn_graph, num_sampled_layers);
+    if (num_subgraph_nodes_ == 0) {
+      return 0;
+    }
+    DegreeCounting(gnn_graph);
+    EdgeCreation(gnn_graph);
+    NodeFeatureCreation(gnn_graph);
+    // loop over each node, grab out/in edges, construct them in LC_CSR_CSC
+    // no edge data, just topology
+    TimerStop(&timer);
+    return num_subgraph_nodes_;
+  }
+
+  size_t BuildSubgraphView(GNNGraph<VTy, ETy>& gnn_graph,
+                           size_t num_sampled_layers) {
+    galois::StatTimer timer("BuildSubgraphView", kRegionName);
+    TimerStart(&timer);
+    CreateSubgraphMapping(gnn_graph, num_sampled_layers);
+    NodeFeatureCreation(gnn_graph);
+    TimerStop(&timer);
+    return num_subgraph_nodes_;
+  }
+
+  galois::PODResizeableArray<GNNFeature>& GetLocalFeatures() {
+    return subgraph_node_features_;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Nodes
+  //////////////////////////////////////////////////////////////////////////////
+
+  uint32_t size() { return num_subgraph_nodes_; }
+  NodeIterator begin() const { return NodeIterator(0); }
+  NodeIterator end() const { return NodeIterator(num_subgraph_nodes_); }
+
+  NodeIterator begin_owned() const { return NodeIterator(0); }
+  NodeIterator end_owned() const {
+    return NodeIterator(subgraph_master_boundary_);
+  }
+
+  uint32_t SIDToLID(uint32_t sid) const { return subgraph_id_to_lid_[sid]; }
+  uint32_t LIDToSID(uint32_t lid) const { return lid_to_subgraph_id_[lid]; }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Edge iteration and destination
+  //////////////////////////////////////////////////////////////////////////////
+
+  EdgeIterator edge_begin(GraphNode n) {
+    return underlying_graph_.edge_begin(n);
+  }
+  EdgeIterator edge_end(GraphNode n) { return underlying_graph_.edge_end(n); }
+  GraphNode GetEdgeDest(EdgeIterator out_edge_iterator) {
+    return underlying_graph_.getEdgeDst(out_edge_iterator);
+  };
+  galois::runtime::iterable<
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
+  edges(GraphNode n) {
+    return internal::make_no_deref_range(edge_begin(n), edge_end(n));
+  }
+
+  EdgeIterator in_edge_begin(GraphNode n) {
+    return underlying_graph_.in_edge_begin(n);
+  }
+  EdgeIterator in_edge_end(GraphNode n) {
+    return underlying_graph_.in_edge_end(n);
+  }
+  GraphNode GetInEdgeDest(EdgeIterator in_edge_iterator) {
+    return underlying_graph_.getInEdgeDst(in_edge_iterator);
+  };
+  galois::runtime::iterable<
+      galois::NoDerefIterator<typename GNNDistGraph::edge_iterator>>
+  in_edges(GraphNode n) {
+    return internal::make_no_deref_range(in_edge_begin(n), in_edge_end(n));
+  }
+
+  size_t GetLocalDegree(GraphNode n) {
+    return std::distance(edge_begin(n), edge_end(n));
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Edge sampling status check
+  //////////////////////////////////////////////////////////////////////////////
+
+  bool OutEdgeSampled(EdgeIterator out_edge_iterator, size_t layer_num,
+                      const GNNGraph<VTy, ETy>& original_graph) {
+    return original_graph.IsEdgeSampledOriginalGraph(
+        subedge_to_original_edge_[*out_edge_iterator], layer_num);
+  }
+  bool InEdgeSampled(EdgeIterator in_edge_iterator, size_t layer_num,
+                     const GNNGraph<VTy, ETy>& original_graph) {
+    // note that original IsEdgeSampled is called because this object stores the
+    // original edge already
+    return original_graph.IsEdgeSampledOriginalGraph(
+        in_subedge_to_original_edge_[*in_edge_iterator], layer_num);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::LargeArray<uint32_t>* GetLIDToSIDPointer() {
+    return &lid_to_subgraph_id_;
+  }
+  void EnableTimers() { use_timer_ = true; }
+  void DisableTimers() { use_timer_ = false; }
+
+  std::vector<std::vector<size_t>>& GetSubgraphMirrors() {
+    return subgraph_mirrors_;
+  }
+
+private:
+  bool use_timer_{true};
+  void TimerStart(galois::StatTimer* t) {
+    if (use_timer_)
+      t->start();
+  }
+  void TimerStop(galois::StatTimer* t) {
+    if (use_timer_)
+      t->stop();
+  }
+
+  // TODO signature cleanup
+  //! Creates subgraph ID mapping from the number of sampled nodes from the
+  //! original graph. Should be done every epoch when sampled graph changes.
+  void CreateSubgraphMapping(GNNGraph<VTy, ETy>& gnn_graph, size_t) {
+    galois::StatTimer timer("SIDMapping", kRegionName);
+    TimerStart(&timer);
+
+    assert(gnn_graph.size() == lid_to_subgraph_id_.size());
+    // clear all mappings
+    galois::ParallelSTL::fill(lid_to_subgraph_id_.begin(),
+                              lid_to_subgraph_id_.end(),
+                              std::numeric_limits<uint32_t>::max());
+
+    galois::GAccumulator<uint32_t> subgraph_count;
+    subgraph_count.reset();
+    galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+                   [&](uint32_t node_id) {
+                     if (gnn_graph.IsActiveInSubgraph(node_id)) {
+                       subgraph_count += 1;
+                     }
+                   });
+    num_subgraph_nodes_ = subgraph_count.reduce();
+    // if no subgraph, get out
+    if (num_subgraph_nodes_ == 0) {
+      subgraph_master_boundary_ = 0;
+      TimerStop(&timer);
+      return;
+    }
+
+    // checking sanity
+    // galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()),
+    //               [&](uint32_t node_id) {
+    //                 if (gnn_graph.IsInSampledGraph(node_id) &&
+    //                 !gnn_graph.IsActiveInSubgraph(node_id)) {
+    //                  // check if any edges are active
+    //                  for (auto a = gnn_graph.edge_begin(node_id); a !=
+    //                  gnn_graph.edge_end(node_id);a++) {
+    //                    if (gnn_graph.IsEdgeSampledAny(a)) {
+    //                      galois::gWarn("ERROR node ", node_id);
+    //                    }
+    //                  }
+    //                  for (auto a = gnn_graph.in_edge_begin(node_id); a !=
+    //                  gnn_graph.in_edge_end(node_id);a++) {
+    //                    if (gnn_graph.IsInEdgeSampledAny(a)) {
+    //                      galois::gWarn("ERROR in node ", node_id);
+    //                    }
+    //                  }
+    //                 }
+    //               });
+
+    if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) {
+      // allocate a bit more than necessary to avoid a big realloc
+      // if node value changes slightly later
+      subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02);
+    }
+
+    // bitset to mark if a master is outside the "master only" boundary
+    // and not contiguous; needed to mask out non-masters
+    galois::DynamicBitSet& non_layer_zero_masters =
+        gnn_graph.GetNonLayerZeroMasters();
+    // init the bitset as necessary
+    if (non_layer_zero_masters.size() < num_subgraph_nodes_) {
+      non_layer_zero_masters.resize(num_subgraph_nodes_);
+    } else {
+      non_layer_zero_masters.ParallelReset();
+    }
+
+    std::vector<unsigned>& master_offsets = gnn_graph.GetMasterOffsets();
+    std::vector<unsigned>& mirror_offsets = gnn_graph.GetMirrorOffsets();
+
+    ResetSIDThreadOffsets(master_offsets.size());
+
+    // compute offsets for each layer
+    galois::PODResizeableArray<unsigned> layer_offsets;
+    layer_offsets.resize(master_offsets.size() - 1);
+    for (unsigned i = 0; i < layer_offsets.size(); i++) {
+      layer_offsets[i] = master_offsets[i] + mirror_offsets[i];
+      if (i > 0) {
+        // prefix summing
+        layer_offsets[i] += layer_offsets[i - 1];
+      }
+    }
+
+    // all nodes before this SID are master nodes in layer 0;
+    // NOTE: there are master nodes past this boundary that will
+    // not be covered by a begin_owned loop, which may cause problems down
+    // the line; this is handled by the bitset above
+    subgraph_master_boundary_ = master_offsets[0];
+
+    size_t last_owned_node = *(gnn_graph.end_owned());
+    // compute amount of work each thread needs to do
+    galois::on_each([&](size_t thread_id, size_t num_threads) {
+      unsigned start_node;
+      unsigned end_node;
+      // this thread always has a set number of nodes to run; this is it
+      std::tie(start_node, end_node) = galois::block_range(
+          size_t{0}, gnn_graph.size(), thread_id, num_threads);
+      // these arrays track how much work will need to be done by this
+      // thread
+      galois::PODResizeableArray<unsigned>& my_offsets =
+          sid_thread_offsets_[thread_id];
+      galois::PODResizeableArray<unsigned>& my_mirror_offsets =
+          subgraph_mirror_offsets_[thread_id];
+
+      for (size_t local_node_id = start_node; local_node_id < end_node;
+           local_node_id++) {
+        // only bother if node was active
+        if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+          unsigned node_timestamp =
+              gnn_graph.SampleNodeTimestamp(local_node_id);
+          // TODO(loc) this check shouldn't even be necessary; active in
+          // subgraph implies added at somepoint
+          if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+            // tracks how many nodes for each timestamp this node will
+            // work with by incrementing this
+            my_offsets[node_timestamp]++;
+
+            if (local_node_id >= last_owned_node) {
+              // this is a mirror node; get the host that the master is located
+              // on and increment this thread's mirror node count for that host
+              uint32_t node_gid = gnn_graph.GetGID(local_node_id);
+              my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
+            }
+          } else {
+            GALOIS_LOG_WARN("shouldn't ever get here right?");
+          }
+        }
+      }
+    });
+
+    // prefix sum the threads
+    galois::do_all(galois::iterate(size_t{0}, master_offsets.size()),
+                   [&](size_t layer_num) {
+                     for (size_t thread_id = 1;
+                          thread_id < galois::getActiveThreads(); thread_id++) {
+                       sid_thread_offsets_[thread_id][layer_num] +=
+                           sid_thread_offsets_[thread_id - 1][layer_num];
+                     }
+                   });
+
+    for (unsigned i = 0; i < master_offsets.size() - 1; i++) {
+      if (i > 0) {
+        GALOIS_LOG_VASSERT(
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i] +
+                    layer_offsets[i - 1] ==
+                (layer_offsets[i]),
+            "layer {} wrong {} vs correct {}", i,
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i],
+            layer_offsets[i]);
+      } else {
+        GALOIS_LOG_VASSERT(
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i] ==
+                (layer_offsets[i]),
+            "layer {} wrong {} vs correct {}", i,
+            sid_thread_offsets_[galois::getActiveThreads() - 1][i],
+            layer_offsets[i]);
+      }
+    }
+
+    // last element of prefix sum needs to equal the correct layer offset
+    galois::do_all(
+        galois::iterate(uint32_t{0},
+                        galois::runtime::getSystemNetworkInterface().Num),
+        [&](size_t host_num) {
+          // for each host, get prefix sum of each thread's mirrors
+          for (size_t thread_id = 1; thread_id < galois::getActiveThreads();
+               thread_id++) {
+            subgraph_mirror_offsets_[thread_id][host_num] +=
+                subgraph_mirror_offsets_[thread_id - 1][host_num];
+          }
+        });
+
+    // allocate the mirror space; last element of prefix sum is total size
+    for (unsigned host_num = 0;
+         host_num < galois::runtime::getSystemNetworkInterface().Num;
+         host_num++) {
+      if (galois::runtime::getSystemNetworkInterface().ID == host_num) {
+        continue;
+      }
+      subgraph_mirrors_[host_num].resize(
+          subgraph_mirror_offsets_[galois::getActiveThreads() - 1][host_num]);
+    }
+
+    galois::on_each([&](size_t thread_id, size_t num_threads) {
+      unsigned start_node;
+      unsigned end_node;
+      std::tie(start_node, end_node) = galois::block_range(
+          size_t{0}, gnn_graph.size(), thread_id, num_threads);
+
+      galois::PODResizeableArray<unsigned>& current_thread_offset =
+          thread_id != 0 ? sid_thread_offsets_[thread_id - 1]
+                         : thread_zero_work_;
+      galois::PODResizeableArray<unsigned>& my_mirror_offsets =
+          thread_id != 0 ? subgraph_mirror_offsets_[thread_id - 1]
+                         : thread_zero_mirror_offsets_;
+
+      for (size_t local_node_id = start_node; local_node_id < end_node;
+           local_node_id++) {
+        if (gnn_graph.IsActiveInSubgraph(local_node_id)) {
+          unsigned node_timestamp =
+              gnn_graph.SampleNodeTimestamp(local_node_id);
+          if (node_timestamp != std::numeric_limits<unsigned>::max()) {
+            uint32_t sid_to_use;
+            if (node_timestamp != 0) {
+              sid_to_use = layer_offsets[node_timestamp - 1] +
+                           current_thread_offset[node_timestamp]++;
+              if (local_node_id < last_owned_node) {
+                // master node that is not in layer 0 (i.e. node_timestamp != 0)
+                non_layer_zero_masters.set(sid_to_use);
+              }
+            } else {
+              // node timestamp == 0; no layer offset needed because offset
+              // is 0
+              sid_to_use = current_thread_offset[node_timestamp]++;
+            }
+
+            // this is a mirror
+            if (local_node_id >= last_owned_node) {
+              // XXX(loc) mirror offsets
+              uint32_t node_gid = gnn_graph.GetGID(local_node_id);
+              size_t my_offset =
+                  my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++;
+
+              if (my_offset >
+                  subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size())
+                GALOIS_LOG_FATAL(
+                    "{} {}", my_offset,
+                    subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size());
+
+              subgraph_mirrors_[gnn_graph.GetHostID(node_gid)][my_offset] =
+                  node_gid;
+            }
+
+            subgraph_id_to_lid_[sid_to_use]    = local_node_id;
+            lid_to_subgraph_id_[local_node_id] = sid_to_use;
+          } else {
+            GALOIS_LOG_WARN("shouldn't ever get here right?");
+          }
+        }
+      }
+    });
+
+    TimerStop(&timer);
+  }
+
+  //! reset sid thread offsets used for parallel SID mapping creation
+  void ResetSIDThreadOffsets(size_t num_layers) {
+    if (!sid_thread_offsets_.size()) {
+      sid_thread_offsets_.resize(galois::getActiveThreads());
+      galois::on_each([&](size_t thread_id, size_t) {
+        sid_thread_offsets_[thread_id].resize(num_layers);
+      });
+    }
+
+    if (!subgraph_mirror_offsets_.size()) {
+      subgraph_mirror_offsets_.resize(galois::getActiveThreads());
+      galois::on_each([&](size_t thread_id, size_t) {
+        subgraph_mirror_offsets_[thread_id].resize(
+            galois::runtime::getSystemNetworkInterface().Num);
+      });
+    }
+
+    galois::do_all(
+        galois::iterate(size_t{0}, sid_thread_offsets_.size()), [&](size_t i) {
+          galois::PODResizeableArray<unsigned>& arr = sid_thread_offsets_[i];
+          std::fill(arr.begin(), arr.end(), 0);
+          galois::PODResizeableArray<unsigned>& arr2 =
+              subgraph_mirror_offsets_[i];
+          std::fill(arr2.begin(), arr2.end(), 0);
+        });
+
+    if (thread_zero_work_.size() < num_layers) {
+      thread_zero_work_.resize(num_layers);
+    }
+    if (thread_zero_mirror_offsets_.size() <
+        galois::runtime::getSystemNetworkInterface().Num) {
+      thread_zero_mirror_offsets_.resize(
+          galois::runtime::getSystemNetworkInterface().Num);
+    }
+    galois::ParallelSTL::fill(thread_zero_work_.begin(),
+                              thread_zero_work_.end(), 0);
+    galois::ParallelSTL::fill(thread_zero_mirror_offsets_.begin(),
+                              thread_zero_mirror_offsets_.end(), 0);
+  }
+
+  //! Counts in and out degrees of all sampled nodes in the graph
+  void DegreeCounting(const GNNGraph<VTy, ETy>& gnn_graph) {
+    galois::StatTimer timer("DegreeCounting", kRegionName);
+    TimerStart(&timer);
+
+    if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) {
+      local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02);
+    }
+
+    if (local_subgraph_in_degrees_.size() < num_subgraph_nodes_) {
+      local_subgraph_in_degrees_.resize(num_subgraph_nodes_ * 1.02);
+    }
+
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](uint32_t subgraph_id) {
+          uint32_t node_id     = subgraph_id_to_lid_[subgraph_id];
+          uint32_t out_degrees = 0;
+          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+              out_degrees++;
+            }
+          }
+          local_subgraph_out_degrees_[subgraph_id] = out_degrees;
+
+          uint32_t in_degrees = 0;
+          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+              in_degrees++;
+            }
+          }
+          local_subgraph_in_degrees_[subgraph_id] = in_degrees;
+        },
+        galois::loopname("DegreeCountingDoAll"), galois::steal());
+
+    TimerStop(&timer);
+  }
+
+  //! Creates edges
+  void EdgeCreation(const GNNGraph<VTy, ETy>& gnn_graph) {
+    galois::StatTimer timer("EdgeConstruction", kRegionName);
+    TimerStart(&timer);
+    // galois::DGAccumulator<uint32_t> empty_masters;
+    // galois::DGAccumulator<uint32_t> empty_mirrors;
+    // empty_masters.reset();
+    // empty_mirrors.reset();
+
+    // galois::DGAccumulator<uint32_t> total_sn;
+    // total_sn.reset();
+    // total_sn += num_subgraph_nodes_;
+    // size_t global_sub_size = total_sn.reduce();
+
+    // prefix sum over subgraph degrees from previous phase to get starting
+    // points
+    for (size_t i = 1; i < num_subgraph_nodes_; i++) {
+      // if (local_subgraph_out_degrees_[i] == 0 &&
+      //    local_subgraph_in_degrees_[i] == 0) {
+      //  if (i < subgraph_master_boundary_) {
+      //    empty_masters += 1;
+      //  } else {
+      //    if (gnn_graph.GetNonLayerZeroMasters().test(i)) {
+      //      empty_masters += 1;
+      //    } else {
+      //      empty_mirrors += 1;
+      //    }
+      //  }
+      //}
+      local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1];
+      local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1];
+    }
+
+    // uint32_t emaster = empty_masters.reduce();
+    // uint32_t emirror = empty_mirrors.reduce();
+    // if (gnn_graph.host_id() == 0) {
+    //  galois::gInfo("Empty masters percent is ", emaster /
+    //  (float)global_sub_size,
+    //                " ", emaster, " ", global_sub_size);
+    //  galois::gInfo("Empty mirrors percent is ", emirror /
+    //  (float)global_sub_size,
+    //                " ", emirror, " ", global_sub_size);
+    //}
+
+    // allocate then set node endpoints
+    num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1];
+
+    galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName);
+    TimerStart(&alloc_time);
+    underlying_graph_.DeallocateOnly();
+    underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_);
+    underlying_graph_.CSCAllocate();
+    TimerStop(&alloc_time);
+
+    galois::gInfo(gnn_graph.host_prefix(), "Subgraph nodes and edges are ",
+                  num_subgraph_nodes_, " ", num_subgraph_edges_);
+
+    galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_),
+                   [&](uint32_t subgraph_id) {
+                     underlying_graph_.fixEndEdge(
+                         subgraph_id, local_subgraph_out_degrees_[subgraph_id]);
+                     underlying_graph_.FixEndInEdge(
+                         subgraph_id, local_subgraph_in_degrees_[subgraph_id]);
+                   });
+    if (subedge_to_original_edge_.size() < num_subgraph_edges_) {
+      subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
+    }
+    if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) {
+      in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02);
+    }
+
+    // save edges + save reference to layer sample status
+    galois::do_all(
+        galois::iterate(begin(), end()),
+        [&](uint32_t subgraph_id) {
+          uint32_t node_id = subgraph_id_to_lid_[subgraph_id];
+          assert(subgraph_id != std::numeric_limits<uint32_t>::max());
+          uint32_t out_location = 0;
+          uint32_t in_location  = 0;
+          if (subgraph_id != 0) {
+            out_location = local_subgraph_out_degrees_[subgraph_id - 1];
+            in_location  = local_subgraph_in_degrees_[subgraph_id - 1];
+          }
+
+          for (auto out_edge_iter : gnn_graph.edges(node_id)) {
+            if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) {
+              assert(
+                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] !=
+                  std::numeric_limits<uint32_t>::max());
+              subedge_to_original_edge_[out_location] = *out_edge_iter;
+
+              underlying_graph_.constructEdge(
+                  out_location++,
+                  lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]);
+            }
+          }
+
+          for (auto in_edge_iter : gnn_graph.in_edges(node_id)) {
+            if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) {
+              in_subedge_to_original_edge_[in_location] =
+                  *(gnn_graph.InEdgeToOutEdge(in_edge_iter));
+              underlying_graph_.ConstructInEdge(
+                  in_location++,
+                  lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]);
+            }
+          }
+          assert(out_location == local_subgraph_out_degrees_[subgraph_id]);
+          assert(in_location == local_subgraph_in_degrees_[subgraph_id]);
+        },
+        galois::loopname("EdgeCreationDoAll"), galois::steal());
+    TimerStop(&timer);
+  }
+
+  //! Copies over relevant features of the nodes
+  void NodeFeatureCreation(GNNGraph<VTy, ETy>& gnn_graph) {
+    galois::StatTimer timer("NodeFeatureCreation", kRegionName);
+    TimerStart(&timer);
+    size_t feat_length = gnn_graph.node_feature_length();
+    subgraph_node_features_.resize(feat_length * num_subgraph_nodes_);
+
+    galois::do_all(
+        galois::iterate(begin(), end()), [&](size_t subgraph_node_id) {
+          size_t local_id = subgraph_id_to_lid_[subgraph_node_id];
+          std::memcpy(
+              &(subgraph_node_features_[subgraph_node_id * feat_length]),
+              &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]),
+              feat_length * sizeof(GNNFeature));
+        });
+    TimerStop(&timer);
+  }
+
+  static const constexpr char* kRegionName = "GNNSubgraph";
+
+  bool inductive_subgraph_{false};
+
+  // name is self explanatory
+  LC_CSR_CSC_Graph<char, void> underlying_graph_;
+  // size vars
+  uint32_t num_subgraph_nodes_;
+  uint32_t num_subgraph_edges_;
+  uint32_t subgraph_master_boundary_;
+  //! Features corresponding only to this subgraph; copied from main graph
+  //! (in other words, redundant; would be nice if there was a way to
+  //! fake contiguous memory
+  galois::PODResizeableArray<GNNFeature> subgraph_node_features_;
+  //! Dense array mapping local ids to subgraph id (not space efficient)
+  galois::LargeArray<uint32_t> lid_to_subgraph_id_;
+  //! Map subgraph ids back to local graph ids
+  //! gstl vector because this will get resized every epoch (LargeArray
+  //! is for static)
+  galois::PODResizeableArray<uint32_t> subgraph_id_to_lid_;
+  // intermediate degrees used for edge construction
+  galois::PODResizeableArray<uint32_t> local_subgraph_out_degrees_;
+  galois::PODResizeableArray<uint32_t> local_subgraph_in_degrees_;
+  //! Maps from subgraph out-edge id to original graph edge id (used to check if
+  //! edge exists in particular layer)
+  galois::PODResizeableArray<uint32_t> subedge_to_original_edge_;
+  //! Maps from subgraph in-edge id to original graph edge id (used to check if
+  //! edge exists in particular layer)
+  galois::PODResizeableArray<uint32_t> in_subedge_to_original_edge_;
+
+  //! Mirror mappings for Gluon for subgraph
+  // std::vector<std::vector<size_t>> subgraph_mirrors_;
+  std::vector<std::vector<size_t>> subgraph_mirrors_;
+
+  //! Offsets to use for
+  std::vector<galois::PODResizeableArray<unsigned>> sid_thread_offsets_;
+  std::vector<galois::PODResizeableArray<unsigned>> subgraph_mirror_offsets_;
+  galois::PODResizeableArray<unsigned> thread_zero_work_;
+  galois::PODResizeableArray<unsigned> thread_zero_mirror_offsets_;
+};
diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
new file mode 100644
index 0000000000..422965fbaf
--- /dev/null
+++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h
@@ -0,0 +1,437 @@
+// defined in GNNGraph.cpp; set in order to control which matrix
+// gets synchronized
+#include "galois/GNNTypes.h"
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/GNNCudaContextHostDecls.h"
+#endif
+
+namespace galois {
+namespace graphs {
+
+extern std::vector<char>* sampled_nodes_;
+extern GNNFloat* gnn_matrix_to_sync_;
+extern size_t gnn_matrix_to_sync_column_length_;
+extern galois::DynamicBitSet bitset_graph_aggregate;
+extern galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_;
+extern galois::DynamicBitSet bitset_sample_flag_;
+extern size_t subgraph_size_;
+extern size_t num_active_layer_rows_;
+#ifdef GALOIS_ENABLE_GPU
+extern struct CUDA_Context* cuda_ctx_for_sync;
+extern unsigned layer_number_to_sync;
+#endif
+
+// NodeTy is always a node data type of a "graph" type.
+// This type is used by GluonSubstrate to reset a value.
+// ValTy is either a node data type of a graph or the ones
+// that are stored in separate objects.
+template <typename NTy>
+struct SampleFlagSync {
+  using NodeTy = NTy;
+  using ValTy  = char;
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t lid, NodeTy&) { return (*sampled_nodes_)[lid]; }
+
+  static bool reduce(uint32_t lid, NodeTy&, ValTy y) {
+    if (y) {
+      (*sampled_nodes_)[lid] = y;
+      assert((*sampled_nodes_)[lid] == 1);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t, NodeTy&) {}
+
+  //! element wise set
+  static void setVal(uint32_t lid, NodeTy&, ValTy y) {
+    (*sampled_nodes_)[lid] = y;
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+struct SampleFlagBitset {
+  static constexpr bool is_vector_bitset() { return false; }
+  static constexpr bool is_valid() { return true; }
+  static galois::DynamicBitSet& get() { return bitset_sample_flag_; }
+  static void reset_range(size_t begin, size_t end) {
+    bitset_sample_flag_.reset(begin, end);
+  }
+};
+
+template <typename NTy>
+struct GNNSumAggregate {
+  using ValTy  = galois::gstl::Vector<GNNFloat>;
+  using NodeTy = NTy;
+
+  static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; }
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, NodeTy&) {
+    // It should be a CPU synchronizing substrate.
+    // If the GPU flag is turned off, then personality does not exist.
+    // assert(device_personality == DevicePersonality::CPU);
+    ValTy extracted_vec;
+    extracted_vec.reserve(gnn_matrix_to_sync_column_length_);
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX memcpy
+      extracted_vec.emplace_back(
+          gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i]);
+    }
+    // move constructor should kick in here to avoid return copy
+    return extracted_vec;
+  }
+
+  //! return a vector of floats to sync
+  static void ExtractDirect(uint32_t node_id,
+                            typename ValTy::value_type* to_write) {
+    std::memcpy(
+        to_write,
+        (char*)&(
+            gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_]),
+        gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX vectorized add
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] +=
+          y[i];
+    }
+    return true;
+  }
+
+  static bool reduce(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX vectorized add
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] +=
+          y[i];
+    }
+    return true;
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t, NodeTy&) {}
+  // Reset is here in case anyone wants to bring it back
+  // static void reset(uint32_t node_id, char&) {
+  //  for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+  //    gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+  //    0;
+  //  }
+  //}
+
+  //! element wise set
+  static void setVal(uint32_t node_id, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+          y[i];
+    }
+  }
+
+  static void setVal(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+          y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+template <typename NTy>
+struct GNNSampleSumAggregate {
+  using ValTy  = galois::gstl::Vector<GNNFloat>;
+  using NodeTy = NTy;
+
+  static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; }
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, NodeTy&) {
+    // It should be a CPU synchronizing substrate.
+    // If the GPU flag is turned off, then personality does not exist.
+    // assert(device_personality == DevicePersonality::CPU);
+    // ValTy extracted_vec(gnn_matrix_to_sync_column_length_);
+    ValTy extracted_vec;
+    extracted_vec.reserve(gnn_matrix_to_sync_column_length_);
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      // need to have correct size because serializer will expect
+      // it to be of a certain length
+      extracted_vec.resize(gnn_matrix_to_sync_column_length_, 0);
+      return extracted_vec;
+    }
+
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // XXX memcpy
+      extracted_vec.emplace_back(
+          gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                                  gnn_matrix_to_sync_column_length_ +
+                              i]);
+    }
+    // move constructor should kick in here to avoid return copy
+    return extracted_vec;
+  }
+
+  static void ExtractDirect(uint32_t node_id,
+                            typename ValTy::value_type* to_write) {
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return;
+    }
+    std::memcpy(
+        to_write,
+        (char*)&(gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                                     gnn_matrix_to_sync_column_length_]),
+        gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type));
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+    assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_);
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // galois::gPrint("write ", (*gnn_lid_to_sid_pointer_)[node_id] *
+      //                        gnn_matrix_to_sync_column_length_ + i, "\n");
+      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                              gnn_matrix_to_sync_column_length_ +
+                          i] += y[i];
+    }
+    return true;
+  }
+
+  static bool reduce(uint32_t node_id, NodeTy&, ValTy::value_type* y) {
+    if ((*gnn_lid_to_sid_pointer_)[node_id] ==
+        std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+    assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_);
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      // galois::gPrint(galois::runtime::getSystemNetworkInterface().ID,  "]
+      // nodeid ", node_id, " sid ",  (*gnn_lid_to_sid_pointer_)[node_id],
+      //               " write ", (*gnn_lid_to_sid_pointer_)[node_id] *
+      //                        gnn_matrix_to_sync_column_length_ + i, "\n");
+      gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] *
+                              gnn_matrix_to_sync_column_length_ +
+                          i] += y[i];
+    }
+    return true;
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t, NodeTy&) {}
+
+  // version where you have a vector object
+  static void setVal(uint32_t node_id, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id];
+    if (converted_sid >= num_active_layer_rows_ ||
+        converted_sid == std::numeric_limits<uint32_t>::max()) {
+      return;
+    }
+    assert(converted_sid < subgraph_size_);
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[converted_sid * gnn_matrix_to_sync_column_length_ +
+                          i] = y[i];
+    }
+  }
+
+  // version where you have a pointer only (more efficient because this
+  // version is for reading directly from the recv buffer)
+  static void setVal(uint32_t node_id, NodeTy&, ValTy::value_type* y) {
+    uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id];
+    if (converted_sid >= num_active_layer_rows_ ||
+        converted_sid == std::numeric_limits<uint32_t>::max()) {
+      return;
+    }
+
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+      gnn_matrix_to_sync_[converted_sid * gnn_matrix_to_sync_column_length_ +
+                          i] = y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+template <typename NTy>
+struct SHADGNNSumAggregate {
+  using ValTy  = galois::gstl::Vector<GNNFloat>;
+  using NodeTy = NTy;
+
+  static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_ / 2; }
+
+  //! return a vector of floats to sync
+  static ValTy extract(uint32_t node_id, NodeTy&) {
+    // It should be a CPU synchronizing substrate.
+    // If the GPU flag is turned off, then personality does not exist.
+    // assert(device_personality == DevicePersonality::CPU);
+
+    // It should extract the last half of features of the adjacent neighbors
+    // (So, source of feature aggregation).
+    ValTy extracted_vec;
+    extracted_vec.reserve(gnn_matrix_to_sync_column_length_ / 2);
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      // XXX memcpy
+      extracted_vec.emplace_back(
+          gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                              gnn_matrix_to_sync_column_length_ / 2]);
+    }
+    // move constructor should kick in here to avoid return copy
+    return extracted_vec;
+  }
+
+  //! return a vector of floats to sync
+  static void ExtractDirect(uint32_t node_id,
+                            typename ValTy::value_type* to_write) {
+    std::memcpy(
+        to_write,
+        (char*)&(
+            gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ +
+                                gnn_matrix_to_sync_column_length_ / 2]),
+        (gnn_matrix_to_sync_column_length_ / 2) *
+            sizeof(typename ValTy::value_type));
+  }
+
+  //! reduction is addition in this case; add received vector to
+  //! own vector
+  static bool reduce(uint32_t node_id, char&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_ / 2);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      // XXX vectorized add
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] += y[i];
+    }
+    return true;
+  }
+
+  static bool reduce(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      // XXX vectorized add
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] += y[i];
+    }
+    return true;
+  }
+
+  //! No-op: readAny = overwritten anyways
+  static void reset(uint32_t, NodeTy&) {}
+  // Reset is here in case anyone wants to bring it back
+  // static void reset(uint32_t node_id, char&) {
+  //  for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) {
+  //    gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] =
+  //    0;
+  //  }
+  //}
+
+  //! element wise set
+  static void setVal(uint32_t node_id, NodeTy&, ValTy y) {
+    assert(y.size() == gnn_matrix_to_sync_column_length_);
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] = y[i];
+    }
+  }
+
+  static void setVal(uint32_t node_id, NodeTy&, const ValTy::value_type* y) {
+    // loop and do addition
+    for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) {
+      gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i +
+                          gnn_matrix_to_sync_column_length_ / 2] = y[i];
+    }
+  }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+};
+
+#ifdef GALOIS_ENABLE_GPU
+extern struct CUDA_Context* cuda_ctx;
+GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync,
+                                gnn_matrix_to_sync_column_length_,
+                                layer_number_to_sync);
+GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_output, cuda_ctx_for_sync,
+                                gnn_matrix_to_sync_column_length_,
+                                layer_number_to_sync);
+#endif
+GALOIS_SYNC_STRUCTURE_BITSET(graph_aggregate);
+
+} // namespace graphs
+} // namespace galois
diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h
new file mode 100644
index 0000000000..c347ae8dbe
--- /dev/null
+++ b/libgnn/include/galois/layers/DenseLayer.h
@@ -0,0 +1,194 @@
+
+#pragma once
+#include "galois/layers/GNNLayer.h"
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+
+namespace galois {
+
+//! Just does a linear xform with no convolution over graph
+template <typename VTy, typename ETy>
+class DenseLayer : public GNNLayer<VTy, ETy> {
+public:
+  //! Initializes the variables of the base class and also allocates additional
+  //! memory for temporary matrices. Also initializes sync substrate for the
+  //! weight matrix
+  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+             PointerWithSize<GNNFloat>* backward_output_matrix,
+             const GNNLayerDimensions& layer_dimensions,
+             const GNNLayerConfig& config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix,
+                           layer_dimensions, config),
+        input_column_intermediates_(layer_dimensions.input_columns),
+        output_column_intermediates_(layer_dimensions.output_columns) {
+    // TODO Need to make sure that layer knows about forward/backward matrix
+    // sharing (e.g., overwriting previously used input to save space)
+    GALOIS_LOG_FATAL(
+        "This layer has not been kept up to date; do not use until "
+        "sure it's been updated");
+    size_t num_input_elements = this->layer_dimensions_.input_rows *
+                                this->layer_dimensions_.input_columns;
+    in_temp_1_.resize(num_input_elements, 0);
+    size_t num_output_elements = this->layer_dimensions_.input_rows *
+                                 this->layer_dimensions_.output_columns;
+    GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements);
+    this->layer_type_  = galois::GNNLayerType::kDense;
+    this->p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+    GALOIS_LOG_VERBOSE("Dense initialized");
+  }
+
+  DenseLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+             PointerWithSize<GNNFloat>* backward_output_matrix,
+             const GNNLayerDimensions& dimensions)
+      : DenseLayer(layer_num, graph, backward_output_matrix, dimensions,
+                   GNNLayerConfig()) {}
+
+  // Parent functions
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    GALOIS_LOG_VERBOSE("Calling forward phase");
+    assert(input_embeddings.size() == (this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.input_columns));
+    assert(this->p_in_temp_1_.size() == input_embeddings.size());
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    // pointer to input to operate on
+    const GNNFloat* input_data = input_embeddings.data();
+    // first, dropout
+    if (!this->config_.disable_dropout &&
+        (this->layer_phase_ == GNNPhase::kTrain)) {
+      this->DoDropout(input_embeddings, &this->p_in_temp_1_);
+      input_data = this->p_in_temp_1_.data();
+    }
+
+    // FW
+    UpdateEmbeddings(input_data, this->p_forward_output_matrix_.data());
+
+    if (!this->config_.disable_activation) {
+      GALOIS_LOG_VERBOSE("Doing activation");
+      this->Activation();
+    }
+
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    return this->p_forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) final {
+    assert(this->layer_phase_ == GNNPhase::kTrain);
+
+    // derivative of activation
+    if (!this->config_.disable_activation) {
+      this->ActivationDerivative(input_gradient);
+    }
+
+    if (this->layer_number_ != 0) {
+      // derivative for update
+      // backout = F'
+      UpdateEmbeddingsDerivative(input_gradient->data(),
+                                 this->p_backward_output_matrix_.data());
+    }
+
+    galois::PointerWithSize<galois::GNNFloat> input_data;
+    if (!this->config_.disable_dropout) {
+      // dropout result is currently stored in temp 1
+      // needs to be used before it gets overwritten
+      input_data = this->p_in_temp_1_;
+    } else {
+      // no dropout = use vanilla input
+      input_data = prev_layer_input;
+    }
+
+    // W' = F^T (FW)'
+    galois::CBlasSGEMM(
+        CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+        this->layer_dimensions_.input_rows,
+        this->layer_dimensions_.output_columns, input_data.data(),
+        input_gradient->data(), this->p_layer_weight_gradients_.data());
+    // sync weight gradients; note aggregation sync occurs in the function call
+    // already
+    this->WeightGradientSyncSum();
+
+    if (!this->config_.disable_dropout && this->layer_number_ != 0) {
+      this->DoDropoutDerivative();
+    }
+
+    return this->p_backward_output_matrix_;
+  }
+
+private:
+  // 2 temporaries the size of the forward input; used for dropout and
+  // aggregation (if either are required)
+  std::vector<GNNFloat> in_temp_1_;
+  // Pointer with size versions
+  PointerWithSize<GNNFloat> p_in_temp_1_;
+
+  // Each thread has a vector of size # input columns or # output columns for
+  // storing intermediate results during aggregation.
+  // The one used depeneds on if aggregation occurs before or after the mxm.
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      input_column_intermediates_;
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      output_column_intermediates_;
+
+  //! Do embedding update via mxm with this layer's weights (forward)
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      /* TODO(lhc) implement this
+      gpu_object_.UpdateEmbeddingsGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, node_embeddings,
+          base_gpu_object_.layer_weights(), output);
+          */
+    } else {
+#endif
+      // CPU version is just a call into CBlas
+      galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans,
+                         this->layer_dimensions_.input_rows,
+                         this->layer_dimensions_.input_columns,
+                         this->layer_dimensions_.output_columns,
+                         node_embeddings, this->layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  //! Calculate graident via mxm with last layer's gradients (backward)
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) {
+    assert(this->p_layer_weights_.size() ==
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      /* TODO(lhc) implement this
+      gpu_object_.UpdateEmbeddingsDerivativeGPU(
+          layer_dimensions_.input_rows, layer_dimensions_.input_columns,
+          layer_dimensions_.output_columns, gradients,
+          base_gpu_object_.layer_weights(), output);
+          */
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y
+      galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                         this->layer_dimensions_.input_rows,
+                         this->layer_dimensions_.output_columns,
+                         this->layer_dimensions_.input_columns, gradients,
+                         this->layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(hochan/loc) replace with dense gpu object
+  // GCNGPUAllocations gpu_object_;
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh
new file mode 100644
index 0000000000..439faad738
--- /dev/null
+++ b/libgnn/include/galois/layers/GNNLayer.cuh
@@ -0,0 +1,84 @@
+#pragma once
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Holds pointers to GNN layer weights/gradient on GPU
+class GNNLayerGPUAllocations {
+public:
+  //! CUDA frees all allocated memory (i.e. non-nullptr)
+  ~GNNLayerGPUAllocations();
+  //! Initializes forward and backward output matrices of this layer on GPU
+  void InitInOutMemory(size_t forward_size, size_t backward_size);
+  //! Initializes memory for weight and weight gradients on GPU
+  void InitWeightMemory(size_t num_weights);
+  //! Initializes memory for dropout
+  void InitDropoutMemory(size_t dropout_size);
+  //! Copy provided data in vector to GPU weights
+  void CopyToWeights(const std::vector<GNNFloat>& cpu_layer_weights);
+  //! Copy provided data in vector to GPU weight gradients
+  void CopyToWeightGradients(const std::vector<GNNFloat>& cpu_gradients);
+  //! Copy GPU forward output to the provided vector (assumes vector is already
+  //! correct size)
+  void CopyForwardOutputToCPU(GNNFloat* cpu_forward_output,
+                              size_t forward_output_size);
+  //! Copy GPU backward output to the provided vector (assumes vector is already
+  //! correct size)
+  void CopyBackwardOutputToCPU(GNNFloat* cpu_backward_output,
+                               size_t backward_output_size);
+  //! Copy GPU weight gradients to the provided vector (assumes vector is
+  //! already correct size)
+  void CopyWeightGradientsToCPU(std::vector<GNNFloat>* cpu_gradients);
+
+  //! Prints forward output matrix on gpu
+  void PrintForwardOutput(size_t num);
+
+  //! Prints backward output matrix on gpu
+  void PrintBackwardOutput(size_t num);
+
+  //! Does dropout on the GPU; saves non-dropped weights to output
+  void DoDropoutGPU(const PointerWithSize<GNNFloat> input_to_dropout,
+                    PointerWithSize<GNNFloat> output, float dropout_rate);
+  //! Does dropout derivative on the backward output matrix of the gpu
+  void DoDropoutDerivativeGPU(size_t input_size, GNNFloat scale);
+
+  //! Helper function: give a vector which is copied over to the GPU (new
+  //! memory is allocated as necessary)
+  GNNFloat* Allocate(const std::vector<GNNFloat>& v);
+
+  //! Initializes vectors on GPU to 1
+  void InitGPUVectorTo1(GNNFloat* vector, size_t vector_size);
+
+  //! Apply an activation function
+  void ActivationGPU(size_t num_forward_output_elements);
+  //! Apply an activation function for derivative
+  void ActivationDerivativeGPU(GNNFloat* gradients,
+                               size_t num_gradients_elements);
+  void
+  ReconstructDropoutMatrixGPU(const PointerWithSize<GNNFloat> input_to_drouput,
+                              PointerWithSize<GNNFloat>* output_matrix,
+                              size_t num_elements, GNNFloat scale);
+
+  void MaskNonMastersGPU(PointerWithSize<GNNFloat>* input, size_t start_node,
+                         size_t end_node, size_t row_index);
+
+  GNNFloat* forward_output() { return forward_output_matrix_; }
+  GNNFloat* backward_output() { return backward_output_matrix_; }
+  GNNFloat* layer_weights() { return layer_weights_; }
+  GNNFloat* layer_weight_gradients() { return layer_weight_gradients_; }
+
+  void CopyToCPU(PointerWithSize<GNNFloat>* input);
+  void CopyToCPU(GNNFloat* input, size_t size);
+
+private:
+  size_t* num_weights_{nullptr};
+  GNNFloat* forward_output_matrix_{nullptr};
+  GNNFloat* backward_output_matrix_{nullptr};
+  GNNFloat* layer_weights_{nullptr};
+  GNNFloat* layer_weight_gradients_{nullptr};
+  GNNFloat* rng_results_{nullptr};
+  char* dropout_mask_{nullptr};
+  uint8_t* activation_memo_{nullptr};
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h
new file mode 100644
index 0000000000..73153a44de
--- /dev/null
+++ b/libgnn/include/galois/layers/GNNLayer.h
@@ -0,0 +1,908 @@
+#pragma once
+
+#include "galois/PerThreadRNG.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/graphs/GNNGraph.h"
+#include "galois/Logging.h"
+#include "galois/layers/GradientSyncStructures.h"
+
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/layers/GNNLayer.cuh"
+#endif
+
+// #define PRINT_VEC_LOG_
+// #define PRINT_GPU_VEC_
+
+namespace galois {
+
+//! Supported layer types in the GNN
+enum class GNNLayerType {
+  //! Invalid placeholder
+  kInvalid,
+  //! GCN
+  kGraphConvolutional,
+  //! Sage layer: same as GCN except with mean aggregation and concat
+  kSAGE,
+  //! Dense linear xform layer
+  kDense,
+  //! L2 normalization layer
+  kL2Norm,
+  //! ReLU layer
+  kReLU
+  // TODO GAT
+};
+
+//! Supported output layer types in the GNN
+enum class GNNOutputLayerType { kInvalid, kSoftmax, kSigmoid };
+
+//! Struct holding the dimensions of a layer. Assumption is that a layer takes
+//! a matrix and outputs another matrix with a different # of columns (e.g.
+//! matrix multiply with a set of weights)
+struct GNNLayerDimensions {
+  //! Number of rows in input and output of this layer
+  size_t input_rows;
+  //! Number of columns in input of this layer
+  size_t input_columns;
+  //! Number of columns output of this layer
+  size_t output_columns;
+  //! If rows change, this is set. Otherwise, ignored.
+  size_t output_rows;
+};
+
+//! Config options for operations that can occur in a layer
+struct GNNLayerConfig {
+  //! True if weights should be allocated
+  bool allocate_weights{true};
+  //! If true, disable allocation of the output matrix (used for output layers
+  //! which can overwrite the input, i.e. passthrough)
+  bool disable_output{false};
+  //! Turns off dropout of weights if enabled
+  bool disable_dropout{false};
+  //! Rate at which to drop things if dropout is on
+  float dropout_rate{0.5};
+  //! True to disable activation function for intermediate layers
+  bool disable_activation{false};
+  //! True if normalization is disabled to occur during multiplies
+  bool disable_normalization{false};
+  //! If this is false, aggregate may occur after multiply if # of input columns
+  //! is higher than output columns to do less work in aggregation
+  bool disable_aggregate_after_update{false};
+  //! On to not aggregate self vector during aggregation
+  bool disable_self_aggregate{false};
+  //! Graph sampling flag in use or not
+  bool do_sampling{false};
+  // TODO activation type; for now default is softmax
+
+  //! Sets settings such that testing is easy
+  void DebugConfig() {
+    disable_activation     = true;
+    disable_normalization  = true;
+    disable_dropout        = true;
+    disable_self_aggregate = true;
+  }
+};
+
+// Tried to avoid inheritance, but keeping track of heterogeneous layers
+// becomes a mess if there isn't a base class I can create the container on.
+//! Base class for layers in a graph neural network
+template <typename VTy, typename ETy>
+class GNNLayer {
+public:
+  //! Creation of a layer needs the # of the layer, the graph to train on, and
+  //! the input/output dimensions of the MxM that occurs in the layer; config
+  //! as well
+  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+           PointerWithSize<GNNFloat>* backward_output_matrix,
+           const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
+      : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions),
+        config_(config) {
+    // TODO(loc)
+    // this is currently a backward-compatibility hack, need to have caller
+    // set output rows rather than created here
+    layer_dimensions_.output_rows = layer_dimensions_.input_rows;
+
+    if (config_.allocate_weights) {
+      // dropout allocation; dropout is same as input
+      if (!config_.disable_dropout) {
+        dropout_mask_.resize(layer_dimensions_.input_rows *
+                                 layer_dimensions_.input_columns,
+                             false);
+      }
+      // allocate memory based on layer dimensions
+      size_t num_weight_elements =
+          layer_dimensions_.input_columns * layer_dimensions_.output_columns;
+      galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                    ", layer weights ", num_weight_elements, " (",
+                    FloatElementsToGB(num_weight_elements), " GB)");
+      layer_weights_.resize(num_weight_elements);
+      galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                    ", layer gradients ", num_weight_elements, " (",
+                    FloatElementsToGB(num_weight_elements), " GB)");
+      layer_weight_gradients_.resize(num_weight_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        base_gpu_object_.InitWeightMemory(num_weight_elements);
+        base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows *
+                                           layer_dimensions_.input_columns);
+      }
+#endif
+
+      GlorotBengioInit(&layer_weights_);
+    }
+
+    // TODO(loc) optimize this and layer creation in general
+    // this does not use output_rows and assumes the worst case where
+    // all nodes are generated
+    // for now it's kept as input_rows so as to not break things
+    size_t num_output_elements =
+        layer_dimensions_.input_rows * layer_dimensions_.output_columns;
+
+    if (!config_.disable_output) {
+      galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_,
+                    ", forward output matrix ", num_output_elements, " (",
+                    FloatElementsToGB(num_output_elements), " GB)");
+      forward_output_matrix_.resize(num_output_elements, 0);
+    }
+
+    if (layer_number_ != 0) {
+      GALOIS_LOG_VASSERT(
+          backward_output_matrix->size() ==
+              layer_dimensions_.input_rows * layer_dimensions_.input_columns,
+          "backward output size {} should equal input size {}",
+          backward_output_matrix->size(),
+          layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+    } else {
+      GALOIS_LOG_VASSERT(backward_output_matrix->data() == nullptr,
+                         "layer 0 should null ptr backward output");
+      GALOIS_LOG_VASSERT(backward_output_matrix->size() == 0,
+                         "layer 0 should size 0 backward output");
+    }
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.InitInOutMemory(num_output_elements,
+                                       layer_dimensions_.input_rows *
+                                           layer_dimensions_.input_columns);
+
+      // initialize the PointerWithSize wrappers
+      p_layer_weights_ = PointerWithSize<GNNFloat>(
+          base_gpu_object_.layer_weights(), layer_weights_.size());
+      p_layer_weight_gradients_ =
+          PointerWithSize<GNNFloat>(base_gpu_object_.layer_weight_gradients(),
+                                    layer_weight_gradients_.size());
+      p_forward_output_matrix_ = PointerWithSize<GNNFloat>(
+          base_gpu_object_.forward_output(), forward_output_matrix_.size());
+      p_backward_output_matrix_ = PointerWithSize<GNNFloat>(
+          base_gpu_object_.backward_output(), backward_output_matrix->size());
+      // TODO can clear the cpu side vectors/don't use .size() since optimally
+      // they aren't initialized
+    } else {
+#endif
+      // initialize the PointerWithSize wrappers
+      p_layer_weights_ = PointerWithSize<GNNFloat>(layer_weights_);
+      p_layer_weight_gradients_ =
+          PointerWithSize<GNNFloat>(layer_weight_gradients_);
+      p_forward_output_matrix_ =
+          PointerWithSize<GNNFloat>(forward_output_matrix_);
+      p_backward_output_matrix_ = *backward_output_matrix;
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  //! Uses a default config
+  GNNLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+           PointerWithSize<GNNFloat>* backward_output_matrix,
+           const GNNLayerDimensions& dimensions)
+      : GNNLayer(layer_num, graph, backward_output_matrix, dimensions,
+                 GNNLayerConfig()) {}
+
+  virtual void ResizeRows(size_t new_row_count) {
+    layer_dimensions_.input_rows  = new_row_count;
+    layer_dimensions_.output_rows = new_row_count;
+    ResizeOutputMatrix(new_row_count);
+  }
+
+  virtual void ResizeInputOutputRows(size_t input_row, size_t output_row) {
+    layer_dimensions_.input_rows  = input_row;
+    layer_dimensions_.output_rows = output_row;
+    ResizeOutputMatrix(output_row);
+  }
+
+  void ResizeOutputMatrix(size_t new_output_row) {
+    size_t num_output_elements =
+        new_output_row * layer_dimensions_.output_columns;
+
+    if (!config_.disable_output &&
+        (forward_output_matrix_.size() < num_output_elements)) {
+      galois::gInfo(graph_.host_prefix(), "Resizing layer ", layer_number_,
+                    ", forward output matrix to ", num_output_elements, " (",
+                    FloatElementsToGB(num_output_elements), " GB)");
+      // resize with a bit of a buffer to prevent possible future resizes
+      size_t buffer_size = (num_output_elements * 0.02);
+      forward_output_matrix_.resize(num_output_elements + buffer_size, 0);
+    }
+
+    // XXX(hochan) GPU end
+#ifdef GALOIS_ENABLE_GPU
+    // XXX(hochan)
+#endif
+    // reinitialize the PointerWithSize wrappers
+    p_forward_output_matrix_ =
+        PointerWithSize<GNNFloat>(forward_output_matrix_);
+#ifdef GALOIS_ENABLE_GPU
+    // XXX(hochan)
+#endif
+  }
+
+  void UpdateBackwardOutput(PointerWithSize<GNNFloat>* backward_output_matrix) {
+    // XXX(hochan) gpu
+    if (layer_number_ != 0) {
+      assert(backward_output_matrix->size() >=
+             layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+    } else {
+      GALOIS_LOG_FATAL("Layer 0 should not need to update backward output");
+    }
+    p_backward_output_matrix_ = *backward_output_matrix;
+  }
+
+  GNNPhase layer_phase() { return layer_phase_; }
+  //! Changes this layer's phase
+  void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; }
+
+  void DisableActivation() { config_.disable_activation = true; }
+
+  //! Initializes all layer weights to 1. This is used as a debug function for
+  //! testing.
+  void InitAllWeightsTo1() {
+    if (layer_weights_.size()) {
+      layer_weights_.assign(layer_weights_.size(), 1);
+    }
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
+#endif
+  }
+
+  const PointerWithSize<GNNFloat> GetForwardOutput() {
+    return p_forward_output_matrix_;
+  }
+
+  const PointerWithSize<GNNFloat> GetBackwardOutput() {
+    return p_backward_output_matrix_;
+  }
+
+  //! Returns the weight gradients
+  const PointerWithSize<GNNFloat> GetLayerWeightGradients() {
+    return p_layer_weight_gradients_;
+  }
+
+  //! Returns dimensions of this layer
+  const GNNLayerDimensions& GetLayerDimensions() const {
+    return layer_dimensions_;
+  }
+
+  galois::GNNLayerType layer_type() const { return layer_type_; }
+  galois::GNNOutputLayerType output_layer_type() const {
+    return output_layer_type_;
+  }
+  size_t layer_number() const { return layer_number_; }
+  size_t graph_user_layer_number() const { return graph_user_layer_number_; }
+
+  //! Conducts the forward phase given the input to this layer which
+  //! ultimately leads to an output (classfication of node labels) at the end
+  //! of the GNN.
+  //! @returns Output of the forward phase (i.e. input to next layer)
+  // XXX size of embeddings
+  virtual const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) = 0;
+  //! Conducts the backward phase given the input to this layer; the backward
+  //! phase calculates the gradients to update the weights of trainable
+  //! parts of the layer (e.g., weights, trainable params for aggregate, etc.).
+  //! @param prev_layer_input The input that was given to this layer in the
+  //! forward phase
+  //! @param input_gradient gradient from the backward phase layer before this
+  //! one; takes a pointer to save space by writing intermediate results to it
+  //! @returns Output of the backward phase (i.e. input to previous layer); note
+  //! it's a pointer because layer can mess with it
+  virtual PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) = 0;
+
+  //! Given an optimizer, update the weights in this layer based on gradients
+  //! stored in the layer
+  virtual void OptimizeLayer(BaseOptimizer* optimizer,
+                             size_t trainable_layer_number) {
+    optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_,
+                               trainable_layer_number);
+  }
+
+  //! Flip sampling switch on
+  void EnableSampling() { config_.do_sampling = true; }
+  void DisableSampling() { config_.do_sampling = false; }
+  bool IsSampledLayer() const { return config_.do_sampling; }
+  //! Sets the graph user layer number; important for sampling as this index
+  //! determines which index to use when checking for sampled edges
+  void SetGraphUserLayerNumber(size_t num) { graph_user_layer_number_ = num; }
+
+#ifdef GALOIS_ENABLE_GPU
+  //! Utility function for allocating
+  PointerWithSize<GNNFloat> AllocateGPU(const std::vector<GNNFloat>& v) {
+    return PointerWithSize<GNNFloat>(base_gpu_object_.Allocate(v), v.size());
+  }
+
+  //! Copies over forward output results to CPU from GPU
+  const std::vector<GNNFloat> CopyForwardOutputFromGPU() {
+    size_t cpu_forward_output_size = p_forward_output_matrix_.size();
+    GNNFloat* cpu_forward_output =
+        (GNNFloat*)malloc(cpu_forward_output_size * sizeof(GNNFloat));
+    base_gpu_object_.CopyForwardOutputToCPU(cpu_forward_output,
+                                            cpu_forward_output_size);
+    return std::vector<GNNFloat>(cpu_forward_output,
+                                 cpu_forward_output + cpu_forward_output_size);
+  }
+
+  //! Copies over backward output results to CPU from GPU
+  const PointerWithSize<GNNFloat> CopyBackwardOutputFromGPU() {
+    size_t cpu_backward_output_size = p_backward_output_matrix_.size();
+    GNNFloat* cpu_backward_output =
+        (GNNFloat*)malloc(cpu_backward_output_size * sizeof(GNNFloat));
+    base_gpu_object_.CopyBackwardOutputToCPU(cpu_backward_output,
+                                             cpu_backward_output_size);
+    return PointerWithSize<GNNFloat>(cpu_backward_output,
+                                     cpu_backward_output_size);
+  }
+
+  //! Copies over weight gradients to CPU from GPU
+  const std::vector<GNNFloat>& CopyWeightGradientsFromGPU() {
+    base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_);
+    return layer_weight_gradients_;
+  }
+
+  void PrintForwardOutputGPU() {
+    base_gpu_object_.PrintForwardOutput(forward_output_matrix_.size());
+  }
+
+  void PrintBackwardOutputGPU() {
+    base_gpu_object_.PrintBackwardOutput(p_backward_output_matrix_.size());
+  }
+#endif
+  void EnableTimers() { use_timer_ = true; }
+  void DisableTimers() { use_timer_ = false; }
+
+protected:
+  //! Layer order (starts from 0); used in backward to shortcut output as layer
+  //! 0 does not need to do some things that other layers need to do
+  // XXX be more specific
+  size_t layer_number_;
+  //! Graph layer number: only layers that use the graph are numbered
+  size_t graph_user_layer_number_;
+  //! Pointer to the graph being trained by this layer.
+  //! This is owned by the creator of this layer, so no need to free it when
+  //! this layer is destroyed.
+  const galois::graphs::GNNGraph<VTy, ETy>& graph_;
+  //! Dimensions (input/output sizes) of this layer
+  GNNLayerDimensions layer_dimensions_;
+  //! Config object for certain parameters for layer
+  GNNLayerConfig config_;
+
+  //! Weights used by this layer. Dimensions: input columns by output columns
+  std::vector<GNNFloat> layer_weights_;
+  //! Gradients used to update the weights of this layer
+  std::vector<GNNFloat> layer_weight_gradients_;
+  // There is a forward and a backward as their sizes will differ and we only
+  // want to allocate memory once to avoid runtime memory allocation.
+  //! The output of the forward phase for this layer.
+  std::vector<GNNFloat> forward_output_matrix_;
+
+  // These are wrapper around the pointer for the data associated with
+  // any GNN layer: takes a CPU or GPU pointer depending on configuration
+  // Needed to allow both CPU/GPU runs with same code
+  PointerWithSize<GNNFloat> p_layer_weights_;
+  PointerWithSize<GNNFloat> p_layer_weight_gradients_;
+  PointerWithSize<GNNFloat> p_forward_output_matrix_;
+  PointerWithSize<GNNFloat> p_backward_output_matrix_;
+  galois::DynamicBitSet activation_memo_;
+
+  //! RNG for matrix initialization
+  PerThreadRNG random_init_rng_{-5.0, 5.0};
+  //! RNG for dropout
+  PerThreadRNG dropout_rng_;
+  //! Indicates which fields of the weight matrix are dropped if dropout is
+  //! used
+  std::vector<bool> dropout_mask_;
+  //! Phase of GNN computation that this layer is currently in
+  galois::GNNPhase layer_phase_{galois::GNNPhase::kTrain};
+  //! Layer type (invalid if output layer)
+  galois::GNNLayerType layer_type_{galois::GNNLayerType::kInvalid};
+  //! Output layer type (remains invalid if not an output layer)
+  galois::GNNOutputLayerType output_layer_type_{
+      galois::GNNOutputLayerType::kInvalid};
+
+  // Used mainly for accuracy tracking
+  galois::DGAccumulator<uint32_t> node_count_;
+  galois::DGAccumulator<float> float_accumulator_;
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  bool use_timer_{true};
+  void TimerStart(galois::StatTimer* t) {
+    if (use_timer_)
+      t->start();
+  }
+  void TimerStop(galois::StatTimer* t) {
+    if (use_timer_)
+      t->stop();
+  }
+
+  //! Init based from following paper
+  //! http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
+  //! Since it is unclear what j and j+1 refer to in that paper, the things
+  //! used are the dimensions of this particular weight matrix
+  //! TODO revisit paper and see what they really mean
+  //! Code inspired DGL and TinyDNN
+  void GlorotBengioInit(std::vector<GNNFloat>* vector_to_init) {
+    float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns +
+                                           layer_dimensions_.input_columns);
+    std::default_random_engine rng(1 + layer_number_);
+    std::uniform_real_distribution<GNNFloat> dist(-max, max);
+
+    for (size_t i = 0; i < vector_to_init->size(); i++) {
+      (*vector_to_init)[i] = dist(rng);
+    }
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
+#endif
+  }
+
+  //! Init 2 things as one unit; used for SAGE
+  void PairGlorotBengioInit(std::vector<GNNFloat>* vector1,
+                            std::vector<GNNFloat>* vector2) {
+    // multiplied by 2 here because 2 pieces are 1 unit
+    float max =
+        std::sqrt(6.0) / std::sqrt((2 * layer_dimensions_.output_columns) +
+                                   layer_dimensions_.input_columns);
+    assert(vector1->size() == (layer_dimensions_.input_columns *
+                               layer_dimensions_.output_columns));
+    assert(vector2->size() == (layer_dimensions_.input_columns *
+                               layer_dimensions_.output_columns));
+    std::default_random_engine rng(1 + layer_number_);
+    std::uniform_real_distribution<GNNFloat> dist(-max, max);
+
+    for (size_t i = 0; i < vector1->size(); i++) {
+      (*vector1)[i] = dist(rng);
+    }
+    for (size_t i = 0; i < vector2->size(); i++) {
+      (*vector2)[i] = dist(rng);
+    }
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
+#endif
+  }
+
+  //! Randomly init a float vector using the class's random init RNG
+  void RandomInitVector(std::vector<GNNFloat>* vector_to_init) {
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), vector_to_init->size()),
+        [&](size_t i) {
+          // pull from the class's per thread RNG
+          (*vector_to_init)[i] = random_init_rng_.GetRandomNumber();
+        },
+        galois::loopname("RandomInitVector"));
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      CopyLayerWeightsToGPU();
+    }
+#endif
+  }
+
+  //! CPU variant of dropout
+  void DoDropoutCPU(const PointerWithSize<GNNFloat> input_to_dropout,
+                    PointerWithSize<GNNFloat>* output_matrix) {
+    // TODO This (and dropout in general) may not work in the sampling setting
+    size_t num_elements =
+        layer_dimensions_.input_rows * layer_dimensions_.input_columns;
+
+    // determine which parts to drop
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), num_elements),
+        [&](size_t i) {
+          dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate);
+        },
+        galois::loopname("LayerDropoutRNG"));
+
+    // create new matrix with non-dropped input + some scaling
+    // TODO save scaling elsewhere?
+    GNNFloat scale = 1. / (1. - config_.dropout_rate);
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), num_elements),
+        [&](size_t i) {
+          (*output_matrix)[i] = input_to_dropout[i] *
+                                static_cast<GNNFloat>(dropout_mask_[i]) * scale;
+        },
+        galois::loopname("LayerDropout"));
+  }
+
+  //! Choose a set of weights from this layer's weights to keep and save to
+  //! the output matrix + apply some scaling to the kept weights based on
+  //! dropout rate
+  void DoDropout(const PointerWithSize<GNNFloat> input_to_dropout,
+                 PointerWithSize<GNNFloat>* output_matrix) {
+    galois::StatTimer timer("ForwardDropout", "GNNLayer");
+    TimerStart(&timer);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix,
+                                    config_.dropout_rate);
+    } else {
+#endif
+      DoDropoutCPU(input_to_dropout, output_matrix);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
+  //! Apply the derivative of dropout to the backward phase output
+  void DoDropoutDerivative() {
+    galois::StatTimer timer("BackwardDropout", "GNNLayer");
+    TimerStart(&timer);
+    assert(p_backward_output_matrix_.size() == dropout_mask_.size());
+    GNNFloat scale = 1. / (1. - config_.dropout_rate);
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(),
+                                              scale);
+    } else {
+#endif
+      // use dropout mask to figure out derivative
+      galois::do_all(
+          galois::iterate(static_cast<size_t>(0),
+                          p_backward_output_matrix_.size()),
+          [&](size_t i) {
+            p_backward_output_matrix_[i] =
+                p_backward_output_matrix_[i] *
+                static_cast<GNNFloat>(dropout_mask_[i]) * scale;
+          },
+          galois::loopname("LayerDropoutDerivative"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
+  void
+  ReconstructDropoutMatrix(const PointerWithSize<GNNFloat> input_to_dropout,
+                           PointerWithSize<GNNFloat>* output_matrix) {
+    galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer");
+    TimerStart(&timer);
+    // reuse the dropout mask from a previous dropout call
+    size_t num_elements = output_matrix->size();
+    GNNFloat scale      = 1. / (1. - config_.dropout_rate);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.ReconstructDropoutMatrixGPU(
+          input_to_dropout, output_matrix, num_elements, scale);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(static_cast<size_t>(0), num_elements),
+          [&](size_t i) {
+            (*output_matrix)[i] = input_to_dropout[i] *
+                                  static_cast<GNNFloat>(dropout_mask_[i]) *
+                                  scale;
+          },
+          galois::loopname("ReconstructDropout"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
+  //! Does some activation function based on configuration on forward output
+  //! matrix
+  void Activation() {
+    galois::StatTimer timer("ForwardActivation", "GNNLayer");
+    TimerStart(&timer);
+
+    // TODO only does relu at the moment; should check user specified activation
+    // and act accordingly
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size());
+    } else {
+#endif
+      if (activation_memo_.size() != p_forward_output_matrix_.size()) {
+        activation_memo_.resize(p_forward_output_matrix_.size());
+      }
+      activation_memo_.reset();
+      assert(activation_memo_.size() == p_forward_output_matrix_.size());
+      assert(layer_dimensions_.output_rows * layer_dimensions_.output_columns <=
+             p_forward_output_matrix_.size());
+
+      galois::do_all(galois::iterate(static_cast<size_t>(0),
+                                     layer_dimensions_.output_rows *
+                                         layer_dimensions_.output_columns),
+                     [&](size_t i) {
+                       if (p_forward_output_matrix_[i] > 0.0) {
+                         // do nothing, keep value; set the memo though
+                         activation_memo_.set(i);
+                       } else {
+                         p_forward_output_matrix_[i] = 0;
+                       }
+                     });
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
+  void ActivationCPU();
+  //! Calculate derivative of activation function based on config on the matrix
+  void ActivationDerivative(PointerWithSize<GNNFloat>* gradient) {
+    galois::StatTimer timer("BackwardActivation", "GNNLayer");
+    TimerStart(&timer);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.ActivationDerivativeGPU(gradient->data(),
+                                               gradient->size());
+    } else {
+#endif
+      assert(gradient->size() >=
+             layer_dimensions_.output_rows * layer_dimensions_.output_columns);
+      // TODO only does relu at the moment; should check user specified
+      // activation and act accordingly keep gradient if the original output was
+      // greater than 0
+      galois::do_all(
+          galois::iterate(static_cast<size_t>(0),
+                          layer_dimensions_.output_rows *
+                              layer_dimensions_.output_columns),
+          [&](size_t i) {
+            // it was <= 0 before; set back to 0
+            if (!activation_memo_.test(i)) {
+              (*gradient)[i] = 0;
+            }
+          },
+          galois::loopname("ReLU-Derivative"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&timer);
+  }
+
+  //! Synchronize weight gradients with a summation
+  void WeightGradientSyncSum() {
+    galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+    TimerStart(&clubbed_timer);
+    galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer");
+    TimerStart(&t);
+    int weight_size = static_cast<int>(p_layer_weight_gradients_.size());
+
+    // TODO(loc) remove this limitation later; can just do a loop over the
+    // weight matrix
+    if (p_layer_weight_gradients_.size() >
+        size_t{std::numeric_limits<int>::max()}) {
+      GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max "
+                       "int at the moment");
+    }
+#ifdef GALOIS_ENABLE_GPU
+    // TODO(lhc) make this clang option later
+    bool gpu_direct_enabled = false;
+    if (device_personality == DevicePersonality::GPU_CUDA &&
+        !gpu_direct_enabled) {
+      base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_);
+      MPI_Allreduce(MPI_IN_PLACE, layer_weight_gradients_.data(), weight_size,
+                    MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+      base_gpu_object_.CopyToWeightGradients(layer_weight_gradients_);
+    } else {
+#endif
+      MPI_Allreduce(MPI_IN_PLACE,
+                    static_cast<void*>(p_layer_weight_gradients_.data()),
+                    weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    TimerStop(&t);
+    TimerStop(&clubbed_timer);
+  }
+
+#ifdef GALOIS_ENABLE_GPU
+  //! Object that holds all GPU allocated pointers to memory related to layers
+  GNNLayerGPUAllocations base_gpu_object_;
+  //! Copies over layer weights to GPU
+  void CopyLayerWeightsToGPU() {
+    base_gpu_object_.CopyToWeights(layer_weights_);
+  }
+#endif
+
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input) {
+    MaskInputNonMasters(input, std::numeric_limits<size_t>::max());
+  }
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.input_columns;
+    assert(start_node * row_index <= input->size());
+    assert(end_node * row_index <= input->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            // TODO(loc) use a std function for this for max efficiency
+            for (size_t i = 0; i < row_index; i++) {
+              (*input)[non_master * row_index + i] = 0;
+            }
+          },
+          galois::loopname("MaskInputNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  void MaskInputNonMasters(PointerWithSize<GNNFloat>* input, size_t max_rows,
+                           const galois::DynamicBitSet& bs) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.input_columns;
+    assert(start_node * row_index <= input->size());
+    assert(end_node * row_index <= input->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            if (!bs.test(non_master)) {
+              // TODO(loc) use a std function for this for max efficiency
+              for (size_t i = 0; i < row_index; i++) {
+                (*input)[non_master * row_index + i] = 0;
+              }
+            }
+          },
+          galois::loopname("MaskInputNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  //! Mask a gradient size'd matrix's rows that correspond to mirrors
+  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* input) {
+    MaskGradientNonMasters(input, std::numeric_limits<size_t>::max());
+  }
+  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients,
+                              size_t max_rows) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.output_columns;
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+    assert(start_node * row_index <= gradients->size());
+    assert(end_node * row_index <= gradients->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(gradients, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            // TODO(loc) use a std function for this for max efficiency
+            for (size_t i = 0; i < row_index; i++) {
+              (*gradients)[non_master * row_index + i] = 0;
+            }
+          },
+          galois::loopname("MaskGradientNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  void MaskGradientNonMasters(PointerWithSize<GNNFloat>* gradients,
+                              size_t max_rows,
+                              const galois::DynamicBitSet& bs) {
+    assert(*(graph_.begin_owned()) == 0);
+    size_t start_node = *(graph_.end_owned());
+    size_t end_node   = graph_.active_size();
+
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+
+    size_t row_index = layer_dimensions_.output_columns;
+    if (start_node > max_rows) {
+      start_node = max_rows;
+    }
+    if (end_node > max_rows) {
+      end_node = max_rows;
+    }
+    assert(start_node * row_index <= gradients->size());
+    assert(end_node * row_index <= gradients->size());
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      base_gpu_object_.MaskNonMastersGPU(gradients, start_node, end_node,
+                                         row_index);
+    } else {
+#endif
+      // galois::gInfo(start_node, " to ", end_node);
+      galois::do_all(
+          galois::iterate(start_node, end_node),
+          [&](size_t non_master) {
+            // if something is not a master, kill it
+            if (!bs.test(non_master)) {
+              // galois::gInfo("don't keep ", non_master);
+              // TODO(loc) use a std function for this for max efficiency
+              for (size_t i = 0; i < row_index; i++) {
+                (*gradients)[non_master * row_index + i] = 0;
+              }
+            }
+          },
+          galois::loopname("MaskGradientNonMasters"));
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  //! Does some math to get GB used by some # of floats
+  double FloatElementsToGB(size_t num_of_floats) const {
+    return num_of_floats * double{4} / (1 << 30);
+  }
+
+  void MaskNonMastersGPU(PointerWithSize<GNNFloat>* input, size_t start_node,
+                         size_t end_node, size_t row_index);
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GradientSyncStructures.h b/libgnn/include/galois/layers/GradientSyncStructures.h
new file mode 100644
index 0000000000..ad76f514cd
--- /dev/null
+++ b/libgnn/include/galois/layers/GradientSyncStructures.h
@@ -0,0 +1,64 @@
+#pragma once
+#include "galois/GNNTypes.h"
+
+namespace galois {
+
+//! Simple summation of values
+struct WeightGradientSummation {
+  using ValTy = GNNFloat;
+  static ValTy extract(uint32_t, ValTy& weight) { return weight; }
+  static bool reduce(uint32_t, ValTy& weight, ValTy y) {
+    weight += y;
+    return true;
+  }
+
+  //! reset weight to 0
+  static void reset(uint32_t, ValTy& weight) { weight = 0.0; }
+
+  //! save weight
+  static void setVal(uint32_t, ValTy& weight, ValTy y) { weight = y; }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
+struct WeightGradientSet {
+  using ValTy = GNNFloat;
+  static ValTy extract(uint32_t, ValTy& weight) { return weight; }
+  static bool reduce(uint32_t, ValTy&, ValTy) { return true; }
+
+  //! reset weight to 0
+  static void reset(uint32_t, ValTy& weight) { weight = 0.0; }
+
+  //! save weight
+  static void setVal(uint32_t, ValTy& weight, ValTy y) { weight = y; }
+
+  // GPU options TODO for GPU
+  static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_batch(unsigned, uint8_t*) { return false; }
+  static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {
+    return false;
+  }
+  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }
+  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {
+    return false;
+  }
+  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
new file mode 100644
index 0000000000..51a167b9c1
--- /dev/null
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh
@@ -0,0 +1,47 @@
+#pragma once
+#include "galois/GNNTypes.h"
+#include "galois/graphs/GNNGraph.cuh"
+
+namespace galois {
+
+//! Holds pointers for GPU memory for GCN layer
+class GCNGPUAllocations {
+public:
+  // free memory
+  ~GCNGPUAllocations();
+
+  void AllocateInTemp1(const size_t size);
+  void AllocateInTemp2(const size_t size);
+  void AllocateOutTemp(const size_t size);
+
+  GNNFloat* in_temp_1() { return in_temp_1_; }
+  GNNFloat* in_temp_2() { return in_temp_2_; }
+  GNNFloat* out_temp() { return out_temp_; }
+
+  void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph,
+                       size_t num_nodes, size_t column_length,
+                       const GNNFloat* node_embeddings,
+                       GNNFloat* aggregate_output, bool use_norm,
+                       bool disable_self_aggregate, size_t last_master);
+
+  void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns,
+                           size_t output_columns,
+                           const GNNFloat* node_embeddings,
+                           const GNNFloat* layer_weights, GNNFloat* output);
+  void UpdateEmbeddingsDerivativeGPU(size_t num_nodes, size_t input_columns,
+                                     size_t output_columns,
+                                     const GNNFloat* node_embeddings,
+                                     const GNNFloat* layer_weights,
+                                     GNNFloat* output);
+
+  void GetWeightGradientsGPU(size_t num_nodes, size_t input_columns,
+                             size_t output_columns, const GNNFloat* prev_input,
+                             const GNNFloat* gradients, GNNFloat* output);
+
+private:
+  GNNFloat* in_temp_1_{nullptr};
+  GNNFloat* in_temp_2_{nullptr};
+  GNNFloat* out_temp_{nullptr};
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
new file mode 100644
index 0000000000..d5259a7af9
--- /dev/null
+++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h
@@ -0,0 +1,617 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/layers/GraphConvolutionalLayer.cuh"
+#endif
+
+namespace galois {
+
+extern galois::DynamicBitSet graphs::bitset_graph_aggregate;
+
+template <typename VTy, typename ETy>
+class GraphConvolutionalLayer : public GNNLayer<VTy, ETy> {
+public:
+  //! Initializes the variables of the base class and also allocates additional
+  //! memory for temporary matrices. Also initializes sync substrate for the
+  //! weight matrix
+  GraphConvolutionalLayer(size_t layer_num,
+                          const galois::graphs::GNNGraph<VTy, ETy>& graph,
+                          PointerWithSize<GNNFloat>* backward_output_matrix,
+                          const GNNLayerDimensions& dimensions,
+                          const GNNLayerConfig& config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config),
+        input_column_intermediates_(dimensions.input_columns),
+        output_column_intermediates_(dimensions.output_columns) {
+    galois::gWarn(
+        "GCN layer not up to date with new subgraph/sampling changes; "
+        "do not use until updated to reflect changes (see GraphSAGE layer)");
+
+    size_t num_input_elements = this->layer_dimensions_.input_rows *
+                                this->layer_dimensions_.input_columns;
+    if (!this->config_.disable_dropout ||
+        this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", GCN input temp var 1 ",
+                    num_input_elements, " (",
+                    this->FloatElementsToGB(num_input_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp1(num_input_elements);
+      } else {
+#endif
+        in_temp_1_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    // only on in dropout case + if in temp is smaller than out temp
+    if (!this->config_.disable_dropout &&
+        (this->config_.disable_aggregate_after_update ||
+         this->layer_dimensions_.input_columns <=
+             this->layer_dimensions_.output_columns)) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", GCN input temp var 2 ",
+                    num_input_elements, " (",
+                    this->FloatElementsToGB(num_input_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp2(num_input_elements);
+      } else {
+#endif
+        in_temp_2_.resize(num_input_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    size_t num_output_elements = this->layer_dimensions_.input_rows *
+                                 this->layer_dimensions_.output_columns;
+
+    // only needed if out temp would be smaller than intemp
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      // xform matrix first to work with a smaller output size
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", GCN output temp var ",
+                    num_output_elements, " (",
+                    this->FloatElementsToGB(num_output_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateOutTemp(num_output_elements);
+      } else {
+#endif
+        out_temp_.resize(num_output_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    this->layer_type_ = galois::GNNLayerType::kGraphConvolutional;
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // init pointers with size
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(),
+                                               num_input_elements);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(),
+                                               num_input_elements);
+      p_out_temp_  = PointerWithSize<GNNFloat>(gpu_object_.out_temp(),
+                                              num_output_elements);
+    } else {
+#endif
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+      p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+
+    GALOIS_LOG_VERBOSE("Conv layer initialized");
+  }
+
+  GraphConvolutionalLayer(size_t layer_num,
+                          const galois::graphs::GNNGraph<VTy, ETy>& graph,
+                          PointerWithSize<GNNFloat>* backward_output_matrix,
+                          const GNNLayerDimensions& dimensions)
+      : GraphConvolutionalLayer(layer_num, graph, backward_output_matrix,
+                                dimensions, GNNLayerConfig()) {}
+
+  // Parent functions
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    galois::StatTimer timer("ForwardPhase", kRegionName);
+    timer.start();
+    GALOIS_LOG_VERBOSE("Calling forward phase");
+    assert(input_embeddings.size() == (this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.input_columns));
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    // pointer to input to operate on
+    const GNNFloat* input_data = input_embeddings.data();
+    GNNFloat* agg_data;
+    // first, dropout
+    if (!this->config_.disable_dropout &&
+        (this->layer_phase_ == GNNPhase::kTrain ||
+         this->layer_phase_ == GNNPhase::kBatch)) {
+      this->DoDropout(input_embeddings, &p_in_temp_1_);
+      input_data = p_in_temp_1_.data();
+      agg_data   = p_in_temp_2_.data();
+    } else {
+      agg_data = p_in_temp_1_.data();
+    }
+
+    // flip aggregate/update if dimensions favor it (do less work)
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      // aggregation and update
+      AggregateAll(this->layer_dimensions_.input_columns, input_data, agg_data,
+                   &input_column_intermediates_);
+      UpdateEmbeddings(agg_data, this->p_forward_output_matrix_.data());
+    } else {
+      // update to aggregate
+      // FW
+      UpdateEmbeddings(input_data, p_out_temp_.data());
+      // A(FW)
+      AggregateAll(this->layer_dimensions_.output_columns, p_out_temp_.data(),
+                   this->p_forward_output_matrix_.data(),
+                   &output_column_intermediates_);
+    }
+
+    if (!this->config_.disable_activation) {
+      GALOIS_LOG_VERBOSE("Doing activation");
+      this->Activation();
+    }
+
+    assert(this->p_forward_output_matrix_.size() ==
+           (this->layer_dimensions_.input_rows *
+            this->layer_dimensions_.output_columns));
+    timer.stop();
+
+    return this->p_forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) final {
+    galois::StatTimer timer("BackwardPhase", kRegionName);
+    galois::StatTimer weight_gradient_timer("BackwardPhaseWeight", kRegionName);
+    galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync",
+                                                 kRegionName);
+    timer.start();
+
+    assert(this->layer_phase_ == GNNPhase::kTrain ||
+           this->layer_phase_ == GNNPhase::kBatch);
+
+    // derivative of activation
+    if (!this->config_.disable_activation) {
+      this->ActivationDerivative(input_gradient);
+    }
+
+    // AFW = O
+    galois::PointerWithSize<galois::GNNFloat> input_data;
+    galois::PointerWithSize<galois::GNNFloat> agg_data;
+    if (!this->config_.disable_dropout) {
+      // dropout result is currently stored in temp 1
+      // needs to be used before it gets overwritten
+      input_data = p_in_temp_1_;
+      agg_data   = p_in_temp_2_;
+    } else {
+      // no dropout = use vanilla input
+      input_data = prev_layer_input;
+      agg_data   = p_in_temp_1_;
+    }
+
+    // NOTE: PREV LAYER INPUT AND BACKWARDOUTPUT ARE THE SAME MEMORY LOCATION;
+    // BEWARE OF DEPENDENCIES
+
+    // derivative of aggregation/update
+    // TODO clean up logic here to reduce nesting
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      // aggdata can == p_intemp1; in other words, need to use before overwrite
+      // mask it, then use it
+      if (this->layer_number_ != 0) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows);
+        }
+      } else {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows);
+        }
+      }
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients.data());
+      } else {
+#endif
+        weight_gradient_timer.start();
+        // temp 2 holds aggregated feature vectors from forward phase
+        // use output rows since gcn can use subgraphs
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_rows,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients_.data());
+        weight_gradient_timer.stop();
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      // gradient isn't masked here; only temp1, which has already been
+      // overwritten = fine
+      if (this->layer_number_ != 0) {
+        // transposed sgemm for derivative; in_temp is output
+        assert(input_gradient->size() ==
+               this->layer_dimensions_.input_rows *
+                   this->layer_dimensions_.output_columns);
+        // pintemp1 contains (AF)'
+        UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data());
+        // pback contains F'
+        // derivative of aggregate is the same due to symmetric graph
+        AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                     this->p_backward_output_matrix_.data(),
+                     &input_column_intermediates_, true);
+      }
+    } else {
+      // TODO at this point, out_temp contains memoized FW
+      // can use it to get A' = O' (FW)^T
+      // aggregate occurs regardless of layer being equal to 0 because it is
+      // required in this case for the weight gradient calculation
+      // this is (FW)'
+      AggregateAll(this->layer_dimensions_.output_columns,
+                   input_gradient->data(), p_out_temp_.data(),
+                   &output_column_intermediates_, true);
+
+      if (this->layer_number_ != 0) {
+        if (this->graph_.IsSubgraphOn()) {
+          // Gradients for mirror nodes should be updated by their owner
+          // hosts. In case of graph sampling, we should let this know whether
+          // a node is a sampled master or not.
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows);
+        }
+      } else {
+        // The first layer can zerofy non-master nodes' gradients since
+        // it is the last gradient aggregation.
+        // if 0 then no input to mask: mask the gradient
+        // this is fine because gradient won't be used to get feature gradients
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(&p_out_temp_,
+                                       this->layer_dimensions_.input_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(&p_out_temp_);
+        }
+      }
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients.data());
+      } else {
+#endif
+        weight_gradient_timer.start();
+        // p_out_temp aggregated gradients from the next layer.
+        // The weight gradients for this layer is calculated by
+        // (The current vertex embedding x p_out_temp).
+        // Vertex embedding dimension is (input row x input column),
+        // p_out_temp dimension is (input row x output column),
+        // and weight is (input column x output column).
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients_.data());
+        weight_gradient_timer.stop();
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      if (this->layer_number_ != 0) {
+        // can now overwrite p_backward without issue; since input gradient
+        // is untouched if layer number isn't 0 this will be correct
+        UpdateEmbeddingsDerivative(p_out_temp_.data(),
+                                   this->p_backward_output_matrix_.data());
+      }
+    }
+
+    // sync weight gradients; note aggregation sync occurs in the function call
+    // already
+    weight_gradient_sync_timer.start();
+    this->WeightGradientSyncSum();
+    weight_gradient_sync_timer.stop();
+
+    if (!this->config_.disable_dropout && this->layer_number_ != 0) {
+      this->DoDropoutDerivative();
+    }
+
+    timer.stop();
+    return this->p_backward_output_matrix_;
+  }
+
+private:
+  static const constexpr char* kRegionName = "GCNLayer";
+  // 2 temporaries the size of the forward input; used for dropout and
+  // aggregation (if either are required)
+  std::vector<GNNFloat> in_temp_1_;
+  std::vector<GNNFloat> in_temp_2_;
+  // Temporary matrix the size of the output of the forward pass; used if
+  // an intermediate op occurs before writing to the final output matrix
+  std::vector<GNNFloat> out_temp_;
+
+  // Pointer with size versions
+  PointerWithSize<GNNFloat> p_in_temp_1_;
+  PointerWithSize<GNNFloat> p_in_temp_2_;
+  PointerWithSize<GNNFloat> p_out_temp_;
+
+  // Each thread has a vector of size # input columns or # output columns for
+  // storing intermediate results during aggregation.
+  // The one used depeneds on if aggregation occurs before or after the mxm.
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      input_column_intermediates_;
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      output_column_intermediates_;
+
+  //! CPU aggregation
+  void
+  AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings,
+                  GNNFloat* aggregate_output,
+                  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
+                  bool is_backward) {
+    galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName);
+    size_t num_nodes   = (is_backward)
+                             ? this->layer_dimensions_.input_rows
+                           // In case of minibatching or graph sampling,
+                           // the outut row must be the samped graph's number of
+                           // nodes of that layer.
+                             : this->layer_dimensions_.output_rows;
+    size_t last_master = *(this->graph_.end_owned());
+
+    assert(0 == *(this->graph_.begin_owned()));
+
+    galois::do_all(
+        /* Either an original or a sampled graph iterator is used */
+        galois::iterate(*(this->graph_.begin()), num_nodes),
+        [&](size_t src) {
+          size_t index_to_src_feature = src * column_length;
+          // zero out src feature first
+          for (size_t i = 0; i < column_length; i++) {
+            aggregate_output[index_to_src_feature + i] = 0;
+          }
+
+          if (this->layer_phase_ == GNNPhase::kTrain ||
+              this->layer_phase_ == GNNPhase::kBatch) {
+            if (this->IsSampledLayer()) {
+              // Check if node is part of sampled graph; ignore after
+              // 0'ing if it is not sampled.
+              // TODO(hc): check if SAGE also checks this
+              if (!this->graph_.IsInSampledGraphSubgraph(src)) {
+                return;
+              }
+            }
+          }
+
+          GNNFloat source_norm = 1.0;
+          if (!this->config_.disable_normalization) {
+            if (this->graph_.IsSubgraphOn() ||
+                this->graph_.IsSubgraphViewOn()) {
+              source_norm = this->graph_.GetDegreeNorm(
+                  src, this->graph_user_layer_number_);
+            } else {
+              source_norm = this->graph_.GetGCNNormFactor(src);
+            }
+          }
+
+          // init to self
+          if (!this->config_.disable_self_aggregate) {
+            graphs::bitset_graph_aggregate.set(this->graph_.ConvertToLID(src));
+            // only aggregate self once on master
+            if (src < last_master) {
+              for (size_t i = 0; i < column_length; i++) {
+                aggregate_output[index_to_src_feature + i] =
+                    node_embeddings[index_to_src_feature + i] * source_norm *
+                    source_norm;
+              }
+            }
+          }
+
+          // loop through all destinations to grab the feature to aggregate
+          auto e_beg = (is_backward) ? this->graph_.in_edge_begin(src)
+                                     : this->graph_.edge_begin(src);
+          auto e_end = (is_backward) ? this->graph_.in_edge_end(src)
+                                     : this->graph_.edge_end(src);
+          for (auto e = e_beg; e != e_end; e++) {
+            if (this->layer_phase_ == GNNPhase::kTrain ||
+                this->layer_phase_ == GNNPhase::kBatch) {
+              if (this->IsSampledLayer()) {
+                bool is_sampled = (is_backward)
+                                      ? this->graph_.IsInEdgeSampled(
+                                            e, this->graph_user_layer_number_)
+                                      : this->graph_.IsEdgeSampled(
+                                            e, this->graph_user_layer_number_);
+                // ignore non-sampled nodes and edges
+                if (!is_sampled) {
+                  continue;
+                }
+              }
+            }
+            size_t dst = (is_backward) ? this->graph_.GetInEdgeDest(e)
+                                       : this->graph_.GetEdgeDest(e);
+            graphs::bitset_graph_aggregate.set(this->graph_.ConvertToLID(src));
+            size_t index_to_dst_feature = dst * column_length;
+
+            if (!this->config_.disable_normalization) {
+              GNNFloat norm_scale;
+              if (this->graph_.IsSubgraphOn() ||
+                  this->graph_.IsSubgraphViewOn()) {
+                norm_scale = (is_backward)
+                                 ? this->graph_.GetDegreeNorm(
+                                       dst, this->graph_user_layer_number_)
+                                 : source_norm;
+              } else {
+                norm_scale = source_norm * this->graph_.GetGCNNormFactor(dst);
+              }
+
+              galois::VectorMulAdd(
+                  column_length, &aggregate_output[index_to_src_feature],
+                  &node_embeddings[index_to_dst_feature], norm_scale,
+                  &aggregate_output[index_to_src_feature]);
+            } else {
+              // add dst feature to aggregate output
+              galois::VectorAdd(column_length,
+                                &aggregate_output[index_to_src_feature],
+                                &node_embeddings[index_to_dst_feature],
+                                &aggregate_output[index_to_src_feature]);
+            }
+          }
+        },
+        galois::chunk_size<1>(), galois::steal(),
+        galois::loopname("ConvolutionalAggregateAll"));
+    // aggregate sync
+    aggregate_all_sync_timer.start();
+    this->graph_.AggregateSync(aggregate_output, column_length, is_backward,
+                               num_nodes);
+    aggregate_all_sync_timer.stop();
+  }
+
+  //! Performs aggregation for all nodes of the graph given the length of the
+  //! vector to aggregate, the features themselves, an output array, and per
+  //! thread storage for the intermediate scaling via norm factor
+  void AggregateAll(
+      size_t column_length, const GNNFloat* node_embeddings,
+      GNNFloat* aggregate_output,
+      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {
+    AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
+  }
+
+  void
+  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
+               GNNFloat* aggregate_output,
+               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
+               bool is_backward) {
+    std::string agg_timer_name = "Aggregate";
+    if (!is_backward) {
+      agg_timer_name += "Forward";
+    } else {
+      agg_timer_name += "Backward";
+    }
+    galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
+    timer.start();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      size_t last_master = *(this->graph_.end_owned());
+      gpu_object_.AggregateAllGPU(
+          this->graph_.GetGPUGraph(), this->graph_.size(), column_length,
+          node_embeddings, aggregate_output,
+          !this->config_.disable_normalization,
+          this->config_.disable_self_aggregate, last_master);
+      this->graph_.AggregateSyncGPU(aggregate_output, column_length,
+                                    this->layer_number_);
+    } else {
+#endif
+      AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
+                      is_backward);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    timer.stop();
+  }
+
+  //! Do embedding update via mxm with this layer's weights (forward)
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) {
+    galois::StatTimer timer("ForwardXform", kRegionName);
+    timer.start();
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsGPU(this->layer_dimensions_.input_rows,
+                                      this->layer_dimensions_.input_columns,
+                                      this->layer_dimensions_.output_columns,
+                                      node_embeddings,
+                                      base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // CPU version is just a call into CBlas
+      galois::CBlasSGEMM(
+          CblasNoTrans, CblasNoTrans,
+          this->layer_dimensions_.input_rows /* Graph or sampled graph nodes */,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns,
+          node_embeddings /* input row x input columns */,
+          this->layer_weights_.data() /* input column x output column */,
+          output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    timer.stop();
+  }
+
+  //! Calculate graident via mxm with last layer's gradients (backward)
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) {
+    galois::StatTimer timer("BackwardXform", kRegionName);
+    timer.start();
+
+    assert(this->p_layer_weights_.size() ==
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsDerivativeGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, gradients,
+          base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y
+      galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                         this->layer_dimensions_.input_rows,
+                         this->layer_dimensions_.output_columns,
+                         this->layer_dimensions_.input_columns, gradients,
+                         this->layer_weights_.data(), output);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    timer.stop();
+  }
+
+#ifdef GALOIS_ENABLE_GPU
+  GCNGPUAllocations gpu_object_;
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h
new file mode 100644
index 0000000000..fe201e453b
--- /dev/null
+++ b/libgnn/include/galois/layers/L2NormLayer.h
@@ -0,0 +1,166 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+
+#ifdef GALOIS_ENABLE_GPU
+// TODO(loc/hochan)
+#endif
+
+namespace galois {
+
+//! Applies L2 norm to rows of the input
+template <typename VTy, typename ETy>
+class L2NormLayer : public GNNLayer<VTy, ETy> {
+public:
+  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+              PointerWithSize<GNNFloat>* backward_output_matrix,
+              const GNNLayerDimensions& dimensions)
+      : L2NormLayer(layer_num, graph, backward_output_matrix, dimensions,
+                    GNNLayerConfig{.allocate_weights = false}) {}
+  L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+              PointerWithSize<GNNFloat>* backward_output_matrix,
+              const GNNLayerDimensions& dimensions,
+              const GNNLayerConfig& config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config) {
+    this->layer_type_ = galois::GNNLayerType::kL2Norm;
+    // input/output columns must be equivalent in a softmax
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    GALOIS_LOG_VERBOSE("L2 norm initialized");
+  }
+
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO
+#endif
+    GALOIS_LOG_FATAL(
+        "L2 Layer has not been kept up to date for months; do not use");
+    return ForwardPhaseCPU(input_embeddings);
+  }
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO
+#endif
+    return BackwardPhaseCPU(prev_layer_input, input_gradient);
+  }
+
+private:
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    this->forward_output_matrix_.assign(this->forward_output_matrix_.size(),
+                                        0.0);
+    // for each row, get square root of squared sums then normalize
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+    // TODO(loc) make sure this works in distributed setting as well
+    galois::do_all(
+        galois::iterate(this->graph_.begin_owned(), this->graph_.end_owned()),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if (this->layer_phase_ == GNNPhase::kTrain &&
+                !this->graph_.IsInSampledGraph(row))
+              return;
+          }
+
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset        = row * feature_length;
+            float running_square_sum = 0.0;
+            // get square sums
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              running_square_sum += std::pow(input_embeddings[row_index], 2);
+            }
+
+            // make sure running sum isn't too small
+            running_square_sum =
+                (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
+
+            // sqrt of sums, then divide row by it
+            float sqrt_squares = std::pow(running_square_sum, 0.5);
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              this->forward_output_matrix_[row_index] =
+                  input_embeddings[row_index] / sqrt_squares;
+            }
+          }
+        },
+        galois::loopname("L2ForwardNormalization"));
+
+    return this->forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhaseCPU(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                   PointerWithSize<galois::GNNFloat>* input_gradient) {
+    galois::do_all(
+        galois::iterate(size_t{0}, this->p_backward_output_matrix_.size()),
+        [&](size_t i) { this->p_backward_output_matrix_[i] = 0; });
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of
+    // the row (since l2 norm affects entire row)
+    // The math itself can be derived using quotient/chain rule on each element
+    // of the normalized row
+    galois::do_all(
+        galois::iterate(this->graph_.begin_owned(), this->graph_.end_owned()),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if (this->layer_phase_ == GNNPhase::kTrain &&
+                !this->graph_.IsInSampledGraph(row))
+              return;
+          }
+
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset = row * feature_length;
+            // note: if you work this out on paper it turns out that terms that
+            // seem extra in the way this is calculated below simply get
+            // canceled out, so this ends up working out This implementation is
+            // taken from the IPDPS GraphSAINT implementation: I (loc) have
+            // confirmed the math checks out
+            float running_square_sum = 0.0;
+            float mult_with_input    = 0.0;
+
+            // get square sums
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              running_square_sum += std::pow(prev_layer_input[row_index], 2);
+              // gradient multiplied with corresponding input; subtraction
+              // because derivative math ends up working out that way
+              mult_with_input -=
+                  prev_layer_input[row_index] * (*input_gradient)[row_index];
+            }
+            running_square_sum =
+                (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum;
+            assert(running_square_sum != 0.0);
+
+            // denominator for all gradients is just the square sum to the
+            // -3/2'd power since this is -, all we have to do is multiply it
+            // later rather than divide
+            float denominator = std::pow(running_square_sum, -1.5);
+            assert(denominator != 0.0);
+
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              this->p_backward_output_matrix_[row_index] =
+                  denominator *
+                  (prev_layer_input[row_index] * mult_with_input +
+                   (*input_gradient)[row_index] * running_square_sum);
+            }
+          }
+        },
+        galois::loopname("L2Backward"));
+
+    return this->p_backward_output_matrix_;
+  }
+
+  //! No op
+  void OptimizeLayer(BaseOptimizer*, size_t) { return; };
+
+#ifdef GALOIS_ENABLE_GPU
+  // TODO(loc/hochan)
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/ReLULayer.h b/libgnn/include/galois/layers/ReLULayer.h
new file mode 100644
index 0000000000..c35704a28e
--- /dev/null
+++ b/libgnn/include/galois/layers/ReLULayer.h
@@ -0,0 +1,126 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+#include "galois/GNNMath.h"
+
+// XXX(hc): We don't have GPU ReLU implementation.
+
+// TODO(hc): All intermediate layers in Galois-GNN have internal ReLU
+// layer. So, this is not yet being used.
+// BUT, I would like to leave this for the future.
+
+namespace galois {
+
+//! ReLU layer: takes each row of the input matrix and sets 0 to elements < 0 in
+//! a row. Currently this only works with **single class* labels and is coded as
+//! such.
+template <typename VTy, typename ETy>
+class ReLULayer : public GNNLayer<VTy, ETy> {
+public:
+  ReLULayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
+            const GNNLayerDimensions& dimensions)
+      : ReLULayer<VTy, ETy>(
+            layer_num, graph, backward_output_matrix, dimensions,
+            GNNLayerConfig{.allocate_weights = false, .disable_output = true}) {
+  }
+
+  ReLULayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
+            const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config) {
+    this->layer_type_ = galois::GNNLayerType::kReLU;
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    GALOIS_LOG_VERBOSE("ReLU initialized");
+  }
+
+  //! Perform max(0, input) to each row of input
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    return ForwardPhaseCPU(input_embeddings);
+  }
+
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    galois::StatTimer Timer("ReLULayer", "ReLULayer");
+    this->TimerStart(&Timer);
+
+    // note: p_backward == input_embeddings
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if ((this->layer_phase_ == GNNPhase::kTrain ||
+                 this->layer_phase_ == GNNPhase::kBatch) &&
+                !this->graph_.IsInSampledGraphSubgraph(row)) {
+              return;
+            }
+          }
+
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset = row * feature_length;
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              // TODO(hc): SHAD uses inplace update but Galois-GNN uses
+              // separate vector for outputs.
+              // Revisit this if there is performance differences.
+              this->forward_output_matrix_[row_index] =
+                  std::max(float{0}, input_embeddings[row_index]);
+            }
+          }
+        },
+        // TODO chunk size?
+        // steal on as some threads may have nothing to work on
+        // galois::steal(), galois::loopname("ReLUForward"));
+        galois::steal());
+    this->TimerStop(&Timer);
+    return this->forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhaseCPU(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                   PointerWithSize<galois::GNNFloat>* input_gradients) {
+    galois::StatTimer Timer("ReLUBackward", "ReLULayer");
+    this->TimerStart(&Timer);
+
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned row) {
+          if (this->IsSampledLayer()) {
+            if (this->layer_phase_ == GNNPhase::kTrain &&
+                !this->graph_.IsInSampledGraphSubgraph(row))
+              return;
+          }
+          // Even though ReLU is non-differentiable at 0,
+          // PyTorch's ReLU returns 0 for the derivative of 0.
+          if (this->graph_.IsValidForPhase(row, this->layer_phase_)) {
+            size_t row_offset = row * feature_length;
+            for (size_t row_index = row_offset;
+                 row_index < (row_offset + feature_length); row_index++) {
+              this->p_backward_output_matrix_[row_index] =
+                  (prev_layer_input[row_index] > 0 ? 1 : 0) *
+                  (*input_gradients)[row_index];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("ReLUBackward"));
+
+    this->TimerStop(&Timer);
+
+    return this->p_backward_output_matrix_;
+  }
+
+  //! Get gradients to fix distribution such that it leans more towards single
+  //! class ground truth.
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradients) final {
+    return BackwardPhaseCPU(prev_layer_input, input_gradients);
+  }
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/SAGELayer.cuh b/libgnn/include/galois/layers/SAGELayer.cuh
new file mode 100644
index 0000000000..05f9e8556c
--- /dev/null
+++ b/libgnn/include/galois/layers/SAGELayer.cuh
@@ -0,0 +1,82 @@
+#pragma once
+#include "galois/GNNTypes.h"
+#include "galois/graphs/GNNGraph.cuh"
+
+namespace galois {
+
+//! Holds pointers for GPU memory for SAGE layer
+class SAGEGPUAllocations {
+public:
+  // free memory
+  ~SAGEGPUAllocations();
+
+  // allocate the 3 temp arrays
+  void AllocateInTemp1(const size_t size);
+  void AllocateInTemp2(const size_t size);
+  void AllocateOutTemp(const size_t size);
+
+  GNNFloat* in_temp_1() { return in_temp_1_; }
+  GNNFloat* in_temp_2() { return in_temp_2_; }
+  GNNFloat* out_temp() { return out_temp_; }
+
+  void AllocateWeight2(const size_t size);
+  void AllocateWeightGradient2(const size_t size);
+
+  GNNFloat* layer_weights_2() { return layer_weights_2_; }
+  GNNFloat* layer_weight_gradients_2() { return layer_weight_gradients_2_; }
+
+  void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph,
+                       size_t num_nodes, size_t column_length,
+                       const GNNFloat* node_embeddings,
+                       GNNFloat* aggregate_output, bool use_norm,
+                       bool is_backward);
+
+  void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns,
+                           size_t output_columns,
+                           const GNNFloat* node_embeddings,
+                           const GNNFloat* layer_weights, GNNFloat* output);
+  void UpdateEmbeddingsDerivativeGPU(size_t num_nodes, size_t input_columns,
+                                     size_t output_columns,
+                                     const GNNFloat* node_embeddings,
+                                     const GNNFloat* layer_weights,
+                                     GNNFloat* output);
+
+  void GetWeightGradientsGPU(size_t num_nodes, size_t input_columns,
+                             size_t output_columns, const GNNFloat* prev_input,
+                             const GNNFloat* gradients, GNNFloat* output);
+
+  void SelfFeatureUpdateEmbeddingsGPU(size_t input_rows, size_t input_columns,
+                                      size_t output_columns,
+                                      const GNNFloat* node_embeddings,
+                                      GNNFloat* output);
+
+  void SelfFeatureUpdateEmbeddingsDerivativeGPU(size_t input_rows,
+                                                size_t output_columns,
+                                                size_t input_columns,
+                                                const GNNFloat* gradients,
+                                                GNNFloat* output);
+
+  void UpdateWeight2DerivativeGPU(size_t input_columns, size_t input_rows,
+                                  size_t output_columns,
+                                  const GNNFloat* prev_layer_inputs,
+                                  const GNNFloat* input_gradients,
+                                  GNNFloat* output);
+
+  //! Copy provided data in vector to GPU self weight
+  void CopyToWeights2(const std::vector<GNNFloat>& cpu_layer_weights);
+  //! Copy provided data in vector to GPU self weight gradients
+  void CopyToWeight2Gradients(const std::vector<GNNFloat>& cpu_gradients);
+
+  //! Copy GPU self weight gradients to the provided vector (assumes vector is
+  //! already correct size)
+  void CopyWeight2GradientsToCPU(std::vector<GNNFloat>* cpu_gradients);
+
+private:
+  GNNFloat* in_temp_1_{nullptr};
+  GNNFloat* in_temp_2_{nullptr};
+  GNNFloat* out_temp_{nullptr};
+  GNNFloat* layer_weights_2_{nullptr};
+  GNNFloat* layer_weight_gradients_2_{nullptr};
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h
new file mode 100644
index 0000000000..5bcaf66589
--- /dev/null
+++ b/libgnn/include/galois/layers/SAGELayer.h
@@ -0,0 +1,1080 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+#include "galois/layers/GradientSyncStructures.h"
+#include "galois/GNNMath.h"
+#include "galois/Logging.h"
+
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/layers/SAGELayer.cuh"
+#endif
+
+namespace galois {
+
+extern galois::DynamicBitSet graphs::bitset_graph_aggregate;
+
+struct SAGELayerConfig {
+  bool disable_concat{false};
+};
+
+// TODO(loc) move common functionality with GCN layer to common parent class
+// (e.g. inits): cleans up Dense code a bit as well
+
+//! Same as GCN layer except for the following:
+//! - Mean aggregation; no symmetric norm with sqrts used (this
+//! ends up performing better for some graphs)
+//! - Concatination of the self: rather than aggregating self
+//! feature it is concatinated (i.e. dimensions are doubled)
+template <typename VTy, typename ETy>
+class SAGELayer : public GNNLayer<VTy, ETy> {
+public:
+  //! Initializes the variables of the base class and also allocates additional
+  //! memory for temporary matrices. Also initializes sync substrate for the
+  //! weight matrix
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
+            const GNNLayerDimensions& dimensions, const GNNLayerConfig& config,
+            const SAGELayerConfig& sage_config)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           config),
+        sage_config_(sage_config),
+        input_column_intermediates_(dimensions.input_columns),
+        output_column_intermediates_(dimensions.output_columns) {
+    if (!sage_config_.disable_concat) {
+      // there are now 2 weight matrices used: one for self, one for aggregation
+      // abstractly it's one matrix: W = W1 | W2
+      size_t num_weight_elements = this->layer_dimensions_.input_columns *
+                                   this->layer_dimensions_.output_columns;
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE second layer weights ",
+                    num_weight_elements, " (",
+                    this->FloatElementsToGB(num_weight_elements), " GB)");
+      // TODO(lhc) for now, allocate dummy cpu weight2 for copying to GPU
+      layer_weights_2_.resize(num_weight_elements);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateWeight2(num_weight_elements);
+      }
+#endif
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE second layer gradients ",
+                    num_weight_elements, " (",
+                    this->FloatElementsToGB(num_weight_elements), " GB)");
+      layer_weight_gradients_2_.resize(num_weight_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateWeightGradient2(num_weight_elements);
+      }
+#endif
+
+      // reinit both weight matrices as one unit
+      this->PairGlorotBengioInit(&this->layer_weights_, &layer_weights_2_);
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        // copy weight2 to GPU
+        gpu_object_.CopyToWeights2(layer_weights_2_);
+        p_layer_weights_2_ = PointerWithSize<GNNFloat>(
+            gpu_object_.layer_weights_2(), num_weight_elements);
+        p_layer_weight_gradients_2_ = PointerWithSize<GNNFloat>(
+            gpu_object_.layer_weight_gradients_2(), num_weight_elements);
+      } else {
+#endif
+        // update the pointers to them as well as realloc will require it
+        p_layer_weights_2_ = PointerWithSize<GNNFloat>(layer_weights_2_);
+        p_layer_weight_gradients_2_ =
+            PointerWithSize<GNNFloat>(layer_weight_gradients_2_);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+      std::vector<size_t> weight_size = {num_weight_elements};
+      // initialize the optimizer
+      second_weight_optimizer_ =
+          std::make_unique<AdamOptimizer>(weight_size, 1);
+    }
+
+    // TODO(loc) dropout uses input rows; this won't work if dropout is enabled
+    size_t num_in_temp_elements = this->layer_dimensions_.output_rows *
+                                  this->layer_dimensions_.input_columns;
+
+    // if (this->layer_number_ == 0) {
+    //   // set this to true for layer 0; it avoids aggregation completely
+    //   // in the last layer for the backward phase
+    //   config_.disable_aggregate_after_update = true;
+    //   // TODO this *will* hurt test evaluation because test eval has no
+    //   // backward phase, so the end-to-end benefits do not exist there
+    //   // Solution to this is to allocate all intermediate structures for both
+    //   // cases + make sure resize handles both cases
+    // }
+
+    // if in temp is smaller than out temp, or if dropout exists
+    if (!this->config_.disable_dropout ||
+        this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE input temp var 1 ",
+                    num_in_temp_elements, " (",
+                    this->FloatElementsToGB(num_in_temp_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp1(num_in_temp_elements);
+      } else {
+#endif
+        in_temp_1_.resize(num_in_temp_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    // only on in dropout case + if in temp is smaller than out temp
+    if (!this->config_.disable_dropout &&
+        (this->config_.disable_aggregate_after_update ||
+         this->layer_dimensions_.input_columns <=
+             this->layer_dimensions_.output_columns)) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE input temp var 2 ",
+                    num_in_temp_elements, " (",
+                    this->FloatElementsToGB(num_in_temp_elements), " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateInTemp2(num_in_temp_elements);
+      } else {
+#endif
+        in_temp_2_.resize(num_in_temp_elements, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    size_t num_out_temp = this->layer_dimensions_.input_rows *
+                          this->layer_dimensions_.output_columns;
+    // only needed if out temp would be smaller than intemp
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      galois::gInfo(this->graph_.host_prefix(), "Creating layer ",
+                    this->layer_number_, ", SAGE output temp var ",
+                    num_out_temp, " (", this->FloatElementsToGB(num_out_temp),
+                    " GB)");
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.AllocateOutTemp(num_out_temp);
+      } else {
+#endif
+        out_temp_.resize(num_out_temp, 0);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    this->layer_type_ = galois::GNNLayerType::kSAGE;
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      // init pointers with size
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_1(),
+                                               num_in_temp_elements);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(gpu_object_.in_temp_2(),
+                                               num_in_temp_elements);
+      p_out_temp_  = PointerWithSize<GNNFloat>(gpu_object_.out_temp(),
+                                              num_output_elements);
+    } else {
+#endif
+      p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+      p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+      p_out_temp_  = PointerWithSize<GNNFloat>(out_temp_);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+
+    GALOIS_LOG_VERBOSE("SAGE layer initialized");
+  }
+
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
+            const GNNLayerDimensions& dimensions, const GNNLayerConfig& config)
+      : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, config,
+                  SAGELayerConfig()) {}
+
+  SAGELayer(size_t layer_num, const galois::graphs::GNNGraph<VTy, ETy>& graph,
+            PointerWithSize<GNNFloat>* backward_output_matrix,
+            const GNNLayerDimensions& dimensions)
+      : SAGELayer(layer_num, graph, backward_output_matrix, dimensions,
+                  GNNLayerConfig(), SAGELayerConfig()) {}
+
+  void InitSelfWeightsTo1() {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      size_t layer_weights_2_size = p_layer_weights_2_.size();
+      if (layer_weights_2_size > 0) {
+        base_gpu_object_.InitGPUVectorTo1(gpu_object_.layer_weights_2(),
+                                          layer_weights_2_size);
+      }
+    } else {
+#endif
+      if (layer_weights_2_.size()) {
+        layer_weights_2_.assign(layer_weights_2_.size(), 1);
+      }
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  //! Returns the 2nd set of weight gradients
+  const PointerWithSize<GNNFloat> GetLayerWeightGradients2() {
+    return p_layer_weight_gradients_2_;
+  }
+
+  // Parent functions
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+    // galois::gDebug(
+    //    "Layer ", this->layer_number_, " dims: ",
+    //    layer_dimensions_.input_rows, " ", layer_dimensions_.output_rows, " ",
+    //    layer_dimensions_.input_columns, "
+    //    ", layer_dimensions_.output_columns, " ", input_embeddings.size(), "
+    //    ", layer_dimensions_.input_rows * layer_dimensions_.input_columns);
+    galois::StatTimer timer("ForwardPhase", kRegionName);
+    this->TimerStart(&timer);
+
+    assert(input_embeddings.size() >= (this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.input_columns));
+    assert(this->p_forward_output_matrix_.size() >=
+           (this->layer_dimensions_.output_rows *
+            this->layer_dimensions_.output_columns));
+
+    // pointer to input to operate on
+    const GNNFloat* input_data = input_embeddings.data();
+    GNNFloat* agg_data;
+    // first, dropout
+    if (!this->config_.disable_dropout &&
+        (this->layer_phase_ == GNNPhase::kTrain)) {
+      this->DoDropout(input_embeddings, &p_in_temp_1_);
+      input_data = p_in_temp_1_.data();
+      agg_data   = p_in_temp_2_.data();
+    } else {
+      agg_data = p_in_temp_1_.data();
+    }
+
+    // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part
+    // which is done regardless
+
+    // flip aggregate/update if dimensions favor it (do less work)
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      if (!this->config_.disable_dropout &&
+          (this->layer_phase_ == GNNPhase::kTrain)) {
+        assert(p_in_temp_2_.size() >=
+               this->layer_dimensions_.output_rows *
+                   this->layer_dimensions_.input_columns);
+      } else {
+        assert(p_in_temp_1_.size() >=
+               this->layer_dimensions_.output_rows *
+                   this->layer_dimensions_.input_columns);
+      }
+
+      // aggregation and update
+      AggregateAll(this->layer_dimensions_.input_columns, input_data, agg_data,
+                   &input_column_intermediates_);
+      assert(this->p_forward_output_matrix_.size() >=
+             this->layer_dimensions_.output_rows *
+                 this->layer_dimensions_.output_columns);
+      UpdateEmbeddings(agg_data, this->p_forward_output_matrix_.data(), true);
+    } else {
+      assert(p_out_temp_.size() >= this->layer_dimensions_.input_rows *
+                                       this->layer_dimensions_.output_columns);
+
+      // update to aggregate
+      // FW
+      UpdateEmbeddings(input_data, p_out_temp_.data(), false);
+
+      // A(FW)
+      assert(this->p_forward_output_matrix_.size() >=
+             this->layer_dimensions_.output_rows *
+                 this->layer_dimensions_.output_columns);
+      AggregateAll(this->layer_dimensions_.output_columns, p_out_temp_.data(),
+                   this->p_forward_output_matrix_.data(),
+                   &output_column_intermediates_);
+    }
+
+    if (!sage_config_.disable_concat) {
+      // FW1 is unaffected by the agg/update flip, so can to it
+      // separately
+      SelfFeatureUpdateEmbeddings(input_data,
+                                  this->p_forward_output_matrix_.data());
+    }
+
+    if (!this->config_.disable_activation) {
+      GALOIS_LOG_VERBOSE("Doing activation");
+      this->Activation();
+    }
+
+    assert(this->p_forward_output_matrix_.size() >=
+           (this->layer_dimensions_.output_rows *
+            this->layer_dimensions_.output_columns));
+
+    this->TimerStop(&timer);
+
+    return this->p_forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat> prev_layer_input,
+                PointerWithSize<galois::GNNFloat>* input_gradient) final {
+    galois::StatTimer timer("BackwardPhase", kRegionName);
+    galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync",
+                                                 kRegionName);
+    galois::StatTimer weight_gradient_sync_timer2("BackwardPhaseWeight2Sync",
+                                                  kRegionName);
+    this->TimerStart(&timer);
+
+    assert(this->layer_phase_ == GNNPhase::kTrain ||
+           this->layer_phase_ == GNNPhase::kBatch);
+
+    // derivative of activation
+    if (!this->config_.disable_activation) {
+      this->ActivationDerivative(input_gradient);
+    }
+
+    // if dropout was used, use the dropout matrix for the input
+    galois::PointerWithSize<galois::GNNFloat> input_data;
+    galois::PointerWithSize<galois::GNNFloat> agg_data;
+    if (!this->config_.disable_dropout) {
+      // dropout result is currently stored in temp 1
+      // needs to be used before it gets overwritten
+      input_data = p_in_temp_1_;
+      agg_data   = p_in_temp_2_;
+    } else {
+      // no dropout = use vanilla input
+      input_data = prev_layer_input;
+      agg_data   = p_in_temp_1_;
+    }
+
+    // aggregate this here before gradient starts to get overwritten
+    // this is xform ffirst
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      // aggregate occurs regardless of layer being equal to 0 because it is
+      // required in this case for the weight gradient calculation
+      // this is (FW)'
+      // TODO: this is absolutely terrible performance wise as well; keep
+      // in mind
+      AggregateAll(this->layer_dimensions_.output_columns,
+                   input_gradient->data(), p_out_temp_.data(),
+                   &output_column_intermediates_, true);
+    }
+
+    if (!sage_config_.disable_concat) {
+      if (this->layer_number_ != 0) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows);
+        }
+      } else {
+        // if 0 then no input to mask: mask the gradient
+        // this is fine because gradient won't be used to get feature gradients
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(input_gradient,
+                                       this->layer_dimensions_.output_rows);
+        }
+      }
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.UpdateWeight2DerivativeGPU(
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            input_gradient->data(), p_layer_weight_gradients_2_.data());
+      } else {
+#endif
+        // input data (prev layer input or temp1) or gradient need mask
+        // can mask gradient if layer == 0
+        // otherwise must mask other
+
+        galois::StatTimer concat_grad_timer("ConcatGradMultiply", kRegionName);
+        this->TimerStart(&concat_grad_timer);
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            input_gradient->data(), p_layer_weight_gradients_2_.data());
+        this->TimerStop(&concat_grad_timer);
+
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+    }
+
+    weight_gradient_sync_timer2.start();
+    this->WeightGradientSyncSum2();
+    weight_gradient_sync_timer2.stop();
+
+    // derivative of aggregation/update
+    // TODO clean up logic here to reduce nesting
+    if (this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      // aggdata can == p_intemp1; in other words, need to use before overwrite
+      // mask it, then use it
+      // XXX masking may not be required in sampling case where rows change
+      if (this->layer_number_ != 0 || sage_config_.disable_concat) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&agg_data,
+                                    this->layer_dimensions_.output_rows);
+        }
+      }
+
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        // XXX output rows
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients_.data());
+      } else {
+#endif
+        // agg data holds aggregated feature vectors from forward phase
+        galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
+        this->TimerStart(&normal_grad_timer);
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_rows,
+            this->layer_dimensions_.output_columns, agg_data.data(),
+            input_gradient->data(), this->p_layer_weight_gradients_.data());
+        this->TimerStop(&normal_grad_timer);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      // 0 means input gradient shouldn't get masked
+      if (this->layer_number_ != 0) {
+        // NOTE: this is super nice because it avoids aggregation completely
+        // in the layer 0 setting
+        // ---unmasked---
+        // transposed sgemm for derivative; in_temp is output
+        assert(input_gradient->size() >=
+               this->layer_dimensions_.output_rows *
+                   this->layer_dimensions_.output_columns);
+        // pintemp1 contains (AF)'
+        // overwrites the dropout matrix that was in ptemp1 (needed for second
+        // weight matrix)
+        UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(),
+                                   true);
+
+        // pback contains F'
+        // derivative of aggregate is the same due to symmetric graph
+        AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(),
+                     this->p_backward_output_matrix_.data(),
+                     &input_column_intermediates_, true);
+      }
+    } else {
+      // xform first
+
+      // --unmasked--
+
+      // disable concat is part of condition because otherwise this mask
+      // should have gotten done elsewhere
+      if (this->layer_number_ != 0 && sage_config_.disable_concat) {
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows,
+                                    this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskInputNonMasters(&input_data,
+                                    this->layer_dimensions_.input_rows);
+        }
+      }
+
+      // layer number 0 means output needs to be masked because input cannot
+      // be masked
+      if (this->layer_number_ == 0) {
+        // if 0 then no input to mask: mask the gradient
+        // this is fine because gradient won't be used to get feature gradients
+        if (this->graph_.IsSubgraphOn()) {
+          this->MaskGradientNonMasters(&p_out_temp_,
+                                       this->layer_dimensions_.input_rows,
+                                       this->graph_.GetNonLayerZeroMasters());
+        } else {
+          this->MaskGradientNonMasters(&p_out_temp_,
+                                       this->layer_dimensions_.input_rows);
+        }
+      }
+
+      // W' = F^T (FW)'
+      // TODO put this in a function
+#ifdef GALOIS_ENABLE_GPU
+      if (device_personality == DevicePersonality::GPU_CUDA) {
+        gpu_object_.GetWeightGradientsGPU(
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients_.data());
+      } else {
+#endif
+        // input col x input row * input row x output col
+        galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName);
+        this->TimerStart(&normal_grad_timer);
+        galois::CBlasSGEMM(
+            CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.output_columns, input_data.data(),
+            p_out_temp_.data(), this->p_layer_weight_gradients_.data());
+        this->TimerStop(&normal_grad_timer);
+#ifdef GALOIS_ENABLE_GPU
+      }
+#endif
+
+      // to get a correct result out temp mask cannot be masked;
+      // outtemp will only be masked if layer number is 0, so this
+      // is safe in all other cases
+      if (this->layer_number_ != 0) {
+        // derivative for update
+        // backout = F'
+        UpdateEmbeddingsDerivative(
+            p_out_temp_.data(), this->p_backward_output_matrix_.data(), false);
+      }
+    }
+
+    weight_gradient_sync_timer.start();
+    this->WeightGradientSyncSum();
+    weight_gradient_sync_timer.stop();
+
+    // full gradient needed here; should occur after all updates
+    if (this->layer_number_ != 0) {
+      // deal with feature gradients for the self feature here
+      // this function will sum directly into the backward matrix
+      // input gradient never gets masked if layer number != 0
+      SelfFeatureUpdateEmbeddingsDerivative(
+          input_gradient->data(), this->p_backward_output_matrix_.data());
+    }
+
+    if (!this->config_.disable_dropout && this->layer_number_ != 0) {
+      this->DoDropoutDerivative();
+    }
+
+    this->TimerStop(&timer);
+    return this->p_backward_output_matrix_;
+  }
+
+#ifdef GALOIS_ENABLE_GPU
+  //! Copies over self weight gradients to CPU from GPU
+  const std::vector<GNNFloat>& CopyWeight2GradientsFromGPU() {
+    if (!layer_weight_gradients_2_.size()) {
+      layer_weight_gradients_2_.resize(p_layer_weight_gradients_2_.size());
+    }
+    gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_);
+    return layer_weight_gradients_2_;
+  }
+#endif
+
+private:
+  static const constexpr char* kRegionName = "SAGELayer";
+
+  //! CPU aggregation
+  void
+  AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings,
+                  GNNFloat* aggregate_output,
+                  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>*,
+                  bool is_backward) {
+    // aggregation causes a row count change
+    size_t num_rows_to_handle;
+    if (!is_backward) {
+      num_rows_to_handle = this->layer_dimensions_.output_rows;
+    } else {
+      num_rows_to_handle = this->layer_dimensions_.input_rows;
+    }
+
+    galois::do_all(
+        galois::iterate(*(this->graph_.begin()), num_rows_to_handle),
+        [&](size_t src) {
+          size_t index_to_src_feature = src * column_length;
+          // zero out src feature first
+          for (size_t i = 0; i < column_length; i++) {
+            aggregate_output[index_to_src_feature + i] = 0;
+          }
+
+          GNNFloat source_norm = 0.0;
+          if (!this->config_.disable_normalization) {
+            source_norm =
+                this->graph_.GetDegreeNorm(src, this->graph_user_layer_number_);
+          }
+
+          if (!is_backward) {
+            // loop through all destinations to grab the feature to aggregate
+            for (auto e = this->graph_.edge_begin(src);
+                 e != this->graph_.edge_end(src); e++) {
+              if (this->layer_phase_ == GNNPhase::kTrain ||
+                  this->layer_phase_ == GNNPhase::kBatch) {
+                // XXX
+                // galois::gDebug("In here");
+                if (this->IsSampledLayer()) {
+                  if (!this->graph_.IsEdgeSampled(
+                          e, this->graph_user_layer_number_)) {
+                    continue;
+                  }
+                }
+              }
+              size_t dst = this->graph_.GetEdgeDest(e);
+              graphs::bitset_graph_aggregate.set(
+                  this->graph_.ConvertToLID(src));
+              size_t index_to_dst_feature = dst * column_length;
+
+              if (!this->config_.disable_normalization) {
+                GNNFloat norm_scale = source_norm;
+                assert(norm_scale != 0);
+
+                galois::VectorMulAdd(
+                    column_length, &aggregate_output[index_to_src_feature],
+                    &node_embeddings[index_to_dst_feature], norm_scale,
+                    &aggregate_output[index_to_src_feature]);
+              } else {
+                // add dst feature to aggregate output
+                galois::VectorAdd(column_length,
+                                  &aggregate_output[index_to_src_feature],
+                                  &node_embeddings[index_to_dst_feature],
+                                  &aggregate_output[index_to_src_feature]);
+              }
+            }
+          } else {
+            // loop through all destinations to grab the feature to aggregate
+            for (auto e = this->graph_.in_edge_begin(src);
+                 e != this->graph_.in_edge_end(src); e++) {
+              if (this->layer_phase_ == GNNPhase::kTrain ||
+                  this->layer_phase_ == GNNPhase::kBatch) {
+                // XXX
+                if (this->IsSampledLayer()) {
+                  if (!this->graph_.IsInEdgeSampled(
+                          e, this->graph_user_layer_number_)) {
+                    continue;
+                  }
+                }
+              }
+              size_t dst = this->graph_.GetInEdgeDest(e);
+              graphs::bitset_graph_aggregate.set(
+                  this->graph_.ConvertToLID(src));
+
+              // input row x output row in backward means that i shouldn't be
+              // touching nodes past output rows; the above sample check
+              // should deal with this where this matters
+              assert(dst < this->layer_dimensions_.output_rows);
+
+              size_t index_to_dst_feature = dst * column_length;
+
+              if (!this->config_.disable_normalization) {
+                GNNFloat norm_scale = this->graph_.GetDegreeNorm(
+                    dst, this->graph_user_layer_number_);
+
+                assert(norm_scale != 0);
+
+                galois::VectorMulAdd(
+                    column_length, &aggregate_output[index_to_src_feature],
+                    &node_embeddings[index_to_dst_feature], norm_scale,
+                    &aggregate_output[index_to_src_feature]);
+              } else {
+                // add dst feature to aggregate output
+                galois::VectorAdd(column_length,
+                                  &aggregate_output[index_to_src_feature],
+                                  &node_embeddings[index_to_dst_feature],
+                                  &aggregate_output[index_to_src_feature]);
+              }
+            }
+          }
+        },
+        galois::chunk_size<1>(), galois::steal(),
+        galois::loopname("SAGEAggregateAll"));
+  }
+
+  //! Performs aggregation for all nodes of the graph given the length of the
+  //! vector to aggregate, the features themselves, an output array, and per
+  //! thread storage for the intermediate scaling via norm factor
+  void AggregateAll(
+      size_t column_length, const GNNFloat* node_embeddings,
+      GNNFloat* aggregate_output,
+      galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts) {
+    AggregateAll(column_length, node_embeddings, aggregate_output, pts, false);
+  }
+
+  void
+  AggregateAll(size_t column_length, const GNNFloat* node_embeddings,
+               GNNFloat* aggregate_output,
+               galois::substrate::PerThreadStorage<std::vector<GNNFloat>>* pts,
+               bool is_backward) {
+    std::string agg_timer_name      = "AggregateCompute";
+    std::string agg_sync_timer_name = "AggregateSync";
+    size_t num_rows_to_handle;
+    if (!is_backward) {
+      agg_timer_name += "Forward";
+      agg_sync_timer_name += "Forward";
+      num_rows_to_handle = this->layer_dimensions_.output_rows;
+    } else {
+      agg_timer_name += "Backward";
+      agg_sync_timer_name += "Backward";
+      num_rows_to_handle = this->layer_dimensions_.input_rows;
+    }
+    galois::StatTimer timer(agg_timer_name.c_str(), kRegionName);
+    galois::StatTimer aggregate_all_sync_timer(agg_sync_timer_name.c_str(),
+                                               kRegionName);
+    this->TimerStart(&timer);
+
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      if (!this->IsSampledLayer()) {
+        gpu_object_.AggregateAllGPU(
+            this->graph_.GetGPUGraph(), this->graph_.size(), column_length,
+            node_embeddings, aggregate_output,
+            !this->config_.disable_normalization, is_backward);
+      } else {
+        // TODO(hochan)
+        GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION");
+      }
+      this->graph_.AggregateSyncGPU(aggregate_output, column_length,
+                                    this->layer_number_);
+    } else {
+#endif
+      AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts,
+                      is_backward);
+      this->TimerStop(&timer);
+
+      // aggregate sync
+      aggregate_all_sync_timer.start();
+      this->graph_.AggregateSync(aggregate_output, column_length, is_backward,
+                                 num_rows_to_handle);
+      aggregate_all_sync_timer.stop();
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+  }
+
+  //! Do embedding update via mxm with this layer's weights (forward)
+  void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output,
+                        bool after) {
+    galois::StatTimer timer("ForwardXForm", kRegionName);
+    this->TimerStart(&timer);
+#ifdef GALOIS_ENABLE_GPU
+    // TODO self change
+    // XXX(hochan) output rows
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsGPU(this->layer_dimensions_.input_rows,
+                                      this->layer_dimensions_.input_columns,
+                                      this->layer_dimensions_.output_columns,
+                                      node_embeddings,
+                                      base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // galois::gDebug("Layer ", this->graph_user_layer_number_, " ",
+      //               layer_dimensions_.output_rows, " ",
+      //               layer_dimensions_.input_columns, " ",
+      //               layer_dimensions_.output_columns);
+      // CPU version is just a call into CBlas
+      if (after) {
+        galois::CBlasSGEMM(
+            CblasNoTrans, CblasNoTrans, this->layer_dimensions_.output_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, node_embeddings,
+            this->p_layer_weights_.data(), output);
+      } else {
+        galois::CBlasSGEMM(
+            CblasNoTrans, CblasNoTrans, this->layer_dimensions_.input_rows,
+            this->layer_dimensions_.input_columns,
+            this->layer_dimensions_.output_columns, node_embeddings,
+            this->p_layer_weights_.data(), output);
+      }
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
+
+  //! Same as above but uses the second set of weights (self feature weights)
+  void SelfFeatureUpdateEmbeddings(const GNNFloat* node_embeddings,
+                                   GNNFloat* output) {
+    galois::StatTimer timer("SelfForwardXForm", kRegionName);
+    this->TimerStart(&timer);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.SelfFeatureUpdateEmbeddingsGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, node_embeddings, output);
+    } else {
+#endif
+      // note use of layer weights 2 differentiates this from above
+      galois::CBlasSGEMM(
+          CblasNoTrans, CblasNoTrans, this->layer_dimensions_.output_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, node_embeddings,
+          layer_weights_2_.data(), output, true);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
+
+  //! Calculate graident via mxm with last layer's gradients (backward)
+  void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output,
+                                  bool after) {
+    galois::StatTimer timer("BackwardXForm", kRegionName);
+    this->TimerStart(&timer);
+
+    assert(this->p_layer_weights_.size() >=
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.UpdateEmbeddingsDerivativeGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.input_columns,
+          this->layer_dimensions_.output_columns, gradients,
+          base_gpu_object_.layer_weights(), output);
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y note input rows is used here due to
+      // transpose of aggregation
+      if (after) {
+        galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                           this->layer_dimensions_.output_rows,
+                           this->layer_dimensions_.output_columns,
+                           this->layer_dimensions_.input_columns, gradients,
+                           this->p_layer_weights_.data(), output);
+      } else {
+        galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                           this->layer_dimensions_.input_rows,
+                           this->layer_dimensions_.output_columns,
+                           this->layer_dimensions_.input_columns, gradients,
+                           this->p_layer_weights_.data(), output);
+      }
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
+
+  //! Same as above but uses the second set of weights (self feature weights)
+  void SelfFeatureUpdateEmbeddingsDerivative(const GNNFloat* gradients,
+                                             GNNFloat* output) {
+    galois::StatTimer timer("SelfBackwardXForm", kRegionName);
+    this->TimerStart(&timer);
+
+    assert(this->p_layer_weights_.size() >=
+           this->layer_dimensions_.input_columns *
+               this->layer_dimensions_.output_columns);
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.SelfFeatureUpdateEmbeddingsDerivativeGPU(
+          this->layer_dimensions_.input_rows,
+          this->layer_dimensions_.output_columns,
+          this->layer_dimensions_.input_columns, gradients, output);
+    } else {
+#endif
+      // difference is Trans for B matrix (data) to get z by y (weights is y by
+      // z normally); result is x by y true at end -> accumulate
+      galois::CBlasSGEMM(CblasNoTrans, CblasTrans,
+                         this->layer_dimensions_.output_rows,
+                         this->layer_dimensions_.output_columns,
+                         this->layer_dimensions_.input_columns, gradients,
+                         layer_weights_2_.data(), output, true);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&timer);
+  }
+
+  //! override parent function: optimizes the second set of weights as well
+  void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number) {
+    galois::StatTimer total_gradient_timer("GradientDescent", kRegionName);
+    total_gradient_timer.start();
+    optimizer->GradientDescent(this->p_layer_weight_gradients_,
+                               this->p_layer_weights_, trainable_layer_number);
+    if (!sage_config_.disable_concat) {
+      second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_,
+                                                p_layer_weights_2_, 0);
+    }
+    total_gradient_timer.stop();
+  }
+
+  //! Sync second set of weight gradients
+  void WeightGradientSyncSum2() {
+    galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon");
+    this->TimerStart(&clubbed_timer);
+    galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName);
+    this->TimerStart(&t);
+    int weight_size = static_cast<int>(p_layer_weight_gradients_2_.size());
+
+#ifdef GALOIS_ENABLE_GPU
+    bool gpu_direct_enabled = false;
+    if (device_personality == DevicePersonality::GPU_CUDA &&
+        !gpu_direct_enabled) {
+      gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_);
+      MPI_Allreduce(MPI_IN_PLACE,
+                    static_cast<void*>(layer_weight_gradients_2_.data()),
+                    weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+      gpu_object_.CopyToWeight2Gradients(layer_weight_gradients_2_);
+    } else {
+#endif
+      // TODO(loc) remove this limitation later; can just do a loop over the
+      // weight matrix
+      if (p_layer_weight_gradients_2_.size() >
+          size_t{std::numeric_limits<int>::max()}) {
+        GALOIS_LOG_FATAL(
+            "Weight sync code does not handle size larger than max "
+            "int at the moment");
+      }
+      MPI_Allreduce(MPI_IN_PLACE,
+                    static_cast<void*>(p_layer_weight_gradients_2_.data()),
+                    weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
+#ifdef GALOIS_ENABLE_GPU
+    }
+#endif
+    this->TimerStop(&t);
+    this->TimerStop(&clubbed_timer);
+  }
+
+  void ResizeRows(size_t new_row_count) {
+    GNNLayer<VTy, ETy>::ResizeRows(new_row_count);
+    ResizeIntermediates(new_row_count, new_row_count);
+  }
+
+  void ResizeInputOutputRows(size_t input_row, size_t output_row) {
+    GNNLayer<VTy, ETy>::ResizeInputOutputRows(input_row, output_row);
+    ResizeIntermediates(input_row, output_row);
+  }
+
+  void ResizeIntermediates(size_t new_input_rows, size_t new_output_rows) {
+    size_t num_in_temp_elements =
+        new_output_rows * this->layer_dimensions_.input_columns;
+    // galois::gDebug(this->graph_.host_prefix(), "Layer num ",
+    // this->layer_number_, " ",
+    //               in_temp_1_.size(), " and ", num_in_temp_elements, " ",
+    //               layer_dimensions_.input_columns, " ",
+    //               layer_dimensions_.output_columns);
+
+    // if in temp is smaller than out temp, or if dropout exists
+    if (!this->config_.disable_dropout ||
+        this->config_.disable_aggregate_after_update ||
+        this->layer_dimensions_.input_columns <=
+            this->layer_dimensions_.output_columns) {
+      if (in_temp_1_.size() < num_in_temp_elements) {
+        galois::gInfo(this->graph_.host_prefix(), "Resize layer ",
+                      this->layer_number_, ", SAGE input temp var 1 ",
+                      num_in_temp_elements, " (",
+                      this->FloatElementsToGB(num_in_temp_elements), " GB)");
+        size_t buffer_size = num_in_temp_elements * 0.02;
+#ifdef GALOIS_ENABLE_GPU
+        // XXX(hochan)
+        if (device_personality == DevicePersonality::GPU_CUDA) {
+          gpu_object_.AllocateInTemp1(num_in_temp_elements + buffer_size);
+        } else {
+#endif
+          in_temp_1_.resize(num_in_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+        }
+#endif
+        // XXX(hochan) GPU
+        p_in_temp_1_ = PointerWithSize<GNNFloat>(in_temp_1_);
+      }
+    }
+
+    // only on in dropout case + if in temp is smaller than out temp
+    if (!this->config_.disable_dropout &&
+        (this->config_.disable_aggregate_after_update ||
+         this->layer_dimensions_.input_columns <=
+             this->layer_dimensions_.output_columns)) {
+      if (in_temp_2_.size() < num_in_temp_elements) {
+        galois::gInfo(this->graph_.host_prefix(), "Resize layer ",
+                      this->layer_number_, ", SAGE input temp var 2 ",
+                      num_in_temp_elements, " (",
+                      this->FloatElementsToGB(num_in_temp_elements), " GB)");
+        size_t buffer_size = num_in_temp_elements * 0.02;
+#ifdef GALOIS_ENABLE_GPU
+        if (device_personality == DevicePersonality::GPU_CUDA) {
+          gpu_object_.AllocateInTemp2(num_in_temp_elements + buffer_size);
+        } else {
+#endif
+          in_temp_2_.resize(num_in_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+        }
+#endif
+        // XXX(hochan) GPU
+        p_in_temp_2_ = PointerWithSize<GNNFloat>(in_temp_2_);
+      }
+    }
+
+    size_t num_output_temp_elements =
+        new_input_rows * this->layer_dimensions_.output_columns;
+    // only needed if out temp would be smaller than intemp
+    if (!this->config_.disable_aggregate_after_update &&
+        this->layer_dimensions_.input_columns >
+            this->layer_dimensions_.output_columns) {
+      if (out_temp_.size() < num_output_temp_elements) {
+        galois::gInfo(
+            this->graph_.host_prefix(), "Resize layer ", this->layer_number_,
+            ", SAGE output temp var ", num_output_temp_elements, " (",
+            this->FloatElementsToGB(num_output_temp_elements), " GB)");
+        size_t buffer_size = (num_output_temp_elements * 0.02);
+#ifdef GALOIS_ENABLE_GPU
+        if (device_personality == DevicePersonality::GPU_CUDA) {
+          gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size);
+        } else {
+#endif
+          out_temp_.resize(num_output_temp_elements + buffer_size, 0);
+#ifdef GALOIS_ENABLE_GPU
+        }
+#endif
+        p_out_temp_ = PointerWithSize<GNNFloat>(out_temp_);
+      }
+    }
+  }
+
+  //! SAGE config params
+  SAGELayerConfig sage_config_;
+  //! Need own optimizer for the 2nd weight matrix
+  std::unique_ptr<AdamOptimizer> second_weight_optimizer_;
+
+  // second set of weights for the concat that may occur
+  std::vector<GNNFloat> layer_weights_2_;
+  std::vector<GNNFloat> layer_weight_gradients_2_;
+  PointerWithSize<GNNFloat> p_layer_weights_2_;
+  PointerWithSize<GNNFloat> p_layer_weight_gradients_2_;
+
+  // 2 temporaries the size of the forward input; used for dropout and
+  // aggregation (if either are required)
+  std::vector<GNNFloat> in_temp_1_;
+  std::vector<GNNFloat> in_temp_2_;
+  // Temporary matrix the size of the output of the forward pass; used if
+  // an intermediate op occurs before writing to the final output matrix
+  std::vector<GNNFloat> out_temp_;
+
+  // Pointer with size versions
+  PointerWithSize<GNNFloat> p_in_temp_1_;
+  PointerWithSize<GNNFloat> p_in_temp_2_;
+  PointerWithSize<GNNFloat> p_out_temp_;
+
+  // Each thread has a vector of size # input columns or # output columns for
+  // storing intermediate results during aggregation.
+  // The one used depeneds on if aggregation occurs before or after the mxm.
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      input_column_intermediates_;
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      output_column_intermediates_;
+
+#ifdef GALOIS_ENABLE_GPU
+  SAGEGPUAllocations gpu_object_;
+#endif
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h
new file mode 100644
index 0000000000..26d9271d37
--- /dev/null
+++ b/libgnn/include/galois/layers/SigmoidLayer.h
@@ -0,0 +1,154 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+#include "galois/GNNMath.h"
+
+#include <math.h>
+
+// TODO(loc) GPU support
+
+namespace galois {
+
+//! Sigmoid layer: applies sigmoid function element wise to each element of the
+//! input.
+//! Meant for use with *multi-class* labels.
+template <typename VTy, typename ETy>
+class SigmoidLayer : public GNNLayer<VTy, ETy> {
+public:
+  SigmoidLayer(size_t layer_num,
+               const galois::graphs::GNNGraph<VTy, ETy>& graph,
+               PointerWithSize<GNNFloat>* backward_output_matrix,
+               const GNNLayerDimensions& dimensions)
+      : GNNLayer<VTy, ETy>(layer_num, graph, backward_output_matrix, dimensions,
+                           GNNLayerConfig{.allocate_weights = false}),
+        input_loss_(dimensions.input_rows),
+        norm_gradient_vectors_(dimensions.input_columns) {
+    this->output_layer_type_ = galois::GNNOutputLayerType::kSigmoid;
+    // input/output columns must be equivalent
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    // output needs to match number of possible classes
+    GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses());
+  }
+
+  //! Normalizes all elements by applying sigmoid to all of them
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO(loc) when GPU needs it
+    printf("%p\n", input_embeddings.data());
+    return p_layer_weights_;
+#else
+    return ForwardPhaseCPU(input_embeddings);
+#endif
+  }
+
+  //! Get gradients to fix distribution such that it leans more towards
+  //! multiclass ground truth.
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat>,
+                PointerWithSize<galois::GNNFloat>*) final {
+#ifdef GALOIS_ENABLE_GPU
+    // TODO(loc) when GPU needs it
+    return p_layer_weights_;
+#else
+    return BackwardPhaseCPU();
+#endif
+  }
+
+private:
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    galois::gWarn(
+        "Sigmoid layer has not been kept up to date; do not use unless sure"
+        " it works with new changes");
+
+    input_loss_.assign(input_loss_.size(), 0.0);
+    this->forward_output_matrix_.assign(this->forward_output_matrix_.size(),
+                                        0.0);
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+    this->node_count_.reset();
+    this->float_accumulator_.reset();
+
+    galois::do_all(
+        galois::iterate(this->graph_.begin(), this->graph_.end()),
+        [&](const unsigned local_node) {
+          if (this->graph_.IsValidForPhase(local_node, this->layer_phase_)) {
+            if (this->IsSampledLayer()) {
+              if (this->layer_phase_ == GNNPhase::kTrain &&
+                  !this->graph_.IsInSampledGraph(local_node))
+                return;
+            }
+
+            this->node_count_ += 1;
+
+            size_t node_offset = feature_length * local_node;
+            // sigmoid the values for this node
+            for (unsigned index = 0; index < feature_length; index++) {
+              // splitting in half is done for numerical stability of log
+              if (input_embeddings[node_offset + index] >= 0) {
+                this->forward_output_matrix_[node_offset + index] =
+                    1.0 / (1.0 + expf(-input_embeddings[node_offset + index]));
+              } else {
+                this->forward_output_matrix_[node_offset + index] =
+                    expf(input_embeddings[node_offset + index]) /
+                    (1.0 + expf(input_embeddings[node_offset + index]));
+              }
+            }
+
+            input_loss_[local_node] = GNNCrossEntropy(
+                feature_length, this->graph_.GetMultiClassLabel(local_node),
+                &this->forward_output_matrix_[node_offset]);
+            // TODO(loc) normalize the loss
+            this->float_accumulator_ += input_loss_[local_node];
+          }
+        },
+        galois::steal(), galois::loopname("SigmoidForward"));
+
+    galois::gPrint(
+        "Average loss is ",
+        this->float_accumulator_.reduce() / this->node_count_.reduce(), "\n");
+    return this->forward_output_matrix_;
+  }
+
+  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU() {
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+    galois::do_all(
+        galois::iterate(size_t{0}, this->p_backward_output_matrix_.size()),
+        [&](size_t i) { this->p_backward_output_matrix_[i] = 0; });
+
+    galois::do_all(
+        galois::iterate(this->graph_.begin(), this->graph_.end()),
+        [&](const unsigned local_node) {
+          if (this->graph_.IsValidForPhase(local_node, this->layer_phase_)) {
+            if (this->IsSampledLayer()) {
+              if (this->layer_phase_ == GNNPhase::kTrain &&
+                  !this->graph_.IsInSampledGraph(local_node))
+                return;
+            }
+
+            // derivative cross entropy into norm grad
+            const GNNLabel* ground_truth =
+                this->graph_.GetMultiClassLabel(local_node);
+            size_t node_offset = feature_length * local_node;
+            // sigmoid-cross-entropy derivative: turns out all it is is simple
+            // subtraction
+            for (unsigned index = 0; index < feature_length; index++) {
+              this->p_backward_output_matrix_[node_offset + index] =
+                  this->forward_output_matrix_[node_offset + index] -
+                  ground_truth[index];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("SigmoidBackward"));
+
+    return this->p_backward_output_matrix_;
+  }
+
+  //! Loss for each row of the input
+  std::vector<GNNFloat> input_loss_;
+  //! Each thread gets storage to allocate the gradients during backward
+  //! prop; each is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      norm_gradient_vectors_;
+};
+
+} // namespace galois
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh
new file mode 100644
index 0000000000..6387edaeb6
--- /dev/null
+++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh
@@ -0,0 +1,48 @@
+#ifndef GALOIS_SOFTMAX_GPU
+#define GALOIS_SOFTMAX_GPU
+#include "galois/graphs/GNNGraph.cuh"
+namespace galois {
+
+//! Contains implementation for the forward/backward pass of the softmax layer
+//! on GPUs.
+class SoftmaxLayerGPU {
+public:
+  //! Initialize by saving pointers to already initialized GPU memory
+  SoftmaxLayerGPU(const galois::graphs::GNNGraphGPUAllocations& gpu_graph)
+      : train_mask_(gpu_graph.local_training_mask()),
+        val_mask_(gpu_graph.local_validation_mask()),
+        test_mask_(gpu_graph.local_testing_mask()),
+        local_labels_(gpu_graph.ground_truth()) {}
+  void ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes,
+                       size_t feature_length, const GNNFloat* input_embeddings,
+                       GNNFloat* output);
+  void BackwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes,
+                        size_t feature_length, const GNNFloat* predictions,
+                        GNNFloat* output_gradient);
+
+  void CopyToCPU(GNNFloat* input, size_t size);
+
+private:
+  char* train_mask_;
+  char* val_mask_;
+  char* test_mask_;
+  GNNLabel* local_labels_;
+
+  //! Helper function that returns the correct mask based on phase it is passed
+  char* ChooseMask(galois::GNNPhase phase) {
+    switch (phase) {
+    case GNNPhase::kTrain:
+      return train_mask_;
+    case GNNPhase::kValidate:
+      return val_mask_;
+    case GNNPhase::kTest:
+      return test_mask_;
+    default:
+      GALOIS_LOG_FATAL("Invalid phase specified");
+      return nullptr;
+    }
+  }
+};
+
+} // namespace galois
+#endif
diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h
new file mode 100644
index 0000000000..0fe3d66284
--- /dev/null
+++ b/libgnn/include/galois/layers/SoftmaxLayer.h
@@ -0,0 +1,219 @@
+#pragma once
+#include "galois/layers/GNNLayer.h"
+#include "galois/GNNMath.h"
+
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/layers/SoftmaxLayer.cuh"
+#endif
+
+namespace galois {
+
+//! Softmax layer: takes each row of the input matrix and creates a probability
+//! distribution based on the magnitude of elements in each row.
+//! Currently this only works with **single class* labels and is coded as such.
+template <typename VTy, typename ETy>
+class SoftmaxLayer : public GNNLayer<VTy, ETy> {
+public:
+  SoftmaxLayer(size_t layer_num,
+               const galois::graphs::GNNGraph<VTy, ETy>& graph,
+               PointerWithSize<GNNFloat>* backward_output_matrix,
+               const GNNLayerDimensions& dimensions)
+      : GNNLayer<VTy, ETy>(
+            layer_num, graph, backward_output_matrix, dimensions,
+            GNNLayerConfig{.allocate_weights = false, .disable_output = true}),
+#ifdef GALOIS_ENABLE_GPU
+        gpu_object_(graph.GetGPUGraph()),
+#endif
+        input_loss_(dimensions.input_rows),
+        ground_truth_vectors_(dimensions.input_columns),
+        norm_gradient_vectors_(dimensions.input_columns),
+        softmax_temp_vectors_(dimensions.input_columns)
+
+  {
+    this->output_layer_type_ = galois::GNNOutputLayerType::kSoftmax;
+    // input/output columns must be equivalent in a softmax
+    GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns);
+    // output needs to match number of possible classes
+    GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses());
+  }
+
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhaseCPU(const PointerWithSize<galois::GNNFloat> input_embeddings) {
+    galois::StatTimer Timer("SoftmaxForward", "SoftmaxLayer");
+    this->TimerStart(&Timer);
+
+    // note: p_backward == input_embeddings
+    input_loss_.assign(input_loss_.size(), 0.0);
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+#ifndef NDEBUG
+    galois::DGAccumulator<GNNFloat> loss_accum;
+    galois::DGAccumulator<size_t> handled;
+    loss_accum.reset();
+    handled.reset();
+#endif
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned i) {
+          if (this->IsSampledLayer()) {
+            if ((this->layer_phase_ == GNNPhase::kTrain ||
+                 this->layer_phase_ == GNNPhase::kBatch) &&
+                !this->graph_.IsInSampledGraphSubgraph(i)) {
+              // XXX
+              VectorZero(feature_length,
+                         &this->p_backward_output_matrix_[i * feature_length]);
+              return;
+            }
+          }
+
+          // do softmax
+          GNNSoftmax(feature_length, &input_embeddings[feature_length * i],
+                     &this->p_backward_output_matrix_[feature_length * i]);
+
+          // create ground truth vector for this LID
+          std::vector<GNNFloat>* ground_truth_vec =
+              ground_truth_vectors_.getLocal();
+          assert(ground_truth_vec->size() == feature_length);
+          ground_truth_vec->assign(ground_truth_vec->size(), 0.0);
+          // single class label is an index; set the correct one
+          (*ground_truth_vec)[static_cast<size_t>(
+              this->graph_.GetSingleClassLabel(i))] = 1.0;
+
+          // calculate loss for this LID (note not all i will be filled)
+          input_loss_[i] = GNNCrossEntropy(
+              feature_length, ground_truth_vec->data(),
+              &this->p_backward_output_matrix_[feature_length * i]);
+#ifndef NDEBUG
+          loss_accum += input_loss_[i];
+          handled += 1;
+#endif
+        },
+        // TODO chunk size?
+        // steal on as some threads may have nothing to work on
+        // galois::steal(), galois::loopname("SoftmaxForward"));
+        galois::steal());
+#ifndef NDEBUG
+    GNNFloat reduced_loss = loss_accum.reduce();
+    size_t t              = handled.reduce();
+    galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t,
+                   "\n");
+#endif
+    this->TimerStop(&Timer);
+    return this->p_backward_output_matrix_;
+  }
+
+  //! Creates probability distribution of each row of input
+  const PointerWithSize<galois::GNNFloat>
+  ForwardPhase(const PointerWithSize<galois::GNNFloat> input_embeddings) final {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.ForwardPhaseGPU(this->layer_phase_, this->graph_.size(),
+                                  this->layer_dimensions_.input_columns,
+                                  input_embeddings.data(),
+                                  this->p_backward_output_matrix_.data());
+      return this->p_backward_output_matrix_;
+    }
+#endif
+    return ForwardPhaseCPU(input_embeddings);
+  }
+
+  PointerWithSize<galois::GNNFloat> BackwardPhaseCPU() {
+    galois::StatTimer Timer("SoftmaxBackward", "SoftmaxLayer");
+    this->TimerStart(&Timer);
+
+    const size_t feature_length = this->layer_dimensions_.input_columns;
+
+    galois::do_all(
+        galois::iterate(size_t{0}, this->layer_dimensions_.input_rows),
+        [&](const unsigned node) {
+          if (this->IsSampledLayer()) {
+            if ((this->layer_phase_ == GNNPhase::kTrain ||
+                 this->layer_phase_ == GNNPhase::kBatch) &&
+                !this->graph_.IsInSampledGraphSubgraph(node))
+              return;
+          }
+
+          size_t correct = this->graph_.GetSingleClassLabel(node);
+          // See here for explanation for why this works
+          // https://gombru.github.io/2018/05/23/cross_entropy_loss/
+          // Derivation of full combined derivative isn't there, but some
+          // emperical inspection tells me this is likely correct
+          // TODO(loc) work it out myself
+          for (size_t idx = 0; idx < feature_length; idx++) {
+            if (idx == correct) {
+              // positive class
+              this->p_backward_output_matrix_[node * feature_length + idx] =
+                  this->p_backward_output_matrix_[node * feature_length + idx] -
+                  1;
+            } else {
+              // negative class
+              this->p_backward_output_matrix_[node * feature_length + idx] =
+                  this->p_backward_output_matrix_[node * feature_length + idx];
+            }
+          }
+        },
+        galois::steal(), galois::loopname("SoftmaxBackward"));
+
+    this->TimerStop(&Timer);
+
+    return this->p_backward_output_matrix_;
+  }
+
+  //! Get gradients to fix distribution such that it leans more towards single
+  //! class ground truth.
+  PointerWithSize<galois::GNNFloat>
+  BackwardPhase(PointerWithSize<galois::GNNFloat>,
+                PointerWithSize<galois::GNNFloat>*) final {
+#ifdef GALOIS_ENABLE_GPU
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpu_object_.BackwardPhaseGPU(this->layer_phase_, this->graph_.size(),
+                                   this->layer_dimensions_.input_columns,
+                                   this->p_backward_output_matrix_.data(),
+                                   this->p_backward_output_matrix_.data());
+      return this->p_backward_output_matrix_;
+    }
+#endif
+    return BackwardPhaseCPU();
+  }
+
+  void ResizeRows(size_t new_row_count) {
+    this->layer_dimensions_.input_rows  = new_row_count;
+    this->layer_dimensions_.output_rows = new_row_count;
+    // no output resize
+    if (input_loss_.size() < new_row_count) {
+      input_loss_.resize(new_row_count * 1.02);
+    }
+  }
+
+  void ResizeInputOutputRows(size_t in, size_t out) {
+    assert(in == out);
+    this->layer_dimensions_.input_rows  = in;
+    this->layer_dimensions_.output_rows = out;
+    // no output resize
+    if (input_loss_.size() < in) {
+      input_loss_.resize(in * 1.02);
+    }
+  }
+
+private:
+#ifdef GALOIS_ENABLE_GPU
+  SoftmaxLayerGPU gpu_object_;
+#endif
+
+  //! Loss for each row of the input
+  std::vector<GNNFloat> input_loss_;
+  //! Each thread gets storage to allocate the ground truth vector in during
+  //! calculation; each vector is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      ground_truth_vectors_;
+  //! Each thread gets storage to allocate the gradients during backward
+  //! prop; each is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      norm_gradient_vectors_;
+  //! Each thread gets storage for a temporary vector used during softmax
+  //! derivative calculation; each is the size of a feature vector
+  galois::substrate::PerThreadStorage<std::vector<GNNFloat>>
+      softmax_temp_vectors_;
+};
+
+} // namespace galois
diff --git a/libgnn/src/CUDAUtil.cu b/libgnn/src/CUDAUtil.cu
new file mode 100644
index 0000000000..7d39a81ff2
--- /dev/null
+++ b/libgnn/src/CUDAUtil.cu
@@ -0,0 +1,9 @@
+#include <cuda.h>
+
+#include "galois/CUDAUtilHostDecls.h"
+#include "galois/GNNTypes.h"
+
+DevicePersonality device_personality;
+int gpudevice;
+
+void SetCUDADeviceId(int gpu_id) { cudaSetDevice(gpu_id); }
diff --git a/libgnn/src/DistributedMinibatchTracker.cpp b/libgnn/src/DistributedMinibatchTracker.cpp
new file mode 100644
index 0000000000..4f25252b0a
--- /dev/null
+++ b/libgnn/src/DistributedMinibatchTracker.cpp
@@ -0,0 +1,86 @@
+#include <algorithm>
+#include "galois/DistributedMinibatchTracker.h"
+
+size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() {
+  galois::StatTimer timer("DistributedGetNumberForNextMinibatch");
+  timer.start();
+
+  uint32_t my_share = int_distribution_(rng_object_);
+  if (current_num_on_hosts_[my_host_id_] == 0) {
+    my_share = 0;
+  }
+  sampled_num_on_hosts_[my_host_id_] = my_share;
+  // sync and post process *the same way on all hosts*
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_UINT32_T,
+                static_cast<void*>(sampled_num_on_hosts_.data()), 1,
+                MPI_UINT32_T, MPI_COMM_WORLD);
+
+  for (size_t i = 1; i < sampled_num_on_hosts_.size(); i++) {
+    sampled_num_on_hosts_[i] += sampled_num_on_hosts_[i - 1];
+  }
+  uint32_t share_sum = sampled_num_on_hosts_.back();
+  uint32_t num_per_unit =
+      std::max((total_minibatch_size_ + share_sum - 1) / share_sum, size_t{1});
+
+  size_t my_value_to_take    = 0;
+  size_t extra_to_distribute = 0;
+  size_t sanity_sum          = 0;
+  for (size_t host = 0; host < num_hosts_; host++) {
+    // determine how much to pull from each host based on sampled number
+    uint32_t start;
+    uint32_t end;
+    if (host == 0) {
+      start = 0;
+      end   = std::min(num_per_unit * sampled_num_on_hosts_[host],
+                       (uint32_t)total_minibatch_size_);
+    } else if (host == (num_hosts_ - 1)) {
+      start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1],
+                       (uint32_t)total_minibatch_size_);
+      end   = total_minibatch_size_;
+    } else {
+      start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1],
+                       (uint32_t)total_minibatch_size_);
+      end   = std::min(num_per_unit * sampled_num_on_hosts_[host],
+                       (uint32_t)total_minibatch_size_);
+    }
+
+    uint32_t proposed_to_take = end - start;
+    sanity_sum += proposed_to_take;
+
+    // is there actually that much? check
+    uint32_t actual_to_take =
+        std::min(proposed_to_take, current_num_on_hosts_[host]);
+
+    if (actual_to_take < proposed_to_take) {
+      extra_to_distribute += proposed_to_take - actual_to_take;
+    }
+    // update counts, then return
+    current_num_on_hosts_[host] -= actual_to_take;
+    if (host == my_host_id_) {
+      my_value_to_take = actual_to_take;
+    }
+  }
+  GALOIS_LOG_ASSERT(sanity_sum == total_minibatch_size_);
+
+  // redistribute extra to hosts with remaining
+  for (size_t host = 0; host < num_hosts_; host++) {
+    if (!extra_to_distribute) {
+      // leave when there is nothing selse to distribute
+      break;
+    }
+
+    size_t left_on_host = current_num_on_hosts_[host];
+    if (left_on_host) {
+      uint32_t to_take = std::min(extra_to_distribute, left_on_host);
+      extra_to_distribute -= to_take;
+      current_num_on_hosts_[host] -= to_take;
+      // update my count as neccessary
+      if (my_host_id_ == host) {
+        my_value_to_take += to_take;
+      }
+    }
+  }
+  timer.stop();
+
+  return my_value_to_take;
+}
diff --git a/libgnn/src/GNNCudaContext.cu b/libgnn/src/GNNCudaContext.cu
new file mode 100644
index 0000000000..28589da00c
--- /dev/null
+++ b/libgnn/src/GNNCudaContext.cu
@@ -0,0 +1,240 @@
+#include <stdlib.h>
+#include "gg.h"
+#include "ggcuda.h"
+#include "galois/cuda/Context.h"
+#include "galois/GNNTypes.h"
+#include "galois/runtime/cuda/DeviceSync.h"
+#include "galois/GNNCudaContextHostDecls.h"
+
+extern Shared<DynamicBitset> cuda_bitset_graph_aggregate;
+
+// The forward declaration is in the original Context.h file; as long as
+// pointers to it are used it shouldn't be an issue (since space usage is
+// unknown at that point)
+struct CUDA_Context : public CUDA_Context_Common {
+  // TODO to arrays: each context handles all layers of the graph
+  // Possible to add a "layer" argument to the below functions?
+  std::vector<struct CUDA_Context_Field<galois::GNNFloat>> layer_input_matrix;
+  std::vector<struct CUDA_Context_Field<galois::GNNFloat>> layer_output_matrix;
+  std::vector<size_t> layer_input_matrix_column_size;
+  std::vector<size_t> layer_output_matrix_column_size;
+};
+
+//! Allocates a new CUDA context
+//! Note: caller is responsible for freeing it
+struct CUDA_Context* get_CUDA_context(int id) {
+  struct CUDA_Context* ctx =
+      (struct CUDA_Context*)calloc(1, sizeof(struct CUDA_Context));
+  ctx->id = id;
+  return ctx;
+}
+
+bool init_CUDA_context(struct CUDA_Context* ctx, int device) {
+  return init_CUDA_context_common(ctx, device);
+}
+
+void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers) {
+  ctx->layer_output_matrix.resize(num_layers);
+  ctx->layer_output_matrix_column_size.resize(num_layers);
+  ctx->layer_input_matrix.resize(num_layers);
+  ctx->layer_input_matrix_column_size.resize(num_layers);
+}
+
+void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g_info,
+                         unsigned num_hosts) {
+  size_t mem_usage = mem_usage_CUDA_common(g_info, num_hosts);
+  printf("[%d] Host memory for communication context: (%3u B) %3u MB\n",
+         ctx->id, mem_usage, mem_usage / 1048756);
+
+  // TODO This is expensive; is it required? Can we get away with less?
+  // should only need one copy of mirror/masters for entire execution,
+  // not per layer
+  // graph does not need to be copied either since that's handled elsewhere
+  // (gpu object on GNNGraph)
+  load_graph_CUDA_common(ctx, g_info, num_hosts);
+}
+
+void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx,
+                                     unsigned layer_number, unsigned num_hosts,
+                                     unsigned nnodes, size_t infl_in_size,
+                                     size_t infl_out_size) {
+  ctx->layer_input_matrix_column_size[layer_number] = infl_in_size;
+  load_graph_CUDA_field_inflating(ctx, &ctx->layer_input_matrix[layer_number],
+                                  num_hosts, nnodes, infl_in_size, false);
+  ctx->layer_output_matrix_column_size[layer_number] = infl_out_size;
+  load_graph_CUDA_field_inflating(ctx, &ctx->layer_output_matrix[layer_number],
+                                  num_hosts, nnodes, infl_out_size, false);
+}
+
+////////// layer_input_matrix (forward) synchronization function ///////////////
+
+namespace galois {
+void batch_get_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode,
+      column_size);
+}
+
+void batch_get_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            size_t column_size,
+                                            unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size);
+}
+
+void batch_aggregate_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, addOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_aggregate_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, addOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                            unsigned from_id, uint8_t* buf,
+                                            DataCommMode mode,
+                                            size_t column_size,
+                                            unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, setOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_mirror_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, setOp>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_get_reset_node_layer_input_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode,
+      column_size);
+}
+
+void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx,
+                                                  unsigned from_id,
+                                                  uint8_t* buf,
+                                                  size_t column_size,
+                                                  unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
+      ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size);
+}
+
+////////// layer_output_matrix (backward) synchronization function /////////////
+
+void batch_get_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size,
+      mode, column_size);
+}
+
+void batch_get_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             size_t column_size,
+                                             unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMaster, false>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size);
+}
+
+void batch_aggregate_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, addOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_aggregate_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, addOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                             unsigned from_id, uint8_t* buf,
+                                             DataCommMode mode,
+                                             size_t column_size,
+                                             unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMaster, setOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_set_mirror_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode,
+    size_t column_size, unsigned layer_number) {
+  batch_set_shared_field<GNNFloat, sharedMirror, setOp>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode,
+      column_size);
+}
+
+void batch_get_reset_node_layer_output_matrix_cuda(
+    struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size,
+    DataCommMode* mode, size_t column_size, unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size,
+      mode, column_size);
+}
+
+void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx,
+                                                   unsigned from_id,
+                                                   uint8_t* buf,
+                                                   size_t column_size,
+                                                   unsigned layer_number) {
+  batch_get_shared_field<GNNFloat, sharedMirror, true>(
+      ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size);
+}
+
+void get_bitset_graph_aggregate_cuda(struct CUDA_Context*,
+                                     uint64_t* bitset_compute) {
+  cuda_bitset_graph_aggregate.cpu_rd_ptr()->copy_to_cpu(bitset_compute);
+}
+
+void bitset_graph_aggregate_reset_cuda(struct CUDA_Context*, size_t begin,
+                                       size_t end) {
+  reset_bitset_field(cuda_bitset_graph_aggregate, begin, end);
+}
+
+void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix,
+                             size_t column_size, size_t num_nodes,
+                             unsigned layer_number) {
+  if (ctx->layer_input_matrix_column_size[layer_number] == column_size) {
+    ctx->layer_input_matrix[layer_number].data.set_data(
+        layer_matrix, column_size * num_nodes);
+  } else if (ctx->layer_output_matrix_column_size[layer_number] ==
+             column_size) {
+    ctx->layer_output_matrix[layer_number].data.set_data(
+        layer_matrix, column_size * num_nodes);
+  }
+}
+
+size_t getLayerInputMatrixColumnSize(struct CUDA_Context* ctx,
+                                     unsigned layer_number) {
+  return ctx->layer_input_matrix_column_size[layer_number];
+}
+
+size_t getLayerOutputMatrixColumnSize(struct CUDA_Context* ctx,
+                                      unsigned layer_number) {
+  return ctx->layer_output_matrix_column_size[layer_number];
+}
+} // namespace galois
diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp
new file mode 100644
index 0000000000..582fba95f6
--- /dev/null
+++ b/libgnn/src/GNNMath.cpp
@@ -0,0 +1,174 @@
+#include <algorithm>
+#include <immintrin.h>
+#include "galois/GNNMath.h"
+#include "galois/Logging.h"
+
+void galois::VectorZero(size_t length, GNNFloat* a) {
+  for (size_t i = 0; i < length; i++) {
+    a[i] = 0;
+  }
+}
+
+size_t galois::MaxIndex(const size_t length, const GNNFloat* vector) {
+  size_t index     = 0;
+  GNNFloat cur_max = vector[0];
+
+  for (size_t i = 1; i < length; i++) {
+    if (vector[i] > cur_max) {
+      index   = i;
+      cur_max = vector[i];
+    }
+  }
+
+  return index;
+}
+
+void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+                       GNNFloat* output) {
+#ifdef __AVX2__
+  constexpr size_t vectorization_length =
+      8; // for 32-bit floating point in AVX2; TODO AVX512
+  // can only do up to a particular multiple due to alignment
+  const size_t aligned_end = length - length % vectorization_length;
+  // do add via vector ops
+  for (size_t i = 0; i < aligned_end; i += vectorization_length) {
+    _mm256_storeu_ps(&output[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]),
+                                               _mm256_loadu_ps(&b[i])));
+  }
+
+  // handle the rest
+  for (size_t i = aligned_end; i < length; ++i) {
+    output[i] = a[i] + b[i];
+  }
+#else
+  galois::gWarn("No vectorization support on this machine! Falling back to "
+                "simple for loop");
+  // no vector -> trivial loop add
+  for (size_t i = 0; i < length; ++i) {
+    output[i] = a[i] + b[i];
+  }
+#endif
+}
+
+void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b,
+                          const GNNFloat b_scale, GNNFloat* output) {
+#ifdef __AVX512F__
+  // 512
+  constexpr size_t vectorization_length = 16;
+  const size_t aligned_end = length - length % vectorization_length;
+  __m512 scale_vec_main    = _mm512_set_ps(
+      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale,
+      b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale);
+  for (size_t i = 0; i < aligned_end; i += vectorization_length) {
+    _mm512_storeu_ps(
+        &output[i],
+        _mm512_add_ps(_mm512_loadu_ps(&a[i]),
+                      _mm512_mul_ps(scale_vec_main, _mm512_loadu_ps(&b[i]))));
+  }
+  // handle the rest
+  for (size_t i = aligned_end; i < length; ++i) {
+    output[i] = a[i] + b[i] * b_scale;
+  }
+#else
+#ifdef __AVX2__
+  constexpr size_t vectorization_length =
+      8; // for 32-bit floating point in AVX2; TODO AVX512
+  // can only do up to a particular multiple due to alignment
+  // create scale vector for b
+  __m128 scale_vec_half = _mm_set_ps(b_scale, b_scale, b_scale, b_scale);
+  __m256 scale_vec_main = _mm256_castps128_ps256(scale_vec_half);
+  scale_vec_main = _mm256_insertf128_ps(scale_vec_main, scale_vec_half, 1);
+
+  const size_t aligned_end = length - length % vectorization_length;
+  // do add via vector ops
+  for (size_t i = 0; i < aligned_end; i += vectorization_length) {
+    _mm256_storeu_ps(
+        &output[i],
+        _mm256_add_ps(_mm256_loadu_ps(&a[i]),
+                      _mm256_mul_ps(scale_vec_main, _mm256_loadu_ps(&b[i]))));
+  }
+
+  // handle the rest
+  for (size_t i = aligned_end; i < length; ++i) {
+    output[i] = a[i] + b[i] * b_scale;
+  }
+#else
+  galois::gWarn("No vectorization support on this machine! Falling back to "
+                "simple for loop");
+  // no vector -> trivial loop add
+  for (size_t i = 0; i < length; ++i) {
+    output[i] = a[i] + b[i] * b_scale;
+  }
+#endif
+#endif
+}
+
+void galois::GNNSoftmax(const size_t vector_length, const GNNFloat* input,
+                        GNNFloat* output) {
+  const GNNFloat max_element =
+      *(std::max_element(input, input + vector_length));
+  GNNFloat denominator = 0;
+  // normalize all elements using exponentional of max element
+  for (size_t i = 0; i < vector_length; i++) {
+    output[i] = std::exp(input[i] - max_element);
+    denominator += output[i];
+  }
+  // divide all by total to get a distribution
+  for (size_t i = 0; i < vector_length; i++) {
+    output[i] /= denominator;
+  }
+}
+
+void galois::GNNSoftmaxDerivative(const size_t vector_length,
+                                  const GNNFloat* prev_output,
+                                  const GNNFloat* prev_output_derivative,
+                                  GNNFloat* temp_vector, GNNFloat* output) {
+  // TODO(loc) remove this function, unnecessary as cross/softmax derivatives
+  // can be merged as currently done in Softmax code
+  // will do so in a separate commit
+  GALOIS_LOG_FATAL("Should not need this function anymore with simplified "
+                   "combined derivatives in each layer");
+  for (size_t i = 0; i < vector_length; i++) {
+    for (size_t j = 0; j < vector_length; j++) {
+      temp_vector[j] = (j == i) ? prev_output[i] * (1.0 - prev_output[i])
+                                : -prev_output[j] * prev_output[i];
+    }
+    GNNFloat sdot_result = 0;
+    // TODO use vector instructions? would need another loop to add everything
+    // together + a temp vector to store results so probably about the same?
+    for (size_t k = 0; k < vector_length; k++) {
+      sdot_result += prev_output_derivative[k] * temp_vector[k];
+    }
+    output[i] = sdot_result;
+
+    // TODO this is currently disabled because of a nested parallelism problem
+    // (cblas may use more threads)
+    // output[i] =
+    //    cblas_sdot(vector_length, prev_output_derivative, 1, temp_vector, 1);
+  }
+}
+
+void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
+                        const CBLAS_TRANSPOSE trans_b, size_t input_rows,
+                        size_t input_columns, size_t output_columns,
+                        const GNNFloat* a, const GNNFloat* b,
+                        GNNFloat* output) {
+  CBlasSGEMM(trans_a, trans_b, input_rows, input_columns, output_columns, a, b,
+             output, false);
+}
+
+void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a,
+                        const CBLAS_TRANSPOSE trans_b, size_t input_rows,
+                        size_t input_columns, size_t output_columns,
+                        const GNNFloat* a, const GNNFloat* b, GNNFloat* output,
+                        bool accumulate) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+  // do the MM
+  // TODO roll our own sgemm rather than use 3rd party?
+  cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
+              input_columns, 1.0, a, lead_dim_a, b, lead_dim_b,
+              accumulate ? 1.0 : 0.0, output, output_columns);
+}
diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu
new file mode 100644
index 0000000000..8305990fc8
--- /dev/null
+++ b/libgnn/src/GNNMath.cu
@@ -0,0 +1,189 @@
+#include "galois/GNNMath.cuh"
+
+bool galois::cublas_is_init = false;
+cublasHandle_t galois::global_cublas_handle;
+bool galois::curand_is_init = false;
+curandGenerator_t galois::global_curand_generator;
+
+void galois::InitCuBLAS() {
+  CUBLAS_CHECK(cublasCreate(&global_cublas_handle));
+  galois::cublas_is_init = true;
+}
+
+void galois::InitCuRAND() {
+  CURAND_CHECK(curandCreateGenerator(&galois::global_curand_generator,
+                                     CURAND_RNG_PSEUDO_DEFAULT));
+  galois::curand_is_init = true;
+}
+
+void galois::CuRANDUniformRNG(GNNFloat* array_to_fill, size_t num_elements) {
+  // TODO how much overhead does this check have?
+  if (!galois::curand_is_init) {
+    galois::InitCuRAND();
+  }
+  CURAND_CHECK(curandGenerateUniform(galois::global_curand_generator,
+                                     array_to_fill, num_elements));
+}
+
+void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                           const cublasOperation_t trans_b, size_t input_rows,
+                           size_t input_columns, size_t output_columns,
+                           const GNNFloat* a, const GNNFloat* b,
+                           GNNFloat* output) {
+  CBlasSGEMMGPU(trans_a, trans_b, input_rows, input_columns, output_columns, a,
+                b, output, false);
+}
+
+void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a,
+                           const cublasOperation_t trans_b, size_t input_rows,
+                           size_t input_columns, size_t output_columns,
+                           const GNNFloat* a, const GNNFloat* b,
+                           GNNFloat* output, bool accumulate) {
+  if (!cublas_is_init) {
+    InitCuBLAS();
+  }
+  size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows;
+  size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns;
+  float beta        = (accumulate) ? 1.0 : 0.0;
+  float dummy0      = 1.0;
+  // because cusparse assumes column major even though we're passing in row
+  // major, the order of multiply is reversed so that it does what we
+  // want anyways
+  // https://stackoverflow.com/questions/56043539/cublassgemm-row-major-multiplication
+  CUBLAS_CHECK(cublasSgemm(global_cublas_handle, trans_b, trans_a,
+                           output_columns, input_rows, input_columns, &dummy0,
+                           b, lead_dim_b, a, lead_dim_a, &beta, output,
+                           output_columns));
+  CUDA_TEST("cublas sgemm failure");
+}
+
+__global__ void galois::SoftmaxCrossEntropyForward(
+    char* mask, size_t num_nodes, size_t feature_length,
+    const galois::GNNFloat* input_embeddings, galois::GNNFloat* output) {
+
+  // NOTE: assumes that output is already 0'd out as it will not overwrite the
+  // entire thing
+  CUDA_KERNEL_LOOP(i, 0, num_nodes) {
+    if (mask[i] == 1) {
+      galois::DoSoftmax(feature_length, input_embeddings + feature_length * i,
+                        output + feature_length * i);
+      // ignoring crossentropy loss calculation for now because I'm not using
+      // loss for anything + didn't bother allocating an array to store loss
+      // anyways
+    } else {
+      galois::GPUVectorZero(feature_length, output + feature_length * i);
+    }
+  }
+}
+
+__global__ void galois::SoftmaxCrossEntropyBackward(
+    char* mask, size_t num_nodes, size_t feature_length,
+    const galois::GNNFloat* predictions, const galois::GNNLabel* ground_truth,
+    galois::GNNFloat* output_gradient) {
+  const unsigned global_thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned warp_thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = global_thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+      threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // TODO: how many classes can there be? it's a set quantity at the moment
+  // copy of a particular node's prediction; put into shared memory to avoid
+  // overheads of accessing it otherwise
+  // TODO benchmark
+  __shared__ GNNFloat
+      local_node_prediction[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+  __shared__ GNNFloat
+      intermediate_gradient[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES];
+
+  // a warp works on a single node at once
+  for (unsigned wid = warp_id; wid < num_nodes; wid += num_warps) {
+    // operate only if masked
+    if (mask[wid] == 1) {
+      unsigned base_index = wid * feature_length;
+
+      // copy over a prediction to shared memory (faster access time)
+      // TODO benchmark this to see if worth
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          local_node_prediction[warp_lane][feat_index] =
+              predictions[base_index + feat_index];
+        }
+      }
+      // do not proceed until entire prediction is copied to shared memory
+      __syncthreads();
+
+      // TODO can refactor below to device functions
+      // cross entropy derivative
+      // each thread of warp takes different feature
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          if (feat_index == (unsigned)ground_truth[wid]) {
+            // this thread is responsible for the truth
+            intermediate_gradient[warp_lane][feat_index] =
+                -1.0 / (local_node_prediction[warp_lane][feat_index] + 1e-10);
+          } else {
+            // all others are 0 (ground truth label = 0)
+            intermediate_gradient[warp_lane][feat_index] = 0.0;
+          }
+        }
+      }
+      __syncthreads();
+
+      // softmax derivative
+      // each thread of warp takes different feature
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          GNNFloat sum  = 0.0;
+          GNNFloat self = local_node_prediction[warp_lane][feat_index];
+
+          for (unsigned j = 0; j < feature_length; j++) {
+            GNNFloat df = (j == feat_index)
+                              ? (self * (1.0 - self))
+                              : -local_node_prediction[warp_lane][j] * self;
+            sum += df * intermediate_gradient[warp_lane][j];
+          }
+
+          // each thread saves final output for the feature
+          output_gradient[base_index + feat_index] = sum;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+__device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input,
+                                  GNNFloat* output) {
+  // find max value
+  GNNFloat current_max = input[0];
+  for (size_t i = 1; i < vector_length; i++) {
+    if (input[i] > current_max) {
+      current_max = input[i];
+    }
+  }
+  // set output by scaling with the max
+  GNNFloat denominator = 0.0;
+  for (size_t i = 0; i < vector_length; i++) {
+    // NOTE: expf only works for single precision float; may need to change if
+    // we ever switch to double
+    output[i] = expf(input[i] - current_max);
+    denominator += output[i];
+  }
+  // denominator scale
+  for (size_t i = 0; i < vector_length; i++) {
+    output[i] /= denominator;
+  }
+}
+
+__device__ void galois::GPUVectorZero(size_t vector_length, GNNFloat* vec) {
+  for (size_t i = 0; i < vector_length; i++) {
+    vec[i] = 0;
+  }
+}
diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp
new file mode 100644
index 0000000000..843e75a1a6
--- /dev/null
+++ b/libgnn/src/GNNOptimizers.cpp
@@ -0,0 +1,54 @@
+#include "galois/Galois.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/Logging.h"
+#include <cassert>
+
+void galois::AdamOptimizer::GradientDescent(
+    PointerWithSize<GNNFloat> derivatives, PointerWithSize<GNNFloat> matrix,
+    size_t layer_number) {
+  assert(derivatives.size() == matrix.size());
+
+  // grab based on layer being used
+  PointerWithSize<GNNFloat>& first_moment  = p_first_moments_[layer_number];
+  PointerWithSize<GNNFloat>& second_moment = p_second_moments_[layer_number];
+  assert(derivatives.size() == first_moment.size());
+  assert(derivatives.size() == second_moment.size());
+
+#ifdef GALOIS_ENABLE_GPU
+  if (device_personality == DevicePersonality::GPU_CUDA) {
+    gpu_object_.AdamUpdate(derivatives.data(), matrix.data(), matrix.size(),
+                           first_moment.data(), second_moment.data(),
+                           config_.alpha, config_.beta1, config_.beta2,
+                           config_.epsilon, beta1_power_t_[layer_number],
+                           beta2_power_t_[layer_number]);
+  } else {
+#endif
+    // individual weight updates via gradients
+    galois::do_all(
+        galois::iterate(static_cast<size_t>(0), matrix.size()),
+        [&](size_t i) {
+          // moment estimate updates
+          first_moment[i] = config_.beta1 * first_moment[i] +
+                            (1.0 - config_.beta1) * derivatives[i];
+          second_moment[i] =
+              config_.beta2 * second_moment[i] +
+              (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]);
+          // bias corrected moments using beta power
+          GNNFloat bias_correct_first =
+              first_moment[i] / (1.0 - beta1_power_t_[layer_number]);
+          GNNFloat bias_correct_second =
+              second_moment[i] / (1.0 - beta2_power_t_[layer_number]);
+          // weight update using bias corrected moments
+          (matrix.data())[i] -=
+              config_.alpha * bias_correct_first /
+              (std::sqrt(bias_correct_second) + config_.epsilon);
+        },
+        galois::loopname("AdamOptimizerGradientDescent"));
+#ifdef GALOIS_ENABLE_GPU
+  }
+#endif
+
+  // update the power terms for next update call
+  beta1_power_t_[layer_number] *= config_.beta1;
+  beta2_power_t_[layer_number] *= config_.beta2;
+}
diff --git a/libgnn/src/GNNOptimizers.cu b/libgnn/src/GNNOptimizers.cu
new file mode 100644
index 0000000000..840554ddd4
--- /dev/null
+++ b/libgnn/src/GNNOptimizers.cu
@@ -0,0 +1,68 @@
+#include "galois/GNNOptimizers.cuh"
+#include "galois/CUDAUtil.h"
+
+galois::AdamOptimizerGPU::AdamOptimizerGPU(
+    const std::vector<size_t>& trainable_layer_sizes, size_t num_trainable) {
+  num_layers_ = num_trainable;
+  first_moments_.resize(num_layers_);
+  second_moments_.resize(num_layers_);
+
+  for (size_t layer = 0; layer < num_layers_; layer++) {
+    // initialize the moment vector memory then zero it all out
+    CUDA_CHECK(cudaMalloc((void**)(&(first_moments_[layer])),
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+    CUDA_CHECK(cudaMalloc((void**)(&(second_moments_[layer])),
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+    CUDA_CHECK(cudaMemset(first_moments_[layer], 0,
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+    CUDA_CHECK(cudaMemset(second_moments_[layer], 0,
+                          trainable_layer_sizes[layer] * sizeof(GNNFloat)));
+  }
+}
+
+galois::AdamOptimizerGPU::~AdamOptimizerGPU() {
+  // loop through and free first/second moments
+  for (size_t layer = 0; layer < num_layers_; layer++) {
+    CUDA_FREE(first_moments_[layer]);
+    CUDA_FREE(second_moments_[layer]);
+  }
+}
+void galois::AdamOptimizerGPU::CopyToVector(std::vector<GNNFloat>& to,
+                                            PointerWithSize<GNNFloat> from) {
+  CUDA_CHECK(cudaMemcpy(to.data(), from.data(), to.size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
+namespace {
+
+__global__ void DoAdamUpdate(const galois::GNNFloat* derivatives,
+                             galois::GNNFloat* matrix_to_update,
+                             size_t matrix_size, galois::GNNFloat* first_moment,
+                             galois::GNNFloat* second_moment,
+                             galois::GNNFloat alpha, galois::GNNFloat beta1,
+                             galois::GNNFloat beta2, galois::GNNFloat epsilon,
+                             galois::GNNFloat beta1t, galois::GNNFloat beta2t) {
+  CUDA_KERNEL_LOOP(i, 0, matrix_size) {
+    first_moment[i]  = beta1 * first_moment[i] + (1.0 - beta1) * derivatives[i];
+    second_moment[i] = beta2 * second_moment[i] +
+                       (1.0 - beta2) * (derivatives[i] * derivatives[i]);
+    // bias corrected moments using beta power
+    galois::GNNFloat bias_correct_first  = first_moment[i] / (1.0 - beta1t);
+    galois::GNNFloat bias_correct_second = second_moment[i] / (1.0 - beta2t);
+    // weight update using bias corrected moments
+    matrix_to_update[i] -=
+        alpha * bias_correct_first / sqrtf(bias_correct_second + epsilon);
+  }
+}
+
+} // namespace
+
+void galois::AdamOptimizerGPU::AdamUpdate(
+    const GNNFloat* derivatives, GNNFloat* matrix_to_update, size_t matrix_size,
+    GNNFloat* first_moment, GNNFloat* second_moment, GNNFloat alpha,
+    GNNFloat beta1, GNNFloat beta2, GNNFloat epsilon, GNNFloat beta1t,
+    GNNFloat beta2t) {
+  DoAdamUpdate<<<CUDA_GET_BLOCKS(matrix_size), CUDA_NUM_THREADS>>>(
+      derivatives, matrix_to_update, matrix_size, first_moment, second_moment,
+      alpha, beta1, beta2, epsilon, beta1t, beta2t);
+}
diff --git a/libgnn/src/GraphNeuralNetwork.cu b/libgnn/src/GraphNeuralNetwork.cu
new file mode 100644
index 0000000000..2d04073563
--- /dev/null
+++ b/libgnn/src/GraphNeuralNetwork.cu
@@ -0,0 +1,27 @@
+#include "galois/GraphNeuralNetwork.cuh"
+#include "galois/Logging.h"
+
+float galois::GraphNeuralNetworkGPU::GetGlobalAccuracyGPU(
+    const graphs::GNNGraphGPUAllocations& gpu_graph, GNNPhase phase,
+    const PointerWithSize<GNNFloat> predictions) {
+  // get correct mask
+  char* mask_to_use = nullptr;
+  switch (phase) {
+  case GNNPhase::kTrain:
+    mask_to_use = gpu_graph.local_training_mask();
+    break;
+  case GNNPhase::kValidate:
+    mask_to_use = gpu_graph.local_validation_mask();
+    break;
+  case GNNPhase::kTest:
+    mask_to_use = gpu_graph.local_testing_mask();
+    break;
+  default:
+    GALOIS_LOG_FATAL("Invalid phase specified");
+  }
+
+  // run accuracy check kernel on GPU
+  // TODO finish this implementation
+
+  return 0.0;
+}
diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp
new file mode 100644
index 0000000000..9b603fc2e4
--- /dev/null
+++ b/libgnn/src/MinibatchGenerator.cpp
@@ -0,0 +1,85 @@
+#include "galois/MinibatchGenerator.h"
+#include "galois/Galois.h"
+#include <cassert>
+
+void galois::MinibatchGenerator::OriginalGetNextMinibatch(
+    std::vector<char>* batch_mask) {
+  assert(current_position_ <= mask_to_minibatch_.size());
+  assert(current_position_ <= master_bound_);
+  assert(batch_mask->size() == mask_to_minibatch_.size());
+
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
+  if (current_position_ >= master_bound_) {
+    return;
+  }
+
+  size_t current_count = 0;
+  // start from last positiion
+  while (current_position_ < master_bound_) {
+    if (mask_to_minibatch_[current_position_]) {
+      // XXX and a master node; seed nodes only exist locally
+      (*batch_mask)[current_position_] = 1;
+      current_count++;
+    }
+    // break when minibatch is large enough
+    current_position_++;
+    if (current_count == minibatch_size_)
+      break;
+  }
+
+  // advance current position to next set bit for next call (or to end to detect
+  // no more minibatches
+  while (!mask_to_minibatch_[current_position_] &&
+         (current_position_ < master_bound_)) {
+    current_position_++;
+  }
+}
+
+void galois::MinibatchGenerator::ShuffleGetNextMinibatch(
+    std::vector<char>* batch_mask) {
+  size_t current_count = 0;
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
+  // loops through a number of indices locally and sets
+  while (current_position_ < all_indices_.size()) {
+    (*batch_mask)[all_indices_[current_position_++]] = 1;
+    current_count++;
+    if (current_count == minibatch_size_)
+      break;
+  }
+}
+
+// used if all hosts have a global view of the same minibatch sequence
+// (occurs if all hosts use same shuffle seed)
+// Do not use unless you know what you are doing
+void galois::MinibatchGenerator::DistributedShuffleGetNextMinibatch(
+    std::vector<char>* batch_mask) {
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
+
+  size_t current_count = 0;
+  size_t global_minibatch_size =
+      minibatch_size_ * galois::runtime::getSystemNetworkInterface().Num;
+  while (current_position_ < all_indices_.size()) {
+    size_t candidate_lid = all_indices_[current_position_++];
+    if (candidate_lid < batch_mask->size() && candidate_lid < master_bound_) {
+      (*batch_mask)[candidate_lid] = 1;
+    }
+
+    current_count++;
+    if (current_count == global_minibatch_size)
+      break;
+  }
+}
+
+// used with distributed minibatch tracker which is deprecated; code not
+// guaranteed to work
+void galois::MinibatchGenerator::DistributedShuffleGetNextMinibatch(
+    std::vector<char>* batch_mask, size_t num_to_get) {
+  size_t current_count = 0;
+  galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0);
+  while (current_position_ < all_indices_.size()) {
+    (*batch_mask)[all_indices_[current_position_++]] = 1;
+    current_count++;
+    if (current_count == num_to_get)
+      break;
+  }
+}
diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp
new file mode 100644
index 0000000000..7fe3fed8f4
--- /dev/null
+++ b/libgnn/src/graphs/GNNGraph.cpp
@@ -0,0 +1,35 @@
+// XXX include net interface if necessary
+#include "galois/graphs/GNNGraph.h"
+
+namespace galois {
+namespace graphs {
+
+std::vector<char>* sampled_nodes_ = nullptr;
+// Sync structure variables; global to get around sync structure
+// limitations at the moment
+GNNFloat* gnn_matrix_to_sync_            = nullptr;
+size_t gnn_matrix_to_sync_column_length_ = 0;
+size_t subgraph_size_                    = 0;
+//! For synchronization of graph aggregations
+galois::DynamicBitSet bitset_graph_aggregate;
+galois::LargeArray<uint32_t>* gnn_lid_to_sid_pointer_ = nullptr;
+size_t num_active_layer_rows_                         = 0;
+//! It specifies offset for feature aggregation
+size_t feature_aggregation_offset_ = 0;
+uint32_t* gnn_degree_vec_1_;
+uint32_t* gnn_degree_vec_2_;
+
+galois::DynamicBitSet bitset_sample_flag_;
+
+//! For synchronization of sampled degrees
+galois::DynamicBitSet bitset_sampled_degrees_;
+std::vector<galois::LargeArray<uint32_t>>* gnn_sampled_out_degrees_;
+
+#ifdef GALOIS_ENABLE_GPU
+struct CUDA_Context* cuda_ctx_for_sync;
+struct CUDA_Context* cuda_ctx;
+unsigned layer_number_to_sync;
+#endif
+
+}; // namespace graphs
+}; // namespace galois
diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu
new file mode 100644
index 0000000000..065e84be6c
--- /dev/null
+++ b/libgnn/src/graphs/GNNGraph.cu
@@ -0,0 +1,188 @@
+#include "gg.h"
+#include "ggcuda.h"
+
+#include "galois/cuda/DynamicBitset.h"
+
+#include "galois/CUDAUtil.h"
+#include "galois/graphs/GNNGraph.cuh"
+#include "sharedptr.h"
+
+Shared<DynamicBitset> cuda_bitset_graph_aggregate;
+
+galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing GPU graph allocations");
+  CUDA_FREE(num_features_);
+  CUDA_FREE(feature_length_);
+  CUDA_FREE(num_edges_);
+  CUDA_FREE(edge_index_);
+  CUDA_FREE(edge_destinations_);
+  CUDA_FREE(feature_vector_);
+  CUDA_FREE(ground_truth_);
+  CUDA_FREE(local_training_mask_);
+  CUDA_FREE(local_validation_mask_);
+  CUDA_FREE(local_testing_mask_);
+  CUDA_FREE(global_degrees_);
+  CUDA_FREE(global_train_degrees_);
+}
+
+void galois::graphs::GNNGraphGPUAllocations::SetGraphTopology(
+    const std::vector<int>& edge_index, const std::vector<int>& edge_dests) {
+  // num edges variable
+  CUDA_CHECK(cudaMalloc((void**)(&num_edges_), sizeof(unsigned)));
+  unsigned num_edges = edge_dests.size();
+  CUDA_CHECK(cudaMemcpy(num_edges_, &num_edges, sizeof(unsigned),
+                        cudaMemcpyHostToDevice));
+
+  // topology; assumes caller already setup vectors accordingly
+  CUDA_CHECK(
+      cudaMalloc((void**)(&edge_index_), edge_index.size() * sizeof(int)));
+  CUDA_CHECK(cudaMalloc((void**)(&edge_destinations_),
+                        edge_dests.size() * sizeof(int)));
+  CUDA_CHECK(cudaMemcpy(edge_index_, edge_index.data(),
+                        edge_index.size() * sizeof(int),
+                        cudaMemcpyHostToDevice));
+  CUDA_CHECK(cudaMemcpy(edge_destinations_, edge_dests.data(),
+                        edge_dests.size() * sizeof(int),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::graphs::GNNGraphGPUAllocations::SetFeatures(
+    const std::vector<GNNFeature>& features, unsigned num_features) {
+  // feature count & length
+  CUDA_CHECK(cudaMalloc((void**)(&num_features_), sizeof(unsigned)));
+  CUDA_CHECK(cudaMalloc((void**)(&feature_length_), sizeof(unsigned)));
+  CUDA_CHECK(cudaMemcpy(num_features_, &num_features, sizeof(unsigned),
+                        cudaMemcpyHostToDevice));
+  unsigned feature_length = features.size() / num_features;
+  CUDA_CHECK(cudaMemcpy(feature_length_, &feature_length, sizeof(unsigned),
+                        cudaMemcpyHostToDevice));
+
+  // features themselves
+  CUDA_CHECK(cudaMalloc((void**)(&feature_vector_),
+                        features.size() * sizeof(GNNFeature)));
+  CUDA_CHECK(cudaMemcpy(feature_vector_, features.data(),
+                        features.size() * sizeof(GNNFeature),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::graphs::GNNGraphGPUAllocations::SetLabels(
+    const std::vector<GNNLabel>& ground_truth) {
+  CUDA_CHECK(cudaMalloc((void**)(&ground_truth_),
+                        ground_truth.size() * sizeof(GNNLabel)));
+  CUDA_CHECK(cudaMemcpy(ground_truth_, ground_truth.data(),
+                        ground_truth.size() * sizeof(GNNLabel),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::graphs::GNNGraphGPUAllocations::SetMasks(
+    const std::vector<char>& train, const std::vector<char>& val,
+    const std::vector<char>& test) {
+  CUDA_CHECK(
+      cudaMalloc((void**)(&local_training_mask_), train.size() * sizeof(char)));
+  CUDA_CHECK(cudaMemcpy(local_training_mask_, train.data(),
+                        train.size() * sizeof(char), cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(
+      cudaMalloc((void**)(&local_validation_mask_), val.size() * sizeof(char)));
+  CUDA_CHECK(cudaMemcpy(local_validation_mask_, val.data(),
+                        val.size() * sizeof(char), cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(
+      cudaMalloc((void**)(&local_testing_mask_), test.size() * sizeof(char)));
+  CUDA_CHECK(cudaMemcpy(local_testing_mask_, test.data(),
+                        test.size() * sizeof(char), cudaMemcpyHostToDevice));
+}
+
+void galois::graphs::GNNGraphGPUAllocations::InitNormFactor(size_t num_nodes) {
+  GALOIS_LOG_ASSERT(global_degrees_ == nullptr);
+  GALOIS_LOG_ASSERT(global_train_degrees_ == nullptr);
+
+  CUDA_CHECK(
+      cudaMalloc((void**)(&global_degrees_), sizeof(uint32_t) * num_nodes));
+  CUDA_CHECK(cudaMalloc((void**)(&global_train_degrees_),
+                        sizeof(uint32_t) * num_nodes));
+  global_degree_size_       = num_nodes;
+  global_train_degree_size_ = num_nodes;
+}
+
+#if 0 // TODO(lhc) will be added
+__global__ void CalculateFullNormFactorGPU() {
+  const unsigned thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+    threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+    (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // each warp gets a source: this var holds the first/last edge worked on by
+  // that warp
+  __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2];
+
+  // each warp works on a source: threads in warp split the feature
+  for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
+    if (thread_lane < 2) {
+      edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane];
+    }
+    __syncthreads();
+
+    const int edge_begin = edge_begin_end[warp_lane][0];
+    const int edge_end   = edge_begin_end[warp_lane][1];
+    for (int offest = edge_begin; offset < edge_end; offset++) {
+
+    }
+  }
+}
+
+void galois::graphs::GNNGraphGPUAllocations::CalculateFullNormFactor() {
+
+}
+#endif
+
+void galois::graphs::GNNGraphGPUAllocations::SetGlobalDegrees(
+    const std::vector<uint32_t> global_degrees) {
+  if (global_degree_size_ < global_degrees.size()) {
+    if (global_degree_size_ > 0) {
+      CUDA_CHECK(cudaFree(global_degrees_));
+    }
+    CUDA_CHECK(cudaMalloc((void**)(&global_degrees_),
+                          global_degrees.size() * sizeof(uint32_t)));
+    global_degree_size_ = global_degrees.size();
+  }
+
+  CUDA_CHECK(cudaMemcpy(global_degrees_, global_degrees.data(),
+                        global_degrees.size() * sizeof(uint32_t),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::graphs::GNNGraphGPUAllocations::SetGlobalTrainDegrees(
+    const std::vector<uint32_t> global_train_degrees) {
+  if (global_train_degree_size_ < global_train_degrees.size()) {
+    if (global_train_degree_size_ > 0) {
+      CUDA_CHECK(cudaFree(global_train_degrees_));
+    }
+    CUDA_CHECK(cudaMalloc((void**)(&global_train_degrees_),
+                          global_train_degrees.size() * sizeof(uint32_t)));
+    global_train_degree_size_ = global_train_degrees.size();
+  }
+
+  CUDA_CHECK(cudaMemcpy(global_train_degrees_, global_train_degrees.data(),
+                        global_train_degrees.size() * sizeof(uint32_t),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::graphs::GNNGraphGPUAllocations::AllocAggregateBitset(size_t size) {
+  cuda_bitset_graph_aggregate.alloc(1);
+  cuda_bitset_graph_aggregate.cpu_wr_ptr()->alloc(size);
+}
+
+void galois::graphs::GNNGraphGPUAllocations::CopyToCPU(
+    const PointerWithSize<GNNFloat>& input) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * input.size());
+  cudaMemcpy(cpu_input, input.data(), sizeof(GNNFloat) * input.size(),
+             cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < input.size(); i++)
+    fprintf(stdout, "** %lu is %f\n", i, cpu_input[i]);
+}
diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp
new file mode 100644
index 0000000000..3bea1063c8
--- /dev/null
+++ b/libgnn/src/graphs/GNNSubgraph.cpp
@@ -0,0 +1 @@
+#include "galois/graphs/GNNGraph.h"
diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu
new file mode 100644
index 0000000000..71b2b4512f
--- /dev/null
+++ b/libgnn/src/layers/GNNLayer.cu
@@ -0,0 +1,268 @@
+#include "galois/CUDAUtil.h"
+#include "galois/GNNMath.cuh"
+#include "galois/layers/GNNLayer.cuh"
+
+galois::GNNLayerGPUAllocations::~GNNLayerGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing GPU layer allocations");
+  CUDA_FREE(num_weights_);
+  CUDA_FREE(forward_output_matrix_);
+  CUDA_FREE(backward_output_matrix_);
+  CUDA_FREE(layer_weights_);
+  CUDA_FREE(layer_weight_gradients_);
+  CUDA_FREE(activation_memo_);
+}
+
+void galois::GNNLayerGPUAllocations::InitInOutMemory(size_t forward_size,
+                                                     size_t backward_size) {
+  CUDA_CHECK(cudaMalloc((void**)(&forward_output_matrix_),
+                        forward_size * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMalloc((void**)(&backward_output_matrix_),
+                        backward_size * sizeof(GNNFloat)));
+}
+
+void galois::GNNLayerGPUAllocations::InitWeightMemory(size_t num_weights) {
+  // num weights
+  CUDA_CHECK(cudaMalloc((void**)(&num_weights_), sizeof(size_t)));
+  CUDA_CHECK(cudaMemcpy(num_weights_, &num_weights, sizeof(size_t),
+                        cudaMemcpyHostToDevice));
+  // memory
+  CUDA_CHECK(
+      cudaMalloc((void**)(&layer_weights_), num_weights * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMalloc((void**)(&layer_weight_gradients_),
+                        num_weights * sizeof(GNNFloat)));
+}
+
+void galois::GNNLayerGPUAllocations::InitDropoutMemory(size_t dropout_size) {
+  CUDA_CHECK(
+      cudaMalloc((void**)(&rng_results_), dropout_size * sizeof(GNNFloat)));
+  CUDA_CHECK(cudaMemset(rng_results_, 0, dropout_size * sizeof(GNNFloat)));
+
+  CUDA_CHECK(cudaMalloc((void**)(&dropout_mask_), dropout_size * sizeof(char)));
+  CUDA_CHECK(cudaMemset(dropout_mask_, 0, dropout_size * sizeof(char)));
+}
+
+void galois::GNNLayerGPUAllocations::CopyToWeights(
+    const std::vector<GNNFloat>& cpu_layer_weights) {
+  CUDA_CHECK(cudaMemcpy(layer_weights_, cpu_layer_weights.data(),
+                        cpu_layer_weights.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::GNNLayerGPUAllocations::CopyToWeightGradients(
+    const std::vector<GNNFloat>& cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(layer_weight_gradients_, cpu_gradients.data(),
+                        cpu_gradients.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU(
+    GNNFloat* cpu_forward_output, size_t forward_output_size) {
+  CUDA_CHECK(cudaMemcpy(cpu_forward_output, forward_output_matrix_,
+                        forward_output_size * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
+void galois::GNNLayerGPUAllocations::CopyBackwardOutputToCPU(
+    GNNFloat* cpu_backward_output, size_t backward_output_size) {
+  CUDA_CHECK(cudaMemcpy(cpu_backward_output, backward_output_matrix_,
+                        backward_output_size * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
+void galois::GNNLayerGPUAllocations::CopyWeightGradientsToCPU(
+    std::vector<GNNFloat>* cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(cpu_gradients->data(), layer_weight_gradients_,
+                        cpu_gradients->size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
+
+namespace {
+
+__global__ void
+DoDropoutImpl(size_t input_size, const galois::GNNFloat* input_to_dropout,
+              galois::GNNFloat* output, const galois::GNNFloat* rng_vector,
+              char* dropout_mask, float dropout_rate, galois::GNNFloat scale) {
+  CUDA_KERNEL_LOOP(i, 0, input_size) {
+    // convert the rng floats into a mask
+    dropout_mask[i] = rng_vector[i] > dropout_rate ? 1 : 0;
+    // use mask to keep/drop weights
+    output[i] = input_to_dropout[i] * (float)dropout_mask[i] * scale;
+  }
+}
+
+__global__ void DoDropoutDerivativeImpl(size_t input_size,
+                                        galois::GNNFloat* input,
+                                        char* dropout_mask,
+                                        galois::GNNFloat scale) {
+  CUDA_KERNEL_LOOP(i, 0, input_size) {
+    input[i] = input[i] * (float)dropout_mask[i] * scale;
+  }
+}
+
+} // namespace
+
+void galois::GNNLayerGPUAllocations::DoDropoutGPU(
+    const PointerWithSize<GNNFloat> input_to_dropout,
+    PointerWithSize<GNNFloat> output, float dropout_rate) {
+  // RNG which weights to dropout
+  galois::CuRANDUniformRNG(rng_results_, input_to_dropout.size());
+  GNNFloat scale = 1. / (1. - dropout_rate);
+  // GPU dropout kernel
+  DoDropoutImpl<<<CUDA_GET_BLOCKS(input_to_dropout.size()), CUDA_NUM_THREADS>>>(
+      input_to_dropout.size(), input_to_dropout.data(), output.data(),
+      rng_results_, dropout_mask_, dropout_rate, scale);
+  CUDA_TEST("Dropout on GPU failure");
+}
+
+void galois::GNNLayerGPUAllocations::DoDropoutDerivativeGPU(size_t input_size,
+                                                            GNNFloat scale) {
+  DoDropoutDerivativeImpl<<<CUDA_GET_BLOCKS(input_size), CUDA_NUM_THREADS>>>(
+      input_size, backward_output_matrix_, dropout_mask_, scale);
+  CUDA_TEST("Dropout derivative on GPU failure");
+}
+
+galois::GNNFloat*
+galois::GNNLayerGPUAllocations::Allocate(const std::vector<GNNFloat>& v) {
+  // TODO keep track of these so that on destruction they can be freed
+  // accordingly; for now I'll let them leak
+  galois::GNNFloat* to_return = nullptr;
+  CUDA_CHECK(
+      cudaMalloc((void**)(&to_return), v.size() * sizeof(galois::GNNFloat)));
+  CUDA_CHECK(cudaMemcpy(to_return, v.data(),
+                        v.size() * sizeof(galois::GNNFloat),
+                        cudaMemcpyHostToDevice));
+  return to_return;
+}
+
+namespace {
+__global__ void PrintVector(galois::GNNFloat* v, unsigned size) {
+  for (unsigned i = 0; i < size; i++) {
+    printf("%u %f\n", i, v[i]);
+  }
+}
+} // namespace
+
+// TODO copy from gpu function as well just in case I need to check
+void galois::GNNLayerGPUAllocations::PrintForwardOutput(size_t size) {
+  PrintVector<<<1, 1>>>(forward_output_matrix_, size);
+}
+
+// TODO copy from gpu function as well just in case I need to check
+void galois::GNNLayerGPUAllocations::PrintBackwardOutput(size_t size) {
+  PrintVector<<<1, 1>>>(backward_output_matrix_, size);
+}
+
+namespace {
+__global__ void InitVectorTo1Kernel(galois::GNNFloat* vector,
+                                    size_t num_vector_elements) {
+  CUDA_KERNEL_LOOP(idx, 0, num_vector_elements) { vector[idx] = 1.0; }
+}
+
+__global__ void ReluActivationKernel(galois::GNNFloat* forward_output_matrix,
+                                     size_t num_forward_output_elements,
+                                     uint8_t* activation_memo) {
+  CUDA_KERNEL_LOOP(idx, 0, num_forward_output_elements) {
+    if (forward_output_matrix[idx] > galois::GNNFloat{0}) {
+      activation_memo[idx] = 1;
+    } else {
+      forward_output_matrix[idx] = 0;
+    }
+  }
+}
+
+__global__ void ReluActivationDerivativeKernel(
+    galois::GNNFloat* gradients, galois::GNNFloat* forward_output_matrix,
+    const size_t num_gradients_elements, const uint8_t* activation_memo) {
+  CUDA_KERNEL_LOOP(idx, 0, num_gradients_elements) {
+    if (!activation_memo[idx]) {
+      gradients[idx] = 0;
+    }
+  }
+}
+
+__global__ void
+ReconstructDropoutMatrixKernel(const galois::GNNFloat* input_to_dropout,
+                               galois::GNNFloat* output_matrix,
+                               char* dropout_mask, const size_t num_elements,
+                               const galois::GNNFloat scale) {
+  CUDA_KERNEL_LOOP(i, 0, num_elements) {
+    output_matrix[i] = input_to_dropout[i] * scale;
+  }
+
+  CUDA_KERNEL_LOOP(i, 0, num_elements) {
+    output_matrix[i] *= static_cast<galois::GNNFloat>(dropout_mask[i]);
+  }
+}
+
+__global__ void MaskNonMastersKernel(galois::GNNFloat* input,
+                                     uint32_t start_node, uint32_t end_node,
+                                     uint32_t row_index) {
+  // TODO(lhc) implement nested parallelism if it is worth
+  CUDA_KERNEL_LOOP(non_master, start_node, end_node) {
+    for (uint32_t j = 0; j < row_index; j++) {
+      input[non_master * row_index + j] = 0;
+    }
+  }
+}
+} // namespace
+
+void galois::GNNLayerGPUAllocations::InitGPUVectorTo1(GNNFloat* vector,
+                                                      size_t vector_size) {
+  InitVectorTo1Kernel<<<CUDA_GET_BLOCKS(vector_size), CUDA_NUM_THREADS>>>(
+      vector, vector_size);
+  CUDA_TEST("Failed to initialize vector to 1.");
+}
+
+void galois::GNNLayerGPUAllocations::ActivationGPU(
+    size_t num_forward_output_elements) {
+  if (activation_memo_ == nullptr) {
+    CUDA_CHECK(cudaMalloc((void**)(&activation_memo_),
+                          num_forward_output_elements * sizeof(uint8_t)));
+  }
+  ReluActivationKernel<<<CUDA_GET_BLOCKS(num_forward_output_elements),
+                         CUDA_NUM_THREADS>>>(
+      forward_output_matrix_, num_forward_output_elements, activation_memo_);
+  CUDA_TEST("Activation GPU failed.");
+}
+
+void galois::GNNLayerGPUAllocations::ActivationDerivativeGPU(
+    GNNFloat* gradients, size_t num_gradients_elements) {
+  ReluActivationDerivativeKernel<<<CUDA_GET_BLOCKS(num_gradients_elements),
+                                   CUDA_NUM_THREADS>>>(
+      gradients, forward_output_matrix_, num_gradients_elements,
+      activation_memo_);
+  CUDA_TEST("ActivationDerivative GPU failed.");
+}
+
+void galois::GNNLayerGPUAllocations::ReconstructDropoutMatrixGPU(
+    const PointerWithSize<GNNFloat> input_to_dropout,
+    PointerWithSize<GNNFloat>* output_matrix, size_t num_elements,
+    GNNFloat scale) {
+  ReconstructDropoutMatrixKernel<<<CUDA_GET_BLOCKS(num_elements),
+                                   CUDA_NUM_THREADS>>>(
+      input_to_dropout.data(), output_matrix->data(), dropout_mask_,
+      num_elements, scale);
+}
+
+void galois::GNNLayerGPUAllocations::MaskNonMastersGPU(
+    PointerWithSize<GNNFloat>* input, size_t start_node, size_t end_node,
+    size_t row_index) {
+  MaskNonMastersKernel<<<CUDA_GET_BLOCKS(row_index), CUDA_NUM_THREADS>>>(
+      input->data(), start_node, end_node, row_index);
+}
+
+void galois::GNNLayerGPUAllocations::CopyToCPU(
+    PointerWithSize<GNNFloat>* input) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * input->size());
+  cudaMemcpy(cpu_input, input->data(), sizeof(GNNFloat) * input->size(),
+             cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < input->size(); i++)
+    fprintf(stderr, "%lu = %f\n", i, cpu_input[i]);
+}
+
+void galois::GNNLayerGPUAllocations::CopyToCPU(GNNFloat* input, size_t size) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * size);
+  cudaMemcpy(cpu_input, input, sizeof(GNNFloat) * size, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < size; i++)
+    fprintf(stderr, "%lu = %f\n", i, cpu_input[i]);
+}
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu
new file mode 100644
index 0000000000..4ef8b62eca
--- /dev/null
+++ b/libgnn/src/layers/GraphConvolutionalLayer.cu
@@ -0,0 +1,173 @@
+#include "gg.h"
+#include "ggcuda.h"
+#include "galois/GNNMath.cuh"
+#include "galois/layers/GraphConvolutionalLayer.cuh"
+#include "galois/cuda/DynamicBitset.h"
+#include "sharedptr.h"
+
+// TODO(lhc) better way for this declaration is to declare it
+//           inside of the cuda context, but this messed linking to Gluon
+extern Shared<DynamicBitset> cuda_bitset_graph_aggregate;
+
+galois::GCNGPUAllocations::~GCNGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing GCN layer allocations");
+  CUDA_FREE(in_temp_1_);
+  CUDA_FREE(in_temp_2_);
+  CUDA_FREE(out_temp_);
+}
+
+void galois::GCNGPUAllocations::AllocateInTemp1(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_1_), size * sizeof(GNNFloat)));
+}
+
+void galois::GCNGPUAllocations::AllocateInTemp2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_2_), size * sizeof(GNNFloat)));
+}
+
+void galois::GCNGPUAllocations::AllocateOutTemp(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&out_temp_), size * sizeof(GNNFloat)));
+}
+
+namespace {
+// GPU side aggregation call: no matrix multiply, just regular dst accesses
+__global__ void AggregateAllKernel(
+    unsigned num_nodes, size_t column_length, const int* edge_index,
+    const int* edge_destination, const uint32_t* global_degrees,
+    const galois::GNNFloat* node_embeddings, galois::GNNFloat* aggregate_output,
+    bool disable_self_aggregate, size_t last_master,
+    DynamicBitset* cuda_bitset_graph_aggregate) {
+  const unsigned thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+      threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // each warp gets a source: this var holds the first/last edge worked on by
+  // that warp
+  __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2];
+
+  // each warp works on a source: threads in warp split the feature
+  for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
+    galois::GNNFloat src_norm    = 0.0;
+    galois::GNNFloat dst_norm    = 0.0;
+    galois::GNNFloat norm_to_use = 1.0;
+
+    if (global_degrees != nullptr) {
+      src_norm = (global_degrees[src])
+                     ? (1.0 / sqrt(static_cast<float>(global_degrees[src] + 1)))
+                     : 0.0;
+    }
+
+    if (thread_lane < 2) {
+      edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane];
+    }
+    // essentially what this is doing is making 2 of the threads set edge
+    // begin/end; all threads wait for sync
+    __syncthreads();
+
+    const int row_begin     = edge_begin_end[warp_lane][0];
+    const int row_end       = edge_begin_end[warp_lane][1];
+    unsigned base_src_index = src * column_length;
+
+    if (!disable_self_aggregate) {
+      cuda_bitset_graph_aggregate->set(src);
+      if (src < last_master) {
+        norm_to_use = src_norm * src_norm;
+        for (int i = 0; i < column_length; i += WARP_SIZE) {
+          if (thread_lane + i < column_length) {
+            aggregate_output[base_src_index + thread_lane + i] =
+                node_embeddings[base_src_index + thread_lane + i] * norm_to_use;
+          }
+        }
+      }
+    }
+
+    for (int offset = row_begin; offset < row_end; offset++) {
+      int dst                 = edge_destination[offset];
+      unsigned base_dst_index = dst * column_length;
+      cuda_bitset_graph_aggregate->set(src);
+
+      if (global_degrees != nullptr) {
+        dst_norm =
+            (global_degrees[dst])
+                ? (1.0 / sqrt(static_cast<float>(global_degrees[dst] + 1)))
+                : 0.0;
+        // note that otherwise it's 1.0, so a no-op when it comes to multiply
+        norm_to_use = src_norm * dst_norm;
+      }
+
+      // NOTE: this is where warp diverges
+      // the feature aggregation is split among thread in a warp
+      for (int i = 0; i < column_length; i += WARP_SIZE) {
+        if ((thread_lane + i) < column_length) {
+          if (global_degrees != nullptr) {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i] * norm_to_use;
+          } else {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i];
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+void galois::GCNGPUAllocations::AggregateAllGPU(
+    const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes,
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output, bool use_norm, bool disable_self_aggregate,
+    size_t last_master) {
+  // num_nodes should be greater than 0 to avoid negative number of thread
+  if (num_nodes == 0) {
+    return;
+  }
+
+  CUDA_CHECK(cudaMemset(aggregate_output, 0,
+                        num_nodes * column_length * sizeof(GNNFloat)));
+  if (use_norm) {
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), gpu_graph.get_global_degrees(),
+        node_embeddings, aggregate_output, disable_self_aggregate, last_master,
+        cuda_bitset_graph_aggregate.gpu_wr_ptr());
+  } else {
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), nullptr, node_embeddings,
+        aggregate_output, disable_self_aggregate, last_master,
+        cuda_bitset_graph_aggregate.gpu_wr_ptr());
+  }
+  CUDA_TEST("GPU aggregate all failure");
+}
+
+void galois::GCNGPUAllocations::UpdateEmbeddingsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* node_embeddings, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns,
+                output_columns, node_embeddings, layer_weights, output);
+}
+
+void galois::GCNGPUAllocations::UpdateEmbeddingsDerivativeGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* gradients, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  // note output clumns/input columns are flipped due to transpose of the
+  // layer weights
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, num_nodes, output_columns,
+                input_columns, gradients, layer_weights, output);
+}
+
+void galois::GCNGPUAllocations::GetWeightGradientsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* prev_input, const GNNFloat* gradients, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, num_nodes,
+                output_columns, prev_input, gradients, output);
+}
diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp
new file mode 100644
index 0000000000..99d0ffc5f0
--- /dev/null
+++ b/libgnn/src/layers/SAGELayer.cpp
@@ -0,0 +1 @@
+#include "galois/layers/SAGELayer.h"
diff --git a/libgnn/src/layers/SAGELayer.cu b/libgnn/src/layers/SAGELayer.cu
new file mode 100644
index 0000000000..33cf32d9d3
--- /dev/null
+++ b/libgnn/src/layers/SAGELayer.cu
@@ -0,0 +1,209 @@
+#include "gg.h"
+#include "ggcuda.h"
+#include "galois/cuda/DynamicBitset.h"
+#include "galois/GNNMath.cuh"
+#include "galois/layers/SAGELayer.cuh"
+
+extern Shared<DynamicBitset> cuda_bitset_graph_aggregate;
+
+galois::SAGEGPUAllocations::~SAGEGPUAllocations() {
+  GALOIS_LOG_VERBOSE("Freeing SAGE layer allocations");
+  CUDA_FREE(in_temp_1_);
+  CUDA_FREE(in_temp_2_);
+  CUDA_FREE(out_temp_);
+  CUDA_FREE(layer_weights_2_);
+  CUDA_FREE(layer_weight_gradients_2_);
+}
+
+void galois::SAGEGPUAllocations::AllocateWeight2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&layer_weights_2_), size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateWeightGradient2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&layer_weight_gradients_2_),
+                        size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateInTemp1(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_1_), size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateInTemp2(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&in_temp_2_), size * sizeof(GNNFloat)));
+}
+
+void galois::SAGEGPUAllocations::AllocateOutTemp(const size_t size) {
+  CUDA_CHECK(cudaMalloc((void**)(&out_temp_), size * sizeof(GNNFloat)));
+}
+
+namespace {
+// GPU side aggregation call: no matrix multiply, just regular dst accesses
+__global__ void AggregateAllKernel(
+    unsigned num_nodes, size_t column_length, const int* edge_index,
+    const int* edge_destination, const uint32_t* degree_for_norm,
+    const galois::GNNFloat* node_embeddings, galois::GNNFloat* aggregate_output,
+    DynamicBitset* cuda_bitset_graph_aggregate, bool is_backward) {
+  const unsigned thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = thread_id / WARP_SIZE; // global warp index
+  const unsigned warp_lane =
+      threadIdx.x / WARP_SIZE; // warp index within the CTA
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // each warp gets a source: this var holds the first/last edge worked on by
+  // that warp
+  __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2];
+
+  // each warp works on a source: threads in warp split the feature
+  for (int src = warp_id; src < static_cast<int>(num_nodes); src += num_warps) {
+    galois::GNNFloat norm_to_use = 0.0;
+
+    if (degree_for_norm != nullptr && !is_backward) {
+      norm_to_use = (degree_for_norm[src]) ? (1.0 / degree_for_norm[src]) : 0.0;
+    }
+
+    if (thread_lane < 2) {
+      edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane];
+    }
+    // essentially what this is doing is making 2 of the threads set edge
+    // begin/end; all threads wait for sync
+    __syncthreads();
+
+    const int row_begin     = edge_begin_end[warp_lane][0];
+    const int row_end       = edge_begin_end[warp_lane][1];
+    unsigned base_src_index = src * column_length;
+
+    for (int offset = row_begin; offset < row_end; offset++) {
+      cuda_bitset_graph_aggregate->set(src);
+      int dst                 = edge_destination[offset];
+      unsigned base_dst_index = dst * column_length;
+
+      if (degree_for_norm != nullptr && is_backward) {
+        norm_to_use =
+            (degree_for_norm[dst]) ? (1.0 / degree_for_norm[dst]) : 0.0;
+      }
+
+      // NOTE: this is where warp diverges
+      // the feature aggregation is split among thread in a warp
+      for (int i = 0; i < column_length; i += WARP_SIZE) {
+        if ((thread_lane + i) < column_length) {
+          if (degree_for_norm != nullptr) {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i] * norm_to_use;
+          } else {
+            aggregate_output[base_src_index + thread_lane + i] +=
+                node_embeddings[base_dst_index + thread_lane + i];
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+// TODO(lhc) Will need to iterate over in-edges if is_backward is on
+void galois::SAGEGPUAllocations::AggregateAllGPU(
+    const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes,
+    size_t column_length, const GNNFloat* node_embeddings,
+    GNNFloat* aggregate_output, bool use_norm, bool is_backward) {
+  // num_nodes should be greater than 0 to avoid negative number of thread
+  if (num_nodes == 0) {
+    return;
+  }
+
+  CUDA_CHECK(cudaMemset(aggregate_output, 0,
+                        num_nodes * column_length * sizeof(GNNFloat)));
+  if (use_norm) {
+    uint32_t* degree_for_norm{nullptr};
+    // TODO(lhc) will be added for sampling
+    // if (use_subgraph_) {
+    //} else {
+    degree_for_norm = gpu_graph.get_global_degrees();
+    //}
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), degree_for_norm, node_embeddings,
+        aggregate_output, cuda_bitset_graph_aggregate.gpu_wr_ptr(),
+        is_backward);
+  } else {
+    AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+        num_nodes, column_length, gpu_graph.edge_index(),
+        gpu_graph.edge_destinations(), nullptr, node_embeddings,
+        aggregate_output, cuda_bitset_graph_aggregate.gpu_wr_ptr(),
+        is_backward);
+  }
+  CUDA_TEST("GPU aggregate all failure");
+}
+
+void galois::SAGEGPUAllocations::UpdateEmbeddingsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* node_embeddings, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns,
+                output_columns, node_embeddings, layer_weights, output);
+}
+
+void galois::SAGEGPUAllocations::UpdateEmbeddingsDerivativeGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* gradients, const GNNFloat* layer_weights,
+    GNNFloat* output) {
+  // note output clumns/input columns are flipped due to transpose of the
+  // layer weights
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, num_nodes, output_columns,
+                input_columns, gradients, layer_weights, output);
+}
+
+void galois::SAGEGPUAllocations::GetWeightGradientsGPU(
+    size_t num_nodes, size_t input_columns, size_t output_columns,
+    const GNNFloat* prev_input, const GNNFloat* gradients, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, num_nodes,
+                output_columns, prev_input, gradients, output);
+}
+
+void galois::SAGEGPUAllocations::SelfFeatureUpdateEmbeddingsGPU(
+    size_t input_rows, size_t input_columns, size_t output_columns,
+    const GNNFloat* node_embeddings, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, input_rows, input_columns,
+                output_columns, node_embeddings, layer_weights_2_, output,
+                true);
+}
+
+void galois::SAGEGPUAllocations::SelfFeatureUpdateEmbeddingsDerivativeGPU(
+    size_t input_rows, size_t output_columns, size_t input_columns,
+    const GNNFloat* gradients, GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, input_rows, output_columns,
+                input_columns, gradients, layer_weights_2_, output, true);
+}
+
+void galois::SAGEGPUAllocations::UpdateWeight2DerivativeGPU(
+    size_t input_columns, size_t input_rows, size_t output_columns,
+    const GNNFloat* prev_layer_inputs, const GNNFloat* input_gradients,
+    GNNFloat* output) {
+  CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, input_rows,
+                output_columns, prev_layer_inputs, input_gradients, output);
+}
+
+void galois::SAGEGPUAllocations::CopyToWeights2(
+    const std::vector<GNNFloat>& cpu_layer_weights) {
+  CUDA_CHECK(cudaMemcpy(layer_weights_2_, cpu_layer_weights.data(),
+                        cpu_layer_weights.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::SAGEGPUAllocations::CopyToWeight2Gradients(
+    const std::vector<GNNFloat>& cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(layer_weight_gradients_2_, cpu_gradients.data(),
+                        cpu_gradients.size() * sizeof(GNNFloat),
+                        cudaMemcpyHostToDevice));
+}
+
+void galois::SAGEGPUAllocations::CopyWeight2GradientsToCPU(
+    std::vector<GNNFloat>* cpu_gradients) {
+  CUDA_CHECK(cudaMemcpy(cpu_gradients->data(), layer_weight_gradients_2_,
+                        cpu_gradients->size() * sizeof(GNNFloat),
+                        cudaMemcpyDeviceToHost));
+}
diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp
new file mode 100644
index 0000000000..372751f052
--- /dev/null
+++ b/libgnn/src/layers/SigmoidLayer.cpp
@@ -0,0 +1 @@
+#include "galois/layers/SigmoidLayer.h"
diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu
new file mode 100644
index 0000000000..e29c1bb201
--- /dev/null
+++ b/libgnn/src/layers/SoftmaxLayer.cu
@@ -0,0 +1,79 @@
+#include <cassert>
+#include "galois/GNNMath.cuh"
+#include "galois/Logging.h"
+#include "galois/layers/SoftmaxLayer.cuh"
+
+void galois::SoftmaxLayerGPU::CopyToCPU(GNNFloat* input, size_t size) {
+  GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * size);
+  cudaMemcpy(cpu_input, input, sizeof(GNNFloat) * size, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < size; i++)
+    fprintf(stderr, "%lu = %f\n", i, cpu_input[i]);
+}
+
+void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase,
+                                              size_t num_nodes,
+                                              size_t feature_length,
+                                              const GNNFloat* input_embeddings,
+                                              GNNFloat* output) {
+  char* mask_to_use = ChooseMask(phase);
+  SoftmaxCrossEntropyForward<<<CUDA_GET_BLOCKS(num_nodes), CUDA_NUM_THREADS>>>(
+      mask_to_use, num_nodes, feature_length, input_embeddings, output);
+  CUDA_TEST("Softmax cross entropy forward failed");
+}
+
+__global__ void SoftmaxBackward(char* mask, size_t num_nodes,
+                                size_t feature_length,
+                                const galois::GNNFloat* predictions,
+                                const galois::GNNLabel* ground_truth,
+                                galois::GNNFloat* output_gradient) {
+  const unsigned global_thread_id =
+      BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index
+  const unsigned warp_thread_lane =
+      threadIdx.x & (WARP_SIZE - 1); // thread index within the warp
+  const unsigned warp_id = global_thread_id / WARP_SIZE; // global warp index
+  const unsigned num_warps =
+      (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps
+
+  // a warp works on a single node at once
+  for (unsigned wid = warp_id; wid < num_nodes; wid += num_warps) {
+    // operate only if masked
+    if (mask[wid] == 1) {
+      unsigned base_index = wid * feature_length;
+      // TODO can refactor below to device functions
+      // cross entropy derivative
+      // each thread of warp takes different feature
+      for (unsigned feat_index = warp_thread_lane; feat_index < feature_length;
+           feat_index += WARP_SIZE) {
+        if (feat_index < feature_length) {
+          if (feat_index == (unsigned)ground_truth[wid]) {
+            output_gradient[base_index + feat_index] =
+                predictions[base_index + feat_index] - 1;
+          } else {
+            output_gradient[base_index + feat_index] =
+                predictions[base_index + feat_index];
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+void galois::SoftmaxLayerGPU::BackwardPhaseGPU(galois::GNNPhase phase,
+                                               size_t num_nodes,
+                                               size_t feature_length,
+                                               const GNNFloat* predictions,
+                                               GNNFloat* output_gradient) {
+  assert(feature_length <= MAX_NUM_CLASSES);
+  // num_nodes should be greater than 0 to avoid negative number of thread
+  if (num_nodes == 0) {
+    return;
+  }
+
+  char* mask_to_use = ChooseMask(phase);
+  SoftmaxBackward<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>(
+      mask_to_use, num_nodes, feature_length, predictions, local_labels_,
+      output_gradient);
+
+  CUDA_TEST("Softmax cross entropy backward failed");
+}
diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt
new file mode 100644
index 0000000000..40efcfa0e3
--- /dev/null
+++ b/libgnn/test/CMakeLists.txt
@@ -0,0 +1,142 @@
+find_package(OpenMP)
+
+set(hosts)
+set(host 12)
+while (${host} GREATER 1)
+  list(APPEND hosts ${host})
+  math(EXPR host "${host} - 1")
+endwhile()
+list(APPEND hosts "1")
+
+add_executable(gnngraph-test gnngraph-test.cpp)
+target_link_libraries(gnngraph-test galois_gnn)
+add_test(NAME gnngraph-test COMMAND gnngraph-test)
+
+if (NOT GALOIS_ENABLE_GPU)
+  set(GALOIS_TESTS
+      ${GALOIS_TESTS}
+      convlayer-test
+      sage-layer-test
+      l2norm-layer-test
+      softmaxlayer-test
+      sigmoidlayer-test
+      gnnconstruct-test
+      gnnfb-test
+      adam-test
+      accuracy-test
+      epoch-test
+      multilabel-epoch-test
+      multilabel-read
+      f1-test
+      sample-bit-test
+      gcn-sample-edge-test
+  )
+  add_executable(aggregate-sync-test aggregate-sync-test.cpp)
+  target_link_libraries(aggregate-sync-test galois_gnn)
+  foreach(host_count ${hosts})
+    add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./aggregate-sync-test)
+    set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
+  endforeach()
+  add_executable(back-conv-test back-conv-test.cpp)
+  target_link_libraries(back-conv-test galois_gnn)
+  foreach(host_count ${hosts})
+    add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./back-conv-test)
+    set_tests_properties(run-back-conv-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
+  endforeach()
+else()
+  set(GALOIS_TESTS
+      ${GALOIS_TESTS}
+      gpu-sage-layer-test
+      gpu-convlayer-test
+      gpu-softmaxlayer-test
+      gpu-adam-test
+      gpu-epoch-test
+     )
+  add_executable(gpu-aggregate-sync-test gpu-aggregate-sync-test.cpp)
+  target_link_libraries(gpu-aggregate-sync-test galois_gnn)
+
+  set(gpu_hosts)
+  set(gpu_host 3) #TODO(lhc) more than 4 gpus, test failed
+                  #          seems like it happened due to graph size 0.
+                  #          so let me postpone this
+  while (${gpu_host} GREATER 1)
+    list(APPEND gpu_hosts ${gpu_host})
+    math(EXPR gpu_host "${gpu_host} - 1")
+  endwhile()
+  list(APPEND gpu_hosts "1")
+
+  add_executable(gpu-back-conv-test gpu-back-conv-test.cpp)
+  target_link_libraries(gpu-back-conv-test galois_gnn)
+  foreach(gpu_host_count ${gpu_hosts})
+    set(PSET "-pset=")
+    foreach(iter RANGE 1 ${gpu_host_count})
+      set(PSET "${PSET}g")
+    endforeach()
+    add_test(NAME run-gpu-back-conv-${gpu_host_count} COMMAND mpiexec --bind-to none -n ${gpu_host_count} ./gpu-back-conv-test ${PSET} -numNodes=1)
+    set_tests_properties(run-gpu-back-conv-${gpu_host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1")
+  endforeach()
+endif()
+
+message("Galois Tests..")
+foreach(galois_test ${GALOIS_TESTS})
+  add_executable(${galois_test} ${galois_test}.cpp)
+  target_link_libraries(${galois_test} galois_gnn)
+  add_test(NAME ${galois_test} COMMAND ${galois_test})
+endforeach()
+
+add_executable(remapverify remapverify.cpp)
+target_link_libraries(remapverify galois_gnn)
+target_compile_definitions(remapverify PUBLIC USE_DIST_GALOIS=1)
+
+# MKL Test
+set(MKL_TESTS
+    mkl_micro_sgalois
+    mkl_micro_dgalois
+    mkl_micro_delete_galois
+    single_mkl_micro_sgalois
+    single_mkl_micro_dgalois
+    mkl_micro
+    mkl_micro_omp
+    single_mkl_micro_omp
+)
+
+add_executable(mkl_micro_sgalois mkl_micro.cpp)
+target_link_libraries(mkl_micro_sgalois PUBLIC galois_gnn)
+target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
+
+add_executable(mkl_micro_dgalois mkl_micro.cpp)
+target_link_libraries(mkl_micro_dgalois PUBLIC galois_gnn)
+target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
+
+add_executable(mkl_micro_delete_galois mkl_micro.cpp)
+target_link_libraries(mkl_micro_delete_galois PUBLIC galois_gnn)
+target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1)
+
+add_executable(single_mkl_micro_sgalois single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_sgalois PUBLIC galois_gnn_single)
+target_compile_definitions(single_mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1)
+
+add_executable(single_mkl_micro_dgalois single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_dgalois PUBLIC galois_gnn_single)
+target_compile_definitions(single_mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1)
+
+add_executable(mkl_micro mkl_micro.cpp)
+
+add_executable(mkl_micro_omp mkl_micro.cpp)
+target_link_libraries(mkl_micro_omp PUBLIC ${INTEL_LIBS} OpenMP::OpenMP_CXX)
+target_compile_definitions(mkl_micro_omp PUBLIC USE_OMP=1)
+
+add_executable(single_mkl_micro_omp single_mkl_micro.cpp)
+target_link_libraries(single_mkl_micro_omp PUBLIC ${SINGLE_INTEL_LIBS} OpenMP::OpenMP_CXX)
+target_compile_definitions(single_mkl_micro_omp PUBLIC USE_OMP=1)
+
+foreach(mkl_test ${MKL_TESTS})
+  target_compile_options(${mkl_test} PUBLIC
+      $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+  target_include_directories(${mkl_test} PUBLIC
+      $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+  target_link_libraries(${mkl_test} PUBLIC $<LINK_ONLY:MKL::MKL>)
+endforeach()
+
+add_executable(gstl_test gstl_test.cpp)
+target_link_libraries(gstl_test galois_shmem)
diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp
new file mode 100644
index 0000000000..f2d34c0403
--- /dev/null
+++ b/libgnn/test/accuracy-test.cpp
@@ -0,0 +1,89 @@
+//! @file accuracy-test.cpp
+//! Similar to softmax test except that accuracy is checked + it constructs
+//! a full network object.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load test graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {7, 7};
+  galois::GraphNeuralNetworkConfig gnn_config(
+      1, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      galois::GNNLayerConfig());
+
+  std::vector<size_t> adam_sizes = {21};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 1);
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+  // for constancy set everything to 1
+  gnn->SetAllLayerWeightsTo1();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::PointerWithSize<galois::GNNFloat> distributions = gnn->DoInference();
+  // accuracy will be 0.2: everything chooses the first 1 as the entire row
+  // is the same
+  float pred_accuracy = gnn->GetGlobalAccuracy(distributions);
+  GALOIS_LOG_VERBOSE("{}", pred_accuracy);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.2));
+
+  // validation mode
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  galois::PointerWithSize<galois::GNNFloat> dist2 = gnn->DoInference();
+  pred_accuracy = gnn->GetGlobalAccuracy(dist2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+
+  // test mode
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> dist3 = gnn->DoInference();
+  pred_accuracy = gnn->GetGlobalAccuracy(dist3);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+
+  // manufactured predictions to make sure it predicts things correctly based
+  // on mode
+  // prediction is correct if diagonal of the 7x7 matrix has largest value
+  std::vector<galois::GNNFloat> mpred = {
+      1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
+      0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+  gnn->SetLayerPhases(galois::GNNPhase::kTrain);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred);
+  GALOIS_LOG_VERBOSE("{}", pred_accuracy);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.8));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(0.0));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+
+  std::vector<galois::GNNFloat> mpred2 = {
+      0.5, 0, 0, 0, 0, 0, 0, 0,   0.3, 0, 0, 0, 0, 0, 0.1, 0, 1,
+      0,   0, 0, 0, 0, 0, 0, 0.3, 0,   0, 0, 1, 0, 0, 0,   2, 0,
+      0,   0, 0, 0, 0, 0, 4, 0,   0,   0, 0, 0, 0, 0, 0.1};
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  pred_accuracy = gnn->GetGlobalAccuracy(mpred2);
+  GALOIS_LOG_ASSERT(pred_accuracy == static_cast<float>(1.0));
+}
diff --git a/libgnn/test/adam-test.cpp b/libgnn/test/adam-test.cpp
new file mode 100644
index 0000000000..159e27c744
--- /dev/null
+++ b/libgnn/test/adam-test.cpp
@@ -0,0 +1,48 @@
+//! @file adam-test.cpp
+//! Tests the adam optimizer
+#include "galois/DistGalois.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/Logging.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+
+  // create sample config that is easy to trace
+  galois::AdamOptimizer::AdamConfiguration config;
+  config.alpha   = 1;
+  config.beta1   = 0.5;
+  config.beta2   = 0.5;
+  config.epsilon = 0;
+
+  std::vector<size_t> layer_sizes = {2, 1};
+  galois::AdamOptimizer adam(config, layer_sizes, 2);
+
+  std::vector<galois::GNNFloat> weights1 = {1, 1};
+  std::vector<galois::GNNFloat> weights2 = {10};
+  std::vector<galois::GNNFloat> grad1    = {1, 1};
+  std::vector<galois::GNNFloat> grad2    = {10};
+
+  adam.GradientDescent(grad1, weights1, 0);
+  // check weights
+  GALOIS_LOG_ASSERT(weights1[0] == 0.0);
+  GALOIS_LOG_ASSERT(weights1[1] == 0.0);
+
+  adam.GradientDescent(grad2, weights2, 1);
+  GALOIS_LOG_ASSERT(weights2[0] == 9.0);
+
+  // run again to check if adam keeps moments from before
+  adam.GradientDescent(grad1, weights1, 0);
+  // check weights again (turns out derivative one ends up doing same thing)
+  GALOIS_LOG_ASSERT(weights1[0] == -1.0);
+  GALOIS_LOG_ASSERT(weights1[1] == -1.0);
+
+  // grad 2 again
+  adam.GradientDescent(grad2, weights2, 1);
+  GALOIS_LOG_ASSERT(weights2[0] == 8.0);
+}
diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp
new file mode 100644
index 0000000000..549e6c7c53
--- /dev/null
+++ b/libgnn/test/aggregate-sync-test.cpp
@@ -0,0 +1,382 @@
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  if (galois::runtime::getSystemNetworkInterface().Num == 1) {
+    GALOIS_LOG_WARN("This test should be run with multiple hosts/processes!");
+  }
+
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  // print edges for sanity
+  for (size_t node = 0; node < test_graph->size(); node++) {
+    for (auto e = test_graph->edge_begin(node); e != test_graph->edge_end(node);
+         e++) {
+      galois::gPrint(test_graph->host_prefix(), "Edge ",
+                     test_graph->GetGID(node), " ",
+                     test_graph->GetGID(test_graph->GetEdgeDest(e)), "\n");
+    }
+  }
+  for (auto own = test_graph->begin_owned(); own != test_graph->end_owned();
+       own++) {
+    galois::gPrint(test_graph->host_prefix(), "Node owned GID ",
+                   test_graph->GetGID(*own), "\n");
+  }
+
+  // create same layer from convlayer-test and make sure result is the same even
+  // in multi-host environment
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph->size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+  galois::GNNLayerConfig l_config;
+  l_config.DebugConfig();
+  l_config.disable_aggregate_after_update = true;
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(test_graph->size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // create the layer, no norm factor
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          0, *(test_graph.get()), &p_null, dimension_0, l_config);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph->GetLocalFeatures());
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check output
+  //////////////////////////////////////////////////////////////////////////////
+
+  // check each row on each host: convert row into GID, and based on GID we
+  // know what the ground truth is
+  // row 0 = 3
+  // row 1 = 6
+  // row 2 = 12
+  // row 3 = 18
+  // row 4 = 24
+  // row 5 = 30
+  // row 6 = 15
+
+  // row should correspond to LID
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_VASSERT(layer_0_forward_output[row * 2 + c] == ground_truth,
+                         "should be {} not {}", ground_truth,
+                         layer_0_forward_output[row * 2 + c]);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones(dummy_ones_v);
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
+      layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output: empty
+  //////////////////////////////////////////////////////////////////////////////
+
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // layer 1 to check backward output
+  //////////////////////////////////////////////////////////////////////////////
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, *(test_graph.get()), &p_back, dimension_0, l_config);
+  layer_1->InitAllWeightsTo1();
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph->GetLocalFeatures());
+
+  // same check for forward as before
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(test_graph->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+    case 6:
+      ground_truth = 2;
+      break;
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      ground_truth = 4;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 3 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth);
+    }
+  }
+  //////////////////////////////////////////////////////////////////////////////
+  auto test_graph_2 = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+  // print edges for sanity
+  for (size_t node = 0; node < test_graph_2->size(); node++) {
+    for (auto e = test_graph_2->edge_begin(node);
+         e != test_graph_2->edge_end(node); e++) {
+      galois::gPrint(test_graph_2->host_prefix(), "Edge ",
+                     test_graph_2->GetGID(node), " ",
+                     test_graph_2->GetGID(test_graph_2->GetEdgeDest(e)), "\n");
+    }
+  }
+  for (auto own = test_graph_2->begin_owned(); own != test_graph_2->end_owned();
+       own++) {
+    galois::gPrint(test_graph_2->host_prefix(), "Node owned GID ",
+                   test_graph_2->GetGID(*own), "\n");
+  }
+
+  // create same layer from convlayer-test and make sure result is the same even
+  // in multi-host environment
+  dimension_0.input_rows                  = test_graph_2->size();
+  dimension_0.input_columns               = 3;
+  dimension_0.output_columns              = 2;
+  l_config.disable_aggregate_after_update = false;
+  l_config.DebugConfig();
+
+  // create the layer, no norm factor
+  layer_0 = std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+      0, *(test_graph_2.get()), &p_null, dimension_0, l_config);
+  layer_0->InitAllWeightsTo1();
+
+  // make sure it runs in a sane manner
+  // galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
+  layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph_2->GetLocalFeatures());
+
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    if (global_row == 1) {
+      galois::gPrint(test_graph_2->host_prefix(), "GID ", global_row, " local ",
+                     row, " value ", layer_0_forward_output[row * 2], "\n");
+    }
+    if (global_row == 4) {
+      galois::gPrint(test_graph_2->host_prefix(), "GID ", global_row, " local ",
+                     row, " value ", layer_0_forward_output[row * 2], "\n");
+    }
+  }
+
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_VASSERT(layer_0_forward_output[row * 2 + c] == ground_truth,
+                         "{} Row {} GID {} need to be {} not {}",
+                         test_graph_2->host_prefix(), row, global_row,
+                         ground_truth, layer_0_forward_output[row * 2 + c]);
+    }
+  }
+
+  std::vector<galois::GNNFloat> back_matrix_2(test_graph_2->size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back_2(back_matrix_2);
+
+  layer_1 = std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+      1, *(test_graph_2.get()), &p_back_2, dimension_0, l_config);
+  layer_1->InitAllWeightsTo1();
+  layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph_2->GetLocalFeatures());
+
+  // same check for forward as before
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  std::vector<galois::GNNFloat> dummy_ones_v2(test_graph_2->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones2(dummy_ones_v2);
+  layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph_2->GetLocalFeatures(), &dummy_ones2);
+
+  for (size_t row = 0; row < test_graph_2->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph_2->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+    case 6:
+      ground_truth = 2;
+      break;
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      ground_truth = 4;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 3 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth);
+    }
+  }
+}
diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp
new file mode 100644
index 0000000000..df3dfe915e
--- /dev/null
+++ b/libgnn/test/back-conv-test.cpp
@@ -0,0 +1,180 @@
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+  galois::PointerWithSize<galois::GNNFloat> feats =
+      test_graph.GetLocalFeatures();
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 0;
+      break;
+    case 1:
+      ground_truth = 1;
+      break;
+    case 2:
+      ground_truth = 2;
+      break;
+    case 3:
+      ground_truth = 3;
+      break;
+    case 4:
+      ground_truth = 4;
+      break;
+    case 5:
+      ground_truth = 5;
+      break;
+    case 6:
+      ground_truth = 6;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_VASSERT(feats[row * 3 + c] == ground_truth, "{} not {}",
+                         ground_truth, feats[row * 2 + c]);
+    }
+  }
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph.size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.DebugConfig();
+  dcon.disable_aggregate_after_update = true;
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph.size() * 2, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  std::vector<galois::GNNFloat> back_matrix(test_graph.size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // create layer 1 for testing backward prop actually giving weights back
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_VASSERT(layer_1_forward_output[row * 2 + c] == ground_truth,
+                         "{} not {}", ground_truth,
+                         layer_1_forward_output[row * 2 + c]);
+    }
+  }
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 2;
+      break;
+    case 1:
+      ground_truth = 4;
+      break;
+    case 2:
+      ground_truth = 4;
+      break;
+    case 3:
+      ground_truth = 4;
+      break;
+    case 4:
+      ground_truth = 4;
+      break;
+    case 5:
+      ground_truth = 4;
+      break;
+    case 6:
+      ground_truth = 2;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT(layer_1_backward_output[row * 3 + c] == ground_truth);
+    }
+  }
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[0] == 36, "36 not {}",
+                     layer_1_weight_gradients[0]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[1] == 36, "36 not {}",
+                     layer_1_weight_gradients[1]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[2] == 36, "36 not {}",
+                     layer_1_weight_gradients[2]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[3] == 36, "36 not {}",
+                     layer_1_weight_gradients[3]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[4] == 36, "36 not {}",
+                     layer_1_weight_gradients[4]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[5] == 36, "36 not {}",
+                     layer_1_weight_gradients[5]);
+
+  layer_1.reset();
+
+  return 0;
+}
diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp
new file mode 100644
index 0000000000..6170e87d50
--- /dev/null
+++ b/libgnn/test/convlayer-test.cpp
@@ -0,0 +1,253 @@
+//! @file convlayer-test.cpp
+//! Conv layer test with a test graph
+
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  galois::PointerWithSize<galois::GNNFloat> feats =
+      test_graph.GetLocalFeatures();
+  //////////////////////////////////////////////////////////////////////////////
+  // doubles as a test for reading as well
+  GALOIS_LOG_ASSERT(7 == test_graph.size());
+  GALOIS_LOG_ASSERT(21 == feats.size());
+  GALOIS_LOG_ASSERT(0.0 == feats[0]);
+  GALOIS_LOG_ASSERT(0.0 == feats[1]);
+  GALOIS_LOG_ASSERT(0.0 == feats[2]);
+  GALOIS_LOG_ASSERT(1.0 == feats[3]);
+  GALOIS_LOG_ASSERT(1.0 == feats[4]);
+  GALOIS_LOG_ASSERT(1.0 == feats[5]);
+  GALOIS_LOG_ASSERT(2.0 == feats[6]);
+  GALOIS_LOG_ASSERT(2.0 == feats[7]);
+  GALOIS_LOG_ASSERT(2.0 == feats[8]);
+  GALOIS_LOG_ASSERT(3.0 == feats[9]);
+  GALOIS_LOG_ASSERT(3.0 == feats[10]);
+  GALOIS_LOG_ASSERT(3.0 == feats[11]);
+  GALOIS_LOG_ASSERT(4.0 == feats[12]);
+  GALOIS_LOG_ASSERT(4.0 == feats[13]);
+  GALOIS_LOG_ASSERT(4.0 == feats[14]);
+  GALOIS_LOG_ASSERT(5.0 == feats[15]);
+  GALOIS_LOG_ASSERT(5.0 == feats[16]);
+  GALOIS_LOG_ASSERT(5.0 == feats[17]);
+  GALOIS_LOG_ASSERT(6.0 == feats[18]);
+  GALOIS_LOG_ASSERT(6.0 == feats[19]);
+  GALOIS_LOG_ASSERT(6.0 == feats[20]);
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // create the layer, no norm factor
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          0, test_graph, &p_null, dimension_0, dcon);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  const galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 output
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3",
+                     layer_0_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 6, "{} should be 6",
+                     layer_0_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
+  //////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
+      layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output; all 0 because layer 0
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 3
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
+      layer_0->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
+
+  layer_0.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(14, 1);
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
+
+  layer_1.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerConfig config;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
+  config.disable_aggregate_after_update = false;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_2 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, config);
+  galois::PointerWithSize<galois::GNNFloat> l2_fo =
+      layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  galois::PointerWithSize<galois::GNNFloat> l2_bo =
+      layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+
+  return 0;
+}
diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp
new file mode 100644
index 0000000000..c0b4ede716
--- /dev/null
+++ b/libgnn/test/epoch-test.cpp
@@ -0,0 +1,59 @@
+//! @file epoch-test.cpp
+//! Run 50 epochs of training to see if results improve.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  // size_t num_threads = galois::setActiveThreads(1);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {
+      16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
+  galois::GNNLayerConfig layer_config;
+  layer_config.disable_dropout       = false;
+  layer_config.disable_activation    = false;
+  layer_config.disable_normalization = false;
+  // XXX Activation kills accuracy compared to old code, esp. for cora
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      layer_config);
+
+  std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
+                                    16 * test_graph->GetNumLabelClasses()};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // no verification; test should be eyeballed to make sure accuracy is
+  // increasing
+  galois::StatTimer main_timer("Timer_0");
+  main_timer.start();
+  for (size_t epoch = 0; epoch < 25; epoch++) {
+    galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+    gnn->GradientPropagation();
+    galois::gPrint("Epoch ", epoch, ": Accuracy is ",
+                   gnn->GetGlobalAccuracy(predictions), "\n");
+  }
+
+  // check test accuracy
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions),
+                 "\n");
+  main_timer.stop();
+}
diff --git a/libgnn/test/f1-test.cpp b/libgnn/test/f1-test.cpp
new file mode 100644
index 0000000000..363c12861b
--- /dev/null
+++ b/libgnn/test/f1-test.cpp
@@ -0,0 +1,51 @@
+//! @file f1-test
+//! Tests f1 micro accuracy for multiclass labels
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  // load test graph; false at end = multilabel
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+
+  // perfect precision and recall
+  std::vector<galois::GNNFloat> prediction = {
+      1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
+      1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1};
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction, galois::GNNPhase::kTrain));
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction, galois::GNNPhase::kValidate));
+  GALOIS_LOG_ASSERT(
+      1.0 == test_graph.GetGlobalAccuracy(prediction, galois::GNNPhase::kTest));
+
+  // perfect recall, but training precision is bad
+  std::vector<galois::GNNFloat> prediction2 = {
+      1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1};
+
+  // just print here and check with eyes: checking float equivalance is a pain
+  // both prints should be .6666666
+  GALOIS_LOG_DEBUG(
+      "{} {}",
+      test_graph.GetGlobalAccuracy(prediction2, galois::GNNPhase::kTrain),
+      (2 * (15.0 / 30.0)) / ((15.0 / 30.0) + 1));
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction2, galois::GNNPhase::kValidate));
+  GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy(
+                               prediction2, galois::GNNPhase::kTest));
+
+  // no predictions made
+  std::vector<galois::GNNFloat> prediction3(49, 0);
+  GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy(
+                               prediction3, galois::GNNPhase::kTrain));
+  GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy(
+                               prediction3, galois::GNNPhase::kValidate));
+  GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy(
+                               prediction3, galois::GNNPhase::kTest));
+
+  return 0;
+}
diff --git a/libgnn/test/gcn-sample-edge-test.cpp b/libgnn/test/gcn-sample-edge-test.cpp
new file mode 100644
index 0000000000..c612639d10
--- /dev/null
+++ b/libgnn/test/gcn-sample-edge-test.cpp
@@ -0,0 +1,145 @@
+/**
+ * This test checks correctness by comparing hand calculation
+ * of the forward and backward phases.
+ * This is implemented to check correctness of GCN layer.
+ * Below is the process:
+ * 1. Mark and check nodes and edges to be initially sampled.
+ * 2. Nodes adjacent to the sampled edges are sampled.
+ * 3. Perform forward/backward phases and compare the results
+ *    with hand calculation results.
+ */
+
+// TODO(hc): Designing and implementing multi-host execution is
+// a time consuming task and so, I will work on that later.
+// But, without test, I confirmed correctness of the multi-host
+// execution based on the below changes.
+//
+// 1. Set all layer weights to 1, instead of random values.
+// 2. Used nodes within global node ID range for training.
+// (So, the nodes are deterministic)
+// (The original code uses random selection to match SHAD's one)
+// 3. Compared 1-host and multi hosts, like 2 and 4 hosts,
+// accuracy results on the graph sampling mode.
+// 4. They should be same if the GCN graph sampling is correct.
+// (It was on the test done on 09/15/2023)
+
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/SAGELayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = 1;
+  // tester graph: 0 - 1 - 2 - 3 - 4 - 5 - 6
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+  test_graph.InitializeSamplingData();
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.disable_normalization          = false;
+  dcon.DebugConfig();
+  // Choose a few sample nodes
+  test_graph.SetSampledNode(0);
+  test_graph.SetSampledNode(4);
+  test_graph.UnsetSampledNode(1);
+  test_graph.UnsetSampledNode(2);
+  test_graph.UnsetSampledNode(3);
+  test_graph.UnsetSampledNode(5);
+  test_graph.UnsetSampledNode(6);
+
+  test_graph.ResizeSamplingBitsets();
+  test_graph.SampleAllEdges(0, false, 1);
+
+  // After the above lines, nodes 0, 1, 3, 4, 5 and
+  // edges 0, 7, 8 should be sampled.
+  // So,
+  // 0 -> 1, 2 <- 3 -> 4
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(0));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(1));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(3));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(4));
+  GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(5));
+
+  GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(7));
+  GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(8));
+
+  galois::DynamicBitSet& bset = test_graph.GetDefinitelySampledNodesBset();
+  bset.ParallelReset();
+  bset.set(0);
+  bset.set(1);
+  bset.set(3);
+  bset.set(4);
+  bset.set(5);
+  test_graph.ConstructSampledSubgraph(1);
+  test_graph.EnableSubgraph();
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 5;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  // Layer declaration
+  std::vector<galois::GNNFloat> back_matrix(15);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
+
+  layer_1->InitAllWeightsTo1();
+  layer_1->EnableSampling();
+  galois::PointerWithSize<galois::GNNFloat> features =
+      test_graph.GetLocalFeatures();
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(features);
+
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 0);
+
+  // Dummy gradients
+  std::vector<galois::GNNFloat> dummy_ones_v(10, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+  dummy_ones_v.assign(10, 1);
+  dummy_ones_v[4] = 0;
+  dummy_ones_v[5] = 0;
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  GALOIS_LOG_ASSERT(layer_1_backward_output[0] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[1] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[2] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[3] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[4] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[5] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[6] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[7] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[8] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[9] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[10] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[11] == 0);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[12] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[13] == 2);
+  GALOIS_LOG_ASSERT(layer_1_backward_output[14] == 2);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[5] == 6);
+
+  return 0;
+}
diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp
new file mode 100644
index 0000000000..aa1513ca91
--- /dev/null
+++ b/libgnn/test/gnnconstruct-test.cpp
@@ -0,0 +1,69 @@
+//! @file gnnconstruct-test.cpp
+//! Test to make sure construction works as expected
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  // 2 layer test with softmax
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  // note this includes the output; last 2 must be same because softmax
+  std::vector<size_t> layer_output_sizes = {4, 7, 7};
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax);
+  std::vector<size_t> adam_sizes = {12, 28};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+
+  galois::GraphNeuralNetwork<char, void> gnn(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+
+  // note this does not include output layer
+  GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2);
+  // assert layer types
+  GALOIS_LOG_ASSERT(galois::GNNLayerType::kGraphConvolutional ==
+                    gnn.GetIntermediateLayer(0)->layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kInvalid ==
+                    gnn.GetIntermediateLayer(0)->output_layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNLayerType::kGraphConvolutional ==
+                    gnn.GetIntermediateLayer(1)->layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kInvalid ==
+                    gnn.GetIntermediateLayer(1)->output_layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNLayerType::kInvalid ==
+                    gnn.GetOutputLayer()->layer_type());
+  GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kSoftmax ==
+                    gnn.GetOutputLayer()->output_layer_type());
+
+  // assert dimensions are what we expect
+  const galois::GNNLayerDimensions& layer0_dims =
+      gnn.GetIntermediateLayer(0)->GetLayerDimensions();
+  GALOIS_LOG_ASSERT(layer0_dims.input_rows == 7);
+  // remember tester has features of length 3
+  GALOIS_LOG_ASSERT(layer0_dims.input_columns == 3);
+  GALOIS_LOG_ASSERT(layer0_dims.output_columns == 4);
+
+  const galois::GNNLayerDimensions& layer1_dims =
+      gnn.GetIntermediateLayer(1)->GetLayerDimensions();
+  GALOIS_LOG_ASSERT(layer1_dims.input_rows == 7);
+  GALOIS_LOG_ASSERT(layer1_dims.input_columns == 4);
+  GALOIS_LOG_ASSERT(layer1_dims.output_columns == 7);
+
+  const galois::GNNLayerDimensions& output_dims =
+      gnn.GetOutputLayer()->GetLayerDimensions();
+  GALOIS_LOG_ASSERT(output_dims.input_rows == 7);
+  GALOIS_LOG_ASSERT(output_dims.input_columns == 7);
+  GALOIS_LOG_ASSERT(output_dims.output_columns == 7);
+}
diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp
new file mode 100644
index 0000000000..eb74ffb78a
--- /dev/null
+++ b/libgnn/test/gnnfb-test.cpp
@@ -0,0 +1,185 @@
+//! @file gnnfb-test.cpp
+//! Runs a forward and backward phase on a GCN and an example graph.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  // 2 layer test with softmax
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  // note this includes the output; last 2 must be same because softmax
+  std::vector<size_t> layer_output_sizes = {4, 7, 7};
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+  // note GNNLayerConfig is passed in; use a config that does not do anything
+  // extra like dropout or activation and the like so that input is easier to
+  // verify
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      dcon);
+  // input is 7 x 3, layers are then 3 x 4 and 4 x 7 and 7 x 7
+  // middle 2 are trainable so 12 and 28
+  std::vector<size_t> adam_sizes = {12, 28};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+  auto gnn  = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+  // for constancy set everything to 1
+  gnn->SetAllLayerWeightsTo1();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // forward phase
+  //////////////////////////////////////////////////////////////////////////////
+  const galois::PointerWithSize<galois::GNNFloat> fo_out = gnn->DoInference();
+
+  // check output for layers to make sure it's as expected
+  galois::PointerWithSize<galois::GNNFloat> lf0_out =
+      gnn->GetIntermediateLayer(0)->GetForwardOutput();
+  GALOIS_LOG_ASSERT(lf0_out.size() == 28);
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[0 + i] == 3);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[4 + i] == 6);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[8 + i] == 12);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[12 + i] == 18);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[16 + i] == 24);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[20 + i] == 30);
+  }
+  for (size_t i = 0; i < 4; i++) {
+    GALOIS_LOG_ASSERT(lf0_out[24 + i] == 15);
+  }
+
+  // Disabled: this test worked in past because forward outputs were all
+  // separate matrices, but due to space saving measures this forward output
+  // gets messed with by the softmax call
+
+  // const galois::PointerWithSize<galois::GNNFloat> lf1_out =
+  //    gnn->GetIntermediateLayer(1)->GetForwardOutput();
+  // GALOIS_LOG_ASSERT(lf1_out.size() == 49);
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_VASSERT(lf1_out[0 + i] == 24, "{} vs {} (correct)", lf1_out[0 +
+  //  i], 24);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[7 + i] == 60);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[14 + i] == 96);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[21 + i] == 144);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[28 + i] == 192);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[35 + i] == 156);
+  //}
+  // for (size_t i = 0; i < 7; i++) {
+  //  GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120);
+  //}
+
+  GALOIS_LOG_ASSERT(fo_out.size() == 49);
+  // since row all same, prob distribution across row should be same
+  for (size_t c = 0; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_VERBOSE("{}", fo_out[c + i]);
+      GALOIS_LOG_ASSERT(fo_out[c + i] == fo_out[c + i + 1]);
+    }
+  }
+
+  // train mode = last 2 should be masked off
+  for (size_t c = 35; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out[c + i] == 0);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // backward phase; run it; verifying is difficult due to floating point
+  // nature of softmax gradients
+  //////////////////////////////////////////////////////////////////////////////
+
+  gnn->GradientPropagation();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // verify forward val and test masks
+  //////////////////////////////////////////////////////////////////////////////
+  gnn->SetLayerPhases(galois::GNNPhase::kValidate);
+  gnn->SetAllLayerWeightsTo1();
+  const galois::PointerWithSize<galois::GNNFloat> fo_out_val =
+      gnn->DoInference();
+  for (size_t c = 0; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == fo_out_val[c + i + 1]);
+    }
+  }
+  // first 5 and last should be 0s
+  for (size_t c = 0; c < 35; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
+    }
+  }
+  for (size_t c = 42; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0);
+    }
+  }
+
+  // all but last should be 0s
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  gnn->SetAllLayerWeightsTo1();
+  galois::PointerWithSize<galois::GNNFloat> fo_out_test = gnn->DoInference();
+  for (size_t c = 0; c < 49; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_test[c + i] == fo_out_test[c + i + 1]);
+    }
+  }
+  // first 5 and last should be 0s
+  for (size_t c = 0; c < 42; c += 7) {
+    for (size_t i = 0; i < 6; i++) {
+      GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // run different config of gnn with dropout/activation
+  //////////////////////////////////////////////////////////////////////////////
+
+  GALOIS_LOG_VERBOSE("Running with different congifuration");
+
+  test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+  galois::GraphNeuralNetworkConfig gnn_config2(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      dcon);
+  auto adam2 = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+  auto gnn2  = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
+      std::move(test_graph), std::move(adam2), std::move(gnn_config2));
+  // run to make sure no crashes occur
+  gnn2->DoInference();
+  gnn2->GradientPropagation();
+}
diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp
new file mode 100644
index 0000000000..b8a05fc8cc
--- /dev/null
+++ b/libgnn/test/gnngraph-test.cpp
@@ -0,0 +1,37 @@
+//! @file gnngraph-test.cpp
+//! Test loads a few graphs. Better if you run with multiple hosts.
+//! Doesn't really do much besides that.
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+
+  // note multi level reading tested in another test
+  GALOIS_LOG_VERBOSE("reddit with single label, oec");
+  galois::graphs::GNNGraph<char, void>(
+      "cora", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+  GALOIS_LOG_VERBOSE("reddit with single label, cvc");
+  galois::graphs::GNNGraph<char, void>(
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+
+  // below for when I want to check the remapper
+  // galois::graphs::GNNGraph remapper("ogbn-papers100M",
+  // galois::graphs::GNNPartitionScheme::kOEC, true);
+  // remapper.ContiguousRemap("ogbn-papers100M-remap");
+  // galois::graphs::GNNGraph remapper("ogbn-papers100M-remap",
+  // galois::graphs::GNNPartitionScheme::kOEC, true);
+
+  // galois::graphs::GNNGraph remapper("yelp",
+  // galois::graphs::GNNPartitionScheme::kOEC, true);
+  // remapper.ContiguousRemap("yelp-remap");
+
+  return 0;
+}
diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp
new file mode 100644
index 0000000000..646cba3b16
--- /dev/null
+++ b/libgnn/test/gpu-adam-test.cpp
@@ -0,0 +1,78 @@
+//! @file gpu-adam-test.cpp
+//! Tests the adam optimizer
+#include "galois/DistGalois.h"
+#include "galois/GNNOptimizers.h"
+#include "galois/Logging.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
+  // create sample config that is easy to trace
+  galois::AdamOptimizer::AdamConfiguration config;
+  config.alpha   = 1;
+  config.beta1   = 0.5;
+  config.beta2   = 0.5;
+  config.epsilon = 0;
+
+  std::vector<size_t> layer_sizes = {2, 1};
+  galois::AdamOptimizer adam(config, layer_sizes, 2);
+
+  // make this layer to get access to a gpu helper function; TODO
+  // need a helper alloc function
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
+  std::vector<galois::GNNFloat> output_matrix;
+  output_matrix.resize(dimension_0.input_rows * dimension_0.input_columns);
+
+  galois::PointerWithSize<galois::GNNFloat> output_layer(output_matrix);
+  auto alloc_layer = std::make_unique<galois::SoftmaxLayer>(
+      3, test_graph, &output_layer, dimension_0);
+
+  std::vector<galois::GNNFloat> weights1 = {1, 1};
+  std::vector<galois::GNNFloat> weights2 = {10};
+  std::vector<galois::GNNFloat> grad1    = {1, 1};
+  std::vector<galois::GNNFloat> grad2    = {10};
+
+  galois::PointerWithSize<galois::GNNFloat> p_grad1 =
+      alloc_layer->AllocateGPU(grad1);
+  galois::PointerWithSize<galois::GNNFloat> p_weights1 =
+      alloc_layer->AllocateGPU(weights1);
+  galois::PointerWithSize<galois::GNNFloat> p_grad2 =
+      alloc_layer->AllocateGPU(grad2);
+  galois::PointerWithSize<galois::GNNFloat> p_weights2 =
+      alloc_layer->AllocateGPU(weights2);
+
+  adam.GradientDescent(p_grad1, p_weights1, 0);
+  adam.CopyToVector(weights1, p_weights1);
+
+  // check weights
+  GALOIS_LOG_ASSERT(weights1[0] == 0.0);
+  GALOIS_LOG_ASSERT(weights1[1] == 0.0);
+
+  adam.GradientDescent(p_grad2, p_weights2, 1);
+  adam.CopyToVector(weights2, p_weights2);
+  GALOIS_LOG_ASSERT(weights2[0] == 9.0);
+
+  // run again to check if adam keeps moments from before
+  adam.GradientDescent(p_grad1, p_weights1, 0);
+  adam.CopyToVector(weights1, p_weights1);
+  // check weights again (turns out derivative one ends up doing same thing)
+  GALOIS_LOG_ASSERT(weights1[0] == -1.0);
+  GALOIS_LOG_ASSERT(weights1[1] == -1.0);
+
+  // grad 2 again
+  adam.GradientDescent(p_grad2, p_weights2, 1);
+  adam.CopyToVector(weights2, p_weights2);
+  GALOIS_LOG_ASSERT(weights2[0] == 8.0);
+}
diff --git a/libgnn/test/gpu-aggregate-sync-test.cpp b/libgnn/test/gpu-aggregate-sync-test.cpp
new file mode 100644
index 0000000000..e8d0b9b683
--- /dev/null
+++ b/libgnn/test/gpu-aggregate-sync-test.cpp
@@ -0,0 +1,217 @@
+//! @file gpu-aggregate-sync-test.cpp
+//! GPU sync test to make sure it's sane
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/CUDAUtilHostDecls.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  if (galois::runtime::getSystemNetworkInterface().Num == 1) {
+    GALOIS_LOG_ERROR("This test should be run with multiple hosts/processes");
+    exit(1);
+  }
+  device_personality = DevicePersonality::GPU_CUDA;
+  gpudevice          = galois::runtime::getSystemNetworkInterface().ID;
+  SetCUDADeviceId(gpudevice);
+
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  // create same layer from convlayer-test and make sure result is the same even
+  // in multi-host environment
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph->size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+  galois::GNNLayerConfig l_config;
+  l_config.disable_aggregate_after_update = true;
+
+  unsigned num_layers = 2;
+  test_graph->ResizeGPULayerVector(num_layers);
+  test_graph->InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph->InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // create the layer, no norm factor
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          0, *(test_graph.get()), &p_null, dimension_0, l_config);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  layer_0->ForwardPhase(test_graph->GetLocalFeatures());
+  // pointer is to GPU memory: copy it over to a CPU source for verification
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->CopyForwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check output
+  //////////////////////////////////////////////////////////////////////////////
+
+  // check each row on each host: convert row into GID, and based on GID we
+  // know what the ground truth is
+  // row 0 = 3
+  // row 1 = 6
+  // row 2 = 12
+  // row 3 = 18
+  // row 4 = 24
+  // row 5 = 30
+  // row 6 = 15
+
+  // row should correspond to LID
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_0_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph->size() * 2, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones =
+      layer_0->AllocateGPU(dummy_ones_v);
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& layer_0_backward_output =
+      layer_0->CopyBackwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 backward output; all 0 because layer 0
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == test_graph->size() * 3);
+  for (size_t i = 0; i < layer_0_backward_output.size(); i++) {
+    GALOIS_LOG_ASSERT((layer_0_backward_output)[i] == 0);
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // layer 1 to check backward output
+  //////////////////////////////////////////////////////////////////////////////
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, *(test_graph.get()), &p_back, dimension_0, l_config);
+  layer_1->InitAllWeightsTo1();
+  layer_1->ForwardPhase(test_graph->GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  // same check for forward as before
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth);
+    }
+  }
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(test_graph->size() * 2, 1);
+  layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
+
+  for (size_t row = 0; row < test_graph->size(); row++) {
+    // row -> GID
+    size_t global_row = test_graph->GetGID(row);
+
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+    case 6:
+      ground_truth = 2;
+      break;
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      ground_truth = 4;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+
+    // size 3 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth);
+    }
+  }
+
+  // TODO CVC
+}
diff --git a/libgnn/test/gpu-back-conv-test.cpp b/libgnn/test/gpu-back-conv-test.cpp
new file mode 100644
index 0000000000..7fedffeda6
--- /dev/null
+++ b/libgnn/test/gpu-back-conv-test.cpp
@@ -0,0 +1,167 @@
+//! @file gpu-back-conv-test.cpp
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/CUDAUtilHostDecls.h"
+
+extern int gpudevice;
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  const unsigned my_host_id = galois::runtime::getHostID();
+  gpudevice                 = my_host_id;
+  SetCUDADeviceId(gpudevice);
+  device_personality = DevicePersonality::GPU_CUDA;
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = test_graph.size();
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.DebugConfig();
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(test_graph.size() * 3);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(test_graph.size() * 2, 1);
+
+  unsigned num_layers = 2;
+  test_graph.ResizeGPULayerVector(num_layers);
+  // require 0th substrate initialization
+  test_graph.InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+
+  std::vector<galois::GNNFloat> output_matrix;
+  output_matrix.resize(dimension_0.input_rows * dimension_0.input_columns);
+  galois::PointerWithSize<galois::GNNFloat> output_layer(output_matrix);
+
+  // create layer 1 for testing backward prop actually giving weights back
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
+  galois::PointerWithSize dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
+  layer_1->InitAllWeightsTo1();
+  layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 3;
+      break;
+    case 1:
+      ground_truth = 6;
+      break;
+    case 2:
+      ground_truth = 12;
+      break;
+    case 3:
+      ground_truth = 18;
+      break;
+    case 4:
+      ground_truth = 24;
+      break;
+    case 5:
+      ground_truth = 30;
+      break;
+    case 6:
+      ground_truth = 15;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 2; c++) {
+      GALOIS_LOG_VASSERT(layer_1_forward_output[row * 2 + c] == ground_truth,
+                         "{} not {}", ground_truth,
+                         layer_1_forward_output[row * 2 + c]);
+    }
+  }
+
+  layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
+
+  for (size_t row = 0; row < test_graph.size(); row++) {
+    // row -> GID
+    size_t global_row             = test_graph.GetGID(row);
+    galois::GNNFloat ground_truth = 0.0;
+
+    switch (global_row) {
+    case 0:
+      ground_truth = 2;
+      break;
+    case 1:
+      ground_truth = 4;
+      break;
+    case 2:
+      ground_truth = 4;
+      break;
+    case 3:
+      ground_truth = 4;
+      break;
+    case 4:
+      ground_truth = 4;
+      break;
+    case 5:
+      ground_truth = 4;
+      break;
+    case 6:
+      ground_truth = 2;
+      break;
+    default:
+      GALOIS_LOG_FATAL("bad global row for test graph");
+      break;
+    }
+    // size 2 columns
+    for (size_t c = 0; c < 3; c++) {
+      GALOIS_LOG_ASSERT(layer_1_backward_output[row * 3 + c] == ground_truth);
+    }
+  }
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients =
+      layer_1->CopyWeightGradientsFromGPU();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[0] == 36, "36 not {}",
+                     layer_1_weight_gradients[0]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[1] == 36, "36 not {}",
+                     layer_1_weight_gradients[1]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[2] == 36, "36 not {}",
+                     layer_1_weight_gradients[2]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[3] == 36, "36 not {}",
+                     layer_1_weight_gradients[3]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[4] == 36, "36 not {}",
+                     layer_1_weight_gradients[4]);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients[5] == 36, "36 not {}",
+                     layer_1_weight_gradients[5]);
+
+  layer_1.reset();
+
+  return 0;
+}
diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp
new file mode 100644
index 0000000000..dc5a4ad917
--- /dev/null
+++ b/libgnn/test/gpu-convlayer-test.cpp
@@ -0,0 +1,249 @@
+//! @file gpu-convlayer-test.cpp
+//! Conv layer test with a test graph on gpus
+
+#include "galois/Logging.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  galois::PointerWithSize<galois::GNNFloat> feats =
+      test_graph.GetLocalFeatures();
+  //////////////////////////////////////////////////////////////////////////////
+  // doubles as a test for reading as well
+  GALOIS_LOG_ASSERT(7 == test_graph.size());
+  GALOIS_LOG_ASSERT(21 == feats.size());
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  unsigned num_layers = 3;
+  test_graph.ResizeGPULayerVector(num_layers);
+  test_graph.InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      2, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+
+  // create the layer, no norm factor
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_0 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          0, test_graph, &p_null, dimension_0, dcon);
+  layer_0->InitAllWeightsTo1();
+  // make sure it runs in a sane manner
+  layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+  // pointer is to GPU memory: copy it over to a CPU source for verification
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->CopyForwardOutputFromGPU();
+
+  ////////////////////////////////////////////////////////////////////////////////
+  //// sanity check layer 0 output
+  ////////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15);
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  // TODO need to free the gpu pointer
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones =
+      layer_0->AllocateGPU(dummy_ones_v);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  // galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
+  layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  const std::vector<galois::GNNFloat>& layer_0_weight_gradients =
+      layer_0->CopyWeightGradientsFromGPU();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
+
+  layer_0.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
+  layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2);
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients =
+      layer_1->CopyWeightGradientsFromGPU();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
+
+  layer_1.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // TODO get dropout and activation working
+
+  galois::GNNLayerConfig config;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
+  config.disable_aggregate_after_update = true;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_2 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          2, test_graph, &p_back, dimension_0, config);
+  layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  // pointer is to GPU memory: copy it over to a CPU source for verification
+  const std::vector<galois::GNNFloat>& l2_fo =
+      layer_2->CopyForwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& l2_bo =
+      layer_2->CopyBackwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+
+  return 0;
+}
diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp
new file mode 100644
index 0000000000..71a227416c
--- /dev/null
+++ b/libgnn/test/gpu-epoch-test.cpp
@@ -0,0 +1,64 @@
+//! @file gpu-epoch-test.cpp
+//! Run 50 epochs of training to see if results improve.
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  // size_t num_threads = galois::setActiveThreads(1);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
+
+  // load graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {
+      16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
+  galois::GNNLayerConfig layer_config;
+  layer_config.DebugConfig();
+  // XXX Activation kills accuracy compared to old code, esp. for cora
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax,
+      layer_config);
+
+  std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
+                                    16 * test_graph->GetNumLabelClasses()};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+
+  std::vector<galois::GNNFloat> cpu_pred;
+  cpu_pred.resize(test_graph->GetNumLabelClasses() * test_graph->size());
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // no verification; test should be eyeballed to make sure accuracy is
+  // increasing
+  galois::StatTimer main_timer("Timer_0");
+  main_timer.start();
+  for (size_t epoch = 0; epoch < 100; epoch++) {
+    galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+    gnn->GradientPropagation();
+    // copy to cpu
+    // TODO currently adam has this helper function; it should be handled
+    // by other class though
+    galois::gPrint("Epoch ", epoch, ": Accuracy is ",
+                   gnn->GetGlobalAccuracy(predictions), "\n");
+  }
+
+  // check test accuracy
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions),
+                 "\n");
+  main_timer.stop();
+}
diff --git a/libgnn/test/gpu-sage-layer-test.cpp b/libgnn/test/gpu-sage-layer-test.cpp
new file mode 100644
index 0000000000..bbe5cc97cb
--- /dev/null
+++ b/libgnn/test/gpu-sage-layer-test.cpp
@@ -0,0 +1,270 @@
+//! @file gpu-sage-layer-test.cpp
+//! Sage layer test
+
+#include "galois/Logging.h"
+#include "galois/layers/SAGELayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+  unsigned num_layers = 3;
+  test_graph.ResizeGPULayerVector(num_layers);
+  test_graph.InitLayerVectorMetaObjects(
+      0, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      1, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+  test_graph.InitLayerVectorMetaObjects(
+      2, galois::runtime::getSystemNetworkInterface().Num,
+      dimension_0.input_columns, dimension_0.output_columns);
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  galois::SAGELayerConfig scon;
+  scon.disable_concat = false;
+
+  std::unique_ptr<galois::SAGELayer<char, void>> layer_0 =
+      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null,
+                                                      dimension_0, dcon, scon);
+  layer_0->InitAllWeightsTo1();
+  // sage weights for self
+  layer_0->InitSelfWeightsTo1();
+
+  // make sure it runs in a sane manner
+  layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_0_forward_output =
+      layer_0->CopyForwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 output
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3",
+                     layer_0_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 9, "{} should be 6",
+                     layer_0_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 33);
+  //////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize<galois::GNNFloat> dummy_ones =
+      layer_0->AllocateGPU(dummy_ones_v);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  const std::vector<galois::GNNFloat>& layer_0_weight_gradients =
+      layer_0->CopyWeightGradientsFromGPU();
+  const std::vector<galois::GNNFloat>& layer_0_weight_gradients_2 =
+      layer_0->CopyWeight2GradientsFromGPU();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_0_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_0_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[4] == 21);
+
+  layer_0.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(
+      1, test_graph, &p_back, dimension_0, dcon, scon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->InitSelfWeightsTo1();
+
+  layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& layer_1_forward_output =
+      layer_1->CopyForwardOutputFromGPU();
+
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[0] == 3, "{} should be 3",
+                     layer_1_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[2] == 9, "{} should be 6",
+                     layer_1_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 33);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(14, 1);
+  dummy_ones = layer_1->AllocateGPU(dummy_ones_v);
+  layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& layer_1_backward_output =
+      layer_1->CopyBackwardOutputFromGPU();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 4);
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients =
+      layer_1->CopyWeightGradientsFromGPU();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
+
+  const std::vector<galois::GNNFloat>& layer_1_weight_gradients_2 =
+      layer_1->CopyWeight2GradientsFromGPU();
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_1_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[4] == 21);
+
+  layer_1.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerConfig config;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
+  config.disable_aggregate_after_update = false;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(
+      2, test_graph, &p_back, dimension_0, config, scon);
+  layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  const std::vector<galois::GNNFloat>& l2_fo =
+      layer_2->CopyForwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+  const galois::PointerWithSize<galois::GNNFloat>& l2_bo =
+      layer_2->CopyBackwardOutputFromGPU();
+
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+
+  return 0;
+}
diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp
new file mode 100644
index 0000000000..96875feffa
--- /dev/null
+++ b/libgnn/test/gpu-softmaxlayer-test.cpp
@@ -0,0 +1,123 @@
+//! @file gpu-softmaxlayer-test.cpp
+//! Softmax layer test with a test graph
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+  device_personality = DevicePersonality::GPU_CUDA;
+
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
+
+  GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
+
+  std::vector<galois::GNNFloat> back_matrix(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // train mode
+  auto output_layer = std::make_unique<galois::SoftmaxLayer<char, void>>(
+      3, test_graph, &p_back, dimension_0);
+  // input to softmax
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[8]  = 1;
+  softmax_input[16] = 1;
+  softmax_input[24] = 1;
+  softmax_input[32] = 1;
+  softmax_input[40] = 1;
+  softmax_input[48] = 1;
+  galois::PointerWithSize<galois::GNNFloat> p_softmax_input =
+      output_layer->AllocateGPU(softmax_input);
+
+  output_layer->ForwardPhase(p_softmax_input);
+  output_layer->PrintForwardOutputGPU();
+
+  // Softmax reuses output vector for forward phase
+  const galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
+      output_layer->CopyBackwardOutputFromGPU();
+
+  // assert that predictions are as expected
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) ==
+                      i);
+  }
+  // train mode means last 2 vertices should be empty
+  for (size_t i = 5; i < 7; i++) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+
+  output_layer->BackwardPhase(p_softmax_input, nullptr);
+
+  // validation mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
+  output_layer->ForwardPhase(p_softmax_input);
+  galois::PointerWithSize<galois::GNNFloat> pd2 =
+      output_layer->CopyBackwardOutputFromGPU();
+
+  // validate vertex is index 5
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+  for (size_t i = 6; i < 7; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+
+  output_layer->BackwardPhase(p_softmax_input, nullptr);
+
+  // test mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kTest);
+  output_layer->ForwardPhase(p_softmax_input);
+  galois::PointerWithSize<galois::GNNFloat> pd3 =
+      output_layer->CopyBackwardOutputFromGPU();
+  // validate vertex is index 6
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
+  // all but last are empty distributions
+  for (size_t i = 0; i < 6; i++) {
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
+  }
+
+  output_layer->BackwardPhase(softmax_input, nullptr);
+
+  // TODO in future maybe: add better test for backward phase besides just
+  // running it
+}
diff --git a/libgnn/test/gstl_test.cpp b/libgnn/test/gstl_test.cpp
new file mode 100644
index 0000000000..ef89d96a8b
--- /dev/null
+++ b/libgnn/test/gstl_test.cpp
@@ -0,0 +1,42 @@
+#include "galois/Galois.h"
+#include "galois/gstl.h"
+
+int main(int argc, char* argv[]) {
+  galois::SharedMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+
+  // std vector has no leak issues
+  using VecType = galois::gstl::Vector<float>;
+  // using VecType = std::vector<float>;
+
+  for (size_t i = 0; i < 1000000; i++) {
+    if (i % 10000 == 0)
+      galois::gPrint("Current is ", i, "\n");
+    size_t how_many = 100000;
+
+    std::vector<VecType> carrier;
+    carrier.resize(how_many);
+
+    galois::do_all(galois::iterate(size_t{0}, how_many), [&](size_t iter) {
+      // allocate some vector then do something with it
+      VecType dummy_vec(16);
+      for (unsigned j = 0; j < dummy_vec.size(); j++) {
+        dummy_vec[j] = j;
+      }
+      carrier[iter].swap(dummy_vec);
+    });
+
+    galois::do_all(galois::iterate(size_t{0}, how_many), [&](size_t iter) {
+      VecType to_swap;
+      carrier[iter].swap(to_swap);
+    });
+  }
+
+  return 0;
+}
diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp
new file mode 100644
index 0000000000..d2b659f238
--- /dev/null
+++ b/libgnn/test/l2norm-layer-test.cpp
@@ -0,0 +1,88 @@
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/L2NormLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 2;
+  dimension_0.output_columns = 2;
+
+  std::vector<galois::GNNFloat> l2_input(14, 0.0);
+  l2_input[0]  = 4;
+  l2_input[1]  = 3;
+  l2_input[2]  = 4;
+  l2_input[3]  = 3;
+  l2_input[4]  = 4;
+  l2_input[5]  = 3;
+  l2_input[6]  = 4;
+  l2_input[7]  = 3;
+  l2_input[8]  = 4;
+  l2_input[9]  = 3;
+  l2_input[10] = 4;
+  l2_input[11] = 3;
+  l2_input[12] = 4;
+  l2_input[13] = 3;
+
+  std::vector<galois::GNNFloat> back_matrix(14);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  auto l2_layer = std::make_unique<galois::L2NormLayer<char, void>>(
+      2, test_graph, &p_back, dimension_0);
+  galois::PointerWithSize<galois::GNNFloat> normed =
+      l2_layer->ForwardPhase(l2_input);
+
+  // only go up to 5 because training set
+  for (size_t row = 0; row < 5; row++) {
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2] - 0.8) < 0.0001,
+                       "input 4 should become 0.8 not {}, index {}",
+                       normed[row * 2], row * 2);
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2 + 1] - 0.6) < 0.0001,
+                       "input 3 should become 0.6 not {}, index {}",
+                       normed[row * 2 + 1], row * 2 + 1);
+  }
+  // only go up to 5 because training set
+  for (size_t row = 5; row < 7; row++) {
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2);
+    GALOIS_LOG_VASSERT(std::abs(normed[row * 2 + 1] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2 + 1);
+  }
+
+  // backward
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  galois::PointerWithSize<galois::GNNFloat> grads =
+      l2_layer->BackwardPhase(l2_input, &dummy_ones);
+  float out_4 = (-3.0 / 125.0);
+  float out_3 = (4.0 / 125.0);
+  for (size_t row = 0; row < 5; row++) {
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2] - out_4) < 0.0001,
+                       "index {} grad 4 gradient should be {} not {}", row * 2,
+                       out_4, grads[row * 2]);
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2 + 1] - out_3) < 0.0001,
+                       "index {} grad 3 gradient should be {} not {}",
+                       row * 2 + 1, out_3, grads[row * 2 + 1]);
+  }
+
+  for (size_t row = 5; row < 7; row++) {
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2);
+    GALOIS_LOG_VASSERT(std::abs(grads[row * 2 + 1] - 0.0) < 0.0001,
+                       "index {} should be 0, not part of train", row * 2 + 1);
+  }
+
+  return 0;
+}
diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp
new file mode 100644
index 0000000000..a2e68fa9df
--- /dev/null
+++ b/libgnn/test/mkl_micro.cpp
@@ -0,0 +1,146 @@
+#include <cstdlib>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <mkl.h>
+
+#ifdef USE_SHARED_GALOIS
+#include "galois/Galois.h"
+#include "galois/LargeArray.h"
+#endif
+#ifdef USE_DIST_GALOIS
+#include "galois/DistGalois.h"
+#include "galois/LargeArray.h"
+#endif
+#ifdef USE_SHARED_GALOIS_DELETE
+#include "galois/Galois.h"
+#endif
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif
+
+// MKL wrapper
+void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                size_t input_rows, size_t input_columns, size_t output_columns,
+                const float* a, const float* b, float* output) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+  // do the MM
+  cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
+              input_columns, 1.0, a, lead_dim_a, b, lead_dim_b,
+              false ? 1.0 : 0.0, output, output_columns);
+}
+
+void CacheFlush(std::vector<float>* matrix) {
+  for (size_t i = 0; i < matrix->size(); i++) {
+    (*matrix)[i] = i;
+  }
+}
+
+int main(int argc, char* argv[]) {
+#ifdef USE_SHARED_GALOIS
+  galois::SharedMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+#ifdef USE_SHARED_GALOIS_DELETE
+  std::unique_ptr<galois::SharedMemSys> G;
+  G = std::make_unique<galois::SharedMemSys>();
+
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+  printf("Deleting galois\n");
+  G.reset();
+#endif
+
+#ifdef USE_DIST_GALOIS
+  galois::DistMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Dist Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+  printf("%d %s\n", argc, argv[0]);
+
+  // dimensions from test case
+  size_t a_dim = 12000000;
+  // size_t a_dim = 120000;
+  size_t b_dim = 128;
+  size_t c_dim = 16;
+
+  // inputs
+  std::vector<float> matrix_1(a_dim * b_dim);
+  std::vector<float> matrix_2(a_dim * c_dim);
+  // output
+  // std::vector<float> matrix_3(a_dim * c_dim);
+  std::vector<float> matrix_3(b_dim * c_dim);
+
+  size_t kBigSize = 1000000000;
+  std::vector<float> very_big_matrix(kBigSize);
+
+  // change reps here; maybe make it command line arg
+  for (size_t reps = 0; reps < 5; reps++) {
+    // reinit
+    srand(0);
+    for (size_t i = 0; i < matrix_1.size(); i++) {
+      matrix_1[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+    srand(1);
+    for (size_t i = 0; i < matrix_2.size(); i++) {
+      matrix_2[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+
+    very_big_matrix.clear();
+    very_big_matrix.resize(kBigSize);
+    // cache flush
+    CacheFlush(&very_big_matrix);
+
+    // dummy OMP TBB loop
+#ifdef USE_OMP
+#pragma omp parallel for
+    for (size_t i = 0; i < very_big_matrix.size(); i++) {
+      very_big_matrix[i] = i;
+    }
+#endif
+
+    printf("Rep %lu\n", reps);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    // transpose because it's the same as the problematic call in GNN
+    // TODO(loc) non transpose version
+    // CBlasSGEMM(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim,
+    // matrix_1.data(),
+    //           matrix_2.data(), matrix_3.data());
+    CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(),
+               matrix_2.data(), matrix_3.data());
+    // CBlasSGEMM(CblasNoTrans, CblasTrans, b_dim, a_dim, c_dim,
+    // matrix_1.data(),
+    //            matrix_2.data(), matrix_3.data());
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    auto duration =
+        std::chrono::time_point_cast<std::chrono::milliseconds>(stop) -
+        std::chrono::time_point_cast<std::chrono::microseconds>(start);
+    printf("Run duration is %lf ms\n", duration.count() / 1000.0);
+  }
+
+  return 0;
+}
diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp
new file mode 100644
index 0000000000..b0a2430bd1
--- /dev/null
+++ b/libgnn/test/multilabel-epoch-test.cpp
@@ -0,0 +1,59 @@
+//! @file multilabel-epoch-test.cpp
+//! Run 100 epochs of multilabel dataset
+
+#include "galois/Logging.h"
+#include "galois/GraphNeuralNetwork.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  // size_t num_threads = galois::setActiveThreads(1);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load graph
+  auto test_graph = std::make_unique<galois::graphs::GNNGraph<char, void>>(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+
+  std::vector<galois::GNNLayerType> layer_types = {
+      galois::GNNLayerType::kGraphConvolutional,
+      galois::GNNLayerType::kGraphConvolutional};
+  std::vector<size_t> layer_output_sizes = {
+      16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()};
+  galois::GNNLayerConfig layer_config;
+  layer_config.disable_dropout       = false;
+  layer_config.disable_activation    = false;
+  layer_config.disable_normalization = false;
+  // XXX Activation kills accuracy compared to old code, esp. for cora
+  galois::GraphNeuralNetworkConfig gnn_config(
+      2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSigmoid,
+      layer_config);
+
+  std::vector<size_t> adam_sizes = {16 * test_graph->node_feature_length(),
+                                    16 * test_graph->GetNumLabelClasses()};
+  auto adam = std::make_unique<galois::AdamOptimizer>(adam_sizes, 2);
+
+  auto gnn = std::make_unique<galois::GraphNeuralNetwork<char, void>>(
+      std::move(test_graph), std::move(adam), std::move(gnn_config));
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  // no verification; test should be eyeballed to make sure accuracy is
+  // increasing
+  galois::StatTimer main_timer("Timer_0");
+  main_timer.start();
+  for (size_t epoch = 0; epoch < 100; epoch++) {
+    galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+    gnn->GradientPropagation();
+    galois::gPrint("Epoch ", epoch, ": Accuracy is ",
+                   gnn->GetGlobalAccuracy(predictions), "\n");
+  }
+
+  // check test accuracy
+  gnn->SetLayerPhases(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> predictions = gnn->DoInference();
+  galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions),
+                 "\n");
+  main_timer.stop();
+}
diff --git a/libgnn/test/multilabel-read.cpp b/libgnn/test/multilabel-read.cpp
new file mode 100644
index 0000000000..56b8b42071
--- /dev/null
+++ b/libgnn/test/multilabel-read.cpp
@@ -0,0 +1,142 @@
+//! @file multilabel-read
+//! Make sure multilabels read are sane
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  // load test graph; false at end = multilabel
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+  const galois::GNNLabel* labels = test_graph.GetMultiClassLabel(0);
+
+  unsigned i = 0;
+  GALOIS_LOG_ASSERT(1 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 1;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 2;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 3;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]);
+
+  i = 4;
+  GALOIS_LOG_ASSERT(0 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]);
+
+  i = 5;
+  GALOIS_LOG_ASSERT(1 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]);
+
+  i = 6;
+  GALOIS_LOG_ASSERT(1 == labels[i * 7]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]);
+  GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]);
+  GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]);
+
+  labels = test_graph.GetMultiClassLabel(0);
+  GALOIS_LOG_ASSERT(1 == labels[0]);
+  GALOIS_LOG_ASSERT(1 == labels[1]);
+  GALOIS_LOG_ASSERT(1 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(1);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(1 == labels[1]);
+  GALOIS_LOG_ASSERT(1 == labels[2]);
+  GALOIS_LOG_ASSERT(1 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(2);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(1 == labels[2]);
+  GALOIS_LOG_ASSERT(1 == labels[3]);
+  GALOIS_LOG_ASSERT(1 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(3);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(1 == labels[3]);
+  GALOIS_LOG_ASSERT(1 == labels[4]);
+  GALOIS_LOG_ASSERT(1 == labels[5]);
+  GALOIS_LOG_ASSERT(0 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(4);
+  GALOIS_LOG_ASSERT(0 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(1 == labels[4]);
+  GALOIS_LOG_ASSERT(1 == labels[5]);
+  GALOIS_LOG_ASSERT(1 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(5);
+  GALOIS_LOG_ASSERT(1 == labels[0]);
+  GALOIS_LOG_ASSERT(0 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(1 == labels[5]);
+  GALOIS_LOG_ASSERT(1 == labels[6]);
+
+  labels = test_graph.GetMultiClassLabel(6);
+  GALOIS_LOG_ASSERT(1 == labels[0]);
+  GALOIS_LOG_ASSERT(1 == labels[1]);
+  GALOIS_LOG_ASSERT(0 == labels[2]);
+  GALOIS_LOG_ASSERT(0 == labels[3]);
+  GALOIS_LOG_ASSERT(0 == labels[4]);
+  GALOIS_LOG_ASSERT(0 == labels[5]);
+  GALOIS_LOG_ASSERT(1 == labels[6]);
+
+  return 0;
+}
diff --git a/libgnn/test/remapverify.cpp b/libgnn/test/remapverify.cpp
new file mode 100644
index 0000000000..169a0f129c
--- /dev/null
+++ b/libgnn/test/remapverify.cpp
@@ -0,0 +1,104 @@
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+// actually does remapping
+int main() {
+  galois::DistMemSys G;
+  galois::graphs::LC_CSR_Graph<char, void> orig;
+  orig.readGraphFromGRFile(
+      "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M.tgr");
+  // orig.readGraphFromGRFile("/net/ohm/export/iss/inputs/Learning/ogbn-papers100M.gr");
+
+  std::vector<uint64_t> node_indices;
+  node_indices.resize(orig.size(), 0);
+  std::vector<uint32_t> destinations;
+  destinations.resize(orig.sizeEdges(), 0);
+
+  // get mapping
+  std::string remap_name =
+      galois::default_gnn_dataset_path + "ogbn-papers100M-remap-mapping.bin";
+  std::ifstream file_stream;
+  file_stream.open(remap_name, std::ios::binary | std::ios::in);
+  std::vector<uint32_t> new_to_old(111059956);
+  file_stream.read((char*)new_to_old.data(),
+                   sizeof(uint32_t) * new_to_old.size());
+  file_stream.close();
+
+  std::vector<uint32_t> old_to_new(111059956);
+
+  galois::DynamicBitSet mark_all;
+  mark_all.resize(orig.size());
+  mark_all.reset();
+
+  // get # edges on each node in remapped
+  galois::do_all(
+      galois::iterate(orig.begin(), orig.end()), [&](uint32_t remapped_id) {
+        uint32_t source_id    = new_to_old[remapped_id];
+        old_to_new[source_id] = remapped_id;
+        mark_all.set(source_id);
+        GALOIS_LOG_ASSERT(source_id < orig.size());
+        // TODO check duplicates too
+        node_indices[remapped_id] =
+            std::distance(orig.edge_begin(source_id), orig.edge_end(source_id));
+      });
+
+  galois::do_all(galois::iterate(0, 111059956),
+                 [&](unsigned i) { GALOIS_LOG_ASSERT(mark_all.test(i)); });
+
+  // prefix sum it
+  for (size_t i = 1; i < node_indices.size(); i++) {
+    node_indices[i] += node_indices[i - 1];
+  }
+  // write all edges
+  galois::do_all(
+      galois::iterate(orig.begin(), orig.end()),
+      [&](uint32_t remapped_id) {
+        uint32_t source_id = new_to_old[remapped_id];
+        GALOIS_LOG_ASSERT(source_id < orig.size());
+        uint64_t current_idx;
+        if (remapped_id != 0) {
+          current_idx = node_indices[remapped_id - 1];
+        } else {
+          current_idx = 0;
+        }
+        uint64_t my_end = node_indices[remapped_id];
+
+        for (auto ei = orig.edge_begin(source_id);
+             ei != orig.edge_end(source_id); ei++) {
+          uint32_t dest               = old_to_new[orig.getEdgeDst(ei)];
+          destinations[current_idx++] = dest;
+        }
+        GALOIS_LOG_ASSERT(current_idx == my_end);
+        // TODO check duplicates too
+        // node_indices[remapped_id] = std::distance(orig.edge_begin(node_id),
+        // orig.edge_end(node_id));
+      },
+      galois::steal());
+
+  // write everything
+  struct Header {
+    uint64_t version;
+    uint64_t size;
+    uint64_t numNodes;
+    uint64_t numEdges;
+  };
+  Header h;
+  h.version  = 1;
+  h.size     = 0;
+  h.numNodes = orig.size();
+  h.numEdges = orig.sizeEdges();
+
+  std::string filename =
+      "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M-remap.tgr";
+  // std::string filename =
+  // "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M-remap.gr";
+  std::ofstream write_stream;
+  write_stream.open(filename, std::ios::binary | std::ios::out);
+  write_stream.write((char*)&h, sizeof(Header));
+  write_stream.write((char*)node_indices.data(),
+                     sizeof(uint64_t) * node_indices.size());
+  write_stream.write((char*)destinations.data(),
+                     sizeof(uint32_t) * destinations.size());
+
+  write_stream.close();
+}
diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp
new file mode 100644
index 0000000000..3f53921795
--- /dev/null
+++ b/libgnn/test/sage-layer-test.cpp
@@ -0,0 +1,255 @@
+//! @file sage-layer-test.cpp
+//! Sage layer test
+
+#include "galois/Logging.h"
+#include "galois/layers/SAGELayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+  galois::SAGELayerConfig scon;
+  scon.disable_concat = false;
+
+  galois::PointerWithSize<galois::GNNFloat> p_null(nullptr, 0);
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  std::unique_ptr<galois::SAGELayer<char, void>> layer_0 =
+      std::make_unique<galois::SAGELayer<char, void>>(0, test_graph, &p_null,
+                                                      dimension_0, dcon, scon);
+  layer_0->InitAllWeightsTo1();
+  // sage weights for self
+  layer_0->InitSelfWeightsTo1();
+
+  // make sure it runs in a sane manner
+  const galois::PointerWithSize<galois::GNNFloat> layer_0_forward_output =
+      layer_0->ForwardPhase(test_graph.GetLocalFeatures());
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sanity check layer 0 output
+  //////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 2
+  GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3",
+                     layer_0_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 9, "{} should be 6",
+                     layer_0_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 33);
+  //////////////////////////////////////////////////////////////////////////////
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  // backward pass checking
+  // layer 0 means that an empty weight matrix is returned since there is no
+  // point passing back anything
+  galois::PointerWithSize<galois::GNNFloat> layer_0_backward_output =
+      layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  ////////////////////////////////////////////////////////////////////////////////
+  //// sanity check layer 0 backward output; all 0 because layer 0
+  ////////////////////////////////////////////////////////////////////////////////
+  // since norm factors aren't invovled it is possible to do full assertions
+  // 7 x 3
+  GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients =
+      layer_0->GetLayerWeightGradients();
+  galois::PointerWithSize<galois::GNNFloat> layer_0_weight_gradients_2 =
+      layer_0->GetLayerWeightGradients2();
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36);
+
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_0_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_0_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[4] == 21);
+
+  layer_0.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  // create layer 1 for testing backward prop actually giving weights back
+
+  auto layer_1 = std::make_unique<galois::SAGELayer<char, void>>(
+      1, test_graph, &p_back, dimension_0, dcon, scon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->InitSelfWeightsTo1();
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[0] == 3, "{} should be 3",
+                     layer_1_forward_output[0]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3);
+  GALOIS_LOG_VASSERT(layer_1_forward_output[2] == 9, "{} should be 6",
+                     layer_1_forward_output[2]);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 9);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 18);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 27);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 36);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 45);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 33);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 33);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(14, 1);
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 6);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 4);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 4);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients_2 =
+      layer_1->GetLayerWeightGradients2();
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2.size() == 6);
+  GALOIS_LOG_VASSERT(layer_1_weight_gradients_2[0] == 21,
+                     "{} is wrong should be {}", layer_1_weight_gradients_2[0],
+                     21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[1] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[2] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[3] == 21);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[4] == 21);
+
+  layer_1.reset();
+
+  ////////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerConfig config;
+  config.disable_dropout                = false;
+  config.disable_activation             = false;
+  config.disable_normalization          = false;
+  config.disable_aggregate_after_update = false;
+
+  // finally, just make sure dropout and activation run without crashes
+  // (verification requires floating point accuracy or setting a seed which I
+  // don't have time for at the moment
+  // TODO in future maybe add better unit test for this
+  auto layer_2 = std::make_unique<galois::SAGELayer<char, void>>(
+      1, test_graph, &p_back, dimension_0, config, scon);
+  galois::PointerWithSize<galois::GNNFloat> l2_fo =
+      layer_2->ForwardPhase(test_graph.GetLocalFeatures());
+  GALOIS_LOG_ASSERT(l2_fo.size() == 14);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[0]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[1]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[2]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[3]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[4]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[5]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[6]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[7]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[8]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[9]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[10]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[11]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[12]);
+  GALOIS_LOG_VERBOSE("{}", l2_fo[13]);
+
+  galois::PointerWithSize<galois::GNNFloat> l2_bo =
+      layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  GALOIS_LOG_ASSERT(l2_bo.size() == 21);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]);
+  GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]);
+
+  return 0;
+}
diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp
new file mode 100644
index 0000000000..b53860d950
--- /dev/null
+++ b/libgnn/test/sample-bit-test.cpp
@@ -0,0 +1,165 @@
+//! @file sample-bit-test.cpp
+//! Checks to see if edge sample bit is set correctly.
+
+#include "galois/Logging.h"
+#include "galois/graphs/GNNGraph.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+
+  galois::graphs::GNNGraph<char, void> graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+  graph.InitializeSamplingData(3, false);
+
+  // first, assert all edges are not sampled (should come with all 0s)
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0));
+    }
+    for (auto ei : graph.in_edges(node)) {
+      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0));
+    }
+  }
+
+  // make all edges sampled; it should set the in-edges as well
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      graph.MakeEdgeSampled(ei, 0);
+    }
+  }
+
+  // all edges (including ins) should be sampled
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0));
+    }
+    for (auto ei : graph.in_edges(node)) {
+      GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0));
+    }
+  }
+
+  // clear sample bits for odd numbers
+  for (size_t node = 0; node < graph.size(); node++) {
+    if (node % 2 == 1) {
+      for (auto ei : graph.edges(node)) {
+        graph.MakeEdgeUnsampled(ei, 0);
+      }
+    }
+  }
+
+  // do another check
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      if (node % 2 == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0));
+      }
+      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1));
+      GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 2));
+    }
+
+    // in edges for this node: if destination (i.e., source) is
+    // odd, then it should not be sampled
+    for (auto ei : graph.in_edges(node)) {
+      if ((graph.GetInEdgeDest(ei) % 2) == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0));
+      }
+      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1));
+      GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 2));
+    }
+  }
+
+  // odd layer 1, even layer 2
+  for (size_t node = 0; node < graph.size(); node++) {
+    if (node % 2 == 1) {
+      for (auto ei : graph.edges(node)) {
+        graph.MakeEdgeSampled(ei, 1);
+      }
+    } else {
+      for (auto ei : graph.edges(node)) {
+        graph.MakeEdgeSampled(ei, 2);
+      }
+    }
+  }
+
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      if (node % 2 == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 2));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 2));
+      }
+    }
+
+    // in edges for this node: if destination (i.e., source) is
+    // odd, then it should not be sampled
+    for (auto ei : graph.in_edges(node)) {
+      if ((graph.GetInEdgeDest(ei) % 2) == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 2));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0));
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1));
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 2));
+      }
+    }
+  }
+
+  // odd layer 1, even layer 2; set in edge
+  for (size_t node = 0; node < graph.size(); node++) {
+    if (node % 2 == 1) {
+      for (auto ei : graph.in_edges(node)) {
+        graph.MakeInEdgeUnsampled(ei, 1);
+      }
+    } else {
+      for (auto ei : graph.in_edges(node)) {
+        graph.MakeInEdgeSampled(ei, 1);
+      }
+    }
+  }
+
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.in_edges(node)) {
+      if (node % 2 == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 1));
+      }
+    }
+
+    for (auto ei : graph.edges(node)) {
+      if ((graph.GetEdgeDest(ei) % 2) == 1) {
+        GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1));
+      } else {
+        GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 1));
+      }
+    }
+  }
+
+  // print edges for a quick lookover if run manually
+  for (size_t node = 0; node < graph.size(); node++) {
+    for (auto ei : graph.edges(node)) {
+      galois::gPrint("Out edge ", node, " ", graph.GetEdgeDest(ei), "\n");
+    }
+    for (auto ei : graph.in_edges(node)) {
+      galois::gPrint("In edge to ", node, " from ", graph.GetInEdgeDest(ei),
+                     "\n");
+    }
+  }
+
+  return 0;
+}
diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp
new file mode 100644
index 0000000000..d875a72ee4
--- /dev/null
+++ b/libgnn/test/sample-test.cpp
@@ -0,0 +1,234 @@
+//! @file sample-test.cpp
+//! Sampling tester
+
+/// TODO(hc): This test is deprecated as GCN layer now supports
+/// edge sampling, as well as node sampling.
+/// The previous GCN only checks if node is sampled, but
+/// now it checks edge sampling and for that, it utilizes
+/// a bitset to mark sampled edges.
+/// If that bitset is not set, the corresponding edge is ignored.
+/// However, this test currently does not consider this case,
+/// and doesn't work.
+/// To satisfy the previous assumption and make this test work,
+/// we should mark the entire adjacent edges of the sampled nodes.
+/// In this case, we should not mark the edges' destination nodes as
+/// sampled nodes, and so, let src node iterator skip those nodes
+/// but only allow to iterate them as outgoing destinations.
+/// We can reuse this code later, and so, I will not remove this
+/// from the current source tree.
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/GraphConvolutionalLayer.h"
+#include "galois/layers/SoftmaxLayer.h"
+#include "galois/layers/SigmoidLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+
+  GALOIS_LOG_VERBOSE("[{}] Using {} threads",
+                     galois::runtime::getSystemNetworkInterface().ID,
+                     num_threads);
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = 3;
+  dimension_0.output_columns = 2;
+
+  galois::GNNLayerConfig dcon;
+  dcon.disable_aggregate_after_update = false;
+  dcon.DebugConfig();
+
+  // choose a few sample nodes
+  test_graph.SetSampledNode(0);
+  test_graph.SetSampledNode(2);
+  test_graph.SetSampledNode(4);
+  test_graph.SetSampledNode(5);
+  test_graph.UnsetSampledNode(1);
+  test_graph.UnsetSampledNode(3);
+  test_graph.UnsetSampledNode(6);
+
+  //////////////////////////////////////////////////////////////////////////////
+
+  std::vector<galois::GNNFloat> back_matrix(21);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  std::unique_ptr<galois::GraphConvolutionalLayer<char, void>> layer_1 =
+      std::make_unique<galois::GraphConvolutionalLayer<char, void>>(
+          1, test_graph, &p_back, dimension_0, dcon);
+  layer_1->InitAllWeightsTo1();
+  layer_1->EnableSampling();
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_forward_output =
+      layer_1->ForwardPhase(test_graph.GetLocalFeatures());
+  // same check as before for sanity purposes
+  GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 15);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 12);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 0);
+  GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 0);
+
+  // dummy 1 matrix
+  std::vector<galois::GNNFloat> dummy_ones_v(14, 1);
+  galois::PointerWithSize dummy_ones(dummy_ones_v);
+
+  // since layer isn't 0 anymore, backward phase will actually return something
+  dummy_ones_v.assign(14, 1);
+  // 0 out unsampled nodes
+  dummy_ones_v[2]  = 0;
+  dummy_ones_v[3]  = 0;
+  dummy_ones_v[6]  = 0;
+  dummy_ones_v[7]  = 0;
+  dummy_ones_v[12] = 0;
+  dummy_ones_v[13] = 0;
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_backward_output =
+      layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // check that multiplies go as expected
+  //////////////////////////////////////////////////////////////////////////////
+
+  GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 2);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 0);
+  GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 0);
+
+  galois::PointerWithSize<galois::GNNFloat> layer_1_weight_gradients =
+      layer_1->GetLayerWeightGradients();
+  // make sure they are sane
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 9);
+  GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 9);
+
+  layer_1.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // softmax
+  //////////////////////////////////////////////////////////////////////////////
+
+  galois::GNNLayerDimensions dimension_out;
+  dimension_out.input_rows     = 7;
+  dimension_out.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_out.output_columns = test_graph.GetNumLabelClasses();
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[8]  = 1;
+  softmax_input[16] = 1;
+  softmax_input[24] = 1;
+  softmax_input[32] = 1;
+  softmax_input[40] = 1;
+  softmax_input[48] = 1;
+
+  std::vector<galois::GNNFloat> back_matrix_2(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back_2(back_matrix_2);
+
+  auto output_layer = std::make_unique<galois::SoftmaxLayer<char, void>>(
+      3, test_graph, &p_back_2, dimension_out);
+  output_layer->EnableSampling();
+  galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
+      output_layer->ForwardPhase(softmax_input);
+
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[0])) == 0);
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[2 * 7])) ==
+                    2);
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[4 * 7])) ==
+                    4);
+
+  std::vector<size_t> sampled_out = {1, 3, 6};
+  // assert sampled out are all 0s
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+  // softmax back: check sampled out is all 0s (others are floats, too painful)
+  galois::PointerWithSize<galois::GNNFloat> asdf =
+      output_layer->BackwardPhase(softmax_input, nullptr);
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 6] == 0.0);
+  }
+
+  output_layer.reset();
+
+  //////////////////////////////////////////////////////////////////////////////
+  // sigmoid
+  //////////////////////////////////////////////////////////////////////////////
+  galois::graphs::GNNGraph<char, void> multi_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+
+  auto sigmoid_layer = std::make_unique<galois::SigmoidLayer<char, void>>(
+      3, multi_graph, &p_back_2, dimension_out);
+  sigmoid_layer->EnableSampling();
+  // reuse softmax input; only thing interested in is checking for 0s
+  prediction_distribution = sigmoid_layer->ForwardPhase(softmax_input);
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+  asdf = sigmoid_layer->BackwardPhase(softmax_input, nullptr);
+  for (size_t i : sampled_out) {
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(asdf[i * 7 + 6] == 0.0);
+  }
+
+  return 0;
+}
diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp
new file mode 100644
index 0000000000..9fd861deff
--- /dev/null
+++ b/libgnn/test/sigmoidlayer-test.cpp
@@ -0,0 +1,67 @@
+//! @file sigmoidlayer-test.cpp
+//! Sigmoid layer test with a test graph
+//! No automated ground truth checking; when this was written it was compared
+//! manually with pytorch
+//! TODO add in automated checking eventually; for now this just makes sure it
+//! runs
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SigmoidLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  galois::setActiveThreads(1);
+
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
+
+  GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
+
+  // input to softmax
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[1]  = 1;
+  softmax_input[2]  = 100000000000;
+  softmax_input[3]  = 100000000000000000;
+  softmax_input[4]  = -1000;
+  softmax_input[5]  = -10;
+  softmax_input[6]  = 1000000;
+  softmax_input[8]  = 1;
+  softmax_input[9]  = 1;
+  softmax_input[10] = 1;
+  softmax_input[16] = 1;
+  softmax_input[17] = 1;
+  softmax_input[18] = 1;
+  softmax_input[24] = 0;
+  softmax_input[32] = 0;
+  softmax_input[40] = 0;
+  softmax_input[48] = 0;
+
+  std::vector<galois::GNNFloat> back_matrix(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // train mode
+  auto output_layer = std::make_unique<galois::SigmoidLayer<char, void>>(
+      3, test_graph, &p_back, dimension_0);
+  output_layer->ForwardPhase(softmax_input);
+
+  galois::PointerWithSize<galois::GNNFloat> asdf =
+      output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 1\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    if (i % 7 == 0) {
+      printf("--------------\n");
+    }
+    printf("%f\n", asdf[i]);
+  }
+}
diff --git a/libgnn/test/single_mkl_micro.cpp b/libgnn/test/single_mkl_micro.cpp
new file mode 100644
index 0000000000..97035bdfba
--- /dev/null
+++ b/libgnn/test/single_mkl_micro.cpp
@@ -0,0 +1,208 @@
+#include <cstdlib>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <mkl.h>
+
+#ifdef USE_SHARED_GALOIS
+#include "galois/Galois.h"
+#include "galois/PODResizeableArray.h"
+#endif
+#ifdef USE_DIST_GALOIS
+#include "galois/DistGalois.h"
+#include "galois/PODResizeableArray.h"
+#endif
+
+#ifdef USE_OMP
+#include "omp.h"
+#endif
+
+// MKL wrapper
+#ifdef USE_OMP
+void CBlasSGEMMOMP(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b,
+                   size_t input_rows, size_t input_columns,
+                   size_t output_columns, const float* a, const float* b,
+                   float* output) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+
+#pragma omp parallel for
+  for (int i = 0; i < omp_get_num_threads(); i++) {
+    unsigned chunk_size = input_rows / omp_get_num_threads();
+    unsigned my_start   = chunk_size * i;
+    unsigned my_end     = chunk_size * (i + 1);
+    if (omp_get_num_threads() - 1 == i) {
+      my_end = input_rows;
+    }
+    unsigned rows_to_use = my_end - my_start;
+
+    const float* my_a = a + (my_start * input_columns);
+    float* my_output  = output + (my_start * output_columns);
+
+    // do the MM
+    cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
+                input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
+                false ? 1.0 : 0.0, my_output, output_columns);
+  }
+}
+#endif
+
+#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
+void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a,
+                      const CBLAS_TRANSPOSE trans_b, size_t input_rows,
+                      size_t input_columns, size_t output_columns,
+                      const float* a, const float* b, float* output) {
+  // set lead dimension based on cblas spec w.r.t. transpose setting
+  size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows;
+  size_t lead_dim_b =
+      (trans_b == CblasNoTrans) ? output_columns : input_columns;
+
+  static std::vector<galois::PODResizeableArray<float>> temps;
+  if (trans_a == CblasTrans) {
+    temps.resize(galois::getActiveThreads());
+  }
+
+  galois::on_each([&](size_t i, size_t num_threads) {
+    if (trans_a != CblasTrans) {
+      unsigned chunk_size = input_rows / num_threads;
+      unsigned my_start   = chunk_size * i;
+      unsigned my_end     = chunk_size * (i + 1);
+      if (num_threads - 1 == i) {
+        my_end = input_rows;
+      }
+      unsigned rows_to_use = my_end - my_start;
+
+      const float* my_a = a + (my_start * input_columns);
+      float* my_output  = output + (my_start * output_columns);
+
+      // do the MM
+      cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns,
+                  input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b,
+                  false ? 1.0 : 0.0, my_output, output_columns);
+    } else {
+      galois::PODResizeableArray<float>& my_pod = temps[i];
+      my_pod.resize(input_rows * output_columns);
+
+      unsigned chunk_size = input_columns / num_threads;
+      unsigned my_start   = chunk_size * i;
+      unsigned my_end     = chunk_size * (i + 1);
+      if (num_threads - 1 == i) {
+        my_end = input_columns;
+      }
+      unsigned b_rows_to_use = my_end - my_start;
+
+      const float* my_a = a + (my_start * input_rows);
+      const float* my_b = b + (my_start * output_columns);
+
+      // do the MM
+      cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns,
+                  b_rows_to_use, 1.0, my_a, lead_dim_a, my_b, lead_dim_b,
+                  false ? 1.0 : 0.0, my_pod.data(), output_columns);
+    }
+  });
+
+  if (trans_a == CblasTrans) {
+    printf("Manual summation\n");
+    for (galois::PODResizeableArray<float>& temp_out : temps) {
+      for (unsigned i = 0; i < temp_out.size(); i++) {
+        output[i] += temp_out[i];
+      }
+    }
+  }
+}
+#endif
+
+void CacheFlush(std::vector<float>* matrix) {
+  for (size_t i = 0; i < matrix->size(); i++) {
+    (*matrix)[i] = i;
+  }
+}
+
+int main(int argc, char* argv[]) {
+#ifdef USE_SHARED_GALOIS
+  galois::SharedMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Shared Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+#ifdef USE_DIST_GALOIS
+  galois::DistMemSys G;
+  if (argc != 2) {
+    printf("Thread arg not specified\n");
+    exit(1);
+  }
+  galois::setActiveThreads(std::stoi(argv[1]));
+  printf("Initialized Galois Dist Mem with %u threads\n",
+         galois::getActiveThreads());
+#endif
+
+  printf("%d %s\n", argc, argv[0]);
+
+  // dimensions from test case
+  size_t a_dim = 12000000;
+  size_t b_dim = 128;
+  size_t c_dim = 16;
+
+  // inputs
+  std::vector<float> matrix_1(a_dim * b_dim);
+  std::vector<float> matrix_2(a_dim * c_dim);
+  // output
+  // std::vector<float> matrix_3(a_dim * c_dim);
+  std::vector<float> matrix_3(b_dim * c_dim);
+
+  size_t kBigSize = 1000000000;
+  std::vector<float> very_big_matrix(kBigSize);
+
+  // change reps here; maybe make it command line arg
+  for (size_t reps = 0; reps < 5; reps++) {
+    // reinit
+    srand(0);
+    for (size_t i = 0; i < matrix_1.size(); i++) {
+      matrix_1[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+    srand(1);
+    for (size_t i = 0; i < matrix_2.size(); i++) {
+      matrix_2[i] = rand() / static_cast<float>(RAND_MAX / 10);
+    }
+
+    very_big_matrix.clear();
+    very_big_matrix.resize(kBigSize);
+    // cache flush
+    CacheFlush(&very_big_matrix);
+
+    printf("Rep %lu\n", reps);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    // transpose because it's the same as the problematic call in GNN
+    // TODO(loc) non transpose version
+#ifdef USE_OMP
+    CBlasSGEMMOMP(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim,
+                  matrix_1.data(), matrix_2.data(), matrix_3.data());
+#endif
+#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS)
+    // CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim,
+    // matrix_1.data(),
+    //            matrix_2.data(), matrix_3.data());
+    CBlasSGEMMGalois(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim,
+                     matrix_1.data(), matrix_2.data(), matrix_3.data());
+#endif
+    // CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim,
+    // matrix_1.data(),
+    //            matrix_2.data(), matrix_3.data());
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    auto duration =
+        std::chrono::time_point_cast<std::chrono::milliseconds>(stop) -
+        std::chrono::time_point_cast<std::chrono::microseconds>(start);
+    printf("Run duration is %lf ms\n", duration.count() / 1000.0);
+  }
+
+  return 0;
+}
diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp
new file mode 100644
index 0000000000..1ca2740729
--- /dev/null
+++ b/libgnn/test/softmaxlayer-test.cpp
@@ -0,0 +1,130 @@
+//! @file softmaxlayer-test.cpp
+//! Softmax layer test with a test graph
+//! No automated ground truth checking; when this was written it was compared
+//! manually with pytorch
+//! TODO add in automated checking eventually; for now this just makes sure it
+//! runs
+
+#include "galois/Logging.h"
+#include "galois/GNNMath.h"
+#include "galois/layers/SoftmaxLayer.h"
+
+int main() {
+  galois::DistMemSys G;
+
+  size_t num_threads = galois::setActiveThreads(
+      56 / galois::runtime::getSystemNetworkInterface().Num);
+  GALOIS_LOG_VERBOSE("Num threads is {}", num_threads);
+
+  // load test graph
+  galois::graphs::GNNGraph<char, void> test_graph(
+      "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false);
+
+  // input/output columns must be same in softmax
+  galois::GNNLayerDimensions dimension_0;
+  dimension_0.input_rows     = 7;
+  dimension_0.input_columns  = test_graph.GetNumLabelClasses();
+  dimension_0.output_columns = test_graph.GetNumLabelClasses();
+
+  GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns);
+
+  // input to softmax
+  std::vector<galois::GNNFloat> softmax_input(49, 0.0);
+  // create input with perfect accuracy
+  softmax_input[0]  = 1;
+  softmax_input[8]  = 1;
+  softmax_input[16] = 1;
+  softmax_input[24] = 1;
+  softmax_input[32] = 1;
+  softmax_input[40] = 1;
+  softmax_input[48] = 1;
+
+  std::vector<galois::GNNFloat> back_matrix(49);
+  galois::PointerWithSize<galois::GNNFloat> p_back(back_matrix);
+
+  // train mode
+  auto output_layer = std::make_unique<galois::SoftmaxLayer<char, void>>(
+      3, test_graph, &p_back, dimension_0);
+  galois::PointerWithSize<galois::GNNFloat> prediction_distribution =
+      output_layer->ForwardPhase(softmax_input);
+
+  // assert that predictions are as expected
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) ==
+                      i);
+  }
+  // train mode means last 2 vertices should be empty
+  for (size_t i = 5; i < 7; i++) {
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0);
+  }
+
+  // NOTE: checked before backward because backward overwrites this matrix
+
+  galois::PointerWithSize<galois::GNNFloat> asdf =
+      output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 1\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
+
+  // validation mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kValidate);
+  galois::PointerWithSize<galois::GNNFloat> pd2 =
+      output_layer->ForwardPhase(softmax_input);
+
+  // validate vertex is index 5
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5);
+  for (size_t i = 0; i < 5; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+  for (size_t i = 6; i < 7; i++) {
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0);
+  }
+
+  asdf = output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 2\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
+
+  // test mode
+  output_layer->SetLayerPhase(galois::GNNPhase::kTest);
+  galois::PointerWithSize<galois::GNNFloat> pd3 =
+      output_layer->ForwardPhase(softmax_input);
+  // validate vertex is index 6
+  GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6);
+  // all but last are empty distributions
+  for (size_t i = 0; i < 6; i++) {
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0);
+    GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0);
+  }
+
+  asdf = output_layer->BackwardPhase(softmax_input, nullptr);
+  printf("Output 3\n========\n");
+  for (unsigned i = 0; i < asdf.size(); i++) {
+    printf("%f\n", asdf[i]);
+  }
+}
diff --git a/libgpu/include/checker.h b/libgpu/include/checker.h
new file mode 100644
index 0000000000..7f2cf4e36e
--- /dev/null
+++ b/libgpu/include/checker.h
@@ -0,0 +1,15 @@
+#ifndef CHECKER_H
+#define CHECKER_H
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+static void check_cuda_error(const cudaError_t e, const char* file,
+                             const int line) {
+  if (e != cudaSuccess) {
+    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
+    exit(1);
+  }
+}
+#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)
+
+#endif
diff --git a/libgpu/include/csr_graph.h b/libgpu/include/csr_graph.h
index db9ccccd88..7fff0750e4 100644
--- a/libgpu/include/csr_graph.h
+++ b/libgpu/include/csr_graph.h
@@ -14,8 +14,14 @@
 #ifndef LSG_CSR_GRAPH
 #define LSG_CSR_GRAPH
 
+// TODO: original branch has this include; revert it back eventually
+//#include "graph_gpu.h"
+
 #include <fstream>
+#include "checker.h"
+#include "graph_gpu.h"
 
+/*
 // Adapted from LSG CSRGraph.h
 
 // TODO: make this template data
@@ -139,6 +145,8 @@ struct CSRGraph {
   node_data_type* node_data;
   bool device_graph;
 };
+>>>>>>> dist-dev
+//*/
 
 struct CSRGraphTex : CSRGraph {
   cudaTextureObject_t edge_dst_tx;
diff --git a/libgpu/include/gg.h b/libgpu/include/gg.h
index 779aafdd84..69239fd46c 100644
--- a/libgpu/include/gg.h
+++ b/libgpu/include/gg.h
@@ -35,14 +35,6 @@ unsigned const debug = GGDEBUG;
 
 #include "Timer.h"
 
-static void check_cuda_error(const cudaError_t e, const char* file,
-                             const int line) {
-  if (e != cudaSuccess) {
-    fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e);
-    exit(1);
-  }
-}
-
 template <typename T>
 static void check_retval(const T retval, const T expected, const char* file,
                          const int line) {
@@ -64,7 +56,6 @@ inline static __device__ __host__ int GG_MIN(int x, int y) {
     return x;
 }
 
-#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)
 #define check_rv(r, x) check_retval(r, x, __FILE__, __LINE__)
 
 #include "bmk2.h"
diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h
new file mode 100644
index 0000000000..d208a3328c
--- /dev/null
+++ b/libgpu/include/graph_gpu.h
@@ -0,0 +1,209 @@
+/*
+   csr_graph.h
+
+   Implements a CSR Graph. Part of the GGC source code.
+   Interface derived from LonestarGPU.
+
+   Copyright (C) 2014--2016, The University of Texas at Austin
+
+   See LICENSE.TXT for copyright license.
+
+   Author: Sreepathi Pai <sreepai@ices.utexas.edu>
+*/
+
+#ifndef CSR_GRAPH
+#define CSR_GRAPH
+
+#include <cassert>
+#include <fstream>
+
+#ifdef __CUDACC__
+#define CUDA_HOSTDEV __host__ __device__
+#else
+#define CUDA_HOSTDEV
+#endif
+
+// Adapted from LSG CSRGraph.h
+
+// TODO: make this template data
+typedef unsigned index_type; // should be size_t, but GPU chokes on size_t
+typedef int edge_data_type;
+typedef int node_data_type;
+
+// very simple implementation
+struct CSRGraph {
+  unsigned read(const char file[], bool read_edge_data = true);
+  void copy_to_gpu(struct CSRGraph& copygraph);
+  void copy_to_cpu(struct CSRGraph& copygraph);
+
+  CSRGraph();
+
+  unsigned init();
+  unsigned allocOnHost(bool no_edge_data = false);
+  unsigned allocOnDevice(bool no_edge_data = false);
+  void progressPrint(unsigned maxii, unsigned ii);
+  unsigned readFromGR(const char file[], bool read_edge_data = true);
+
+  unsigned deallocOnHost();
+  unsigned deallocOnDevice();
+  void dealloc();
+
+  CUDA_HOSTDEV bool valid_node(index_type node) {
+    return (node < nnodes);
+  }
+
+  CUDA_HOSTDEV bool valid_edge(index_type edge) {
+    return (edge < nedges);
+  }
+
+  CUDA_HOSTDEV index_type getOutDegree(unsigned src) {
+    assert(src < nnodes);
+    return row_start[src + 1] - row_start[src];
+  };
+
+  CUDA_HOSTDEV index_type getDestination(unsigned src, unsigned edge) {
+    assert(src < nnodes);
+    assert(edge < getOutDegree(src));
+
+    index_type abs_edge = row_start[src] + edge;
+    assert(abs_edge < nedges);
+
+    return edge_dst[abs_edge];
+  };
+
+  CUDA_HOSTDEV index_type getAbsDestination(unsigned abs_edge) {
+    assert(abs_edge < nedges);
+
+    return edge_dst[abs_edge];
+  };
+
+  CUDA_HOSTDEV index_type getFirstEdge(unsigned src) {
+    assert(src <= nnodes); // <= is okay
+    return row_start[src];
+  };
+
+  CUDA_HOSTDEV edge_data_type getWeight(unsigned src, unsigned edge) {
+    assert(src < nnodes);
+    assert(edge < getOutDegree(src));
+
+    index_type abs_edge = row_start[src] + edge;
+    assert(abs_edge < nedges);
+
+    return edge_data[abs_edge];
+  };
+
+  CUDA_HOSTDEV edge_data_type getAbsWeight(unsigned abs_edge) {
+    assert(abs_edge < nedges);
+
+    return edge_data[abs_edge];
+  };
+
+	void print_neighbors(index_type vid) {
+		printf("Vertex %d neighbors: [ ", vid);
+		index_type start = row_start[vid];
+		index_type end = row_start[vid+1];
+		for (index_type e = start; e != end; e++) {
+			index_type dst = edge_dst[e];
+			printf("%d ",  dst);
+		}
+		printf("]\n");
+	}
+	void add_selfloop() {
+		//print_neighbors(nnodes-1);
+		//print_neighbors(0);
+		index_type *new_edge_dst = new index_type[nnodes+nedges];
+		for (index_type i = 0; i < nnodes; i++) {
+			index_type start = row_start[i];
+			index_type end = row_start[i+1];
+			bool selfloop_inserted = false;
+			if (start == end) {
+				new_edge_dst[start+i] = i;
+				continue;
+			}
+			for (index_type e = start; e != end; e++) {
+				index_type dst = edge_dst[e];
+				if (!selfloop_inserted) {
+					if (i < dst) {
+						selfloop_inserted = true;
+						new_edge_dst[e+i] = i;
+						new_edge_dst[e+i+1] = dst;
+					} else if (e+1 == end) {
+						selfloop_inserted = true;
+						new_edge_dst[e+i+1] = i;
+						new_edge_dst[e+i] = dst;
+					} else new_edge_dst[e+i] = dst;
+				} else new_edge_dst[e+i+1] = dst;
+			}
+		}
+		for (index_type i = 0; i <= nnodes; i++) row_start[i] += i;
+		delete edge_dst;
+		edge_dst = new_edge_dst;
+		nedges += nnodes;
+    printf("nnodes = %d, nedges = %d\n", nnodes, nedges);
+		//print_neighbors(nnodes-1);
+		//print_neighbors(0);
+	}
+
+	CUDA_HOSTDEV index_type getEdgeDst(unsigned edge) {
+		assert(edge < nedges);
+		return edge_dst[edge];
+	};
+	CUDA_HOSTDEV node_data_type getData(unsigned vid) {
+		return node_data[vid];
+	}
+	CUDA_HOSTDEV index_type edge_begin(unsigned src) {
+		assert(src <= nnodes);
+		return row_start[src];
+	};
+	CUDA_HOSTDEV index_type edge_end(unsigned src) {
+		assert(src <= nnodes);
+		return row_start[src+1];
+	};
+	CUDA_HOSTDEV index_type *row_start_host_ptr() { return row_start; }
+	CUDA_HOSTDEV index_type *row_start_ptr() { return row_start; }
+	CUDA_HOSTDEV const index_type *row_start_ptr() const { return row_start; }
+	CUDA_HOSTDEV index_type *edge_dst_ptr() { return edge_dst; }
+	CUDA_HOSTDEV const index_type *edge_dst_ptr() const { return edge_dst; }
+	CUDA_HOSTDEV node_data_type *node_data_ptr() { return node_data; }
+	CUDA_HOSTDEV const node_data_type *node_data_ptr() const { return node_data; }
+	CUDA_HOSTDEV edge_data_type *edge_data_ptr() { return edge_data; }
+	CUDA_HOSTDEV const edge_data_type *edge_data_ptr() const { return edge_data; }
+  CUDA_HOSTDEV void fixEndEdge(index_type vid, index_type row_end) { row_start[vid + 1] = row_end; }
+  CUDA_HOSTDEV void constructEdge(index_type eid, index_type dst, edge_data_type edata = 0) {
+    assert(dst < nnodes);
+    assert(eid < nedges);
+    edge_dst[eid] = dst;
+    if (edge_data) edge_data[eid] = edata;
+  }
+  void malloc_index_device(index_type n, index_type*& ptr);
+  void free_index_device(index_type*& ptr);
+  void set_index(index_type pos, index_type value, index_type *ptr);
+  void allocateFrom(index_type nv, index_type ne) {
+    bool need_realloc = false;
+    if (nedges < ne) need_realloc = true;
+    nnodes = nv;
+    nedges = ne;
+    if (max_size < nnodes) max_size = nnodes;
+    //printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges);
+    if (need_realloc) {
+      if (edge_dst) free_index_device(edge_dst);
+      malloc_index_device(nedges, edge_dst);
+    }
+    if (!row_start) malloc_index_device(max_size+1, row_start);
+    set_index(0, 0, row_start);
+  }
+  void set_max_size(index_type max) { assert(max>0); max_size = max; }
+  size_t size() { return size_t(nnodes); }
+  size_t sizeEdges() { return size_t(nedges); }
+  void degree_counting() {}
+  index_type nnodes, nedges;
+  index_type* row_start; // row_start[node] points into edge_dst, node starts at
+                         // 0, row_start[nnodes] = nedges
+  index_type* edge_dst;
+  edge_data_type* edge_data;
+  node_data_type* node_data;
+  bool device_graph;
+  index_type max_size; // this is for reallocation; avoid re-malloc
+  bool is_allocated; // this is for reallocation
+};
+#endif
diff --git a/libgpu/include/internal.h b/libgpu/include/internal.h
index 696c37729c..7d8e6f8a9a 100644
--- a/libgpu/include/internal.h
+++ b/libgpu/include/internal.h
@@ -23,7 +23,9 @@ template <int items, typename T>
 struct multiple_sum {
   T el[items];
 
-  __device__ __host__ multiple_sum() {}
+  // https://nvlabs.github.io/cub/classcub_1_1_block_scan.html#a6ed3f77795e582df31d3d6d9d950615e
+  // "This operation assumes the value of obtained by the T's default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value zero for addition."
+  __device__ __host__ multiple_sum() : multiple_sum(T()) { }
 
   __device__ __host__ multiple_sum(const T e) {
     for (int i = 0; i < items; i++)
diff --git a/libgpu/include/sharedptr.h b/libgpu/include/sharedptr.h
index 9ce66de597..191812ff57 100644
--- a/libgpu/include/sharedptr.h
+++ b/libgpu/include/sharedptr.h
@@ -202,6 +202,17 @@ class Shared {
     return ptrs[0];
 #endif
   }
+
+  void set_data(T* src, size_t src_nmemb) { set_data(src, src_nmemb, 1); }
+
+  void set_data(T* src, size_t src_nmemb, int device) {
+    if (this->nmemb == 0) {
+      alloc(src_nmemb);
+      nmemb = src_nmemb;
+    }
+    assert(this->nmemb == src_nmemb);
+    ptrs[device] = src;
+  }
 };
 
 template <typename T>
diff --git a/libgpu/src/csr_graph.cu b/libgpu/src/csr_graph.cu
index 8cb63eaeb5..1c8a533717 100644
--- a/libgpu/src/csr_graph.cu
+++ b/libgpu/src/csr_graph.cu
@@ -20,8 +20,10 @@ unsigned CSRGraph::init() {
   edge_data            = NULL;
   node_data            = NULL;
   nnodes = nedges = 0;
-  device_graph    = false;
-
+  device_graph = false;
+  // following 2 vars are for GNN code
+  is_allocated = false;
+  max_size = 0;
   return 0;
 }
 
@@ -48,6 +50,19 @@ unsigned CSRGraph::allocOnHost(bool no_edge_data) {
   return ((no_edge_data || edge_data) && row_start && edge_dst && node_data);
 }
 
+void CSRGraph::free_index_device(index_type*& ptr) {
+  check_cuda(cudaFree(ptr));
+}
+
+void CSRGraph::malloc_index_device(index_type n, index_type*& ptr) {
+  check_cuda(cudaMalloc((void **) &ptr, n * sizeof(index_type)));
+}
+
+void CSRGraph::set_index(index_type pos, index_type value, index_type *ptr) {
+  index_type h_value = value;
+  check_cuda(cudaMemcpy(ptr+pos, &h_value, sizeof(index_type), cudaMemcpyHostToDevice));
+}
+
 unsigned CSRGraph::allocOnDevice(bool no_edge_data) {
   if (edge_dst != NULL) // already allocated
     return true;
@@ -169,7 +184,7 @@ void CSRGraph::progressPrint(unsigned maxii, unsigned ii) {
   }
 }
 
-unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) {
+unsigned CSRGraph::readFromGR(const char file[], bool read_edge_data) {
   std::ifstream cfile;
   cfile.open(file);
 
@@ -223,7 +238,7 @@ unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) {
   nnodes = numNodes;
   nedges = numEdges;
 
-  printf("nnodes=%d, nedges=%d, sizeEdge=%d.\n", nnodes, nedges, sizeEdgeTy);
+  printf("nnodes %d nedges %d sizeEdge %d\n", nnodes, nedges, sizeEdgeTy);
   allocOnHost(!read_edge_data);
 
   row_start[0] = 0;
@@ -260,7 +275,7 @@ unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) {
   return 0;
 }
 
-unsigned CSRGraph::read(char file[], bool read_edge_data) {
+unsigned CSRGraph::read(const char file[], bool read_edge_data) {
   return readFromGR(file, read_edge_data);
 }
 
diff --git a/libsupport/include/galois/Logging.h b/libsupport/include/galois/Logging.h
index 8621233fdf..674a9b65fa 100644
--- a/libsupport/include/galois/Logging.h
+++ b/libsupport/include/galois/Logging.h
@@ -111,4 +111,13 @@ void LogLine(LogLevel level, const char* file_name, int line_no, F fmt_string,
     }                                                                          \
   } while (0)
 
+#define GALOIS_LOG_VASSERT(cond, fmt_string, ...)                              \
+  do {                                                                         \
+    if (!(cond)) {                                                             \
+      ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,         \
+                        FMT_STRING(fmt_string), ##__VA_ARGS__);                \
+      ::std::abort();                                                          \
+    }                                                                          \
+  } while (0)
+
 #endif
diff --git a/libsupport/include/galois/shad/DataTypes.h b/libsupport/include/galois/shad/DataTypes.h
new file mode 100644
index 0000000000..253d064cbf
--- /dev/null
+++ b/libsupport/include/galois/shad/DataTypes.h
@@ -0,0 +1,741 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                                     SHAD
+//
+//      The Scalable High-performance Algorithms and Data Structure Library
+//
+//===----------------------------------------------------------------------===//
+//
+// Copyright 2018 Battelle Memorial Institute
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBGALOIS_INCLUDE_SHAD_DATATYPES_H_
+#define LIBGALOIS_INCLUDE_SHAD_DATATYPES_H_
+
+#include <ctime>
+#include <cstring>
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace shad {
+
+/// @brief Data conversion utilities.
+///
+/// Please refer to methods specialization to check
+/// which data types are supported.
+namespace data_types {
+
+/// @brief Enumeration of supported data types.
+///
+/// The enumeration is meant to be used when parsing data
+/// (i.e. type information is not known at compile time).
+enum data_t {
+  STRING = 0,  // string support is currenlty limited
+  CHARS,       // sequence of characters
+  UINT,        // unsigned, binds by default to uint64_t
+  INT,         // int, binds by default to int64_t
+  FLOAT,       // float, binds by default to float
+  DOUBLE,      // double, binds by default to double
+  BOOL,        // bool, binds by default to bool
+  DATE,        // date in "%y-%m-%d" format, binds by default to time_t
+  USDATE,      // date in "%m/%d/%y" format, binds by default to time_t
+  DATE_TIME,   // date in "%y-%m-%dT%H:%M:%S" format,
+               // binds by default to time_t
+  IP_ADDRESS,  // IPv4, binds by default to data_types::ipv4_t
+  LIST_UINT,   // Sequence of unsigneds, support currently limited
+  LIST_INT,    // Sequence of integers, support currently limited
+  LIST_DOUBLE, // Sequence of doubles, support currently limited
+  NONE
+};
+
+/// @brief Data structures for storing schema information.
+/// Given a tuple of data, it associates elements labels and data types
+/// to their position in the tuple.
+using schema_t = std::vector<std::pair<std::string, data_t>>;
+
+/// @brief Encoded null value.
+/// @tparam ENC_t encoding type.
+/// @return Encoded null value for ENC_t.
+template <typename ENC_t>
+constexpr ENC_t kNullValue = ENC_t();
+
+/// @brief Encoded null value for uint64_t.
+/// @return Null encoded value for uint64_t.
+template <>
+constexpr uint64_t kNullValue<uint64_t> = std::numeric_limits<int64_t>::max();
+
+/// @brief Encoded null value for time_t (same as long).
+/// @return Null encoded value for time_t (same as long).
+template <>
+constexpr time_t kNullValue<time_t> = std::numeric_limits<time_t>::max();
+
+/// @brief Encoded null value for double.
+/// @return Null encoded value for double.
+template <>
+constexpr double kNullValue<double> = std::numeric_limits<double>::max();
+
+/// @brief Encode Function
+/// Available specializations:
+///    ENC_t = uint64_t, IN_t = std::string
+/// @tparam ENC_t The type to encode to.
+/// @tparam IN_t The type (format) of the data to encode.
+/// @tparam DT data_types::data_t of the data to encode.
+/// @param in Data to encode.
+/// @return Encoded data.
+template <typename ENC_t, typename IN_t, data_t DT>
+ENC_t encode(IN_t& in);
+
+/// @brief Encode Function
+/// Available specializations:
+///    ENC_t = uint64_t, IN_t = default bindings of data_types::data_t
+/// @tparam ENC_t The type to encode to.
+/// @tparam IN_t The type of the data to encode.
+/// @param in Data to encode.
+/// @return Encoded data.
+template <typename ENC_t, typename IN_t>
+ENC_t encode(IN_t& in);
+
+template <typename ENC_t, typename IN_t>
+ENC_t encode(IN_t& in, data_t dt);
+
+template <typename ENC_t, size_t MAX_s, data_t ST>
+std::array<ENC_t, MAX_s> encode(std::string& str) {
+  std::array<ENC_t, MAX_s> res;
+  if (str.size() > 0) {
+    memcpy(res.data(), str.data(), sizeof(ENC_t) * MAX_s);
+  } else {
+    res.fill('\0');
+  }
+  return res;
+}
+
+template <typename ENC_t, typename DEC_t>
+typename std::enable_if<(std::is_arithmetic<DEC_t>::value or
+                         (sizeof(DEC_t) == sizeof(ENC_t))),
+                        DEC_t>::type
+decode(ENC_t encvalue) {
+  DEC_t val;
+  memcpy(&val, &encvalue, sizeof(DEC_t));
+  return val;
+}
+
+template <typename ENC_t, typename DEC_t, data_t ST>
+DEC_t decode(ENC_t value);
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::INT), int64_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, int64_t>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::UINT), uint64_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, uint64_t>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::FLOAT), float>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, float>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::DOUBLE), double>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, double>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::BOOL), bool>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, bool>(encvalue);
+}
+
+template <typename ENC_t, data_t ST>
+typename std::enable_if<(ST == data_t::DATE), std::time_t>::type
+decode(ENC_t encvalue) {
+  return decode<ENC_t, std::time_t>(encvalue);
+}
+
+template <typename ENC_t, size_t MAX_s, data_t ST>
+std::string decode(std::array<ENC_t, MAX_s>& val) {
+  return std::string(reinterpret_cast<const char*>(val.data()));
+}
+} // namespace data_types
+
+// ENCODE METHODS SPECIALIZATION FOR UINT64 ENC_t
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::UINT>(std::string& str) {
+  uint64_t value;
+  try {
+    value = std::stoull(str);
+  } catch (...) {
+    value = kNullValue<uint64_t>;
+  }
+  return value;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::INT>(std::string& str) {
+  uint64_t encval;
+  int64_t value;
+  try {
+    value = stoll(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::FLOAT>(std::string& str) {
+  uint64_t encval;
+  float value;
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline uint64_t data_types::encode<uint64_t, std::string, data_types::DOUBLE>(
+    std::string& str) {
+  uint64_t encval;
+  double value;
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
+  uint64_t encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::CHARS>(std::string& str) {
+  uint64_t encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
+  return encval;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
+  uint64_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch (...) {
+      return kNullValue<uint64_t>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val;
+      start = end + 1;
+    } else {
+      return kNullValue<uint64_t>;
+    }
+  }
+  return value;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::DATE>(std::string& str) {
+  uint64_t value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline uint64_t data_types::encode<uint64_t, std::string, data_types::USDATE>(
+    std::string& str) {
+  uint64_t value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline uint64_t
+data_types::encode<uint64_t, std::string, data_types::DATE_TIME>(
+    std::string& str) {
+  uint64_t value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+// ENCODE METHODS SPECIALIZATION FOR DOUBLE ENC_t
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::UINT>(std::string& str) {
+  double encval;
+  uint64_t value;
+  try {
+    value = std::stoull(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::INT>(std::string& str) {
+  double encval;
+  int64_t value;
+  try {
+    value = stoll(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::FLOAT>(std::string& str) {
+  double encval;
+  float value;
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::DOUBLE>(std::string& str) {
+  double value;
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  return value;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
+  double encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::CHARS>(std::string& str) {
+  double encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
+  return encval;
+}
+
+template <>
+inline double data_types::encode<double, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
+  uint64_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch (...) {
+      return kNullValue<double>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val;
+      start = end + 1;
+    } else {
+      return kNullValue<double>;
+    }
+  }
+  double encval;
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::DATE>(std::string& str) {
+  double value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<double>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline double
+data_types::encode<double, std::string, data_types::USDATE>(std::string& str) {
+  double value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+template <>
+inline double data_types::encode<double, std::string, data_types::DATE_TIME>(
+    std::string& str) {
+  double value = 0;
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  memcpy(&value, &t, sizeof(value));
+  return value;
+}
+
+// ENCODE METHODS SPECIALIZATION FOR TIME_T ENC_t (same as long)
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::UINT>(std::string& str) {
+  time_t value;
+  try {
+    value = std::stoul(str);
+  } catch (...) {
+    value = kNullValue<time_t>;
+  }
+  return value;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::INT>(std::string& str) {
+  int64_t value;
+  try {
+    value = stol(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  return value;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::FLOAT>(std::string& str) {
+  time_t encval;
+  float value;
+  try {
+    value = stof(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::DOUBLE>(std::string& str) {
+  time_t encval;
+  double value;
+  try {
+    value = stod(str);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  memcpy(&encval, &value, sizeof(value));
+  return encval;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::BOOL>(std::string& str) {
+  if (str.size() == 0)
+    return kNullValue<uint64_t>;
+  time_t encval = 1;
+  if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") ||
+      (str == "0"))
+    encval = 0;
+  return encval;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::CHARS>(std::string& str) {
+  time_t encval = 0;
+  memset(&encval, '\0', sizeof(encval));
+  memcpy(&encval, str.c_str(), sizeof(encval) - 1);
+  return encval;
+}
+
+template <>
+inline time_t data_types::encode<time_t, std::string, data_types::IP_ADDRESS>(
+    std::string& str) {
+  time_t val, value = 0;
+  std::string::iterator start = str.begin();
+  for (unsigned i = 0; i < 4; i++) {
+    std::string::iterator end = std::find(start, str.end(), '.');
+    try {
+      val = std::stoull(std::string(start, end));
+    } catch (...) {
+      return kNullValue<time_t>;
+    }
+    if (val < 256) {
+      value = (value << 8) + val;
+      start = end + 1;
+    } else {
+      return kNullValue<time_t>;
+    }
+  }
+  return value;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::DATE>(std::string& str) {
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%d", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  return t;
+}
+
+template <>
+inline time_t
+data_types::encode<time_t, std::string, data_types::USDATE>(std::string& str) {
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%m/%d/%y", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<time_t>;
+  }
+  return t;
+}
+
+template <>
+inline time_t data_types::encode<time_t, std::string, data_types::DATE_TIME>(
+    std::string& str) {
+  struct tm date {};
+  date.tm_isdst = -1;
+  strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date);
+  time_t t;
+  try {
+    t = mktime(&date);
+  } catch (...) {
+    return kNullValue<uint64_t>;
+  }
+  return t;
+}
+
+template <typename ENC_t, typename IN_t>
+ENC_t data_types::encode(IN_t& in, data_types::data_t dt) {
+  switch (dt) {
+    //     case data_types::STRING :
+    //       return data_types::encode<ENC_t, IN_t, data_types::STRING>(in);
+    //     case data_types::CHARS :
+    //       return data_types::encode<ENC_t, IN_t, data_types::CHARS>(in);
+  case data_types::UINT:
+    return data_types::encode<ENC_t, IN_t, data_types::UINT>(in);
+  case data_types::INT:
+    return data_types::encode<ENC_t, IN_t, data_types::INT>(in);
+  case data_types::FLOAT:
+    return data_types::encode<ENC_t, IN_t, data_types::FLOAT>(in);
+  case data_types::DOUBLE:
+    return data_types::encode<ENC_t, IN_t, data_types::DOUBLE>(in);
+  case data_types::BOOL:
+    return data_types::encode<ENC_t, IN_t, data_types::BOOL>(in);
+  case data_types::DATE:
+    return data_types::encode<ENC_t, IN_t, data_types::DATE>(in);
+  case data_types::USDATE:
+    return data_types::encode<ENC_t, IN_t, data_types::USDATE>(in);
+  case data_types::DATE_TIME:
+    return data_types::encode<ENC_t, IN_t, data_types::DATE_TIME>(in);
+  case data_types::IP_ADDRESS:
+    return data_types::encode<ENC_t, IN_t, data_types::IP_ADDRESS>(in);
+  }
+  return data_types::kNullValue<ENC_t>;
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::UINT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  return std::to_string(value);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::INT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  int64_t v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::FLOAT>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  float v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::DOUBLE>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  double v;
+  memcpy(&v, &value, sizeof(v));
+  return std::to_string(v);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::IP_ADDRESS>(
+    uint64_t value) {
+  std::string ipAddr = "";
+  uint64_t octets[4];
+  for (uint64_t k = 0; k < 4; k++) {
+    octets[k] = value & 255;
+    value     = value >> 8;
+  }
+  for (uint64_t k = 3; k >= 1; k--)
+    ipAddr += std::to_string(octets[k]) + '.';
+  return ipAddr + std::to_string(octets[0]);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::BOOL>(uint64_t value) {
+  if (value == kNullValue<uint64_t>)
+    return "";
+  return std::to_string(value);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::DATE>(uint64_t value) {
+  time_t t = data_types::decode<uint64_t, data_types::DATE>(value);
+  char dateString[11];
+  strftime(dateString, 11, "%Y-%m-%d", std::localtime(&t));
+  return std::string(dateString);
+}
+
+template <>
+inline std::string
+data_types::decode<uint64_t, std::string, data_types::CHARS>(uint64_t value) {
+  const char* c = reinterpret_cast<const char*>(&value);
+  return std::string(c);
+}
+
+template <>
+inline uint64_t data_types::decode<uint64_t, uint64_t>(uint64_t encvalue) {
+  return encvalue;
+}
+} // namespace shad
+
+#endif // LIBGALOIS_INCLUDE_SHAD_DATA_TYPES_H_
diff --git a/libwmd/CMakeLists.txt b/libwmd/CMakeLists.txt
new file mode 100644
index 0000000000..62dac7fee1
--- /dev/null
+++ b/libwmd/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_library(galois_wmd INTERFACE)
+add_library(Galois::wmd ALIAS galois_wmd)
+set_target_properties(galois_wmd PROPERTIES EXPORT_NAME wmd)
+add_dependencies(lib galois_wmd)
+
+target_include_directories(galois_wmd INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>
+)
+
+target_link_libraries(galois_wmd INTERFACE Galois::dist_async Galois::cusp galois_support)
+
+add_subdirectory(test)
+
+install(
+  DIRECTORY include/
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+  COMPONENT dev
+  FILES_MATCHING PATTERN "*.h"
+)
+
+install(TARGETS galois_wmd
+  EXPORT GaloisTargets
+  LIBRARY
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    COMPONENT shlib
+  ARCHIVE
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    COMPONENT lib
+  INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+)
diff --git a/libwmd/include/galois/wmd/WMDGraph.h b/libwmd/include/galois/wmd/WMDGraph.h
new file mode 100644
index 0000000000..962296109e
--- /dev/null
+++ b/libwmd/include/galois/wmd/WMDGraph.h
@@ -0,0 +1,1352 @@
+/**
+ * @file WMDGraph.h
+ *
+ * Contains the implementation of WMDBufferedGraph and WMDOfflineGraph which is
+ * a galois graph constructed from WMD dataset
+ */
+
+#ifndef WMD_BUFFERED_GRAPH_H
+#define WMD_BUFFERED_GRAPH_H
+
+#include <fstream>
+#include <unordered_map>
+#include <atomic>
+#include <cstring>
+#include <cmath>
+#include <iterator>
+#include <sys/stat.h>
+
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/iostreams/stream.hpp>
+#include <boost/archive/binary_iarchive.hpp>
+#include <boost/archive/binary_oarchive.hpp>
+#include <boost/iostreams/device/array.hpp>
+#include <boost/iostreams/device/back_inserter.hpp>
+#include <boost/serialization/vector.hpp>
+
+#include "galois/runtime/Network.h"
+#include "galois/config.h"
+#include "galois/gIO.h"
+#include "galois/Reduction.h"
+#include "galois/shad/DataTypes.h"
+#include "parallel_hashmap/phmap.h"
+
+#include "graphTypes.h"
+#include "graph.h"
+#include "schema.h"
+
+namespace galois {
+namespace graphs {
+
+void inline increment_evilPhase() {
+  ++galois::runtime::evilPhase;
+  if (galois::runtime::evilPhase >=
+      static_cast<uint32_t>(
+          std::numeric_limits<int16_t>::max())) { // limit defined by MPI or
+                                                  // LCI
+    galois::runtime::evilPhase = 1;
+  }
+}
+
+/**
+ * Load a WMD format graph from file into memory.
+ *
+ * Inherit from OffilineGraph only to make it compatible with Partitioner
+ * Internal implementation are completed different.
+ */
+template <typename NodeDataType, typename EdgeDataType>
+class WMDOfflineGraph : public OfflineGraph {
+protected:
+  // TODO: consider typedef uint64_t NodeIDType ?
+  typedef boost::counting_iterator<uint64_t> iterator;
+  typedef boost::counting_iterator<uint64_t> edge_iterator;
+
+  // private feilds from base class that will be updated
+  // uint64_t numNodes;  // num of global nodes
+  // uint64_t numEdges;  // num of global edges
+
+  // local feilds (different on each hosts)
+  std::vector<uint64_t> localNodeSize; // number of local nodes in each hosts
+  uint64_t localEdgeSize;              // number of local edges in this host
+
+  // TODO: it may be possible to optimize these vectors by numa aware data
+  // structures
+  std::vector<uint64_t>
+      localEdgesIdxToGlobalNodeID; // map idx in localEdges to global node ID
+  std::vector<NodeDataType> localNodes; // nodes in this host, index by local ID
+
+  // global feilds (same on each hosts)
+  std::vector<uint64_t> nodeOffset; // each hosts' local ID offset wrt global ID
+  std::vector<uint64_t>
+      globalEdgePrefixSum; // a prefix sum of degree of each global nodes
+
+  // per thread data struct (will be combined into a single data struct)
+  std::vector<std::unordered_map<uint64_t, size_t>>
+      perThreadTokenToLocalEdgesIdx;
+  std::vector<std::vector<NodeDataType>> perThreadLocalNodes;
+  std::vector<std::vector<std::vector<EdgeDataType>>> perThreadLocalEdges;
+
+  uint32_t hostID;
+  uint32_t numHosts;
+
+  /**
+   * @brief this releases memory by swapping p_container with an empty container
+   * and so, by using out-of-scope
+   */
+  template <typename T>
+  static inline void freeContainer(T& p_container) {
+    T empty;
+    std::swap(p_container, empty);
+  }
+
+  inline void insertlocalEdgesPerThread(unsigned tid, uint64_t token,
+                                        EdgeDataType& edge) {
+    if (auto search = perThreadTokenToLocalEdgesIdx[tid].find(token);
+        search !=
+        perThreadTokenToLocalEdgesIdx[tid].end()) { // if token already exists
+      perThreadLocalEdges[tid][search->second].push_back(std::move(edge));
+    } else { // not exist, make a new one
+      perThreadTokenToLocalEdgesIdx[tid].insert(
+          {token, perThreadLocalEdges[tid].size()});
+      std::vector<EdgeDataType> v;
+      v.push_back(std::move(edge));
+      perThreadLocalEdges[tid].push_back(std::move(v));
+    }
+  }
+
+  /**
+   * Load graph info from the file.
+   * Expect a WMD format csv
+   *
+   * @param filename loaded file for the graph
+   * @param segmentsPerHost the number of file segments each host will load.
+   * If value is 1, no file striping is performed. The file is striped into
+   * (segementsPerHost * numHosts) segments.
+   * @param setEdgeSize if True, will update local edges size on this step.
+   * Only set to ture when prefixsum will not be computed.
+   *
+   * @details File striping is used to randomize the order of nodes/edges
+   * loaded from the graph. WMD dataset csv typically grouped nodes/edges by its
+   * types, which will produce an imbalanced graph if you break the file evenly
+   * among hosts. So file striping make each host be able to load multiple
+   * segments in different positions of the file, which produced a more balanced
+   * graph.
+   */
+  void loadGraphFile(const std::string& filename,
+                     FileParser<NodeDataType, EdgeDataType>& parser,
+                     uint64_t segmentsPerHost,
+                     galois::GAccumulator<uint64_t>& nodeCounter,
+                     galois::DGAccumulator<uint64_t>& edgeCounter) {
+    std::string line;
+    struct stat stats;
+
+    std::ifstream graphFile = std::ifstream(filename, std::ifstream::in);
+    if (!graphFile.is_open()) {
+      printf("cannot open file %s\n", filename.c_str());
+      exit(-1);
+    }
+    stat(filename.c_str(), &stats);
+
+    uint64_t numThreads  = galois::getActiveThreads();
+    uint64_t numSegments = numHosts * segmentsPerHost;
+    uint64_t fileSize    = stats.st_size;
+    uint64_t bytesPerSegment =
+        fileSize / numSegments; // file size / number of segments
+
+    // for each host N, it will read segment like:
+    // N, N + numHosts, N + numHosts * 2, ..., N + numHosts * (segmentsPerHost -
+    // 1)
+    for (uint64_t cur = 0; cur < segmentsPerHost; cur++) {
+      uint64_t segmentID = hostID + cur * numHosts;
+      uint64_t start     = segmentID * bytesPerSegment;
+      uint64_t end       = start + bytesPerSegment;
+
+      // check for partial line at start
+      if (segmentID != 0) {
+        graphFile.seekg(start - 1);
+        getline(graphFile, line);
+
+        // if not at start of a line, discard partial line
+        if (!line.empty())
+          start += line.size();
+      }
+
+      // check for partial line at end
+      if (segmentID != numSegments - 1) {
+        graphFile.seekg(end - 1);
+        getline(graphFile, line);
+
+        // if not at end of a line, include next line
+        if (!line.empty())
+          end += line.size();
+      } else { // last locale processes to end of file
+        end = fileSize;
+      }
+
+      graphFile.seekg(start);
+
+      // load segment into memory
+      uint64_t segmentLength = end - start;
+      char* segmentBuffer    = new char[segmentLength];
+      graphFile.read(segmentBuffer, segmentLength);
+
+      if (!graphFile)
+        galois::gError("failed to read segment start: ", start, ", end: ", end,
+                       ", only ", graphFile.gcount(), " could be read from ",
+                       filename);
+      galois::gDebug("[", hostID, "] read file, start: ", start, ", end: ", end,
+                     "/", fileSize);
+
+      // A parallel loop that parse the segment
+      // task 1: get token to global id mapping
+      // task 2: get token to edges mapping
+      uint64_t lengthPerThread = segmentLength / numThreads;
+      galois::on_each([&](unsigned tid, unsigned nthreads) {
+        char* currentLine = segmentBuffer + tid * lengthPerThread;
+        char* endLine     = currentLine + lengthPerThread;
+
+        // check for partial line
+        if (tid != 0) {
+          // if not at start of a line, discard partial line
+          if (*(currentLine - 1) != '\n')
+            currentLine = std::strchr(currentLine, '\n') + 1;
+        }
+
+        // last thread processes to end of file
+        if (tid == (nthreads - 1))
+          endLine = segmentBuffer + segmentLength;
+        galois::gDebug("[", hostID, "] thread ", tid,
+                       " read file, start: ", currentLine - segmentBuffer,
+                       ", end: ", endLine - segmentBuffer, "/", segmentLength);
+        // init per thread counter
+        uint64_t edgeAdded = 0;
+        while (currentLine < endLine) {
+          assert(std::strchr(currentLine, '\n'));
+          char* nextLine      = std::strchr(currentLine, '\n') + 1;
+          uint64_t lineLength = nextLine - currentLine;
+
+          // skip comments
+          if (currentLine[0] == '#') {
+            currentLine = nextLine;
+            continue;
+          }
+
+          // delimiter and # tokens set for wmd data file
+          ParsedGraphStructure<NodeDataType, EdgeDataType> value =
+              parser.ParseLine(currentLine, lineLength);
+
+          if (value.isNode) {
+            perThreadLocalNodes[tid].emplace_back(value.node);
+          } else if (value.isEdge) {
+            for (auto& edge : value.edges) {
+              insertlocalEdgesPerThread(tid, edge.src, edge);
+              edgeAdded += 1;
+            }
+          }
+          currentLine = nextLine;
+        }
+        // update accumulator
+        edgeCounter += edgeAdded;
+        if (cur == segmentsPerHost - 1) {
+          nodeCounter += perThreadLocalNodes[tid].size();
+        }
+      });
+
+      delete[] segmentBuffer;
+    }
+    graphFile.close();
+  }
+
+  /**
+   * Load graph info from the file.
+   * Expect a WMD format csv
+   *
+   * @param filename loaded file for the graph
+   * @param segmentsPerHost the number of file segments each host will load.
+   * If value is 1, no file striping is performed. The file is striped into
+   * (segementsPerHost * numHosts) segments.
+   * @param setEdgeSize if True, will update local edges size on this step.
+   * Only set to ture when prefixsum will not be computed.
+   *
+   * @details File striping is used to randomize the order of nodes/edges
+   * loaded from the graph. WMD dataset csv typically grouped nodes/edges by its
+   * types, which will produce an imbalanced graph if you break the file evenly
+   * among hosts. So file striping make each host be able to load multiple
+   * segments in different positions of the file, which produced a more balanced
+   * graph.
+   */
+  void loadGraphFiles(
+      std::vector<std::unique_ptr<FileParser<NodeDataType, EdgeDataType>>>&
+          parsers,
+      uint64_t segmentsPerHost, bool setEdgeSize) {
+    galois::GAccumulator<uint64_t> nodeCounter;
+    nodeCounter.reset();
+    galois::DGAccumulator<uint64_t> edgeCounter;
+    edgeCounter.reset();
+
+    // init per thread data struct
+    uint64_t numThreads = galois::getActiveThreads();
+    perThreadTokenToLocalEdgesIdx.resize(numThreads);
+    perThreadLocalNodes.resize(numThreads);
+    perThreadLocalEdges.resize(numThreads);
+
+    for (std::unique_ptr<FileParser<NodeDataType, EdgeDataType>>& parser :
+         parsers) {
+      for (const std::string& file : parser->GetFiles()) {
+        loadGraphFile(file, *parser, segmentsPerHost, nodeCounter, edgeCounter);
+      }
+    }
+
+    perThreadTokenToLocalEdgesIdx.clear();
+    perThreadTokenToLocalEdgesIdx.shrink_to_fit();
+
+    if (setEdgeSize) {
+      setSizeEdges(edgeCounter.reduce());
+    }
+
+    localEdgeSize = edgeCounter.read_local();
+    localNodeSize.resize(numHosts);
+    localNodeSize[hostID] = nodeCounter.reduce();
+  }
+
+  /**
+   * Compute global ID of edges by exchange tokenToLocalNodeID
+   */
+  void exchangeEdgeCnt() {
+    // determine edgecnt for each host (partial/local)
+    std::vector<uint64_t> edgeCnt(numVirtualHosts, 0);
+    auto& net               = galois::runtime::getSystemNetworkInterface();
+    uint32_t activeThreads  = galois::getActiveThreads();
+    uint64_t localEdgesSize = localEdges.size();
+    std::vector<std::vector<uint64_t>> threadEdgeCnt(activeThreads);
+    for (uint32_t i = 0; i < activeThreads; i++) {
+      threadEdgeCnt[i].resize(numVirtualHosts, 0);
+    }
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, localEdgesSize, tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        uint32_t index = (localEdges[i][0].src) % numVirtualHosts;
+        threadEdgeCnt[tid][index] += localEdges[i].size();
+      }
+    });
+    for (uint32_t i = 0; i < activeThreads; i++) {
+      for (uint32_t j = 0; j < numVirtualHosts; j++) {
+        edgeCnt[j] += threadEdgeCnt[i][j];
+      }
+    }
+    // Send EdgeCnt
+    for (unsigned int i = 0; i < numHosts; i++) {
+      if (i == hostID)
+        continue;
+
+      galois::runtime::SendBuffer b;
+      galois::runtime::gSerialize(b, edgeCnt);
+      net.sendTagged(i, galois::runtime::evilPhase, std::move(b));
+    }
+    // Receive edgeCnt
+    for (uint32_t h = 0; h < (numHosts - 1); h++) {
+      std::vector<uint64_t> recvChunkCounts;
+
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      galois::runtime::gDeserialize(p->second, recvChunkCounts);
+      galois::do_all(galois::iterate((size_t)0, recvChunkCounts.size()),
+                     [this, &edgeCnt, &recvChunkCounts](uint64_t i) {
+                       edgeCnt[i] += recvChunkCounts[i];
+                     });
+    }
+    increment_evilPhase();
+    uint64_t edgesNum = 0;
+    for (uint32_t h = 0; h < numVirtualHosts; h++) {
+      edgesNum += edgeCnt[h];
+    }
+    setSizeEdges(edgesNum);
+    // Process edgeCnt
+    std::vector<uint64_t> edgeCntBkp = edgeCnt;
+    uint32_t sf                      = scaleFactor;
+    std::vector<std::pair<uint64_t, std::vector<uint32_t>>> cnt_vec;
+    for (size_t i = 0; i < edgeCnt.size(); i++) {
+      std::vector<uint32_t> vec;
+      vec.push_back(i);
+      cnt_vec.push_back(std::make_pair(edgeCnt[i], vec));
+    }
+    std::sort(cnt_vec.begin(), cnt_vec.end());
+    while (sf > 1) {
+      for (uint32_t i = 0; i < (sf * numHosts / 2); i++) {
+        std::pair<uint64_t, std::vector<uint32_t>> mypair;
+        cnt_vec[i].first += cnt_vec[sf * numHosts - i - 1].first;
+        std::vector vec = cnt_vec[(sf * numHosts) - i - 1].second;
+        for (size_t j = 0; j < vec.size(); j++) {
+          cnt_vec[i].second.push_back(
+              cnt_vec[(sf * numHosts) - i - 1].second[j]);
+        }
+      }
+      sf /= 2;
+    }
+    // Determine virtualToPhyMapping values
+    for (uint32_t i = 0; i < numHosts; i++) {
+      std::vector vec = cnt_vec[i].second;
+      for (size_t j = 0; j < vec.size(); j++) {
+        virtualToPhyMapping[vec[j]] = i;
+      }
+    }
+  }
+
+  /**
+   * Merge perThread Data Structures
+   */
+  void mergeThreadDS() {
+    // combine per thread edge list
+    // TODO: It may cause memory fragmentation and so use vector +
+    // inspector/executor in that case
+    std::unordered_map<uint64_t, size_t> globalNodeIDToLocalEdgesIdx;
+    uint64_t numThreads = perThreadLocalEdges.size();
+    for (size_t i = 0; i < numThreads; i++) {
+      uint64_t perThreadSize = perThreadLocalEdges[i].size();
+      for (size_t j = 0; j < perThreadSize; j++) {
+        uint64_t globalID = perThreadLocalEdges[i][j][0].src;
+        if (auto search = globalNodeIDToLocalEdgesIdx.find(globalID);
+            search !=
+            globalNodeIDToLocalEdgesIdx.end()) { // if token already exists
+          std::move(perThreadLocalEdges[i][j].begin(),
+                    perThreadLocalEdges[i][j].end(),
+                    std::back_inserter(localEdges[search->second]));
+        } else { // not exist, make a new one
+          globalNodeIDToLocalEdgesIdx.insert({globalID, localEdges.size()});
+          localEdges.emplace_back(std::move(perThreadLocalEdges[i][j]));
+        }
+      }
+    }
+    perThreadLocalEdges.clear();
+    perThreadLocalEdges.shrink_to_fit();
+
+    // make a maping from localEdges idx to ID
+    localEdgesIdxToGlobalNodeID.resize(globalNodeIDToLocalEdgesIdx.size());
+    galois::do_all(
+        galois::iterate(globalNodeIDToLocalEdgesIdx),
+        [this](std::unordered_map<uint64_t, size_t>::value_type& p) {
+          localEdgesIdxToGlobalNodeID[p.second] = p.first;
+        },
+        galois::steal());
+
+    // combine per thread node list
+    std::vector<uint64_t> perThreadLocalNodesOffset(perThreadLocalNodes.size(),
+                                                    0);
+    for (size_t i = 1; i < perThreadLocalNodes.size(); i++) {
+      perThreadLocalNodesOffset[i] =
+          perThreadLocalNodes[i - 1].size() + perThreadLocalNodesOffset[i - 1];
+    }
+    localNodes.resize(localNodeSize[hostID]);
+    galois::on_each([&](unsigned tid, unsigned) {
+      uint64_t perThreadOffset = perThreadLocalNodesOffset[tid];
+      std::move(perThreadLocalNodes[tid].begin(),
+                perThreadLocalNodes[tid].end(),
+                localNodes.begin() + perThreadOffset);
+    });
+    perThreadLocalNodes.clear();
+    perThreadLocalNodes.shrink_to_fit();
+  }
+
+  /**
+   * Compute prefix sum of the size of edges of nodes in the graph
+   */
+  void computeEdgePrefixSum() {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+
+    size_t numLocalNodes    = localEdges.size();
+    uint64_t numGlobalNodes = size();
+    std::vector<uint64_t> localNodeDegree(numLocalNodes);
+
+    galois::do_all(
+        galois::iterate((size_t)0, numLocalNodes),
+        [this, &localNodeDegree](size_t n) {
+          localNodeDegree[n] = localEdges[n].size();
+        },
+        galois::steal());
+
+    // broadcast node degrees and its global ID to other hosts
+    {
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, localNodeDegree);
+      galois::runtime::gSerialize(
+          sendBuffer,
+          localEdgesIdxToGlobalNodeID); // global ID of the localNodeDegree
+
+      for (uint32_t h = 0; h < numHosts; ++h) {
+        if (h == hostID) {
+          continue;
+        }
+
+        galois::runtime::SendBuffer b;
+        galois::runtime::gSerialize(b, std::move(sendBuffer));
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
+      }
+    }
+
+    // init edge prefix sum
+    globalEdgePrefixSum.resize(numGlobalNodes);
+    galois::do_all(
+        galois::iterate((size_t)0, localEdgesIdxToGlobalNodeID.size()),
+        [this, &localNodeDegree](size_t n) {
+          globalEdgePrefixSum[localEdgesIdxToGlobalNodeID[n]] +=
+              localNodeDegree[n];
+        },
+        galois::steal());
+    localNodeDegree.clear();
+    localNodeDegree.shrink_to_fit();
+
+    // recv node degrees and its global ID from other hosts
+    // build a list of degree of all global nodes on `globalEdgePrefixSum`
+    for (uint32_t h = 0; h < numHosts - 1; h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      // deserialize
+      std::vector<uint64_t> recvNodeDegree;
+      std::vector<uint64_t> recvNodeGlobalID;
+      galois::runtime::gDeserialize(p->second, recvNodeDegree);
+      galois::runtime::gDeserialize(p->second, recvNodeGlobalID);
+
+      galois::do_all(
+          galois::iterate((size_t)0, recvNodeDegree.size()),
+          [this, &recvNodeDegree, &recvNodeGlobalID](size_t n) {
+            globalEdgePrefixSum[recvNodeGlobalID[n]] += recvNodeDegree[n];
+          },
+          galois::steal());
+    }
+
+    // globalEdgePrefixSum has degree info now, so could compute prefixsum
+    // in place
+    for (size_t h = 1; h < numGlobalNodes; h++) {
+      globalEdgePrefixSum[h] += globalEdgePrefixSum[h - 1];
+    }
+
+    // set numEdges (global size)
+    setSizeEdges(globalEdgePrefixSum[numGlobalNodes - 1]);
+    increment_evilPhase();
+  }
+
+public:
+  template <typename WMDBufferedGraph_EdgeType,
+            typename WMDBufferedGraph_NodeType>
+  friend class WMDBufferedGraph;
+  std::vector<uint32_t> virtualToPhyMapping;
+  uint64_t scaleFactor;
+  uint32_t numVirtualHosts;
+  std::vector<std::vector<EdgeDataType>>
+      localEdges; // edges list of local nodes, idx is local ID
+
+  WMDOfflineGraph() {}
+
+  /**
+   * An object that load graph info from the file.
+   * Expect a WMD format csv
+   *
+   * @param name loaded file for the graph.
+   * @param md Masters distribution policy that will be used for partition.
+   * @param segmentsPerHost the number of file segments each host will load.
+   * Default value is 1, no file striping is performed. The file is striped into
+   * (segementsPerHost * numHosts) segments.
+   * @param scaleFactor param decide how many virtual host will be used (as a
+   * scale of num physical host) Default value is 4. which means there will be 4
+   * * numHosts virtual hosts.
+   */
+  WMDOfflineGraph(
+      std::vector<std::unique_ptr<
+          galois::graphs::FileParser<NodeDataType, EdgeDataType>>>& parsers,
+      galois::graphs::MASTERS_DISTRIBUTION md, uint64_t segmentsPerHost = 1,
+      uint32_t scaleFactor = 4)
+      : OfflineGraph() {
+    auto& net         = galois::runtime::getSystemNetworkInterface();
+    hostID            = net.ID;
+    numHosts          = net.Num;
+    this->scaleFactor = scaleFactor;
+    numVirtualHosts   = scaleFactor * numHosts;
+    virtualToPhyMapping.resize(numVirtualHosts);
+
+    galois::gDebug("[", hostID, "] loadGraphFile!");
+    loadGraphFiles(parsers, segmentsPerHost, md == BALANCED_MASTERS);
+    mergeThreadDS();
+    galois::gDebug("[", hostID, "] exchangeEdgeCntMetadata!");
+    exchangeEdgeCnt();
+    galois::gInfo("[", hostID, "] read WMD csv file with local Nodes: ",
+                  localNodeSize[hostID], ", local Edges: ", localEdgeSize);
+  }
+
+  /**
+   * Accesses the prefix sum of degree up to node `n`.
+   *
+   * @param N global ID of node
+   * @returns The value located at index n in the edge prefix sum array
+   */
+  uint64_t operator[](uint64_t N) { return globalEdgePrefixSum[N]; }
+
+  size_t edgeSize() const { return sizeof(EdgeDataType); }
+
+  iterator begin() { return iterator(0); }
+
+  iterator end() { return iterator(size()); }
+
+  /**
+   * return the end idx of edges of node N
+   *
+   * @param N global ID of node
+   * @return edge_iterator
+   */
+  edge_iterator edge_begin(uint64_t N) {
+    if (N == 0)
+      return edge_iterator(0);
+    else
+      return edge_iterator(globalEdgePrefixSum[N - 1]);
+  }
+
+  /**
+   * return the begin idx of edges of node N
+   *
+   * @param N global ID of node
+   * @return edge_iterator
+   */
+  edge_iterator edge_end(uint64_t N) {
+    return edge_iterator(globalEdgePrefixSum[N]);
+  }
+
+  /**
+   * Returns 2 ranges (one for nodes, one for edges) for a particular
+   * division. The ranges specify the nodes/edges that a division is
+   * responsible for. The function attempts to split them evenly among threads
+   * given some kind of weighting
+   *
+   * @param nodeWeight weight to give to a node in division
+   * @param edgeWeight weight to give to an edge in division
+   * @param id Division number you want the ranges for
+   * @param total Total number of divisions
+   * @param scaleFactor Vector specifying if certain divisions should get more
+   * than other divisions
+   */
+  auto divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id,
+                    size_t total,
+                    std::vector<unsigned> scaleFactor = std::vector<unsigned>())
+      -> GraphRange {
+    return galois::graphs::divideNodesBinarySearch<WMDOfflineGraph>(
+        size(), sizeEdges(), nodeWeight, edgeWeight, id, total, *this,
+        scaleFactor);
+  }
+
+  /**
+   * Release memory used by EdgePrefixSum
+   * After that, calls to `edge_begin` and `edge_end` will be invalid
+   */
+  void clearEdgePrefixSumInfo() {
+    globalEdgePrefixSum.clear();
+    globalEdgePrefixSum.shrink_to_fit();
+  }
+};
+
+/**
+ * Class that loads a portion of a Galois graph from disk directly into
+ * memory buffers for access.
+ *
+ * @tparam EdgeDataType type of the edge data
+ */
+template <typename NodeDataType, typename EdgeDataType>
+class WMDBufferedGraph : public BufferedGraph<EdgeDataType> {
+private:
+  typedef boost::counting_iterator<uint64_t> iterator;
+
+  // Edge iterator typedef
+  using EdgeIterator = boost::counting_iterator<uint64_t>;
+
+  // specifies whether or not the graph is loaded
+  bool graphLoaded = false;
+
+  // size of the entire graph (not just locallly loaded portion)
+  uint32_t globalSize = 0;
+  // number of edges in the entire graph (not just locallly loaded portion)
+  uint64_t globalEdgeSize = 0;
+
+  // number of nodes loaded into this graph
+  uint32_t numLocalNodes = 0;
+  // number of edges loaded into this graph
+  uint64_t numLocalEdges = 0;
+  // offset of local to global node id
+  uint64_t nodeOffset = 0;
+
+  uint32_t hostID;
+  uint32_t numHosts;
+
+  // CSR representation of edges
+  std::vector<uint64_t> offsets; // offsets[numLocalNodes] point to end of edges
+  std::vector<EdgeDataType> edges;
+
+  void
+  exchangeLocalNodeSize(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph) {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    globalNodeOffset.resize(numHosts);
+    localNodeSize.resize(numHosts);
+    std::vector<std::vector<uint64_t>> threadNodesToSend(
+        galois::runtime::activeThreads);
+    for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
+      threadNodesToSend[i].resize(numHosts, 0);
+    }
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (uint64_t)0, srcGraph.localNodes.size(), tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        int host =
+            virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
+        threadNodesToSend[tid][host]++;
+      }
+    });
+    for (uint32_t tid = 0; tid < galois::runtime::activeThreads; tid++) {
+      for (uint32_t h = 0; h < numHosts; h++) {
+        localNodeSize[h] += threadNodesToSend[tid][h];
+      }
+    }
+
+    numNodes = 0;
+
+    // send vertex size to other hosts
+    for (uint32_t h = 0; h < numHosts; ++h) {
+      if (h == hostID) {
+        continue;
+      }
+      // serialize size_t
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, localNodeSize);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+    }
+
+    for (uint32_t h = 0; h < numHosts - 1; h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      std::vector<uint64_t> cnt;
+      // deserialize local_node_size
+      galois::runtime::gDeserialize(p->second, cnt);
+      for (uint32_t i = 0; i < numHosts; i++) {
+        localNodeSize[i] += cnt[i];
+      }
+    }
+
+    numNodes      = localNodeSize[hostID];
+    numLocalNodes = numNodes;
+    // compute prefix sum to get offset
+    globalNodeOffset[0] = 0;
+    for (size_t h = 1; h < numHosts; h++) {
+      globalNodeOffset[h] = localNodeSize[h - 1] + globalNodeOffset[h - 1];
+    }
+    srcGraph.setSize(globalNodeOffset[numHosts - 1] +
+                     localNodeSize[numHosts - 1]);
+
+    increment_evilPhase();
+  }
+
+  void addGatheredNodes(std::vector<NodeDataType>&& NodeData) {
+    uint64_t offset = GIDtoLID.size();
+
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, NodeData.size(), tid, nthreads);
+      using map = phmap::parallel_flat_hash_map_m<uint64_t, uint64_t>;
+      for (size_t j = beginNode; j < (endNode); ++j) {
+        GIDtoLID.lazy_emplace_l(
+            NodeData[j].id, [&](map::value_type&) {},
+            [&](const map::constructor& ctor) {
+              ctor(std::pair(NodeData[j].id, offset + j));
+            });
+        LIDtoGID.lazy_emplace_l(
+            offset + j, [&](map::value_type&) {},
+            [&](const map::constructor& ctor) {
+              ctor(std::pair(offset + j, NodeData[j].id));
+            });
+      }
+    });
+    NodeData.clear();
+  }
+
+  /**
+   * Exchanges vertex ids to form a global id to local id map before exchanging
+   * edges so that using the map edges can be inserted into the edgelist
+   */
+  void
+  gatherVerticesAndEdges(std::vector<std::vector<EdgeDataType>>& localEdges,
+                         std::vector<NodeDataType>& localNodes) {
+    auto& net              = galois::runtime::getSystemNetworkInterface();
+    uint32_t activeThreads = galois::getActiveThreads();
+
+    // prepare both nodedata and edgedata to send to all hosts
+    std::vector<std::vector<std::vector<EdgeDataType>>> edgesToSend(
+        numHosts, std::vector<std::vector<EdgeDataType>>());
+    std::vector<std::vector<NodeDataType>> nodesToSend(
+        numHosts, std::vector<NodeDataType>());
+
+    // PerThread DS
+    galois::PerThreadVector<std::vector<std::vector<EdgeDataType>>>
+        threadEdgesToSend;
+    galois::PerThreadVector<std::vector<NodeDataType>> threadNodesToSend;
+    for (uint32_t i = 0; i < activeThreads; i++) {
+      threadEdgesToSend[i].resize(numHosts);
+      threadNodesToSend[i].resize(numHosts);
+    }
+
+    // Prepare edgeList and Vertex ID list to send to other hosts
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, localEdges.size(), tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        uint64_t src = localEdges[i][0].src;
+        int host     = virtualToPhyMapping[src % numVirtualHosts];
+        threadEdgesToSend.get()[host].emplace_back(std::move(localEdges[i]));
+      }
+    });
+
+    localEdges.clear();
+
+    // Prepare Nodedata to send to other hosts
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, localNodes.size(), tid, nthreads);
+
+      for (size_t i = beginNode; i < (endNode); ++i) {
+        int host = virtualToPhyMapping[localNodes[i].id % numVirtualHosts];
+        threadNodesToSend.get()[host].emplace_back(std::move(localNodes[i]));
+      }
+    });
+
+    for (uint32_t tid = 0; tid < activeThreads; tid++) {
+      for (uint32_t h = 0; h < numHosts; h++) {
+        nodesToSend[h].insert(nodesToSend[h].end(),
+                              threadNodesToSend[tid][h].begin(),
+                              threadNodesToSend[tid][h].end());
+        edgesToSend[h].insert(edgesToSend[h].end(),
+                              threadEdgesToSend[tid][h].begin(),
+                              threadEdgesToSend[tid][h].end());
+      }
+    }
+
+    threadNodesToSend.clear_all_parallel();
+    threadEdgesToSend.clear_all_parallel();
+
+    // Send Nodelist
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+    }
+
+    // Collect node data received from other hosts
+    for (uint32_t i = 0; i < (numHosts - 1); i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      std::vector<NodeDataType> NodeData;
+      galois::runtime::gDeserialize(p->second, NodeData);
+      addGatheredNodes(std::move(NodeData));
+    }
+
+    // Collect node data present in this host
+    addGatheredNodes(std::move(nodesToSend[hostID]));
+
+    numLocalNodes = GIDtoLID.size();
+    localEdges.clear();
+    nodesToSend.clear();
+
+    increment_evilPhase();
+    // Send Edgelist
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, edgesToSend[h]);
+      galois::gInfo("[", hostID, "] ", "send to ", h,
+                    " edgesToSend size: ", edgesToSend[h].size());
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+    }
+
+    // Appending edges in each host that belong to self
+    localEdges.resize(GIDtoLID.size());
+
+    // Receiving edges from other hosts and populating edgelist
+    for (uint32_t h = 0; h < (numHosts - 1); h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      uint32_t sendingHost = p->first;
+
+      std::vector<std::vector<EdgeDataType>> edgeList;
+
+      galois::runtime::gDeserialize(p->second, edgeList);
+
+      galois::gInfo("[", hostID, "] recv from ", sendingHost,
+                    " edgeList size: ", edgeList.size());
+
+      galois::on_each([&](unsigned tid, unsigned nthreads) {
+        size_t beginNode;
+        size_t endNode;
+        std::tie(beginNode, endNode) =
+            galois::block_range((size_t)0, edgeList.size(), tid, nthreads);
+        for (size_t j = beginNode; j < endNode; j++) {
+          auto lid = GIDtoLID[edgeList[j][0].src];
+          localEdges[lid].insert(std::end(localEdges[lid]),
+                                 std::begin(edgeList[j]),
+                                 std::end(edgeList[j]));
+        }
+      });
+      edgeList.clear();
+    }
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (size_t)0, edgesToSend[hostID].size(), tid, nthreads);
+      for (size_t j = beginNode; j < endNode; j++) {
+        auto lid = GIDtoLID[edgesToSend[hostID][j][0].src];
+        localEdges[lid].insert(std::end(localEdges[lid]),
+                               std::begin(edgesToSend[hostID][j]),
+                               std::end(edgesToSend[hostID][j]));
+      }
+    });
+    edgesToSend.clear();
+    increment_evilPhase();
+  }
+
+  /**
+   * Flatten the 2D vector localEdges into a CSR edge list
+   * Will compute edge size and build CSR edge offset mapping
+   */
+  void flattenEdges(std::vector<std::vector<EdgeDataType>>& localEdges) {
+    // build CSR edge offseto
+    offsets.resize(numLocalNodes + 1, 0);
+    for (size_t i = 0; i < numLocalNodes; i++) {
+      uint64_t cnt;
+      if (i >= localEdges.size())
+        cnt = 0;
+      else
+        cnt = localEdges[i].size();
+      offsets[i + 1] += cnt + offsets[i];
+    }
+    numLocalEdges = offsets[numLocalNodes];
+
+    // build flatten edge list
+    edges.resize(numLocalEdges);
+    galois::do_all(
+        galois::iterate((size_t)0, localEdges.size()),
+        [this, &localEdges](size_t i) {
+          std::move(localEdges[i].begin(), localEdges[i].end(),
+                    edges.begin() + offsets[i]);
+        },
+        galois::steal());
+  }
+
+public:
+  WMDBufferedGraph() : BufferedGraph<EdgeDataType>() {}
+  phmap::parallel_flat_hash_map_m<uint64_t, uint64_t> GIDtoLID;
+  phmap::parallel_flat_hash_map_m<uint64_t, uint64_t> LIDtoGID;
+
+  // copy not allowed
+  //! disabled copy constructor
+  WMDBufferedGraph(const WMDBufferedGraph&) = delete;
+  //! disabled copy constructor operator
+  WMDBufferedGraph& operator=(const WMDBufferedGraph&) = delete;
+  // move not allowed
+  //! disabled move operator
+  WMDBufferedGraph(WMDBufferedGraph&&) = delete;
+  //! disabled move constructor operator
+  WMDBufferedGraph& operator=(WMDBufferedGraph&&) = delete;
+
+  uint32_t scaleFactor;
+  uint64_t numNodes;
+  uint32_t numVirtualHosts;
+  std::vector<uint64_t> localNodeSize; // number of local nodes in each hosts
+  std::vector<uint64_t>
+      globalNodeOffset; // each hosts' local ID offset wrt global ID
+  std::vector<uint32_t> virtualToPhyMapping;
+  /**
+   * Gets the number of global nodes in the graph
+   * @returns the total number of nodes in the graph (not just local loaded
+   * nodes)
+   */
+  uint32_t size() const { return globalSize; }
+
+  /**
+   * Gets the number of global edges in the graph
+   * @returns the total number of edges in the graph (not just local loaded
+   * edges)
+   */
+  uint32_t sizeEdges() const { return globalEdgeSize; }
+
+  /**
+   * Gets the number of local edges in the graph
+   * @returns the total number of edges in the local graph
+   */
+  uint32_t sizeLocalEdges() const { return numLocalEdges; }
+
+  //! @returns node offset of this buffered graph
+  uint64_t getNodeOffset() const { return nodeOffset; }
+
+  /**
+   * Given a node/edge range to load, loads the specified portion of the
+   * graph into memory buffers from OfflineGraph.
+   *
+   * @param srcGraph the OfflineGraph to load from
+   * @param nodeStart First node to load
+   * @param nodeEnd Last node to load, non-inclusive
+   * @param numGlobalNodes Total number of nodes in the graph
+   * @param numGlobalEdges Total number of edges in the graph
+   */
+  void loadPartialGraph(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph,
+                        uint64_t numGlobalEdges) {
+    if (graphLoaded) {
+      GALOIS_DIE("Cannot load an buffered graph more than once.");
+    }
+
+    // prepare meta data
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    hostID    = net.ID;
+    numHosts  = net.Num;
+
+    globalEdgeSize = numGlobalEdges;
+
+    scaleFactor     = srcGraph.scaleFactor;
+    numVirtualHosts = srcGraph.numVirtualHosts;
+    virtualToPhyMapping.resize(numVirtualHosts);
+    for (uint32_t i = 0; i < numVirtualHosts; i++) {
+      virtualToPhyMapping[i] = srcGraph.virtualToPhyMapping[i];
+    }
+
+    // build local buffered graph
+    exchangeLocalNodeSize(srcGraph);
+    galois::gDebug("[", hostID, "] gatherVerticesAndEdges!");
+    gatherVerticesAndEdges(srcGraph.localEdges, srcGraph.localNodes);
+    galois::gDebug("[", hostID, "] ", "flattenEdges!");
+    flattenEdges(srcGraph.localEdges);
+
+    // clean unused data
+    srcGraph.localEdgesIdxToGlobalNodeID.clear();
+    srcGraph.localEdgesIdxToGlobalNodeID.shrink_to_fit();
+    srcGraph.localEdges.clear();
+    srcGraph.localEdges.shrink_to_fit();
+
+    graphLoaded = true;
+
+    galois::gDebug("[", hostID, "] ", "exchangeNodeRange!");
+    galois::gDebug("[", hostID, "] ",
+                   "BufferedGraph built, nodes: ", numLocalNodes,
+                   ", edges: ", numLocalEdges);
+  }
+
+  /**
+   * Gather local nodes data (mirror + master nodes) from other hosts to
+   * this host And save data to graph
+   *
+   * @param srcGraph the OfflineGraph owns node data (will be cleared after
+   * this call)
+   * @param proxiesOnHosts a list of bit vector which indicates node on that
+   * hosts (include mirror and master nodes)
+   * @param totalLocalNodes the total number of local nodes this host should
+   * have (include mirror and master nodes)
+   */
+  template <typename GraphTy>
+  void gatherNodes(WMDOfflineGraph<NodeDataType, EdgeDataType>& srcGraph,
+                   GraphTy& dstGraph,
+                   std::vector<std::vector<uint64_t>>& proxiesOnHosts,
+                   uint64_t totalLocalNodes,
+                   std::unordered_map<uint64_t, uint32_t> globalToLocalMap) {
+#ifdef NDEBUG
+    (void)totalLocalNodes;
+#endif
+    auto& net        = galois::runtime::getSystemNetworkInterface();
+    auto& localNodes = srcGraph.localNodes;
+
+    // prepare data to send for all hosts
+    // each host will receive its nodes and corresponding node global ID
+    // list
+    galois::gDebug("[", hostID, "] ", "prepare node data!");
+    std::vector<std::vector<NodeDataType>> nodesToSend(
+        numHosts, std::vector<NodeDataType>());
+    std::vector<std::vector<std::vector<NodeDataType>>> threadNodesToSend(
+        galois::runtime::activeThreads,
+        std::vector<std::vector<NodeDataType>>());
+    uint64_t globalIDOffset = 0;
+
+    for (uint32_t i = 0; i < galois::runtime::activeThreads; i++) {
+      threadNodesToSend[i].resize(numHosts);
+    }
+    // Phase 1
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      size_t beginNode;
+      size_t endNode;
+      std::tie(beginNode, endNode) = galois::block_range(
+          (size_t)0, srcGraph.localNodes.size(), tid, nthreads);
+
+      for (size_t i = beginNode; i < endNode; ++i) {
+        int host =
+            virtualToPhyMapping[srcGraph.localNodes[i].id % numVirtualHosts];
+        threadNodesToSend[tid][host].emplace_back((srcGraph.localNodes[i]));
+      }
+    });
+
+    for (uint32_t tid = 0; tid < galois::runtime::activeThreads; tid++) {
+      for (uint32_t h = 0; h < numHosts; h++) {
+        nodesToSend[h].insert(nodesToSend[h].end(),
+                              threadNodesToSend[tid][h].begin(),
+                              threadNodesToSend[tid][h].end());
+      }
+    }
+    srcGraph.localNodes.clear();
+
+    // Send Nodedata only
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+    }
+#ifndef NDEBUG
+    std::atomic<uint64_t> addedData{0};
+#endif
+    for (uint32_t i = 0; i < (numHosts - 1); i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      std::vector<NodeDataType> NodeData;
+      galois::runtime::gDeserialize(p->second, NodeData);
+      galois::do_all(galois::iterate((size_t)0, NodeData.size()),
+                     [this, NodeData, &dstGraph, &globalToLocalMap
+#ifndef NDEBUG
+                      ,
+                      &addedData
+#endif
+      ](size_t j) {
+                       dstGraph->getData(GIDtoLID[NodeData[j].id]) =
+                           NodeData[j];
+                     });
+      NodeData.clear();
+    }
+    galois::do_all(galois::iterate((size_t)0, nodesToSend[hostID].size()),
+                   [this, nodesToSend, &dstGraph, &globalToLocalMap
+#ifndef NDEBUG
+                    ,
+                    &addedData
+#endif
+    ](size_t i) {
+                     dstGraph->getData(GIDtoLID[nodesToSend[hostID][i].id]) =
+                         nodesToSend[hostID][i];
+#ifndef NDEBUG
+                     addedData++;
+#endif
+                   });
+    nodesToSend.clear();
+    nodesToSend.resize(numHosts);
+    increment_evilPhase();
+
+    // Phase 2
+    //     uint64_t numNodes = srcGraph.localNodeSize[hostID];
+    galois::do_all(
+        galois::iterate((uint64_t)0, (uint64_t)numHosts),
+        [this, &nodesToSend, &localNodes, &proxiesOnHosts, globalIDOffset,
+         &dstGraph, &globalToLocalMap](uint64_t i) {
+          if (i != hostID) {
+            for (uint64_t j = 0; j < proxiesOnHosts[i].size(); j++) {
+              auto& r =
+                  dstGraph->getData(globalToLocalMap[proxiesOnHosts[i][j]]);
+              nodesToSend[i].push_back(r);
+            };
+          }
+        },
+        galois::steal());
+
+    // send nodes to other hosts
+    galois::gDebug("[", hostID, "] ", "send nodes!");
+
+    for (uint32_t h = 0; h < numHosts; h++) {
+      if (h == hostID)
+        continue;
+      assert(nodesToSend[h].size() == proxiesOnHosts[h].size());
+      galois::runtime::SendBuffer sendBuffer;
+      galois::runtime::gSerialize(sendBuffer, nodesToSend[h]);
+      galois::runtime::gSerialize(sendBuffer, proxiesOnHosts[h]);
+      galois::gDebug("[", hostID, "] ", "send to ", h,
+                     " nodesToSend size: ", nodesToSend[h].size());
+      net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer));
+    }
+
+    nodesToSend.clear();
+    // recive nodes from other hosts
+    for (uint32_t i = 0; i < (numHosts - 1); i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      uint32_t sendingHost = p->first;
+
+      std::vector<NodeDataType> nodeRecv;
+      std::vector<uint64_t> IDofNodeRecv;
+      galois::runtime::gDeserialize(p->second, nodeRecv);
+      galois::runtime::gDeserialize(p->second, IDofNodeRecv);
+
+      assert(nodeRecv.size() == IDofNodeRecv.size());
+      galois::gDebug("[", hostID, "] recv from ", sendingHost,
+                     " nodeRecv size: ", nodeRecv.size());
+
+      galois::do_all(
+          galois::iterate((size_t)0, IDofNodeRecv.size()),
+          [this, &nodeRecv, &IDofNodeRecv, &dstGraph,
+           &globalToLocalMap](size_t j) {
+            dstGraph->getData(globalToLocalMap[IDofNodeRecv[j]]) = nodeRecv[j];
+          },
+          galois::steal());
+      nodeRecv.clear();
+      IDofNodeRecv.clear();
+    }
+#ifndef NDEBUG
+    //    assert(addedData == totalLocalNodes);
+#endif
+
+    increment_evilPhase();
+
+    // clean unused memory
+    srcGraph.localNodes.clear();
+    srcGraph.localNodes.shrink_to_fit();
+    srcGraph.nodeOffset.clear();
+    srcGraph.nodeOffset.shrink_to_fit();
+  }
+
+  // NOTE: for below methods, it return local edge id instead of global id
+
+  /**
+   * Get the index to the first edge of the provided node THAT THIS GRAPH
+   * HAS LOADED (not necessary the first edge of it globally).
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns a LOCAL edge id iterator
+   */
+  EdgeIterator edgeBegin(uint64_t globalNodeID) {
+    assert((globalNodeID - globalNodeOffset[hostID]) < GIDtoLID.size());
+    return EdgeIterator(offsets[globalNodeID - globalNodeOffset[hostID]]);
+  }
+
+  /**
+   * Get the index to the first edge of the node after the provided node.
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns a LOCAL edge id iterator
+   */
+  EdgeIterator edgeEnd(uint64_t globalNodeID) {
+    assert((globalNodeID - globalNodeOffset[hostID]) < GIDtoLID.size());
+    return EdgeIterator(offsets[globalNodeID + 1 - globalNodeOffset[hostID]]);
+  }
+
+  /**
+   * Get the global node id of the destination of the provided edge.
+   *
+   * @param localEdgeID the local edge id of the edge to get the destination
+   * for (should obtain from edgeBegin/End)
+   */
+  uint64_t edgeDestination(uint64_t localEdgeID) {
+    return edges[localEdgeID].dst;
+  }
+
+  /**
+   * Get the edge data of some edge.
+   *
+   * @param localEdgeID the local edge id of the edge to get the data of
+   * @returns the edge data of the requested edge id
+   */
+  template <typename K = EdgeDataType,
+            typename std::enable_if<!std::is_void<K>::value>::type* = nullptr>
+  EdgeDataType edgeData(uint64_t localEdgeID) {
+    assert(localEdgeID < numLocalEdges);
+    return edges[localEdgeID];
+  }
+
+  /**
+   * Version of above function when edge data type is void.
+   */
+  template <typename K = EdgeDataType,
+            typename std::enable_if<std::is_void<K>::value>::type* = nullptr>
+  unsigned edgeData(uint64_t) {
+    galois::gWarn("Getting edge data on graph when it doesn't exist\n");
+    return 0;
+  }
+
+  /**
+   * Get the number of edges of the node
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns number of edges
+   */
+  uint64_t edgeNum(uint64_t globalNodeID) {
+    return offsets[globalNodeID - globalNodeOffset[hostID] + 1] -
+           offsets[globalNodeID - globalNodeOffset[hostID]];
+  }
+
+  /**
+   * Get the dst of edges of the node
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @param G2L the global to local id mapping
+   * @returns a vector of dst local node id
+   */
+  std::vector<uint64_t> edgeLocalDst(uint64_t globalNodeID) {
+    std::vector<uint64_t> dst;
+    auto end = offsets[globalNodeID - globalNodeOffset[hostID] + 1];
+    for (auto itr = offsets[globalNodeID - globalNodeOffset[hostID]];
+         itr != end; ++itr) {
+      dst.push_back(edges[itr].dst);
+    }
+    return dst;
+  }
+
+  /**
+   * Get the data of edges of the node
+   *
+   * @param globalNodeID the global node id of the node to get the edge
+   * for
+   * @returns a pointer to the first edges of the node in the buffer
+   */
+  EdgeDataType* edgeDataPtr(uint64_t globalNodeID) {
+    return edges.data() + offsets[globalNodeID - globalNodeOffset[hostID]];
+  }
+
+  /**
+   * Free all of the in memory buffers in this object.
+   */
+  void resetAndFree() {
+    offsets.clear();
+    offsets.shrink_to_fit();
+    edges.clear();
+    edges.shrink_to_fit();
+  }
+};
+} // namespace graphs
+} // namespace galois
+#endif
diff --git a/libwmd/include/galois/wmd/WMDPartitioner.h b/libwmd/include/galois/wmd/WMDPartitioner.h
new file mode 100644
index 0000000000..d3fe9cf6dc
--- /dev/null
+++ b/libwmd/include/galois/wmd/WMDPartitioner.h
@@ -0,0 +1,892 @@
+/**
+ * @file WMDPartitioner.h
+ *
+ * Graph partitioning that duplicates edges for WMD dataset. Currently only
+ * supports an outgoing edge cut.
+ *
+ */
+
+#ifndef _WMD_PARTITIONER_H
+#define _WMD_PARTITIONER_H
+
+#include "galois/Galois.h"
+#include "galois/graphs/DistributedLocalGraph.h"
+#include "galois/DReducible.h"
+
+#include "WMDGraph.h"
+
+#include <atomic>
+#include <unistd.h>
+#include <ios>
+#include <iostream>
+#include <fstream>
+#include <set>
+#include <string>
+
+namespace galois {
+
+template <typename EdgeType>
+struct GenericEdge {
+  GenericEdge() = default;
+  GenericEdge(uint64_t src_, uint64_t dst_, EdgeType data_)
+      : src(src_), dst(dst_), data(data_) {}
+
+  uint64_t src;
+  uint64_t dst;
+  EdgeType data;
+};
+
+namespace graphs {
+
+/**
+ * @tparam NodeTy type of node data for the graph
+ * @tparam EdgeTy type of edge data for the graph
+ *
+ * @todo fully document and clean up code
+ * @warning not meant for public use + not fully documented yet
+ */
+template <typename NodeTy, typename EdgeTy, typename Partitioner>
+class WMDGraph : public DistLocalGraph<NodeTy, EdgeTy> {
+
+  //! size used to buffer edge sends during partitioning
+  constexpr static unsigned edgePartitionSendBufSize = 8388608;
+  constexpr static const char* const GRNAME          = "dGraph_WMD";
+  std::unique_ptr<Partitioner> graphPartitioner;
+
+  uint32_t G2LEdgeCut(uint64_t gid, uint32_t globalOffset) const {
+    assert(base_DistGraph::isLocal(gid));
+    // optimized for edge cuts
+    if (gid >= globalOffset && gid < globalOffset + base_DistGraph::numOwned)
+      return gid - globalOffset;
+
+    return base_DistGraph::globalToLocalMap.at(gid);
+  }
+
+  /**
+   * Free memory of a vector by swapping an empty vector with it
+   */
+  template <typename V>
+  void freeVector(V& vectorToKill) {
+    V dummyVector;
+    vectorToKill.swap(dummyVector);
+  }
+
+  uint32_t nodesToReceive;
+
+  uint64_t myKeptEdges;
+  uint64_t globalKeptEdges;
+  uint64_t totalEdgeProxies;
+
+  std::vector<std::vector<size_t>> mirrorEdges;
+  std::unordered_map<uint64_t, uint64_t> localEdgeGIDToLID;
+
+  template <typename, typename, typename>
+  friend class WMDGraph;
+
+  virtual unsigned getHostIDImpl(uint64_t gid) const {
+    return graphPartitioner->retrieveMaster(gid);
+  }
+
+  virtual bool isOwnedImpl(uint64_t gid) const {
+    return (graphPartitioner->retrieveMaster(gid) == base_DistGraph::id);
+  }
+
+  virtual bool isLocalImpl(uint64_t gid) const {
+    return (base_DistGraph::globalToLocalMap.find(gid) !=
+            base_DistGraph::globalToLocalMap.end());
+  }
+
+  virtual bool isVertexCutImpl() const { return false; }
+
+public:
+  //! typedef for base DistGraph class
+  using base_DistGraph = DistLocalGraph<NodeTy, EdgeTy>;
+
+  /**
+   * Returns edges owned by this graph (i.e. read).
+   */
+  uint64_t numOwnedEdges() const { return myKeptEdges; }
+
+  /**
+   * Returns # edges kept in all graphs.
+   */
+  uint64_t globalEdges() const { return globalKeptEdges; }
+
+  std::vector<std::vector<size_t>>& getMirrorEdges() { return mirrorEdges; }
+
+  /**
+   * Return the reader of a particular node.
+   * @param gid GID of node to get reader of
+   * @return Host reader of node passed in as param
+   */
+  unsigned getHostReader(uint64_t gid) const {
+    for (auto i = 0U; i < base_DistGraph::numHosts; ++i) {
+      uint64_t start, end;
+      std::tie(start, end) = base_DistGraph::gid2host[i];
+      if (gid >= start && gid < end) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * Constructor
+   */
+  WMDGraph(
+      std::vector<std::unique_ptr<galois::graphs::FileParser<NodeTy, EdgeTy>>>&
+          parsers,
+      unsigned host, unsigned _numHosts, bool setupGluon = true,
+      bool doSort                             = false,
+      galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS)
+      : base_DistGraph(host, _numHosts) {
+    galois::gInfo("[", base_DistGraph::id, "] Start DistGraph construction.");
+    galois::runtime::reportParam(GRNAME, "WMDGraph", "0");
+    // TODO: who is responsible for init and deinit?
+    galois::StatTimer Tgraph_construct("GraphPartitioningTime", GRNAME);
+    Tgraph_construct.start();
+
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Start reading graph.");
+    galois::StatTimer graphReadTimer("GraphReading", GRNAME);
+    graphReadTimer.start();
+
+    galois::gDebug("[", base_DistGraph::id, "] WMDOfflineGraph End!");
+    galois::graphs::WMDOfflineGraph<NodeTy, EdgeTy> g(parsers, md, 8);
+    galois::gDebug("[", base_DistGraph::id, "] WMDOfflineGraph End!");
+    std::vector<unsigned> dummy;
+
+    // freeup memory that won't be used in the future
+    g.clearEdgePrefixSumInfo();
+
+    std::vector<uint64_t> ndegrees;
+
+    graphPartitioner = std::make_unique<Partitioner>(
+        host, _numHosts, base_DistGraph::numGlobalNodes,
+        base_DistGraph::numGlobalEdges, ndegrees);
+
+    graphReadTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Reading graph complete in ",
+                  graphReadTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Start exchanging edges.");
+    galois::StatTimer edgesExchangeTimer("EdgesExchange", GRNAME);
+    edgesExchangeTimer.start();
+
+    // never read edge data from disk
+    galois::graphs::WMDBufferedGraph<NodeTy, EdgeTy> bufGraph;
+    bufGraph.loadPartialGraph(g, base_DistGraph::numGlobalEdges);
+
+    edgesExchangeTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Exchanging edges complete in ",
+                  edgesExchangeTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Starting edge inspection.");
+    galois::StatTimer inspectionTimer("EdgeInspection", GRNAME);
+    inspectionTimer.start();
+    base_DistGraph::numGlobalNodes = g.size();
+    base_DistGraph::numGlobalEdges = g.sizeEdges();
+
+    // galois::gstl::Vector<uint64_t> prefixSumOfEdges;
+    // prefixSumOfEdges.resize(base_DistGraph::numOwned);
+
+    // initial pass; set up lid-gid mappings, determine which proxies exist on
+    // this host
+    uint64_t nodeBegin = bufGraph.globalNodeOffset[base_DistGraph::id];
+    uint64_t nodeEnd   = bufGraph.globalNodeOffset[base_DistGraph::id] +
+                       bufGraph.localNodeSize[base_DistGraph::id];
+    base_DistGraph::numOwned = bufGraph.localNodeSize[base_DistGraph::id];
+
+    base_DistGraph::gid2host.resize(base_DistGraph::numHosts);
+    for (uint64_t h = 0; h < base_DistGraph::numHosts - 1; h++) {
+      base_DistGraph::gid2host[h] = std::pair<uint64_t, uint64_t>(
+          bufGraph.globalNodeOffset[h], bufGraph.globalNodeOffset[h + 1]);
+    }
+    base_DistGraph::gid2host[base_DistGraph::numHosts - 1] =
+        std::pair<uint64_t, uint64_t>(
+            bufGraph.globalNodeOffset[base_DistGraph::numHosts - 1],
+            base_DistGraph::numGlobalNodes);
+    graphPartitioner->saveGIDToHost(bufGraph.virtualToPhyMapping);
+
+    std::vector<std::vector<uint64_t>> presentProxies =
+        edgeInspectionRound1(bufGraph);
+
+    // vector to store bitsets received from other hosts
+    std::vector<std::vector<uint64_t>> proxiesOnOtherHosts;
+    proxiesOnOtherHosts.resize(_numHosts);
+
+    // send off mirror proxies that exist on this host to other hosts
+    communicateProxyInfo(presentProxies, proxiesOnOtherHosts);
+
+    base_DistGraph::numEdges = bufGraph.sizeLocalEdges();
+    // assumption: we keep all edges since mirror edges are not supported
+    myKeptEdges     = base_DistGraph::numEdges;
+    globalKeptEdges = base_DistGraph::numGlobalEdges;
+
+    inspectionTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Edge inspection complete in ",
+                  inspectionTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+    galois::gInfo("[", base_DistGraph::id, "] Starting building LS_CSR.");
+    galois::StatTimer buildingTimer("GraphBuilding", GRNAME);
+    buildingTimer.start();
+
+    // Graph construction related calls
+    base_DistGraph::beginMaster = 0;
+    // Allocate and construct the graph
+    base_DistGraph::initGraph(base_DistGraph::numNodes);
+
+    // construct edges
+    // not need to move edges from other host since all edges is already ready
+    // when no edge mirror are used.
+    galois::gDebug("[", base_DistGraph::id, "] add edges into graph.");
+    galois::do_all(
+        galois::iterate(nodeBegin, nodeEnd),
+        [&](uint64_t globalID) {
+          auto edgeDst = bufGraph.edgeLocalDst(globalID);
+          std::vector<uint64_t> dstData;
+          for (auto dst : edgeDst) {
+            dstData.emplace_back(base_DistGraph::globalToLocalMap[dst]);
+          }
+          std::vector<EdgeTy> edgeData(bufGraph.edgeNum(globalID));
+          base_DistGraph::graph->addEdges(
+              (globalID - bufGraph.globalNodeOffset[base_DistGraph::id]),
+              dstData, edgeData);
+        },
+        galois::steal());
+
+    // move node data (include mirror nodes) from other hosts to graph in this
+    // host
+    galois::gDebug("[", base_DistGraph::id, "] add nodes data into graph.");
+    bufGraph.gatherNodes(g, base_DistGraph::graph, proxiesOnOtherHosts,
+                         base_DistGraph::numNodes,
+                         base_DistGraph::globalToLocalMap);
+
+    galois::gDebug("[", base_DistGraph::id, "] LS_CSR construction done.");
+    galois::gInfo("[", base_DistGraph::id,
+                  "] LS_CSR graph local nodes: ", base_DistGraph::numNodes);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] LS_CSR graph master nodes: ", base_DistGraph::numOwned);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] LS_CSR graph local edges: ", base_DistGraph::sizeEdges());
+    assert(base_DistGraph::sizeEdges() == base_DistGraph::numEdges);
+    assert(base_DistGraph::graph->size() == base_DistGraph::numNodes);
+
+    bufGraph.resetAndFree();
+
+    buildingTimer.stop();
+    galois::gInfo("[", base_DistGraph::id, "] Building LS_CSR complete in ",
+                  buildingTimer.get_usec() / 1000000.0, " sec.");
+    ////////////////////////////////////////////////////////////////////////////
+
+    if (setupGluon) {
+      galois::CondStatTimer<MORE_DIST_STATS> TfillMirrors("FillMirrors",
+                                                          GRNAME);
+
+      TfillMirrors.start();
+      fillMirrors();
+      TfillMirrors.stop();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    // TODO this might be useful to keep around
+    proxiesOnOtherHosts.clear();
+    proxiesOnOtherHosts.shrink_to_fit();
+    ndegrees.clear();
+    ndegrees.shrink_to_fit();
+
+    // SORT EDGES
+    if (doSort) {
+      base_DistGraph::sortEdgesByDestination();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    galois::CondStatTimer<MORE_DIST_STATS> Tthread_ranges("ThreadRangesTime",
+                                                          GRNAME);
+
+    galois::gInfo("[", base_DistGraph::id, "] Determining thread ranges");
+
+    Tthread_ranges.start();
+    base_DistGraph::determineThreadRanges();
+    base_DistGraph::determineThreadRangesMaster();
+    base_DistGraph::determineThreadRangesWithEdges();
+    base_DistGraph::initializeSpecificRanges();
+    Tthread_ranges.stop();
+
+    Tgraph_construct.stop();
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Total time of DistGraph construction is ",
+                  Tgraph_construct.get_usec() / 1000000.0, " sec.");
+
+    galois::DGAccumulator<uint64_t> accumer;
+    accumer.reset();
+    accumer += base_DistGraph::sizeEdges();
+    totalEdgeProxies = accumer.reduce();
+
+    uint64_t totalNodeProxies;
+    accumer.reset();
+    accumer += base_DistGraph::size();
+    totalNodeProxies = accumer.reduce();
+
+    // report some statistics
+    if (base_DistGraph::id == 0) {
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("TotalNodeProxies"), totalNodeProxies);
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("TotalEdgeProxies"), totalEdgeProxies);
+      galois::runtime::reportStat_Single(GRNAME,
+                                         std::string("OriginalNumberEdges"),
+                                         base_DistGraph::globalSizeEdges());
+      galois::runtime::reportStat_Single(GRNAME, std::string("TotalKeptEdges"),
+                                         globalKeptEdges);
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("ReplicationFactorNodes"),
+          (totalNodeProxies) / (double)base_DistGraph::globalSize());
+      galois::runtime::reportStat_Single(
+          GRNAME, std::string("ReplicatonFactorEdges"),
+          (totalEdgeProxies) / (double)globalKeptEdges);
+    }
+  }
+
+  // this consumes the original graph
+  // this does not support mirror edges
+  template <class NewGraph, class Projection>
+  std::unique_ptr<NewGraph> Project(Projection projection) {
+    std::unique_ptr<NewGraph> newGraph = std::unique_ptr<NewGraph>(
+        new NewGraph(base_DistGraph::id, base_DistGraph::numHosts));
+    using NodeLID     = uint64_t;
+    using NodeGID     = uint64_t;
+    using NewEdgeType = typename NewGraph::EdgeType;
+
+    galois::gInfo("[", base_DistGraph::id, "] Start projection.");
+
+    newGraph->gid2host = base_DistGraph::gid2host;
+    newGraph->localToGlobalVector.resize(base_DistGraph::numNodes);
+    std::vector<bool> keepMirrors(base_DistGraph::numNodes -
+                                  base_DistGraph::numOwned);
+    // these 2 structures: newTopology and newEdgeData must mirror eachother
+    std::vector<std::vector<NodeGID>> newTopology(base_DistGraph::numNodes);
+    std::vector<std::vector<NewEdgeType>> newEdgeData(base_DistGraph::numNodes);
+
+    std::atomic<uint64_t> masterNodes = 0;
+    std::atomic<uint64_t> mirrorNodes = 0;
+    galois::GAccumulator<uint64_t> nodesWithEdges;
+    galois::DGAccumulator<uint64_t> globalNodes;
+    galois::DGAccumulator<uint64_t> globalEdges;
+    nodesWithEdges.reset();
+    globalNodes.reset();
+    globalEdges.reset();
+
+    galois::do_all(
+        galois::iterate(base_DistGraph::masterNodesRange().begin(),
+                        base_DistGraph::masterNodesRange().end()),
+        [&](auto& node) {
+          if (!projection.KeepNode(*this, node)) {
+            return;
+          }
+          NodeGID nodeGID = base_DistGraph::getGID(node);
+          std::vector<NodeGID> edgeDsts;
+          std::vector<NewEdgeType> keptEdgeData;
+
+          uint64_t keptEdges = 0;
+          for (const auto& edge : base_DistGraph::edges(node)) {
+            EdgeTy edgeData = base_DistGraph::getEdgeData(edge);
+            NodeLID dstNode = base_DistGraph::getEdgeDst(edge);
+            if (!projection.KeepEdge(*this, edgeData, node, dstNode)) {
+              continue;
+            }
+            keptEdges++;
+            edgeDsts.emplace_back(base_DistGraph::getGID(dstNode));
+            keptEdgeData.emplace_back(
+                projection.ProjectEdge(*this, edgeData, node, dstNode));
+            if (dstNode >= base_DistGraph::numOwned) {
+              keepMirrors[dstNode - base_DistGraph::numOwned] = true;
+            }
+          }
+          if (projection.KeepEdgeLessMasters() || keptEdges > 0) {
+            if (keptEdges > 0) {
+              nodesWithEdges += 1;
+            }
+            globalNodes += 1;
+            globalEdges += keptEdges;
+            NodeLID nodeLID                        = masterNodes.fetch_add(1);
+            newGraph->localToGlobalVector[nodeLID] = nodeGID;
+            newTopology[nodeLID]                   = std::move(edgeDsts);
+            newEdgeData[nodeLID]                   = std::move(keptEdgeData);
+          }
+        });
+
+    uint64_t numMasters = masterNodes;
+
+    galois::do_all(galois::iterate(uint64_t(base_DistGraph::numOwned),
+                                   uint64_t(base_DistGraph::numNodes)),
+                   [&](auto& mirrorNode) {
+                     if (!keepMirrors[mirrorNode - base_DistGraph::numOwned]) {
+                       return;
+                     }
+                     NodeGID nodeGID = base_DistGraph::getGID(mirrorNode);
+                     NodeLID nodeLID = numMasters + mirrorNodes.fetch_add(1);
+                     newGraph->localToGlobalVector[nodeLID] = nodeGID;
+                   });
+
+    newGraph->numGlobalNodes = globalNodes.reduce();
+    newGraph->numGlobalEdges = globalEdges.reduce();
+    newGraph->numEdges       = globalEdges.read_local();
+
+    newGraph->numOwned          = masterNodes;
+    uint64_t numMirrors         = mirrorNodes;
+    newGraph->numNodes          = newGraph->numOwned + numMirrors;
+    newGraph->beginMaster       = 0;
+    newGraph->numNodesWithEdges = nodesWithEdges.reduce();
+
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Global Nodes: ", newGraph->numGlobalNodes);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Global Edges: ", newGraph->numGlobalEdges);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Local Nodes: ", newGraph->numNodes);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Local Edges: ", newGraph->numEdges);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Master Nodes: ", newGraph->numOwned);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Mirror Nodes: ", numMirrors);
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Projected Edge Nodes: ", newGraph->numNodesWithEdges);
+
+    newTopology.resize(newGraph->numNodes);
+    newEdgeData.resize(newGraph->numNodes);
+    newGraph->localToGlobalVector.resize(newGraph->numNodes);
+    newGraph->recalculateG2LMap();
+
+    for (uint32_t i = newGraph->numOwned; i < newGraph->numNodes; i++) {
+      uint64_t globalID = newGraph->getGID(i);
+      // deliberately use the old graph partitioner to get the owner of the GID
+      newGraph->mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
+          .emplace_back(globalID);
+    }
+
+    galois::gInfo("[", base_DistGraph::id, "] Start building projected graph.");
+    newGraph->initGraph(newGraph->numNodes);
+
+    galois::do_all(
+        galois::iterate(uint64_t(0), uint64_t(newGraph->numNodes)),
+        [&](auto& node) {
+          NodeLID oldGraphLID =
+              base_DistGraph::getLID(newGraph->localToGlobalVector[node]);
+          newGraph->graph->getData(node) = projection.ProjectNode(
+              *this, base_DistGraph::getData(oldGraphLID), oldGraphLID);
+
+          uint64_t numEdges = newTopology[node].size();
+          if (node >= newGraph->numOwned) {
+            return;
+          }
+          std::vector<NodeLID> localDsts;
+          localDsts.reserve(numEdges);
+          for (NodeGID gid : newTopology[node]) {
+            localDsts.emplace_back(newGraph->getLID(gid));
+          }
+          newGraph->graph->addEdges(node, localDsts, newEdgeData[node]);
+
+          newTopology[node].clear();
+          newEdgeData[node].clear();
+        });
+
+    galois::gInfo("[", base_DistGraph::id,
+                  "] Finished building projected graph.");
+
+    newGraph->graphPartitioner = std::move(graphPartitioner);
+    newGraph->determineThreadRanges();
+    newGraph->determineThreadRangesMaster();
+    newGraph->determineThreadRangesWithEdges();
+    newGraph->initializeSpecificRanges();
+
+    return newGraph;
+  }
+
+  /**
+   * Test-only Constructor
+   */
+  WMDGraph(unsigned host, unsigned _numHosts, std::vector<NodeTy> nodes_,
+           std::vector<galois::GenericEdge<EdgeTy>> edges_)
+      : base_DistGraph(host, _numHosts) {
+    base_DistGraph::numGlobalNodes = nodes_.size();
+    base_DistGraph::numGlobalEdges = edges_.size();
+    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numGlobalNodes);
+
+    std::vector<uint64_t> ndegrees;
+    graphPartitioner = std::make_unique<Partitioner>(
+        host, _numHosts, base_DistGraph::numGlobalNodes,
+        base_DistGraph::numGlobalEdges, ndegrees);
+
+    // TODO(Patrick) support using virtualToPhysicalMapping
+    base_DistGraph::numEdges          = 0;
+    base_DistGraph::numNodesWithEdges = 0;
+    std::vector<NodeTy> localNodes;
+    std::set<uint64_t> localMirrors;
+    std::vector<std::vector<galois::GenericEdge<EdgeTy>>> localEdges;
+    std::vector<uint32_t> virtualToPhyMapping(_numHosts);
+    base_DistGraph::gid2host.resize(base_DistGraph::numHosts);
+    for (uint64_t h = 0; h < base_DistGraph::numHosts; h++) {
+      virtualToPhyMapping[h] = h;
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range((uint64_t)0, base_DistGraph::numGlobalNodes, h,
+                              base_DistGraph::numHosts);
+      base_DistGraph::gid2host[h] =
+          std::pair<uint64_t, uint64_t>(beginNode, endNode);
+      if (h != host) {
+        continue;
+      }
+      uint64_t edgeIter = 0;
+      for (; edgeIter < edges_.size() && edges_[edgeIter].src < beginNode;
+           edgeIter++) {
+      }
+      for (uint64_t i = beginNode; i < endNode; i++) {
+        uint64_t lid = i - beginNode;
+        localNodes.emplace_back(nodes_[i]);
+        localEdges.emplace_back(std::vector<galois::GenericEdge<EdgeTy>>());
+        base_DistGraph::localToGlobalVector[lid] = i;
+        while (edgeIter < edges_.size() && edges_[edgeIter].src == i) {
+          auto edge = edges_[edgeIter++];
+          if (edge.dst < beginNode || edge.dst >= endNode) {
+            localMirrors.insert(edge.dst);
+          }
+          localEdges[lid].emplace_back(edge);
+          base_DistGraph::numEdges++;
+        }
+        if (localEdges[lid].size() > 0) {
+          base_DistGraph::numNodesWithEdges++;
+        }
+      }
+    }
+    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);
+
+    base_DistGraph::numOwned    = localNodes.size();
+    uint64_t numMirrors         = localMirrors.size();
+    base_DistGraph::numNodes    = base_DistGraph::numOwned + numMirrors;
+    base_DistGraph::beginMaster = 0;
+
+    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);
+    for (uint64_t mirrorGID : localMirrors) {
+      localNodes.emplace_back(nodes_[mirrorGID]);
+      base_DistGraph::localToGlobalVector.emplace_back(mirrorGID);
+    }
+    base_DistGraph::recalculateG2LMap();
+
+    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
+         i++) {
+      uint64_t globalID = base_DistGraph::getGID(i);
+      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
+          .emplace_back(globalID);
+    }
+
+    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,
+                                       base_DistGraph::numEdges);
+    uint64_t edgeCount = 0;
+    for (uint64_t node = 0; node < base_DistGraph::numNodes; node++) {
+      base_DistGraph::getData(node) = localNodes[node];
+      if (node >= base_DistGraph::numOwned) {
+        continue;
+      }
+      uint64_t numEdges = localEdges[node].size();
+      if (numEdges == 0) {
+        continue;
+      }
+      std::vector<uint64_t> localDsts;
+      std::vector<EdgeTy> newEdgeData;
+      for (auto edge : localEdges[node]) {
+        localDsts.emplace_back(base_DistGraph::getLID(edge.dst));
+        newEdgeData.emplace_back(edge.data);
+      }
+      edgeCount += numEdges;
+      base_DistGraph::graph.addEdgesUnSort(true, node, localDsts.data(),
+                                           newEdgeData.data(), numEdges, false);
+    }
+    base_DistGraph::graph.getEdgePrefixSum();
+
+    base_DistGraph::determineThreadRanges();
+    base_DistGraph::determineThreadRangesMaster();
+    base_DistGraph::determineThreadRangesWithEdges();
+    base_DistGraph::initializeSpecificRanges();
+  }
+
+private:
+  WMDGraph(unsigned host, unsigned _numHosts)
+      : base_DistGraph(host, _numHosts) {}
+
+  std::vector<std::vector<uint64_t>> edgeInspectionRound1(
+      galois::graphs::WMDBufferedGraph<NodeTy, EdgeTy>& bufGraph) {
+    std::vector<std::vector<uint64_t>> incomingMirrors(
+        base_DistGraph::numHosts);
+    uint32_t myID = base_DistGraph::id;
+    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);
+    uint32_t activeThreads = galois::getActiveThreads();
+    std::vector<std::vector<std::set<uint64_t>>> incomingMirrorsPerThread(
+        base_DistGraph::numHosts);
+    for (uint32_t h = 0; h < base_DistGraph::numHosts; h++) {
+      incomingMirrorsPerThread[h].resize(activeThreads);
+    }
+
+    size_t start = bufGraph.globalNodeOffset[base_DistGraph::id];
+    size_t end;
+    if (base_DistGraph::id != base_DistGraph::numHosts - 1)
+      end = bufGraph.globalNodeOffset[base_DistGraph::id + 1];
+    else
+      end = bufGraph.localNodeSize[base_DistGraph::numHosts - 1] +
+            bufGraph.globalNodeOffset[base_DistGraph::id];
+
+    galois::on_each([&](unsigned tid, unsigned nthreads) {
+      uint64_t beginNode;
+      uint64_t endNode;
+      std::tie(beginNode, endNode) =
+          galois::block_range(start, end, tid, nthreads);
+
+      for (uint64_t i = beginNode; i < endNode; ++i) {
+        auto ii = bufGraph.edgeBegin(i);
+        auto ee = bufGraph.edgeEnd(i);
+        for (; ii < ee; ++ii) {
+          uint64_t dst = bufGraph.edgeDestination(*ii);
+          uint64_t master_dst =
+              bufGraph.virtualToPhyMapping[dst % (bufGraph.scaleFactor *
+                                                  base_DistGraph::numHosts)];
+          if (master_dst != myID) {
+            assert(master_dst < base_DistGraph::numHosts);
+            incomingMirrorsPerThread[master_dst][tid].insert(dst);
+          }
+        }
+        base_DistGraph::localToGlobalVector[i - bufGraph.globalNodeOffset
+                                                    [base_DistGraph::id]] =
+            bufGraph
+                .LIDtoGID[i - bufGraph.globalNodeOffset[base_DistGraph::id]];
+      }
+    });
+
+    std::vector<std::set<uint64_t>> dest(base_DistGraph::numHosts);
+    for (uint32_t h = 0; h < base_DistGraph::numHosts; h++) {
+      for (uint32_t t = 0; t < activeThreads; t++) {
+        std::set<uint64_t> tempUnion;
+        std::set_union(dest[h].begin(), dest[h].end(),
+                       incomingMirrorsPerThread[h][t].begin(),
+                       incomingMirrorsPerThread[h][t].end(),
+                       std::inserter(tempUnion, tempUnion.begin()));
+        dest[h] = tempUnion;
+      }
+      std::copy(dest[h].begin(), dest[h].end(),
+                std::back_inserter(incomingMirrors[h]));
+    }
+    incomingMirrorsPerThread.clear();
+    uint64_t offset = base_DistGraph::localToGlobalVector.size();
+    uint64_t count  = 0;
+    for (uint64_t i = 0; i < incomingMirrors.size(); i++) {
+      count += incomingMirrors[i].size();
+    }
+    uint32_t additionalMirrorCount = count;
+    base_DistGraph::localToGlobalVector.resize(
+        base_DistGraph::localToGlobalVector.size() + additionalMirrorCount);
+
+    for (uint64_t i = 0; i < incomingMirrors.size(); i++) {
+      for (uint64_t j = 0; j < incomingMirrors[i].size(); j++) {
+        base_DistGraph::localToGlobalVector[offset] = incomingMirrors[i][j];
+        offset++;
+      }
+    }
+
+    base_DistGraph::numNodes = base_DistGraph::numOwned + additionalMirrorCount;
+    // Creating Global to Local ID map
+    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);
+    for (unsigned i = 0; i < base_DistGraph::numNodes; i++) {
+      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =
+          i;
+    }
+    base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;
+    return incomingMirrors;
+  }
+
+  /**
+   * Communicate to other hosts which proxies exist on this host.
+   *
+   * @param presentProxies Bitset marking which proxies are present on this host
+   * @param proxiesOnOtherHosts Vector to deserialize received bitsets into
+   */
+  void
+  communicateProxyInfo(std::vector<std::vector<uint64_t>> presentProxies,
+                       std::vector<std::vector<uint64_t>> proxiesOnOtherHosts) {
+    auto& net = galois::runtime::getSystemNetworkInterface();
+    // Send proxies on this host to other hosts
+    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {
+      if (h != base_DistGraph::id) {
+        galois::runtime::SendBuffer bitsetBuffer;
+        galois::runtime::gSerialize(bitsetBuffer, presentProxies[h]);
+        net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer));
+      }
+    }
+
+    // receive loop
+    for (unsigned h = 0; h < net.Num - 1; h++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      uint32_t sendingHost = p->first;
+      // deserialize proxiesOnOtherHosts
+      galois::runtime::gDeserialize(p->second,
+                                    proxiesOnOtherHosts[sendingHost]);
+    }
+
+    base_DistGraph::increment_evilPhase();
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+public:
+  galois::GAccumulator<uint64_t> lgMapAccesses;
+  /**
+   * Construct a map from local edge GIDs to LID
+   */
+  void constructLocalEdgeGIDMap() {
+    lgMapAccesses.reset();
+    galois::StatTimer mapConstructTimer("GID2LIDMapConstructTimer", GRNAME);
+    mapConstructTimer.start();
+
+    localEdgeGIDToLID.reserve(base_DistGraph::sizeEdges());
+
+    uint64_t count = 0;
+    for (unsigned src = 0; src < base_DistGraph::size(); src++) {
+      for (auto edge = base_DistGraph::edge_begin(src);
+           edge != base_DistGraph::edge_end(src); edge++) {
+        assert((*edge) == count);
+        unsigned dst      = base_DistGraph::getEdgeDst(edge);
+        uint64_t localGID = getEdgeGIDFromSD(src, dst);
+        // insert into map
+        localEdgeGIDToLID.insert(std::make_pair(localGID, count));
+        count++;
+      }
+    }
+
+    GALOIS_ASSERT(localEdgeGIDToLID.size() == base_DistGraph::sizeEdges());
+    GALOIS_ASSERT(count == base_DistGraph::sizeEdges());
+
+    mapConstructTimer.stop();
+  }
+
+  void reportAccessBefore() {
+    galois::runtime::reportStat_Single(GRNAME, std::string("MapAccessesBefore"),
+                                       lgMapAccesses.reduce());
+  }
+
+  void reportAccess() {
+    galois::runtime::reportStat_Single(GRNAME, std::string("MapAccesses"),
+                                       lgMapAccesses.reduce());
+  }
+
+  /**
+   * checks map constructed above to see which local id corresponds
+   * to a node/edge (if it exists)
+   *
+   * assumes map is generated
+   */
+  std::pair<uint64_t, bool> getLIDFromMap(unsigned src, unsigned dst) {
+    lgMapAccesses += 1;
+    // try to find gid in map
+    uint64_t localGID = getEdgeGIDFromSD(src, dst);
+    auto findResult   = localEdgeGIDToLID.find(localGID);
+
+    // return if found, else return a false
+    if (findResult != localEdgeGIDToLID.end()) {
+      return std::make_pair(findResult->second, true);
+    } else {
+      // not found
+      return std::make_pair((uint64_t)-1, false);
+    }
+  }
+
+  uint64_t getEdgeLID(uint64_t gid) {
+    uint64_t sourceNodeGID = edgeGIDToSource(gid);
+    uint64_t sourceNodeLID = base_DistGraph::getLID(sourceNodeGID);
+    uint64_t destNodeLID   = base_DistGraph::getLID(edgeGIDToDest(gid));
+
+    for (auto edge : base_DistGraph::edges(sourceNodeLID)) {
+      uint64_t edgeDst = base_DistGraph::getEdgeDst(edge);
+      if (edgeDst == destNodeLID) {
+        return *edge;
+      }
+    }
+    GALOIS_DIE("unreachable");
+    return (uint64_t)-1;
+  }
+
+  uint32_t findSourceFromEdge(uint64_t lid) {
+    // TODO binary search
+    // uint32_t left = 0;
+    // uint32_t right = base_DistGraph::numNodes;
+    // uint32_t mid = (left + right) / 2;
+
+    for (uint32_t mid = 0; mid < base_DistGraph::numNodes; mid++) {
+      uint64_t edge_left  = *(base_DistGraph::edge_begin(mid));
+      uint64_t edge_right = *(base_DistGraph::edge_begin(mid + 1));
+
+      if (edge_left <= lid && lid < edge_right) {
+        return mid;
+      }
+    }
+
+    GALOIS_DIE("unreachable");
+    return (uint32_t)-1;
+  }
+
+  uint64_t getEdgeGID(uint64_t lid) {
+    uint64_t src = base_DistGraph::getGID(findSourceFromEdge(lid));
+    uint64_t dst = base_DistGraph::getGID(base_DistGraph::getEdgeDst(lid));
+    return getEdgeGIDFromSD(src, dst);
+  }
+
+private:
+  // https://www.quora.com/
+  // Is-there-a-mathematical-function-that-converts-two-numbers-into-one-so-
+  // that-the-two-numbers-can-always-be-extracted-again
+  // GLOBAL IDS ONLY
+  uint64_t getEdgeGIDFromSD(uint64_t source, uint64_t dest) {
+    return source + (dest % base_DistGraph::numGlobalNodes) *
+                        base_DistGraph::numGlobalNodes;
+  }
+
+  uint64_t edgeGIDToSource(uint64_t gid) {
+    return gid % base_DistGraph::numGlobalNodes;
+  }
+
+  uint64_t edgeGIDToDest(uint64_t gid) {
+    // assuming this floors
+    return gid / base_DistGraph::numGlobalNodes;
+  }
+
+  /**
+   * Fill up mirror arrays.
+   * TODO make parallel?
+   */
+  void fillMirrors() {
+    base_DistGraph::mirrorNodes.reserve(base_DistGraph::numNodes -
+                                        base_DistGraph::numOwned);
+    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;
+         i++) {
+      uint64_t globalID = base_DistGraph::localToGlobalVector[i];
+      assert(graphPartitioner->retrieveMaster(globalID) <
+             base_DistGraph::numHosts);
+      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]
+          .push_back(globalID);
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+// make GRNAME visible to public
+template <typename NodeTy, typename EdgeTy, typename Partitioner>
+constexpr const char* const
+    galois::graphs::WMDGraph<NodeTy, EdgeTy, Partitioner>::GRNAME;
+
+} // end namespace graphs
+} // end namespace galois
+#endif
diff --git a/libwmd/include/galois/wmd/graph.h b/libwmd/include/galois/wmd/graph.h
new file mode 100644
index 0000000000..03649e168e
--- /dev/null
+++ b/libwmd/include/galois/wmd/graph.h
@@ -0,0 +1,190 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef GRAPH_H_
+#define GRAPH_H_
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+// #include <unordered_map>
+
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+
+#include "graphTypes.h"
+// #include "galois/graphs/LS_LC_CSR_Graph.h"
+#include "galois/shad/DataTypes.h"
+
+#define UINT shad::data_types::UINT
+// #define DOUBLE shad::data_types::DOUBLE
+#define USDATE shad::data_types::USDATE
+#define ENCODE shad::data_types::encode
+
+namespace agile::workflow1 {
+
+class Vertex {
+public:
+  uint64_t id;    // user id of the vertex given in dataset (a.k.a token)
+  uint64_t glbid; // global id of the vertex in the graph
+  uint64_t edges; // number of edges
+  TYPES type;
+
+  Vertex() {
+    id    = shad::data_types::kNullValue<uint64_t>;
+    glbid = shad::data_types::kNullValue<uint64_t>;
+    edges = 0;
+    type  = TYPES::NONE;
+  }
+
+  Vertex(uint64_t glbid_, uint64_t, TYPES type_) {
+    id    = glbid_;
+    glbid = glbid_;
+    edges = 0;
+    type  = type_;
+  }
+};
+
+class Edge {
+public:
+  uint64_t src; // vertex id of src
+  uint64_t dst; // vertex id of dst
+  TYPES type;
+  TYPES src_type;
+  TYPES dst_type;
+  uint64_t src_glbid;
+  uint64_t dst_glbid;
+
+  Edge() {
+    src       = shad::data_types::kNullValue<uint64_t>;
+    dst       = shad::data_types::kNullValue<uint64_t>;
+    type      = TYPES::NONE;
+    src_type  = TYPES::NONE;
+    dst_type  = TYPES::NONE;
+    src_glbid = shad::data_types::kNullValue<uint64_t>;
+    dst_glbid = shad::data_types::kNullValue<uint64_t>;
+  }
+
+  Edge(std::vector<std::string>& tokens) {
+    if (tokens[0] == "Sale") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[2]);
+      type      = TYPES::SALE;
+      src_type  = TYPES::PERSON;
+      dst_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "Author") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[1]);
+      type      = TYPES::AUTHOR;
+      src_type  = TYPES::PERSON;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        dst = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        dst_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        dst_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        dst_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "Includes") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      type      = TYPES::INCLUDES;
+      src_type  = TYPES::FORUM;
+      dst_type  = TYPES::FORUMEVENT;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    } else if (tokens[0] == "HasTopic") {
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASTOPIC;
+      dst_type  = TYPES::TOPIC;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+      if (tokens[3] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[3]);
+      else if (tokens[4] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[4]);
+      else if (tokens[5] != "")
+        src = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      if (tokens[3] != "")
+        src_type = TYPES::FORUM;
+      else if (tokens[4] != "")
+        src_type = TYPES::FORUMEVENT;
+      else if (tokens[5] != "")
+        src_type = TYPES::PUBLICATION;
+    } else if (tokens[0] == "HasOrg") {
+      src       = ENCODE<uint64_t, std::string, UINT>(tokens[5]);
+      dst       = ENCODE<uint64_t, std::string, UINT>(tokens[6]);
+      type      = TYPES::HASORG;
+      src_type  = TYPES::PUBLICATION;
+      dst_type  = TYPES::TOPIC;
+      src_glbid = shad::data_types::kNullValue<uint64_t>;
+      dst_glbid = shad::data_types::kNullValue<uint64_t>;
+    }
+  }
+
+private:
+  friend class boost::serialization::access;
+
+  template <class Archive>
+  void serialize(Archive& ar, const unsigned int version) {
+    ar& src;
+    ar& dst;
+    ar& type;
+    ar& src_type;
+    ar& dst_type;
+    ar& src_glbid;
+    ar& dst_glbid;
+  }
+};
+
+} // namespace agile::workflow1
+
+#endif // GRAPH_H
diff --git a/libwmd/include/galois/wmd/graphTypes.h b/libwmd/include/galois/wmd/graphTypes.h
new file mode 100644
index 0000000000..7eebea432d
--- /dev/null
+++ b/libwmd/include/galois/wmd/graphTypes.h
@@ -0,0 +1,75 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef GRAPHTYPES_H_
+#define GRAPHTYPES_H_
+
+namespace agile::workflow1 {
+
+enum class TYPES {
+  PERSON,
+  FORUMEVENT,
+  FORUM,
+  PUBLICATION,
+  TOPIC,
+  PURCHASE,
+  SALE,
+  AUTHOR,
+  WRITTENBY,
+  INCLUDES,
+  INCLUDEDIN,
+  HASTOPIC,
+  TOPICIN,
+  HASORG,
+  ORGIN,
+  DEVICE,
+  FRIEND,
+  USES,
+  COMMUNICATION,
+  NONE
+};
+
+} // namespace agile::workflow1
+
+#endif // GRAPHTYPES_H
diff --git a/libwmd/include/galois/wmd/schema.h b/libwmd/include/galois/wmd/schema.h
new file mode 100644
index 0000000000..444a39089e
--- /dev/null
+++ b/libwmd/include/galois/wmd/schema.h
@@ -0,0 +1,181 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+//                            The AGILE Workflows
+//
+//===----------------------------------------------------------------------===//
+// ** Pre-Copyright Notice
+//
+// This computer software was prepared by Battelle Memorial Institute,
+// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the
+// Department of Energy (DOE). All rights in the computer software are reserved
+// by DOE on behalf of the United States Government and the Contractor as
+// provided in the Contract. You are authorized to use this computer software
+// for Governmental purposes but it is not to be released or distributed to the
+// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS
+// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This
+// notice including this sentence must appear on any copies of this computer
+// software.
+//
+// ** Disclaimer Notice
+//
+// This material was prepared as an account of work sponsored by an agency of
+// the United States Government. Neither the United States Government nor the
+// United States Department of Energy, nor Battelle, nor any of their employees,
+// nor any jurisdiction or organization that has cooperated in the development
+// of these materials, makes any warranty, express or implied, or assumes any
+// legal liability or responsibility for the accuracy, completeness, or
+// usefulness or any information, apparatus, product, software, or process
+// disclosed, or represents that its use would not infringe privately owned
+// rights. Reference herein to any specific commercial product, process, or
+// service by trade name, trademark, manufacturer, or otherwise does not
+// necessarily constitute or imply its endorsement, recommendation, or favoring
+// by the United States Government or any agency thereof, or Battelle Memorial
+// Institute. The views and opinions of authors expressed herein do not
+// necessarily state or reflect those of the United States Government or any
+// agency thereof.
+//
+//                    PACIFIC NORTHWEST NATIONAL LABORATORY
+//                                 operated by
+//                                   BATTELLE
+//                                   for the
+//                      UNITED STATES DEPARTMENT OF ENERGY
+//                       under Contract DE-AC05-76RL01830
+//===----------------------------------------------------------------------===//
+
+#ifndef GALOIS_WMD_SCHEMA_H_
+#define GALOIS_WMD_SCHEMA_H_
+
+#include "graph.h"
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace galois {
+namespace graphs {
+
+using ParsedUID = uint64_t;
+
+template <typename V, typename E>
+struct ParsedGraphStructure {
+  ParsedGraphStructure() : isNode(false), isEdge(false) {}
+  ParsedGraphStructure(V node_) : node(node_), isNode(true), isEdge(false) {}
+  ParsedGraphStructure(std::vector<E> edges_)
+      : edges(edges_), isNode(false), isEdge(true) {}
+
+  V node;
+  std::vector<E> edges;
+  bool isNode;
+  bool isEdge;
+};
+
+template <typename V, typename E>
+class FileParser {
+public:
+  virtual const std::vector<std::string>& GetFiles()                = 0;
+  virtual ParsedGraphStructure<V, E> ParseLine(char* line,
+                                               uint64_t lineLength) = 0;
+  static std::vector<std::string> SplitLine(const char* line,
+                                            uint64_t lineLength, char delim,
+                                            uint64_t numTokens) {
+    uint64_t ndx = 0, start = 0, end = 0;
+    std::vector<std::string> tokens(numTokens);
+
+    for (; line[end] != '\0' && line[end] != '\n' && ndx < numTokens &&
+           end < lineLength;
+         end++) {
+      if (line[end] == delim) {
+        tokens[ndx] = std::string(line + start, end - start);
+        start       = end + 1;
+        ndx++;
+      }
+    }
+
+    if (ndx < numTokens) {
+      tokens[numTokens - 1] =
+          std::string(line + start, end - start); // flush last token
+    }
+    return tokens;
+  }
+};
+
+template <typename V, typename E>
+class WMDParser : public FileParser<V, E> {
+public:
+  WMDParser(std::vector<std::string> files) : csvFields_(10), files_(files) {}
+  WMDParser(uint64_t csvFields, std::vector<std::string> files)
+      : csvFields_(csvFields), files_(files) {}
+
+  virtual const std::vector<std::string>& GetFiles() override { return files_; }
+  virtual ParsedGraphStructure<V, E> ParseLine(char* line,
+                                               uint64_t lineLength) override {
+    std::vector<std::string> tokens =
+        this->SplitLine(line, lineLength, ',', csvFields_);
+
+    if (tokens[0] == "Person") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[1]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::PERSON));
+    } else if (tokens[0] == "ForumEvent") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[4]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::FORUMEVENT));
+    } else if (tokens[0] == "Forum") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[3]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::FORUM));
+    } else if (tokens[0] == "Publication") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[5]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::PUBLICATION));
+    } else if (tokens[0] == "Topic") {
+      ParsedUID uid =
+          shad::data_types::encode<uint64_t, std::string, UINT>(tokens[6]);
+      return ParsedGraphStructure<V, E>(
+          V(uid, 0, agile::workflow1::TYPES::TOPIC));
+    } else { // edge type
+      agile::workflow1::TYPES inverseEdgeType;
+      if (tokens[0] == "Sale") {
+        inverseEdgeType = agile::workflow1::TYPES::PURCHASE;
+      } else if (tokens[0] == "Author") {
+        inverseEdgeType = agile::workflow1::TYPES::WRITTENBY;
+      } else if (tokens[0] == "Includes") {
+        inverseEdgeType = agile::workflow1::TYPES::INCLUDEDIN;
+      } else if (tokens[0] == "HasTopic") {
+        inverseEdgeType = agile::workflow1::TYPES::TOPICIN;
+      } else if (tokens[0] == "HasOrg") {
+        inverseEdgeType = agile::workflow1::TYPES::ORGIN;
+      } else {
+        // skip nodes
+        return ParsedGraphStructure<V, E>();
+      }
+      std::vector<E> edges;
+      E edge(tokens);
+
+      // insert inverse edges to the graph
+      E inverseEdge    = edge;
+      inverseEdge.type = inverseEdgeType;
+      std::swap(inverseEdge.src, inverseEdge.dst);
+      std::swap(inverseEdge.src_type, inverseEdge.dst_type);
+
+      edges.emplace_back(edge);
+      edges.emplace_back(inverseEdge);
+
+      return ParsedGraphStructure<V, E>(edges);
+    }
+  }
+
+private:
+  uint64_t csvFields_;
+  std::vector<std::string> files_;
+};
+
+} // namespace graphs
+} // namespace galois
+
+#endif
diff --git a/libwmd/test/CMakeLists.txt b/libwmd/test/CMakeLists.txt
new file mode 100644
index 0000000000..cdea7d87de
--- /dev/null
+++ b/libwmd/test/CMakeLists.txt
@@ -0,0 +1,12 @@
+function(add_test_unit name)
+  set(test_name unit-${name})
+
+  add_executable(${test_name} ${name}.cpp)
+  target_link_libraries(${test_name} Galois::wmd Galois::cusp Galois::dist_async Galois::gluon)
+
+  set(command_line "$<TARGET_FILE:${test_name}>")
+
+  add_test(NAME ${test_name} COMMAND ${command_line} ${Galois_SOURCE_DIR}/inputs/wmd/static ${Galois_SOURCE_DIR}/inputs/wmd/dynamic)
+endfunction()
+
+add_test_unit(wmd-graph-build)
diff --git a/libwmd/test/wmd-graph-build.cpp b/libwmd/test/wmd-graph-build.cpp
new file mode 100644
index 0000000000..f5404da83c
--- /dev/null
+++ b/libwmd/test/wmd-graph-build.cpp
@@ -0,0 +1,242 @@
+/*
+ * Run this script in Debug view and compare its result to result of
+ * MiningPartitioner.h
+ *
+ * A testing result sheet could be found at
+ * https://docs.google.com/spreadsheets/u/1/d/1D0dAab29uazRKVroBZdEvAsEUT92aNHsiIYuHK6dlDA
+ *
+ * TODO: include a script to gen dataset and compare result
+ *
+ */
+#include "galois/wmd/graph.h"
+#include "galois/wmd/WMDPartitioner.h"
+#include "galois/shad/DataTypes.h"
+#include "galois/wmd/graphTypes.h"
+
+#include "galois/DistGalois.h"
+#include "galois/graphs/GenericPartitioners.h"
+#include "galois/runtime/GraphUpdateManager.h"
+
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+
+using namespace agile::workflow1;
+
+typedef galois::graphs::WMDGraph<Vertex, Edge, OECPolicy> Graph;
+
+void insertEdge(
+    Edge edge,
+    std::unordered_map<std::uint64_t, std::pair<TYPES, std::vector<Edge>>>&
+        vertices) {
+  if (vertices.find(edge.src) != vertices.end()) {
+    vertices[edge.src].second.push_back(edge);
+  } else {
+    assert(false);
+  }
+}
+
+void parser(std::string line,
+            std::unordered_map<std::uint64_t,
+                               std::pair<TYPES, std::vector<Edge>>>& vertices) {
+  if (line.find("//") != std::string::npos ||
+      line.find("#") != std::string::npos) {
+    return;
+  } else if (line.find("/*") != std::string::npos ||
+             line.find("*/") != std::string::npos) {
+    return;
+  } else {
+    const char* ptr = line.c_str();
+    std::istringstream ss(ptr);
+    std::string token;
+    std::vector<std::string> tokens;
+    while (std::getline(ss, token, ',')) {
+      tokens.push_back(token);
+    }
+    if (tokens.size() == 9)
+      tokens.push_back("");
+    if (tokens.size() == 0)
+      return;
+    assert(tokens.size() == 10);
+    bool isNode = tokens[0] == "Person" || tokens[0] == "ForumEvent" ||
+                  tokens[0] == "Forum" || tokens[0] == "Publication" ||
+                  tokens[0] == "Topic";
+    if (isNode) {
+      uint64_t id                        = 0;
+      agile::workflow1::TYPES vertexType = agile::workflow1::TYPES::NONE;
+      if (tokens[0] == "Person") {
+        vertexType = agile::workflow1::TYPES::PERSON;
+        id         = std::stoull(tokens[1]);
+      } else if (tokens[0] == "ForumEvent") {
+        vertexType = agile::workflow1::TYPES::FORUMEVENT;
+        id         = std::stoull(tokens[4]);
+      } else if (tokens[0] == "Forum") {
+        vertexType = agile::workflow1::TYPES::FORUM;
+        id         = std::stoull(tokens[3]);
+      } else if (tokens[0] == "Publication") {
+        vertexType = agile::workflow1::TYPES::PUBLICATION;
+        id         = std::stoull(tokens[5]);
+      } else if (tokens[0] == "Topic") {
+        vertexType = agile::workflow1::TYPES::TOPIC;
+        id         = std::stoull(tokens[6]);
+      } else {
+        assert(false);
+      }
+      vertices[id] =
+          std::pair<TYPES, std::vector<Edge>>(vertexType, std::vector<Edge>());
+    } else {
+      Edge edge(tokens);
+      insertEdge(edge, vertices);
+      // Inverse edge
+      agile::workflow1::TYPES inverseEdgeType = agile::workflow1::TYPES::NONE;
+      if (tokens[0] == "Sale") {
+        inverseEdgeType = agile::workflow1::TYPES::PURCHASE;
+      } else if (tokens[0] == "Author") {
+        inverseEdgeType = agile::workflow1::TYPES::WRITTENBY;
+      } else if (tokens[0] == "Includes") {
+        inverseEdgeType = agile::workflow1::TYPES::INCLUDEDIN;
+      } else if (tokens[0] == "HasTopic") {
+        inverseEdgeType = agile::workflow1::TYPES::TOPICIN;
+      } else if (tokens[0] == "HasOrg") {
+        inverseEdgeType = agile::workflow1::TYPES::ORGIN;
+      } else {
+        assert(false);
+      }
+      agile::workflow1::Edge inverseEdge = edge;
+      inverseEdge.type                   = inverseEdgeType;
+      std::swap(inverseEdge.src, inverseEdge.dst);
+      std::swap(inverseEdge.src_type, inverseEdge.dst_type);
+      insertEdge(inverseEdge, vertices);
+    }
+  }
+}
+
+void getDataFromGraph(
+    std::string& filename,
+    std::unordered_map<std::uint64_t, std::pair<TYPES, std::vector<Edge>>>&
+        vertices) {
+  // read file line by line
+  std::string line;
+  std::ifstream myfile(filename);
+  if (myfile.is_open()) {
+    while (getline(myfile, line)) {
+      parser(line, vertices);
+    }
+    myfile.close();
+  } else {
+    std::cout << "Unable to open file";
+  }
+}
+
+int main(int argc, char* argv[]) {
+  galois::DistMemSys G; // init galois memory
+  auto& net = galois::runtime::getSystemNetworkInterface();
+
+  if (argc == 3)
+    galois::setActiveThreads(atoi(argv[2]));
+
+  if (net.ID == 0) {
+    galois::gPrint("Testing building WMD graph from file.\n");
+    galois::gPrint("Num Hosts: ", net.Num,
+                   ", Active Threads Per Hosts: ", galois::getActiveThreads(),
+                   "\n");
+  }
+
+  std::string dataFile = argv[1];
+  std::string file     = dataFile;
+  std::vector<std::string> filenames;
+  filenames.emplace_back(dataFile);
+  std::vector<std::unique_ptr<galois::graphs::FileParser<
+      agile::workflow1::Vertex, agile::workflow1::Edge>>>
+      parsers;
+  parsers.emplace_back(
+      std::make_unique<galois::graphs::WMDParser<agile::workflow1::Vertex,
+                                                 agile::workflow1::Edge>>(
+          10, filenames));
+  Graph* graph = new Graph(parsers, net.ID, net.Num, true, false,
+                           galois::graphs::BALANCED_EDGES_OF_MASTERS);
+  assert(graph != nullptr);
+
+  std::unordered_map<std::uint64_t, std::pair<TYPES, std::vector<Edge>>>
+      vertices;
+
+  std::string dynFile = argv[2] + std::to_string(net.ID) + ".txt";
+
+  graphUpdateManager<agile::workflow1::Vertex, agile::workflow1::Edge> GUM(
+      std::make_unique<galois::graphs::WMDParser<agile::workflow1::Vertex,
+                                                 agile::workflow1::Edge>>(
+          10, filenames),
+      dynFile, 100, graph);
+  GUM.start();
+  // wait for GUM to finish
+  while (!GUM.stop()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  galois::runtime::getHostBarrier().wait();
+  GUM.stop2();
+
+  // generate a file with sorted token of all nodes and its outgoing edge dst
+  // compare it with other implementation to verify the correctness
+  std::vector<std::pair<uint64_t, std::vector<uint64_t>>> tokenAndEdges;
+  tokenAndEdges.resize(graph->numMasters());
+  galois::do_all(
+      galois::iterate(graph->masterNodesRange()),
+      [&](size_t lid) {
+        auto token = graph->getData(lid).id;
+        std::vector<uint64_t> edgeDst;
+        auto end = graph->edge_end(lid);
+        auto itr = graph->edge_begin(lid);
+        for (; itr != end; itr++) {
+          edgeDst.push_back(graph->getEdgeDst(itr));
+        }
+        std::vector<uint64_t> edgeDstDbg;
+        for (auto& e : graph->edges(lid)) {
+          edgeDstDbg.push_back(graph->getEdgeDst(e));
+        }
+        assert(edgeDst == edgeDstDbg);
+        std::sort(edgeDst.begin(), edgeDst.end());
+        tokenAndEdges[lid] = std::make_pair(token, std::move(edgeDst));
+      },
+      galois::steal());
+  // gather node info from other hosts
+  if (net.ID != 0) { // send token and degree pairs to host 0
+    galois::runtime::SendBuffer sendBuffer;
+    galois::runtime::gSerialize(sendBuffer, tokenAndEdges);
+    net.sendTagged(0, galois::runtime::evilPhase, std::move(sendBuffer));
+  } else { // recv node range from other hosts
+    for (size_t i = 0; i < net.Num - 1; i++) {
+      decltype(net.recieveTagged(galois::runtime::evilPhase)) p;
+      do {
+        p = net.recieveTagged(galois::runtime::evilPhase);
+      } while (!p);
+      std::vector<std::pair<uint64_t, std::vector<uint64_t>>>
+          incomingtokenAndEdges;
+      galois::runtime::gDeserialize(p->second, incomingtokenAndEdges);
+      // combine data
+      std::move(incomingtokenAndEdges.begin(), incomingtokenAndEdges.end(),
+                std::back_inserter(tokenAndEdges));
+    }
+  }
+  if (net.ID == 0) {
+    getDataFromGraph(file, vertices);
+    for (uint32_t i = 0; i < net.Num; i++) {
+      std::string dynFile = argv[2] + std::to_string(i) + ".txt";
+      getDataFromGraph(dynFile, vertices);
+    }
+    // compare with vertices
+    assert(tokenAndEdges.size() == vertices.size());
+    for (size_t i = 0; i < tokenAndEdges.size(); i++) {
+      auto& tokenAndEdge = tokenAndEdges[i];
+      auto& vertex       = vertices[tokenAndEdge.first];
+      assert(vertex.second.size() == tokenAndEdge.second.size());
+      std::sort(vertex.second.begin(), vertex.second.end(),
+                [](const agile::workflow1::Edge& a,
+                   const agile::workflow1::Edge& b) { return a.dst < b.dst; });
+      for (size_t j = 0; j < vertex.second.size(); j++) {
+        assert(vertex.second[j].dst == tokenAndEdge.second[j]);
+      }
+    }
+  }
+  return 0;
+}
diff --git a/lonestar/CMakeLists.txt b/lonestar/CMakeLists.txt
index 55b79c6431..a0efe7bae7 100644
--- a/lonestar/CMakeLists.txt
+++ b/lonestar/CMakeLists.txt
@@ -223,3 +223,8 @@ add_subdirectory(analytics)
 add_subdirectory(eda)
 add_subdirectory(mining)
 add_subdirectory(scientific)
+
+if(USE_DEEPGALOIS)
+  add_subdirectory(gnn)
+  add_subdirectory(libgnnbench)
+endif(USE_DEEPGALOIS)
diff --git a/lonestar/analytics/cpu/bipart/Coarsening.cpp b/lonestar/analytics/cpu/bipart/Coarsening.cpp
index cd4f37daf7..96d17671dd 100644
--- a/lonestar/analytics/cpu/bipart/Coarsening.cpp
+++ b/lonestar/analytics/cpu/bipart/Coarsening.cpp
@@ -29,6 +29,8 @@
 #include <unordered_set>
 #include <unordered_map>
 
+constexpr static const unsigned CHUNK_SIZE      = 512U;
+
 int TOTALW;
 int LIMIT;
 bool FLAG = false;
@@ -42,13 +44,40 @@ int hash(unsigned val) {
 void parallelRand(MetisGraph* graph, int) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
-  galois::do_all(
-      galois::iterate(size_t{0}, fineGGraph->hedges),
-      [&](GNode item) {
-        fineGGraph->getData(item).netrand =
-            hash(fineGGraph->getData(item).netnum);
+  
+	galois::StatTimer T_RAND("RAND");
+  T_RAND.start();
+
+	galois::do_all(
+      galois::iterate((uint64_t) 0, fineGGraph->hedges),
+      [&fineGGraph](uint64_t item) {
+				unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;
+				netnum= hash(netnum);
+        fineGGraph->getData(item, flag_no_lock).netrand = netnum;
+      },
+			galois::steal(),
+//			 galois::chunk_size<CHUNK_SIZE>());
+     galois::loopname("rand"));
+	T_RAND.stop();
+
+		//std::cout <<"hedges: " << fineGGraph->hedges << std::endl;
+
+		galois::StatTimer T_INDEX("INDEX");
+  	T_INDEX.start();
+		galois::do_all(
+      galois::iterate((uint64_t) 0, fineGGraph->hedges),
+      [&fineGGraph](uint64_t item) {
+        unsigned netnum = fineGGraph->getData(item, flag_no_lock).index;
+        netnum= hash(1);
+        fineGGraph->getData(item, flag_no_lock).index = netnum;
       },
-      galois::loopname("rand"));
+      galois::steal(),
+	//		 galois::chunk_size<CHUNK_SIZE>());
+      galois::loopname("rand_index"));
+		T_INDEX.stop();
+
+		//std::cout <<"rand: " << T_RAND.get() << std::endl;
+		//std::cout << "rand_index: " << T_INDEX.get() << std::endl;
 }
 
 using MatchingPolicy = void(GNode, GGraph*);
@@ -123,7 +152,6 @@ void parallelPrioRand(MetisGraph* graph, int iter) {
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   parallelRand(graph, iter);
 
-  // Making deterministic
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode item) {
@@ -165,7 +193,7 @@ void parallelPrioRand(MetisGraph* graph, int iter) {
 template <MatchingPolicy matcher>
 void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
                                   std::vector<bool>& hedges,
-                                  std::vector<unsigned>& weight) {
+                                  galois::LargeArray<unsigned>& weight) {
   parallelPrioRand<matcher>(graph, iter);
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   assert(fineGGraph != graph->getGraph());
@@ -173,20 +201,14 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
   typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;
   ThreadLocalData edgesThreadLocal;
   std::string name = "phaseI";
-  // galois::GAccumulator<unsigned> nnodes;
+  
   galois::GAccumulator<unsigned> hedge;
-  // hyperedge coarsening
-
-  galois::InsertBag<GNode> hedge_bag;
+  
+	galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode item) {
-        // unsigned id = fineGGraph->getData(item).netnum;
-        /*if (fmod(id, pow(2,iter)) <= pow(2,(iter - 1)) - 1) { //final
-            hedges[item] = true;
-            return;
-        }*/
         bool flag       = false;
         unsigned nodeid = INT_MAX;
         auto& edges     = *edgesThreadLocal.getLocal();
@@ -215,8 +237,8 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             return;
           fineGGraph->getData(item).setMatched();
           if (flag)
-            hedge_bag.push(item);
-          // hedges[item] = true;
+						hedge_bag.push(item);
+          
           bag.push(nodeid);
           unsigned ww = 0;
           for (auto pp : edges) {
@@ -224,17 +246,18 @@ void parallelHMatchAndCreateNodes(MetisGraph* graph, int iter, GNodeBag& bag,
             fineGGraph->getData(pp).setMatched();
             fineGGraph->getData(pp).setParent(nodeid);
             fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum;
-          }
+         		//fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum.load();
+				  }
           weight[nodeid - fineGGraph->hedges] = ww;
         }
       },
       galois::loopname("phaseI"));
 
-  for (auto item : hedge_bag)
-    hedges[item] = true;
+			for(auto item: hedge_bag)
+				hedges[item] = true;
 }
 
-void moreCoarse(MetisGraph* graph, std::vector<unsigned>& weight) {
+void moreCoarse(MetisGraph* graph, galois::LargeArray<unsigned>& weight) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   typedef std::vector<GNode> VecTy;
@@ -287,7 +310,8 @@ void moreCoarse(MetisGraph* graph, std::vector<unsigned>& weight) {
               fineGGraph->getData(e).setMatched();
               fineGGraph->getData(e).setParent(nn);
               fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum;
-            }
+           		//fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum.load();
+						 }
           }
         }
       },
@@ -302,7 +326,7 @@ void moreCoarse(MetisGraph* graph, std::vector<unsigned>& weight) {
 
 // Coarsening phaseII
 void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
-                   std::vector<unsigned>& weight) {
+                   galois::LargeArray<unsigned>& weight) {
 
   GGraph* fineGGraph = graph->getFinerGraph()->getGraph();
   typedef std::set<int> SecTy;
@@ -316,7 +340,7 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
   galois::GAccumulator<int> hnode;
   moreCoarse(graph, weight);
 
-  galois::InsertBag<GNode> hedge_bag;
+	galois::InsertBag<GNode> hedge_bag;
 
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
@@ -345,19 +369,41 @@ void coarsePhaseII(MetisGraph* graph, std::vector<bool>& hedges,
           fineGGraph->getData(item).setMatched();
 
         } else {
-          hedge_bag.push(item);
-          fineGGraph->getData(item).setMatched();
+				//	auto& vec = *edgesThreadLocalV.getLocal();
+          //vec.push_back(item);
+					hedge_bag.push(item);
+					fineGGraph->getData(item).setMatched();
         }
-      },
+      },galois::steal(),
       galois::loopname("count # Hyperedges"));
 
-  for (auto item : hedge_bag)
-    hedges[item] = true;
+			for(auto item:hedge_bag)
+				hedges[item] = true;
 }
 
+//find nodes that are not incident to any hyperedge
+void findLoneNodes(GGraph& graph){
+	
+	galois::do_all(
+		galois::iterate((uint64_t) graph.hedges, graph.size()),
+			[&](GNode n){
+				
+				graph.getData(n).notAlone = false;
+			}, galois::steal(), galois::loopname("initialize not alone variables"));
+	
+	galois::do_all(
+		galois::iterate((uint64_t) 0, graph.hedges),
+			[&](GNode h){
+
+				for(auto n:graph.edges(h))
+					graph.getData(graph.getEdgeDst(n)).notAlone = true;
+			}, galois::steal(), galois::loopname("set not alone variables"));
+}
+
+//create coarsened graphs
 void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
-                         std::vector<bool> hedges,
-                         std::vector<unsigned> weight) {
+                         std::vector<bool>& hedges,
+                         galois::LargeArray<unsigned>& weight) {
 
   GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();
   GGraph* coarseGGraph = graph->getGraph();
@@ -370,36 +416,96 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
           hg += 1;
       },
       galois::steal(), galois::loopname("number of hyperedges loop"));
-  galois::do_all(
+ 
+	//find lone nodes
+	findLoneNodes(*fineGGraph);
+ 
+
+	galois::do_all(
       galois::iterate(fineGGraph->hedges, fineGGraph->size()),
       [&](GNode ii) {
-        if (!fineGGraph->getData(ii).isMatched()) {
+        if (!fineGGraph->getData(ii).isMatched()){// && fineGGraph->getData(ii).notAlone) {
           bag.push(ii);
           fineGGraph->getData(ii).setMatched();
           fineGGraph->getData(ii).setParent(ii);
           fineGGraph->getData(ii).netnum  = INT_MAX;
           weight[ii - fineGGraph->hedges] = fineGGraph->getData(ii).getWeight();
-        }
+ 
+	      }
       },
       galois::steal(), galois::loopname("noedgebag match"));
-  unsigned hnum   = hg.reduce();
-  unsigned nodes  = std::distance(bag.begin(), bag.end());
+
+  
+	galois::StatTimer T_BAG("BAG");
+	T_BAG.start();
+	std::vector<bool> inNodeBag(1000, false);
+	std::vector<unsigned> nodeid(1000, INT_MAX);
+
+	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
+		
+		if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
+			int index = ii%1000;
+			inNodeBag[index] = true;
+			if(ii < nodeid[index])
+				nodeid[index] = ii;
+			
+		}
+	}
+
+	for(int i=0;i<1000;i++){
+	
+		if(inNodeBag[i]){
+			bag.push(nodeid[i]);
+			weight[nodeid[i]-fineGGraph->hedges] =  0;
+		}
+	}
+
+	for(GNode ii = fineGGraph->hedges; ii<fineGGraph->size();ii++){
+
+    if(!fineGGraph->getData(ii).isMatched() && !fineGGraph->getData(ii).notAlone){
+      int index = ii%1000;
+   		fineGGraph->getData(ii).setMatched();
+      fineGGraph->getData(ii).setParent(nodeid[index]);
+      fineGGraph->getData(ii).netnum =  INT_MAX;
+  
+      weight[nodeid[index]-fineGGraph->hedges] += fineGGraph->getData(ii).getWeight();   
+    }
+  }
+	T_BAG.stop();
+
+	//std::cout <<"bag time: "<< T_BAG.get() << std::endl;
+	unsigned hnum   = hg.reduce();
+  unsigned nodes  = std::distance(bag.begin(), bag.end()); // + numnodes;
   unsigned newval = hnum;
+  
   std::vector<unsigned> idmap(fineGGraph->hnodes);
   std::vector<unsigned> newrand(nodes);
   std::vector<unsigned> newWeight(nodes);
   galois::StatTimer Tloop("for loop");
   Tloop.start();
   std::vector<unsigned> v;
+
+	galois::LargeArray<bool> inBag;
+
+	inBag.allocateBlocked(fineGGraph->size());
+	for(GNode n = fineGGraph->hedges;n<fineGGraph->size() ; n++)
+		inBag[n] = false;
+
   for (auto n : bag)
-    v.push_back(n);
-  std::sort(v.begin(), v.end());
+		inBag[n] = true;
+  
+  for(GNode n = fineGGraph->hedges; n<fineGGraph->size(); n++)
+		if(inBag[n])
+			v.push_back(n);
+
   for (auto n : v) {
     newrand[newval - hnum]        = n;
     idmap[n - fineGGraph->hedges] = newval++;
     newWeight[idmap[n - fineGGraph->hedges] - hnum] =
         weight[n - fineGGraph->hedges];
   }
+
+  // for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++) {
   galois::do_all(
       galois::iterate(fineGGraph->hedges, fineGGraph->size()),
       [&](GNode n) {
@@ -413,34 +519,43 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(
       num_nodes_next);
   std::vector<std::vector<EdgeTy>> edges_data(num_nodes_next);
-  std::vector<unsigned> old_id(hnum);
-  unsigned h_id = 0;
+ 	std::vector<unsigned> old_id(hnum);
+ 
+
+	unsigned h_id = 0;
+  
   for (GNode n = 0; n < fineGGraph->hedges; n++) {
-    if (hedges[n]) {
-      old_id[h_id]                  = fineGGraph->getData(n).netnum;
-      fineGGraph->getData(n).nodeid = h_id++;
-    }
-  }
+	 			if (hedges[n]) {
+      		old_id[h_id]                  = fineGGraph->getData(n).netnum;
+      		fineGGraph->getData(n).nodeid = h_id++;
+    		}
+ 	}
+
   galois::do_all(
       galois::iterate(size_t{0}, fineGGraph->hedges),
       [&](GNode n) {
         if (!hedges[n])
           return;
-        auto data   = fineGGraph->getData(n, flag_no_lock);
+        //auto data   = fineGGraph->getData(n, flag_no_lock);
         unsigned id = fineGGraph->getData(n).nodeid;
 
         for (auto ii : fineGGraph->edges(n)) {
           GNode dst     = fineGGraph->getEdgeDst(ii);
-          auto dst_data = fineGGraph->getData(dst, flag_no_lock);
-          unsigned pid  = dst_data.getParent();
+        //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);
+          //unsigned pid  = dst_data.getParent();
+					unsigned pid = fineGGraph->getData(dst).getParent();
+
           auto f = std::find(edges_id[id].begin(), edges_id[id].end(), pid);
-          if (f == edges_id[id].end()) {
+         if (f == edges_id[id].end()) {
+
             edges_id[id].push_back(pid);
           }
         } // End edge loop
+
       },
       galois::steal(), galois::loopname("BuildGrah: Find edges"));
 
+		
   std::vector<uint64_t> prefix_edges(num_nodes_next);
   galois::GAccumulator<uint64_t> num_edges_acc;
   galois::do_all(
@@ -455,6 +570,7 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
   for (uint32_t c = 1; c < num_nodes_next; ++c) {
     prefix_edges[c] += prefix_edges[c - 1];
   }
+
   coarseGGraph->constructFrom(num_nodes_next, num_edges_next, prefix_edges,
                               edges_id, edges_data);
   coarseGGraph->hedges = hnum;
@@ -465,16 +581,20 @@ void parallelCreateEdges(MetisGraph* graph, GNodeBag& bag,
         if (ii < hnum) {
           coarseGGraph->getData(ii).netval = INT_MAX;
           coarseGGraph->getData(ii).netnum = old_id[ii];
-        } else {
+				} else {
           coarseGGraph->getData(ii).netval  = INT_MAX;
           coarseGGraph->getData(ii).netnum  = INT_MAX;
           coarseGGraph->getData(ii).netrand = INT_MAX;
-          coarseGGraph->getData(ii).nodeid  = ii;
+          coarseGGraph->getData(ii).nodeid =
+              ii;
           coarseGGraph->getData(ii).setWeight(
               newWeight[ii - coarseGGraph->hedges]);
         }
       },
       galois::steal(), galois::loopname("noedgebag match"));
+
+	inBag.destroy();
+	inBag.deallocate();
 }
 
 void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
@@ -482,7 +602,8 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   GNodeBag nodes;
   int sz = coarseMetisGraph->getFinerGraph()->getGraph()->hedges;
   std::vector<bool> hedges(sz, false);
-  std::vector<unsigned> weight(fineMetisGraph->getGraph()->hnodes);
+  galois::LargeArray<unsigned> weight;
+	weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);
 
   switch (sch) {
   case PLD:
@@ -526,6 +647,9 @@ void findMatching(MetisGraph* coarseMetisGraph, scheduleMode sch, int iter) {
   }
   coarsePhaseII(coarseMetisGraph, hedges, weight);
   parallelCreateEdges(coarseMetisGraph, nodes, hedges, weight);
+
+	weight.destroy();
+	weight.deallocate();
 }
 
 MetisGraph* coarsenOnce(MetisGraph* fineMetisGraph, scheduleMode sch,
@@ -543,31 +667,29 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
   MetisGraph* coarseGraph = fineMetisGraph;
   unsigned size =
       fineMetisGraph->getGraph()
-          ->hnodes; //, fineMetisGraph->getGraph()->cellList().end());
+          ->hnodes;
   unsigned hedgeSize = 0;
-  const float ratio  = 55.0 / 45.0; // change if needed
+  const float ratio  = 55.0 / 45.0;
   const float tol    = std::max(ratio, 1 - ratio) - 1;
   const int hi       = (1 + tol) * size / (2 + tol);
   LIMIT              = hi / 4;
 
-  // std::cout<<"inital weight is "<<totw<<"\n";
   unsigned Size    = size;
   unsigned iterNum = 0;
   unsigned newSize = size;
-  while (size > coarsenTo) {
+  while (Size > coarsenTo) {
     if (iterNum > coarsenTo)
       break;
     if (Size - newSize <= 0 && iterNum > 2)
-      break; // final
+      break; 
     newSize     = coarseGraph->getGraph()->hnodes;
     coarseGraph = coarsenOnce(coarseGraph, sch, iterNum);
     Size        = coarseGraph->getGraph()->hnodes;
     hedgeSize   = coarseGraph->getGraph()->hedges;
-    std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net is "
-              << hedgeSize << "\n";
+    //std::cout << "SIZE IS " << coarseGraph->getGraph()->hnodes << " and net is "
+    //          << hedgeSize << "\n";
     if (hedgeSize < 1000)
-      return coarseGraph->getFinerGraph();
-    // if (Size < 300) return coarseGraph->getFinerGraph();
+			break;
 
     ++iterNum;
   }
diff --git a/lonestar/analytics/cpu/bipart/Refine.cpp b/lonestar/analytics/cpu/bipart/Refine.cpp
index 9c36f91ec0..112dc06277 100644
--- a/lonestar/analytics/cpu/bipart/Refine.cpp
+++ b/lonestar/analytics/cpu/bipart/Refine.cpp
@@ -28,50 +28,6 @@
 
 namespace {
 
-// This is only used on the terminal graph (find graph)
-// Should workd for hmetis
-
-/*int calculate_cutsize(GGraph& g) {
-
-  GNodeBag bag;
-  galois::do_all(galois::iterate(g.getNets()),
-        [&](GNode n) {
-            auto c = g.edges(n).begin();
-            GNode cn = g.getEdgeDst(*c);
-            int part = g.getData(cn).getPart();
-            for (auto x : g.edges(n)) {
-              auto cc = g.getEdgeDst(x);
-              int partc = g.getData(cc).getPart();
-              if (partc != part) {
-                bag.push(n);
-                return;
-              }
-            }
-        },
-        galois::loopname("cutsize"));
-  return std::distance(bag.begin(), bag.end());
-}*/
-
-/*int calculate_cutsize(GGraph& g, std::map<GNode, unsigned> part) {
-
-  GNodeBag bag;
-  galois::do_all(galois::iterate(g.getNets()),
-        [&](GNode n) {
-            auto c = g.edges(n).begin();
-            GNode cn = g.getEdgeDst(*c);
-            unsigned ppart = part[cn];
-            for (auto x : g.edges(n, galois::MethodFlag::UNPROTECTED)) {
-              auto cc = g.getEdgeDst(x);
-              unsigned partc = part[cc];
-              if (partc != ppart) {
-                bag.push(n);
-                return;
-              }
-            }
-        },
-        galois::steal(), galois::loopname("cutsize"));
-  return std::distance(bag.begin(), bag.end());
-}*/
 void projectPart(MetisGraph* Graph) {
   GGraph* fineGraph   = Graph->getFinerGraph()->getGraph();
   GGraph* coarseGraph = Graph->getGraph();
@@ -312,7 +268,7 @@ void parallel_make_balance(GGraph& g, float tol, int p) {
                 int d   = gain * 10.0f;
                 int idx = 10 - d;
                 nodelistz[idx].push(n);
-              } else if (gain >= -9.0f) {
+              } else if (gain > -9.0f) {
                 int d   = gain * 10.0f - 1;
                 int idx = 10 - d;
                 nodelistz[idx].push(n);
@@ -578,12 +534,12 @@ bool isPT(int n) {
   return (ceil(log2(n)) == floor(log2(n)));
 }
 
-void refine(MetisGraph* coarseGraph, unsigned K) {
+void refine(MetisGraph* coarseGraph, unsigned K, double imbalance) {
   float ratio = 0.0f;
   float tol   = 0.0f;
   bool flag   = isPT(K);
   if (flag) {
-    ratio = 55.0 / 45.0; // change if needed
+    ratio = (50.0f + (double) imbalance)/(50.0f - (double) imbalance);
     tol   = std::max(ratio, 1 - ratio) - 1;
   } else {
     ratio = ((float)((K + 1) / 2)) / ((float)(K / 2)); // change if needed
diff --git a/lonestar/analytics/cpu/bipart/bipart.cpp b/lonestar/analytics/cpu/bipart/bipart.cpp
index 7da44e7cc3..27761209ea 100644
--- a/lonestar/analytics/cpu/bipart/bipart.cpp
+++ b/lonestar/analytics/cpu/bipart/bipart.cpp
@@ -54,7 +54,7 @@ static cll::opt<scheduleMode> schedulingMode(
                 clEnumVal(MDEG, "MDEG"), clEnumVal(DEG, "DEG"),
                 clEnumVal(MWD, "MWD"), clEnumVal(HIS, "HIS"),
                 clEnumVal(RAND, "random")),
-    cll::init(PLD));
+    cll::init(RAND));
 
 static cll::opt<bool>
     mtxInput("mtxinput",
@@ -84,8 +84,8 @@ static cll::opt<unsigned> numPartitions(cll::Positional,
                                         cll::init(2));
 static cll::opt<double> imbalance(
     "balance",
-    cll::desc("Fraction deviated from mean partition size (default 0.01)"),
-    cll::init(0.01));
+    cll::desc("Percentage deviated from mean partition size (default 5)"),
+    cll::init(5.0));
 
 //! Flag that forces user to be aware that they should be passing in a
 //! hMetis graph.
@@ -97,7 +97,9 @@ static cll::opt<bool>
 static cll::opt<bool>
     output("output", cll::desc("Specify if partitions need to be written"),
            cll::init(false));
-
+double Ctime = 0.0f;
+double Ptime = 0.0f;
+double Rtime = 0.0f;
 /**
  * Partitioning
  */
@@ -117,11 +119,11 @@ void Partition(MetisGraph* metisGraph, unsigned coarsenTo, unsigned K) {
 
   galois::StatTimer T3("Refine");
   T3.start();
-  refine(mcg, K);
+  refine(mcg, K, imbalance);
   T3.stop();
-  std::cout << "coarsen:," << T.get() << "\n";
-  std::cout << "clustering:," << T2.get() << '\n';
-  std::cout << "Refinement:," << T3.get() << "\n";
+  Ctime += (T.get()/1000.0f);
+  Ptime += (T2.get()/1000.0f);
+  Rtime += (T3.get()/1000.0f);
 
   execTime.stop();
 }
@@ -146,15 +148,17 @@ int computingCut(GGraph& g) {
 }
 
 int computingBalance(GGraph& g) {
-  int zero = 0, one = 0;
+  int max = 0;
+  std::vector<int> parts(numPartitions, 0);
   for (size_t c = g.hedges; c < g.size(); c++) {
-    int part = g.getData(c).getPart();
-    if (part == 0)
-      zero++;
-    else
-      one++;
+    unsigned pp = g.getData(c).getPart();
+    parts[pp]++;
+  }
+  for (unsigned i = 0; i <numPartitions; i++) {
+    if (parts[i] > max)
+      max = parts[i];
   }
-  return std::abs(zero - one);
+  return max;
 }
 // printGraphBeg(*graph)
 
@@ -198,6 +202,7 @@ int hash(unsigned val) {
 
 int main(int argc, char** argv) {
   galois::SharedMemSys G;
+
   LonestarStart(argc, argv, name, desc, url, &inputFile);
 
   galois::StatTimer totalTime("TimerTotal");
@@ -209,25 +214,23 @@ int main(int argc, char** argv) {
                " to indicate the input is a hMetisGraph graph.");
   }
 
-  // srand(-1);
   MetisGraph metisGraph;
   GGraph& graph = *metisGraph.getGraph();
   std::ifstream f(inputFile.c_str());
-  // GGraph graph;// = *metisGraph.getGraph();
   std::string line;
   std::getline(f, line);
   std::stringstream ss(line);
   uint32_t i1;
   uint64_t i2;
   ss >> i1 >> i2;
-  const uint32_t hedges = i1;
-  const uint64_t nodes  = i2;
+  uint32_t hedges = i1;
+  uint64_t nodes  = i2;
   std::cout << "hedges: " << hedges << "\n";
   std::cout << "nodes: " << nodes << "\n\n";
 
   galois::StatTimer T("buildingG");
   T.start();
-  // read rest of input and initialize hedges (build hgraph)
+
   galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(hedges +
                                                                       nodes);
   std::vector<std::vector<EdgeTy>> edges_data(hedges + nodes);
@@ -255,33 +258,35 @@ int main(int argc, char** argv) {
   f.close();
   graph.hedges = hedges;
   graph.hnodes = nodes;
-  std::cout << "number of edges " << edges << "\n";
+  std::cout << "number of hedges " << hedges << "\n";
   uint32_t sizes = hedges + nodes;
+
   galois::do_all(galois::iterate(uint32_t{0}, sizes),
                  [&](uint32_t c) { prefix_edges[c] = edges_id[c].size(); });
 
   for (uint64_t c = 1; c < nodes + hedges; ++c) {
     prefix_edges[c] += prefix_edges[c - 1];
   }
-  // edges = #edges, hedgecount = how many edges each node has, edges_id: for
-  // each node, which ndoes it is connected to edges_data: data for each edge =
-  // 1
+
   graph.constructFrom(nodes + hedges, edges, prefix_edges, edges_id,
                       edges_data);
-  galois::do_all(galois::iterate(graph), [&](GNode n) {
-    if (n < hedges)
-      graph.getData(n).netnum = n + 1;
-    else
-      graph.getData(n).netnum = INT_MAX;
-    graph.getData(n).netrand = INT_MAX;
-    graph.getData(n).netval  = INT_MAX;
-    graph.getData(n).nodeid  = n + 1;
-  });
+  galois::do_all(
+      galois::iterate(graph),
+      [&](GNode n) {
+        if (n < hedges)
+          graph.getData(n).netnum = n + 1;
+        else
+          graph.getData(n).netnum = INT_MAX;
+        graph.getData(n).netrand = INT_MAX;
+        graph.getData(n).netval  = INT_MAX;
+        graph.getData(n).nodeid  = n + 1;
+      },
+      galois::steal(), galois::loopname("build initial graph"));
   T.stop();
   std::cout << "time to build a graph " << T.get() << "\n";
   graphStat(graph);
   std::cout << "\n";
-  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 5);
+  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 10);
   galois::reportPageAlloc("MeminfoPre");
   galois::do_all(
       galois::iterate(graph.hedges, graph.size()),
@@ -291,105 +296,161 @@ int main(int argc, char** argv) {
             .initRefine(0, true);
         graph.getData(item, galois::MethodFlag::UNPROTECTED).initPartition();
       },
-      galois::loopname("initPart"));
+      galois::steal(), galois::loopname("initPart"));
+
+  Partition(&metisGraph, csize, numPartitions);
 
   const int k = numPartitions;
   // calculating number of iterations/levels required
-  int num = log2(k) + 1;
+  int num = log2(k);
 
   int kValue[k];
   for (int i = 0; i < k; i++)
     kValue[i] = 0;
 
-  kValue[0] = k;
+  kValue[0]           = (k + 1) / 2;
+  kValue[(k + 1) / 2] = k / 2;
+
+  galois::do_all(
+      galois::iterate((uint64_t)graph.hedges, graph.size()),
+      [&](GNode n) {
+        unsigned pp = graph.getData(n).getPart();
+        if (pp == 1) {
+          graph.getData(n).setPart((k + 1) / 2);
+        }
+      },
+      galois::steal(), galois::loopname("set part (original graph)"));
+
   // running it level by level
 
+  // toProcess contains nodes to be executed in a given level
   std::set<int> toProcess;
   std::set<int> toProcessNew;
   toProcess.insert(0);
+  toProcess.insert((k + 1) / 2);
+
+  std::vector<std::vector<GNode>> nodesvec(k);
+  // std::array<std::vector<GNode>, 100> hedgesvec;
+
   for (int level = 0; level < num; level++) {
-    // calling Partition for each partition number
-    for (auto i : toProcess) {
-      if (kValue[i] > 1) {
-        MetisGraph metisG;
-        GGraph& gr = *metisG.getGraph();
-        std::vector<GNode> nodesvec;
-        std::vector<GNode> hedgevec;
-        for (GNode n = graph.hedges; n < graph.size(); n++) {
-          int pp = graph.getData(n).getPart();
-          if (kValue[pp] > 1 && pp == i) {
-            nodesvec.push_back(n);
-          }
-        }
-        // unsigned nodesize = nodesvec.size();
-        std::map<GNode, unsigned> nodemap;
-        std::map<GNode, unsigned> edgemap;
-        unsigned ed = 0;
-        for (GNode h = 0; h < graph.hedges; h++) {
-          bool flag      = true;
-          auto c         = graph.edges(h).begin();
-          GNode dst      = graph.getEdgeDst(*c);
-          unsigned nPart = graph.getData(dst).getPart();
-          unsigned ii    = i;
-          if (nPart != ii)
-            continue;
+
+    for (int i = 0; i < k; i++)
+      nodesvec[i].clear();
+
+    // distributing nodes in relevant vectors according to their current
+    // partition assignment
+    for (GNode n = graph.hedges; n < graph.size(); n++) {
+      unsigned pp = graph.getData(n).getPart();
+      nodesvec[pp].push_back(n);
+    }
+
+    std::vector<std::vector<GNode>> hedgevec(k);
+
+    // distribute hyperedges according to their current partition
+    galois::do_all(
+        galois::iterate((uint64_t)0, graph.hedges),
+        [&](GNode h) {
+          auto edge = *(graph.edges(h).begin());
+          auto dst  = graph.getEdgeDst(edge);
+          auto ii   = graph.getData(dst).getPart();
+
+          bool flag = true;
+
           for (auto n : graph.edges(h)) {
-            if (graph.getData(graph.getEdgeDst(n)).getPart() != nPart) {
+            auto part = graph.getData(graph.getEdgeDst(n)).getPart();
+
+            if (part != ii) {
               flag = false;
               break;
             }
           }
 
-          if (flag && kValue[nPart] > 1) {
-            hedgevec.push_back(h);
-            edgemap[h] = ed++;
-          }
-        }
-        unsigned id = hedgevec.size();
-        for (auto n : nodesvec) {
-          nodemap[n] = id++;
+          if (flag)
+            graph.getData(h).setPart(ii);
+          else
+            graph.getData(h).setPart(100000);
+        },
+        galois::steal(), galois::loopname("distribute hedges"));
+
+    for (GNode h = 0; h < graph.hedges; h++) {
+      unsigned part = graph.getData(h).getPart();
+      if (part != 100000)
+        hedgevec[part].push_back(h);
+    }
+
+    // calling Partition for each partition number
+    for (unsigned i : toProcess) {
+      if (kValue[i] > 1) {
+        MetisGraph metisG;
+        GGraph& gr = *metisG.getGraph();
+
+        unsigned ed = 0;
+
+        for (auto h : hedgevec[i])
+          graph.getData(h).index = ed++;
+
+        unsigned id = ed;
+        for (auto n : nodesvec[i]) {
+          graph.getData(n).index = id++;
         }
-        unsigned totalnodes = hedgevec.size() + nodesvec.size();
+
+        unsigned totalnodes = id;
         galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_ids(
             totalnodes);
         std::vector<std::vector<EdgeTy>> edge_data(totalnodes);
         std::vector<uint64_t> pre_edges(totalnodes);
         unsigned edges = 0;
-        for (auto h : hedgevec) {
-          for (auto v : graph.edges(h)) {
-            auto vv        = graph.getEdgeDst(v);
-            uint32_t newid = edgemap[h];
-            unsigned nm    = nodemap[vv];
-            edges_ids[newid].push_back(nm);
-          }
-        }
-        galois::GAccumulator<uint64_t> num_edges_acc;
+
         galois::do_all(
-            galois::iterate(uint32_t{0}, totalnodes),
-            [&](uint32_t c) {
+            galois::iterate(hedgevec[i]),
+            [&](GNode h) {
+              for (auto v : graph.edges(h)) {
+                auto vv = graph.getEdgeDst(v);
+
+                uint32_t newid = graph.getData(h).index;
+                unsigned nm    = graph.getData(vv).index;
+                edges_ids[newid].push_back(nm);
+              }
+            },
+            galois::steal(), galois::loopname("populate edge ids"));
+
+        uint64_t num_edges_acc = 0;
+        //galois::do_all(
+          //  galois::iterate(uint32_t{0}, totalnodes),
+            for(uint32_t c = 0;c<totalnodes;c++) {
               pre_edges[c] = edges_ids[c].size();
               num_edges_acc += pre_edges[c];
-            },
-            galois::steal());
-        edges = num_edges_acc.reduce();
+            }
+            //galois::steal(), galois::loopname("set pre edges"));
+
+        edges = num_edges_acc;
+
         for (uint64_t c = 1; c < totalnodes; ++c) {
           pre_edges[c] += pre_edges[c - 1];
         }
         gr.constructFrom(totalnodes, edges, pre_edges, edges_ids, edge_data);
-        gr.hedges = hedgevec.size();
-        gr.hnodes = nodesvec.size();
-        galois::do_all(galois::iterate(gr), [&](GNode n) {
-          if (n < gr.hedges)
-            gr.getData(n).netnum = n + 1;
-          else
-            gr.getData(n).netnum = INT_MAX;
-          gr.getData(n).netrand = INT_MAX;
-          gr.getData(n).netval  = INT_MAX;
-          gr.getData(n).nodeid  = n + 1;
-        });
+
+        gr.hedges = ed;
+        gr.hnodes = id - ed;
+
+        galois::do_all(
+            galois::iterate(gr),
+            [&](GNode n) {
+              if (n < gr.hedges)
+                gr.getData(n).netnum = n + 1;
+              else
+                gr.getData(n).netnum = INT_MAX;
+              gr.getData(n).netrand = INT_MAX;
+              gr.getData(n).netval  = INT_MAX;
+              gr.getData(n).nodeid  = n + 1;
+            },
+            galois::steal(), galois::loopname("build graph: recursion level"));
+
         Partition(&metisG, csize, kValue[i]);
+
         MetisGraph* mcg = &metisG;
 
+        // now free up the memory by deleting all coarsened graphs
         while (mcg->getCoarserGraph() != NULL) {
           mcg = mcg->getCoarserGraph();
         }
@@ -405,47 +466,56 @@ int main(int argc, char** argv) {
         kValue[i + (tmp + 1) / 2] = (tmp) / 2;
         toProcessNew.insert(i);
         toProcessNew.insert(i + (tmp + 1) / 2);
-        for (GNode v : nodesvec) {
-          GNode n     = nodemap[v];
-          unsigned pp = gr.getData(n).getPart();
-          if (pp == 0) {
-            graph.getData(v).setPart(i);
-          } else if (pp == 1) {
-            graph.getData(v).setPart(i + (tmp + 1) / 2);
-          }
-        }
+
+        galois::do_all(
+            galois::iterate(nodesvec[i]),
+            [&](GNode v) {
+              GNode n     = graph.getData(v).index;
+              unsigned pp = gr.getData(n).getPart();
+              if (pp == 0) {
+                graph.getData(v).setPart(i);
+              } else if (pp == 1) {
+                graph.getData(v).setPart(i + (tmp + 1) / 2);
+              }
+            },
+            galois::steal(),
+            galois::loopname("set part: inside recursive call"));
+
         delete mcg;
-      }
-    }
+      } // end if
+    }   // end for
 
     toProcess = toProcessNew;
     toProcessNew.clear();
-  }
-  // std::cout<<"Total Edge Cut: "<<computingCut(graph)<<"\n";
+  } // end while
+  std::cout<<"Coarsening time(s):,"<<Ctime<<"\n";
+  std::cout<<"Partitiong time(s):,"<<Ptime<<"\n";
+  std::cout<<"Refinement time(s):,"<<Rtime<<"\n";
+  std::cout<<"\n";
+  std::cout<<"Edge Cut,"<<computingCut(graph)<<"\n\n";
+
   galois::runtime::reportStat_Single("BiPart", "Edge Cut", computingCut(graph));
-  galois::runtime::reportStat_Single("BiPart", "zero-one",
-                                     computingBalance(graph));
-  // galois::reportPageAlloc("MeminfoPost");
+  //galois::runtime::reportStat_Single("BiPart", "zero-one",
+  //                                   computingBalance(graph));
 
   totalTime.stop();
-
   if (output) {
 
-    std::cout << "hedgs: " << graph.hedges << "\n";
-    std::cout << "size: " << graph.size() << "\n";
-    std::vector<uint32_t> parts(graph.size() - graph.hedges);
-    std::vector<uint64_t> IDs(graph.size() - graph.hedges);
+    std::vector<std::vector<uint64_t> >parts(numPartitions);
 
     for (GNode n = graph.hedges; n < graph.size(); n++) {
-      parts[n - graph.hedges] = graph.getData(n).getPart();
-      IDs[n - graph.hedges]   = n - graph.hedges + 1;
+      unsigned p = graph.getData(n).getPart();
+      parts[p].push_back(n - graph.hedges + 1);
     }
 
     std::ofstream outputFile(outfile.c_str());
 
-    for (size_t i = 0; i < parts.size(); i++)
-      outputFile << IDs[i] << " " << parts[i] << "\n";
-
+    for (unsigned i = 0; i < numPartitions; i++) {
+      outputFile << i+1 << " ";
+      for (auto v : parts[i]) 
+        outputFile << v << " "; 
+      outputFile << "\n";
+    }
     outputFile.close();
   }
   return 0;
diff --git a/lonestar/analytics/cpu/bipart/bipart.h b/lonestar/analytics/cpu/bipart/bipart.h
index f203c51b13..ee69dfa29b 100644
--- a/lonestar/analytics/cpu/bipart/bipart.h
+++ b/lonestar/analytics/cpu/bipart/bipart.h
@@ -28,7 +28,8 @@ typedef uint32_t EdgeTy;
 
 struct GGraph
     : public galois::graphs::LC_CSR_Graph<MetisNode, EdgeTy>::with_no_lockable<
-          true>::type::with_numa_alloc<true>::type {
+        true>::type::with_numa_alloc<true>::type {
+    	//false>::type::with_numa_alloc<true>::type {
   size_t hedges;
   size_t hnodes;
 };
@@ -77,8 +78,19 @@ class MetisNode {
   galois::CopyableAtomic<int> netnum;
   galois::CopyableAtomic<int> netrand;
   galois::CopyableAtomic<int> netval;
-  void initPartition() { pd.locked = false; }
-
+  galois::CopyableAtomic<int> degree;
+  /*std::atomic<int> FS;
+	std::atomic<int> TE;
+	std::atomic<int> netnum;
+	std::atomic<int> netrand;
+	std::atomic<int> netval;
+	std::atomic<int> degree;
+*/	uint32_t index;
+	bool notAlone;
+	
+	void initPartition() { pd.locked = false; }
+	
+	
   // int num;
   explicit MetisNode(int weight) : _weight(weight) {
     initCoarsen();
@@ -193,8 +205,8 @@ MetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,
                     scheduleMode sMode);
 
 // Partitioning
-void partition(MetisGraph* coarseMetisGraph, unsigned K);
+void partition(MetisGraph*, unsigned);
 // Refinement
-void refine(MetisGraph* coarseGraph, unsigned K);
+void refine(MetisGraph* coarseGraph, unsigned K, double imbalance);
 
 #endif
diff --git a/lonestar/analytics/cpu/triangle-counting/Triangles.cpp b/lonestar/analytics/cpu/triangle-counting/Triangles.cpp
index cf1dd9b40d..8c212d26ca 100644
--- a/lonestar/analytics/cpu/triangle-counting/Triangles.cpp
+++ b/lonestar/analytics/cpu/triangle-counting/Triangles.cpp
@@ -244,19 +244,27 @@ void orderedCountFunc(Graph& graph, GNode n,
   size_t numTriangles_local = 0;
   for (auto it_v : graph.edges(n)) {
     auto v = graph.getEdgeDst(it_v);
-    if (v > n)
+    if (v >= n)
       break;
     Graph::edge_iterator it_n =
         graph.edge_begin(n, galois::MethodFlag::UNPROTECTED);
 
     for (auto it_vv : graph.edges(v)) {
       auto vv = graph.getEdgeDst(it_vv);
-      if (vv > v)
+      if (vv >= v)
         break;
       while (graph.getEdgeDst(it_n) < vv)
         it_n++;
       if (vv == graph.getEdgeDst(it_n)) {
-        numTriangles_local += 1;
+
+        Graph::edge_iterator multi_it_n = it_n;
+
+        while (multi_it_n !=
+                   graph.edge_end(n, galois::MethodFlag::UNPROTECTED) &&
+               graph.getEdgeDst(multi_it_n) == vv) {
+          numTriangles_local += 1;
+          multi_it_n++;
+        }
       }
     }
   }
diff --git a/lonestar/analytics/distributed/CMakeLists.txt b/lonestar/analytics/distributed/CMakeLists.txt
index fa3046c679..546937cbda 100644
--- a/lonestar/analytics/distributed/CMakeLists.txt
+++ b/lonestar/analytics/distributed/CMakeLists.txt
@@ -6,6 +6,5 @@ add_subdirectory(connected-components)
 add_subdirectory(k-core)
 add_subdirectory(pagerank)
 add_subdirectory(partition)
-add_subdirectory(matrixcompletion)
 add_subdirectory(sssp)
 add_subdirectory(triangle-counting)
diff --git a/lonestar/analytics/distributed/bfs/CMakeLists.txt b/lonestar/analytics/distributed/bfs/CMakeLists.txt
index 841754e999..4c421768e4 100644
--- a/lonestar/analytics/distributed/bfs/CMakeLists.txt
+++ b/lonestar/analytics/distributed/bfs/CMakeLists.txt
@@ -1,5 +1,5 @@
-app_dist(bfs_push bfs-push)
-add_test_dist(bfs-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)
+#app_dist(bfs_push bfs-push)
+#add_test_dist(bfs-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)
 
 app_dist(bfs_pull bfs-pull)
 add_test_dist(bfs-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)
diff --git a/lonestar/analytics/distributed/bfs/bfs_push.cpp b/lonestar/analytics/distributed/bfs/bfs_push.cpp
index 814756ecec..34aa8031a3 100644
--- a/lonestar/analytics/distributed/bfs/bfs_push.cpp
+++ b/lonestar/analytics/distributed/bfs/bfs_push.cpp
@@ -21,12 +21,16 @@
 #include "DistBench/Start.h"
 #include "galois/DistGalois.h"
 #include "galois/gstl.h"
+#include "galois/substrate/PerThreadStorage.h"
 #include "galois/DReducible.h"
 #include "galois/DTerminationDetector.h"
 #include "galois/runtime/Tracer.h"
+#include "galois/runtime/StackTracer.h"
 
 #include <iostream>
 #include <limits>
+#include <random>
+#include <chrono>
 
 #ifdef GALOIS_ENABLE_GPU
 #include "bfs_push_cuda.h"
@@ -49,14 +53,18 @@ static cll::opt<unsigned int> maxIterations("maxIterations",
                                                       "Default 1000"),
                                             cll::init(1000));
 
-static cll::opt<uint64_t>
-    src_node("startNode", cll::desc("ID of the source node"), cll::init(0));
+static uint64_t src_node;
 
 static cll::opt<uint32_t>
     delta("delta",
           cll::desc("Shift value for the delta step (default value 0)"),
           cll::init(0));
 
+static cll::opt<unsigned>
+  rseed("rseed",
+        cll::desc("The random seed for choosing the hosts (default value 0)"),
+        cll::init(0));
+
 enum Exec { Sync, Async };
 
 static cll::opt<Exec> execution(
@@ -91,10 +99,10 @@ std::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;
 
 struct InitializeGraph {
   const uint32_t& local_infinity;
-  cll::opt<uint64_t>& local_src_node;
+  uint64_t local_src_node;
   Graph* graph;
 
-  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,
+  InitializeGraph(uint64_t& _src_node, const uint32_t& _infinity,
                   Graph* _graph)
       : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}
 
@@ -267,9 +275,13 @@ struct BFS {
 
   void operator()(GNode src) const {
     NodeData& snode = graph->getData(src);
+    //stack_capture->capture_stack_info();
+    cyg_profile_func_stack(nullptr, nullptr);
 
     if (snode.dist_old > snode.dist_current) {
       active_vertices += 1;
+      //stack_capture->capture_stack_info();
+      cyg_profile_func_stack(nullptr, nullptr);
 
       if (local_priority > snode.dist_current) {
         snode.dist_old = snode.dist_current;
@@ -283,7 +295,11 @@ struct BFS {
           uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);
           if (old_dist > new_dist)
             bitset_dist_current.set(dst);
+          //stack_capture->capture_stack_info();
+          cyg_profile_func_stack(nullptr, nullptr);
         }
+        //stack_capture->capture_stack_info();
+        cyg_profile_func_stack(nullptr, nullptr);
       }
     }
   }
@@ -405,7 +421,13 @@ constexpr static const char* const desc = "BFS on Distributed Galois.";
 constexpr static const char* const url  = nullptr;
 
 int main(int argc, char** argv) {
+
+  stack_capture.reset();
+
+  auto st = std::chrono::high_resolution_clock::now();
+
   galois::DistMemSys G;
+
   DistBenchStart(argc, argv, name, desc, url);
 
   const auto& net = galois::runtime::getSystemNetworkInterface();
@@ -414,6 +436,10 @@ int main(int argc, char** argv) {
     galois::runtime::reportParam(REGION_NAME, "Source Node ID", src_node);
   }
 
+  //Setup Seeding information
+  uint64_t* src_nodes = (uint64_t*) malloc(sizeof(uint64_t) * numRuns);
+  std::mt19937 generator(rseed);
+
   galois::StatTimer StatTimer_total("TimerTotal", REGION_NAME);
 
   StatTimer_total.start();
@@ -428,16 +454,54 @@ int main(int argc, char** argv) {
   // bitset comm setup
   bitset_dist_current.resize(hg->size());
 
-  galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");
-
-  InitializeGraph::go((*hg));
-  galois::runtime::getHostBarrier().wait();
-
   // accumulators for use in operators
   galois::DGAccumulator<uint64_t> DGAccumulator_sum;
   galois::DGReduceMax<uint32_t> m;
 
+  //get the src_nodes of the runs
+  galois::StatTimer StatTimer_select("VertexSelection", REGION_NAME);
+  StatTimer_select.start();
+  for(auto run = 0; run < numRuns; ++run)
+  {
+    uint64_t degree = 0;
+    auto num_nodes = hg->globalSize();
+    uint64_t cand = 0;
+    while(degree < 1)
+    {
+      DGAccumulator_sum.reset();
+      cand = generator() % num_nodes;
+
+      if(hg->isOwned(cand) || hg->isLocal(cand))
+      {
+        auto lcand = hg->getLID(cand);
+        DGAccumulator_sum += hg->localDegree(lcand);
+      }
+
+      degree = DGAccumulator_sum.reduce();
+    }
+    src_nodes[run] = cand;
+  }
+  StatTimer_select.stop();
+
+  DGAccumulator_sum.reset();
+
   for (auto run = 0; run < numRuns; ++run) {
+    src_node = src_nodes[run];
+    syncSubstrate->set_num_run(run);
+    if (personality == GPU_CUDA) {
+#ifdef GALOIS_ENABLE_GPU
+      bitset_dist_current_reset_cuda(cuda_ctx);
+#else
+      abort();
+#endif
+    } else {
+      bitset_dist_current.reset();
+    }
+
+    galois::gPrint("[", net.ID, "] InitializeGraph::go called\n");
+    InitializeGraph::go((*hg));
+    galois::runtime::getHostBarrier().wait();
+
     galois::gPrint("[", net.ID, "] BFS::go run ", run, " called\n");
     std::string timer_str("Timer_" + std::to_string(run));
     galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);
@@ -448,38 +512,32 @@ int main(int argc, char** argv) {
     } else {
       BFS<false>::go(*hg);
     }
+
     StatTimer_main.stop();
 
     // sanity check
     BFSSanityCheck::go(*hg, DGAccumulator_sum, m);
 
-    if ((run + 1) != numRuns) {
-      if (personality == GPU_CUDA) {
-#ifdef GALOIS_ENABLE_GPU
-        bitset_dist_current_reset_cuda(cuda_ctx);
-#else
-        abort();
-#endif
-      } else {
-        bitset_dist_current.reset();
-      }
+    if (output) {
+      std::vector<uint32_t> results = makeResults(hg);
+      auto globalIDs                = hg->getMasterGlobalIDs();
+      assert(results.size() == globalIDs.size());
 
-      syncSubstrate->set_num_run(run + 1);
-      InitializeGraph::go((*hg));
-      galois::runtime::getHostBarrier().wait();
+      writeOutput(outputLocation, "level", results.data(), results.size(),
+                  globalIDs.data());
     }
+
   }
 
   StatTimer_total.stop();
+  galois::gPrint("[", net.ID, "] Max Stack Size ", stack_capture.get_max(), " bytes\n");
 
-  if (output) {
-    std::vector<uint32_t> results = makeResults(hg);
-    auto globalIDs                = hg->getMasterGlobalIDs();
-    assert(results.size() == globalIDs.size());
 
-    writeOutput(outputLocation, "level", results.data(), results.size(),
-                globalIDs.data());
-  }
+  struct rusage r_usage;
+  getrusage(RUSAGE_SELF,&r_usage);
+  galois::gPrint("[", net.ID, "] Memory usage: ", r_usage.ru_maxrss, " KB\n");
+  auto en = std::chrono::high_resolution_clock::now();
 
+  galois::gPrint("[", net.ID, "] E2ETime: ", std::chrono::duration_cast<std::chrono::nanoseconds>(en - st).count(), " ns\n");
   return 0;
 }
diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt
new file mode 100644
index 0000000000..d07810f48e
--- /dev/null
+++ b/lonestar/gnn/CMakeLists.txt
@@ -0,0 +1,29 @@
+include_directories(${CMAKE_SOURCE_DIR}/lonestar/gnn/include)
+include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include)
+include_directories(${CUDA_HOME}/include)
+link_directories(${CUDA_HOME}/lib64)
+if(GALOIS_ENABLE_GPU)
+  include_directories(${CMAKE_SOURCE_DIR}/libgpu/include)
+  link_directories(${INTEL_LIBS_DIR})
+endif()
+
+SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64)
+if(USE_MKL_BLAS)
+  SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64")
+endif()
+link_directories(${BLAS_LIB_DIR})
+
+if(GALOIS_ENABLE_GPU)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU")
+endif()
+
+if(GALOIS_ENABLE_DIST)
+  add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp)
+  target_include_directories(distgraphloader PUBLIC include)
+  target_link_libraries(distgraphloader galois_cusp LLVMSupport)
+endif()
+
+add_subdirectory(gcn)
+#add_subdirectory(sage)
+#add_subdirectory(gat)
+add_subdirectory(distributed)
diff --git a/lonestar/gnn/distributed/CMakeLists.txt b/lonestar/gnn/distributed/CMakeLists.txt
new file mode 100644
index 0000000000..7863ee29db
--- /dev/null
+++ b/lonestar/gnn/distributed/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(gcn)
diff --git a/lonestar/gnn/distributed/gcn/CMakeLists.txt b/lonestar/gnn/distributed/gcn/CMakeLists.txt
new file mode 100644
index 0000000000..c8c9d10447
--- /dev/null
+++ b/lonestar/gnn/distributed/gcn/CMakeLists.txt
@@ -0,0 +1,3 @@
+# link libgnn library and all should go well
+add_executable(gcn-dist gcn-dist.cpp)
+target_link_libraries(gcn-dist galois_gnn gnnbench)
diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
new file mode 100644
index 0000000000..60e9fe75b4
--- /dev/null
+++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp
@@ -0,0 +1,25 @@
+#include "GNNBench/Start.h"
+
+constexpr static const char* const name = "Graph Convolutional Network";
+
+int main(int argc, char* argv[]) {
+  galois::DistMemSys G;
+  GNNBenchStart(argc, argv, name);
+
+  galois::StatTimer init_timer("InitializationTime");
+  init_timer.start();
+  std::unique_ptr<
+      galois::GraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>> gnn =
+      InitializeGraphNeuralNetwork<shad::ShadNodeTy, shad::ShadEdgeTy>();
+  gnn->SetLayerPhases(galois::GNNPhase::kTrain);
+  init_timer.stop();
+
+  galois::runtime::getHostBarrier().wait();
+
+  galois::StatTimer compute_timer("Timer_0");
+  compute_timer.start();
+  gnn->Train(num_epochs);
+  compute_timer.stop();
+
+  return 0;
+}
diff --git a/lonestar/gnn/gat/CMakeLists.txt b/lonestar/gnn/gat/CMakeLists.txt
new file mode 100644
index 0000000000..5fc85aa8a6
--- /dev/null
+++ b/lonestar/gnn/gat/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_executable(gat gat.cpp)
+target_link_libraries(gat PRIVATE Galois::shmem lonestar)
+
+if(GALOIS_ENABLE_GPU)
+  set_property(TARGET gat PROPERTY CUDA_STANDARD 14)
+  set_property(TARGET gat PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+  target_link_libraries(gat PRIVATE dg_gpu dg_cpu)
+  target_link_libraries(gat PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
+else()
+target_link_libraries(gat PRIVATE dg_cpu)
+if(GALOIS_ENABLE_DIST)
+  target_link_libraries(gat PRIVATE distgraphloader)
+endif()
+endif()
diff --git a/lonestar/gnn/gat/gat.cpp b/lonestar/gnn/gat/gat.cpp
new file mode 100644
index 0000000000..10647924b7
--- /dev/null
+++ b/lonestar/gnn/gat/gat.cpp
@@ -0,0 +1,34 @@
+// Graph Attension Networks (GAT)
+// Xuhao Chen <cxh@utexas.edu>
+#include "lonestargnn.h"
+
+const char* name = "Graph Attention Networks (GAT)";
+const char* desc = "Graph Attention Networks on an undirected graph: <https://arxiv.org/pdf/1710.10903.pdf>";
+const char* url  = 0;
+
+// math: h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)}
+// where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and node :math:`j`:
+// .. math:: \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l})
+//                e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right)
+/*
+namespace deepgalois {
+ 
+// define aggregator here
+class AppAggregator: public Aggregator {
+public:
+  emb_t applyEdge(VertexID, VertexID u, emb_t in) {
+    auto ilen = get_in_feat_len();
+    return &in[ilen*u];
+  }
+
+  emb_t applyVertex(VertexID v, emb_t in, emb_t accum) {
+    auto n = get_num_samples();
+    auto ilen = get_in_feat_len();
+    auto olen = get_out_feat_len();
+    emb_t a, b, c;
+  }
+};
+
+}
+//*/
+#include "engine.h"
diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt
new file mode 100644
index 0000000000..7ad1ba6e29
--- /dev/null
+++ b/lonestar/gnn/gcn/CMakeLists.txt
@@ -0,0 +1,17 @@
+#if(GALOIS_ENABLE_GPU)
+#  set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA)
+#endif()
+add_executable(gcn gcn.cpp)
+target_link_libraries(gcn PRIVATE Galois::shmem lonestar)
+
+if(GALOIS_ENABLE_GPU)
+  set_property(TARGET gcn PROPERTY CUDA_STANDARD 14)
+  set_property(TARGET gcn PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+  target_link_libraries(gcn PRIVATE dg_gpu dg_cpu)
+  target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
+else()
+target_link_libraries(gcn PRIVATE dg_cpu)
+if(GALOIS_ENABLE_DIST)
+  target_link_libraries(gcn PRIVATE distgraphloader)
+endif()
+endif()
diff --git a/lonestar/gnn/gcn/README.md b/lonestar/gnn/gcn/README.md
new file mode 100644
index 0000000000..ba680b1f5e
--- /dev/null
+++ b/lonestar/gnn/gcn/README.md
@@ -0,0 +1,64 @@
+DESCRIPTION 
+===========
+
+This application does vertex classification in an undirected graph.
+It uses graph neural network (GNN) to train the vertex features 
+which are then used to classify vertices into different classes.
+
+INPUT
+===========
+
+The input dataset contains three parts:
+1. the input graph file: edgelist format of a |V| x |V| sparse matrix.
+2. the vertex label file: |V| lines with each line a integer.
+3. the input feature file: edgelist format of |V| x |D| sparse matrix.
+
+Vertex ids are expected to be sequential integers between 0 and |V|-1.
+|V| is the number of vertices. |D| is the dimension of input feature vectors.
+
+BUILD
+===========
+
+1. Run cmake at BUILD directory `cd build; cmake -DUSE_DEEPGALOIS=1 -DUSE_BLAS=1 ../`
+
+2. Run `cd <BUILD>/lonestargnn/gcn; make -j`
+
+RUN
+===========
+
+Datasets: 
+(1) single-class: cora citeseer pubmed flickr reddit
+(2) multi-class: ppi yelp amazon
+
+The following are a few example command lines.
+
+$ export OPENBLAS_NUM_THREADS=28
+$ ./gnn cora -t=1 -k=30
+$ ./gnn reddit -t=56 -k=200
+$ ./gcn reddit -k=200 -t=56 -ss=9000 -dr=0.1 -h=128 -vi=20
+
+PERFORMANCE
+===========
+- I
+- I
+- I
+
+REFERENCES
+===========
+The GCN model:
+Semi-Supervised Classification with Graph Convolutional Networks (ICLR 2017)  
+http://arxiv.org/abs/1609.02907 
+https://github.com/tkipf/gcn
+
+DGL:
+Deep Graph Library: Towards Efficient and Scalable Deep Learning on Graphs
+https://arxiv.org/abs/1909.01315
+https://github.com/dmlc/dgl
+
+GraphSAGE: 
+Inductive Representation Learning on Large Graphs
+http://snap.stanford.edu/graphsage/
+
+NeuGraph: Parallel Deep Neural Network Computation on Large Graphs
+https://www.usenix.org/conference/atc19/presentation/ma
+
diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp
new file mode 100644
index 0000000000..454179ad5d
--- /dev/null
+++ b/lonestar/gnn/gcn/gcn.cpp
@@ -0,0 +1,11 @@
+// Graph Neural Networks
+// Xuhao Chen <cxh@utexas.edu>
+#include "lonestargnn.h"
+
+const char* name = "Graph Convolutional Networks";
+const char* desc = "Graph convolutional neural networks on an undirected graph";
+const char* url  = 0;
+
+// TODO rather than having main being part of include file, have main in this
+// just be a function call to some common start function
+#include "engine.h"
diff --git a/lonestar/gnn/gin/CMakeLists.txt b/lonestar/gnn/gin/CMakeLists.txt
new file mode 100644
index 0000000000..7e6027174a
--- /dev/null
+++ b/lonestar/gnn/gin/CMakeLists.txt
@@ -0,0 +1,9 @@
+app(gin gin.cpp)
+target_link_libraries(gin dg_cpu)
+if(GALOIS_ENABLE_DIST)
+  target_link_libraries(gin distgraphloader)
+endif()
+if(GALOIS_ENABLE_GPU)
+  target_link_libraries(gin dg_gpu)
+  target_link_libraries(gin -lcudart -lcublas -lcurand -lcudadevrt)
+endif()
diff --git a/lonestar/gnn/gin/gin.cpp b/lonestar/gnn/gin/gin.cpp
new file mode 100644
index 0000000000..4eb8835214
--- /dev/null
+++ b/lonestar/gnn/gin/gin.cpp
@@ -0,0 +1,33 @@
+// Graph Neural Networks
+// Xuhao Chen <cxh@utexas.edu>
+#include "lonestargnn.h"
+#include "DistributedGraphLoader.h"
+
+const char* name = "Graph Isomorphism Network (GIN)";
+const char* desc = "Graph isomorphism neural networks on an undirected graph";
+const char* url  = 0;
+static cll::opt<unsigned>learn_eps("le", cll::desc("whether to learn the parameter epsilon (default value false)"), cll::init(0));
+static cll::opt<std::string>agg_type("at", cll::desc("Aggregator Type"), cll::init("sum"));
+
+template <>
+class graph_conv_layer<agg_type> {
+public:
+  FV apply_edge(VertexID src, VertexID dst, FV2D in_data) {
+    return in_data[dst];
+  }
+  FV apply_vertex(VertexID src, FV2D in_data) {
+    FV a = deepgalois::matmul(deepgalois::accum, deepgalois::W);
+    FV b = deepgalois::scale(in_data[src], 1.0 + self.eps);
+    return deepgalois::vadd(a, b);
+  }
+};
+
+int main(int argc, char** argv) {
+  galois::SharedMemSys G;
+  LonestarGnnStart(argc, argv, name, desc, url);
+  deepgalois::Net network; // the neural network to train
+
+  graph_conv_layer<agg_type> layer0;
+  return 0;
+}
+
diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h
new file mode 100644
index 0000000000..0bce4b5819
--- /dev/null
+++ b/lonestar/gnn/include/DistributedGraphLoader.h
@@ -0,0 +1,163 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+/**
+ * @file DistributedGraphLoader.h
+ *
+ * Contains definitions for the common distributed graph loading functionality
+ * of Galois.
+ *
+ * Version for GNNs which only support symmetric graphs at this point in time.
+ *
+ * @todo Refactoring a bunch of this code is likely very possible to do
+ */
+#ifndef D_GRAPH_LOADER_SYM
+#define D_GRAPH_LOADER_SYM
+
+#include "galois/graphs/CuSPPartitioner.h"
+#include "deepgalois/configs.h"
+#include "llvm/Support/CommandLine.h"
+
+/*******************************************************************************
+ * Supported partitioning schemes
+ ******************************************************************************/
+namespace galois {
+namespace graphs {
+
+//! enums of partitioning schemes supported
+enum PARTITIONING_SCHEME {
+  OEC,           //!< outgoing edge cut
+  IEC,           //!< incoming edge cut
+  HOVC,          //!< outgoing hybrid vertex cut
+  HIVC,          //!< incoming hybrid vertex cut
+  CART_VCUT,     //!< cartesian vertex cut
+  CART_VCUT_IEC, //!< cartesian vertex cut using iec
+  GINGER_O,      //!< Ginger, outgoing
+  GINGER_I,      //!< Ginger, incoming
+  FENNEL_O,      //!< Fennel, oec
+  FENNEL_I,      //!< Fennel, iec
+  SUGAR_O,       //!< Sugar, oec
+  GNN_OEC,       //!< gnn, oec
+  GNN_CVC        //!< gnn, cvc
+};
+
+/**
+ * Turns a PARTITIONING_SCHEME enum to a string
+ *
+ * @param e partitioning scheme enum
+ * @return string version of e
+ */
+inline const char* EnumToString(PARTITIONING_SCHEME e) {
+  switch (e) {
+  case OEC:
+    return "oec";
+  case IEC:
+    return "iec";
+  case HOVC:
+    return "hovc";
+  case HIVC:
+    return "hivc";
+  case CART_VCUT:
+    return "cvc";
+  case CART_VCUT_IEC:
+    return "cvc_iec";
+  case GINGER_O:
+    return "ginger-oec";
+  case GINGER_I:
+    return "ginger-iec";
+  case FENNEL_O:
+    return "fennel-oec";
+  case FENNEL_I:
+    return "fennel-iec";
+  case SUGAR_O:
+    return "sugar-oec";
+  case GNN_OEC:
+    return "gnn-oec";
+  case GNN_CVC:
+    return "gnn-cvc";
+  default:
+    GALOIS_DIE("Unsupported partition");
+  }
+}
+} // end namespace graphs
+} // end namespace galois
+
+/*******************************************************************************
+ * Graph-loading-related command line arguments
+ ******************************************************************************/
+namespace cll = llvm::cl;
+
+//! input graph file
+extern cll::opt<std::string> dataset;
+//! partitioning scheme to use
+extern cll::opt<galois::graphs::PARTITIONING_SCHEME> partitionScheme;
+//! true if input graph file format is SHAD WMD
+extern cll::opt<bool> useWMD;
+
+// @todo command line argument for read balancing across hosts
+
+namespace galois {
+namespace graphs {
+
+/*******************************************************************************
+ * Graph-loading functions
+ ******************************************************************************/
+
+/**
+ * Loads a symmetric graph file (i.e. directed graph with edges in both
+ * directions)
+ *
+ * @tparam NodeData node data to store in graph
+ * @tparam EdgeData edge data to store in graph
+ * @returns a pointer to a newly allocated DistGraph based on the command line
+ * loaded based on command line arguments
+ */
+template <typename NodeData, typename EdgeData>
+std::unique_ptr<DistGraph<NodeData, EdgeData>> constructSymmetricGraph(std::vector<unsigned>&) {
+  std::string inputFile = deepgalois::path + dataset + ".csgr";
+  galois::gInfo("File to read is ", inputFile);
+  switch (partitionScheme) {
+  case OEC:
+  case IEC:
+    return cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
+  case HOVC:
+  case HIVC:
+    return cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
+
+  case CART_VCUT:
+  case CART_VCUT_IEC:
+    return cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
+  case GNN_OEC:
+    return cuspPartitionGraph<GnnOEC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
+  case GNN_CVC:
+    return cuspPartitionGraph<GnnCVC, NodeData, EdgeData>(
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "");
+  default:
+    GALOIS_DIE("Error: partition scheme specified is invalid");
+    return nullptr;
+  }
+}
+
+} // end namespace graphs
+} // end namespace galois
+#endif
diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h
new file mode 100644
index 0000000000..016ac80831
--- /dev/null
+++ b/lonestar/gnn/include/engine.h
@@ -0,0 +1,121 @@
+// Execution engine
+#include <iostream>
+#include <sstream>
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/Galois.h"
+#else
+#include "DistributedGraphLoader.h"
+#include "galois/DistGalois.h"
+#include "galois/runtime/Network.h"
+#endif
+#include "galois/Version.h"
+#include "galois/Timer.h"
+#include "deepgalois/Net.h"
+
+static void LonestarGnnPrintVersion(llvm::raw_ostream& out) {
+  out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " ("
+      << galois::getRevision() << ")\n";
+  out.flush();
+}
+
+//! initialize lonestargnn benchmark
+void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc,
+                      const char* url) {
+  llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+  galois::runtime::setStatFile(statFile);
+
+  unsigned hostID = 0;
+#ifndef GALOIS_ENABLE_GPU
+  numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU
+  hostID = galois::runtime::getSystemNetworkInterface().ID;
+#endif
+
+  if (hostID == 0) {
+    LonestarGnnPrintVersion(llvm::outs());
+    std::cout << "Copyright (C) " << galois::getCopyrightYear()
+              << " The University of Texas at Austin\n";
+    std::cout << "http://iss.ices.utexas.edu/galois/\n\n";
+    std::cout << "application: " << (app ? app : "unspecified") << "\n";
+    if (desc)
+      std::cout << desc << "\n";
+    if (url)
+      std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/"
+                << url << "\n";
+    std::cout << "\n";
+    std::ostringstream cmdout;
+    for (int i = 0; i < argc; ++i) {
+      cmdout << argv[i];
+      if (i != argc - 1)
+        cmdout << " ";
+    }
+    galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str());
+    galois::runtime::reportParam("(NULL)", "Threads", numThreads);
+  }
+
+  char name[256];
+  gethostname(name, 256);
+  galois::runtime::reportParam("(NULL)", "Hostname", name);
+}
+
+int main(int argc, char** argv) {
+#ifdef GALOIS_ENABLE_GPU
+  galois::SharedMemSys G;
+#else
+  galois::DistMemSys G;
+#endif
+  LonestarGnnStart(argc, argv, name, desc, url);
+
+  // Get a partitioned graph first
+  std::vector<unsigned> dummyVec;
+  std::unique_ptr<deepgalois::DGraph> dGraph;
+#ifndef GALOIS_ENABLE_GPU
+  dGraph = galois::graphs::constructSymmetricGraph<char, void>(dummyVec);
+#endif
+  // initialize network + whole context on CPU
+  // read network, initialize metadata
+  // default setting for now; can be customized by the user
+  deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1,
+                          learning_rate, dropout_rate, weight_decay,
+                          add_selfloop, is_single_class, add_l2norm, add_dense,
+                          neighbor_sample_sz, subgraph_sample_sz, val_interval);
+
+  // initialize distributed context
+  network.partitionInit(dGraph.get(), dataset, is_single_class);
+
+  // construct layers from distributed context
+  network.construct_layers();
+  network.print_layers_info();
+  deepgalois::ResourceManager rm; // tracks peak memory usage
+
+  // the optimizer used to update parameters,
+  // see optimizer.h for more details
+  // optimizer *opt = new gradient_descent();
+  // optimizer *opt = new adagrad();
+  deepgalois::optimizer* opt = new deepgalois::adam();
+  galois::StatTimer Ttrain("TrainAndVal");
+  Ttrain.start();
+  network.train(opt, do_validate); // do training using training samples
+  Ttrain.stop();
+
+  if (do_test) {
+    // test using test samples
+    network.read_test_masks(dataset);
+    galois::StatTimer Ttest("Test");
+    Ttest.start();
+    acc_t test_loss = 0.0, test_acc = 0.0;
+    double test_time = network.evaluate("test", test_loss, test_acc);
+#ifndef GALOIS_ENABLE_GPU
+    if (galois::runtime::getSystemNetworkInterface().ID == 0) {
+      galois::gPrint("test_loss = ", test_loss, " test_acc = ", test_acc,
+                     " test_time = ", test_time, "\n");
+    }
+#else
+    galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc,
+                   " test_time = ", test_time, "\n");
+#endif
+    Ttest.stop();
+  }
+  galois::gInfo(rm.get_peak_memory());
+  return 0;
+}
diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h
new file mode 100644
index 0000000000..8b18e80ae0
--- /dev/null
+++ b/lonestar/gnn/include/lonestargnn.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "llvm/Support/CommandLine.h"
+
+namespace cll = llvm::cl;
+static cll::opt<std::string> dataset(cll::Positional, 
+    cll::desc("<dataset name>"), cll::Required); // 'cora', 'citeseer', 'pubmed'
+//static cll::opt<std::string> model("m", 
+//  cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense'
+static cll::opt<int> epochs("k",
+    cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1));
+static cll::opt<unsigned> num_conv_layers("nc",
+    cll::desc("number of convolutional layers, (default value 2)"), cll::init(2));
+static cll::opt<unsigned> hidden1("h",
+    cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16));
+static cll::opt<float> learning_rate("lr", 
+    cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01));
+static cll::opt<float> dropout_rate("dr", 
+    cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5));
+static cll::opt<float> weight_decay("wd",
+    cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4));
+static cll::opt<float> early_stopping("es",
+    cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10));
+static cll::opt<bool> is_single_class("sc", 
+    cll::desc("single-class or multi-class label (default single)"), cll::init(1));
+static cll::opt<bool> do_validate("dv", cll::desc("enable validation"), cll::init(1));
+static cll::opt<bool> do_test("dt", cll::desc("enable test"), cll::init(1));
+static cll::opt<bool> add_selfloop("sl", cll::desc("add selfloop"), cll::init(0));
+static cll::opt<bool> add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0));
+static cll::opt<bool> add_dense("d", cll::desc("add an dense layer"), cll::init(0));
+static cll::opt<int> val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1));
+static cll::opt<unsigned> neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0));
+static cll::opt<unsigned> subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0));
+
+//! standard global options to the benchmarks
+extern llvm::cl::opt<bool> skipVerify;
+extern llvm::cl::opt<int> numThreads;
+extern llvm::cl::opt<std::string> statFile;
+
+//! standard global options to the benchmarks
+llvm::cl::opt<bool> skipVerify("noverify",
+    llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false));
+llvm::cl::opt<int>numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1));
+llvm::cl::opt<std::string> statFile("statFile",
+    llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init(""));
+
diff --git a/lonestar/gnn/run-citeseer.sh b/lonestar/gnn/run-citeseer.sh
new file mode 100755
index 0000000000..a70f0bdc1f
--- /dev/null
+++ b/lonestar/gnn/run-citeseer.sh
@@ -0,0 +1 @@
+./gcn citeseer -t=56 -k=3
diff --git a/lonestar/gnn/sage/CMakeLists.txt b/lonestar/gnn/sage/CMakeLists.txt
new file mode 100644
index 0000000000..b820f7024b
--- /dev/null
+++ b/lonestar/gnn/sage/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_executable(sage sage.cpp)
+target_link_libraries(sage PRIVATE Galois::shmem lonestar)
+
+if(GALOIS_ENABLE_GPU)
+  set_property(TARGET sage PROPERTY CUDA_STANDARD 14)
+  set_property(TARGET sage PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+  target_link_libraries(sage PRIVATE dg_gpu dg_cpu)
+  target_link_libraries(sage PRIVATE -lcudart -lcublas -lcurand -lcudadevrt)
+else()
+target_link_libraries(sage PRIVATE dg_cpu)
+if(GALOIS_ENABLE_DIST)
+  target_link_libraries(sage PRIVATE distgraphloader)
+endif()
+endif()
diff --git a/lonestar/gnn/sage/sage.cpp b/lonestar/gnn/sage/sage.cpp
new file mode 100644
index 0000000000..5f078dff63
--- /dev/null
+++ b/lonestar/gnn/sage/sage.cpp
@@ -0,0 +1,55 @@
+// GraphSAGE: <https://arxiv.org/pdf/1706.02216.pdf>
+// Xuhao Chen <cxh@utexas.edu>
+#include "lonestargnn.h"
+
+const char* name = "GraphSAGE";
+const char* desc = "GraphSAGE on an undirected graph: <https://arxiv.org/pdf/1706.02216.pdf>";
+const char* url  = 0;
+
+// define aggregator here
+// .. math::
+//      h_{\mathcal{N}(i)}^{(l+1)} & = \mathrm{aggregate}
+//      \left(\{h_{j}^{l}, \forall j \in \mathcal{N}(i) \}\right)
+//
+//      h_{i}^{(l+1)} & = \sigma \left(W \cdot \mathrm{concat}
+//      (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1} + b) \right)
+//
+//      h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l})
+
+namespace deepgalois {
+ 
+class AppAggregator: public Aggregator {
+public:
+  emb_t applyEdge(VertexID, VertexID u, emb_t in) {
+    auto ilen = get_in_feat_len();
+    return &in[ilen*u];
+  }
+
+  emb_t applyVertex(VertexID v, emb_t in, emb_t accum) {
+    auto n = get_num_samples();
+    auto ilen = get_in_feat_len();
+    auto olen = get_out_feat_len();
+    emb_t a, b, c;
+    math::mvmul(CblasTrans, olen, ilen, 1.0, W, &accum[v*ilen], 0.0, a); // a = W * accum[v]; [olen x ilen] * [ilen x 1] = [olen x 1]
+    math::mvmul(CblasTrans, olen, ilen, 1.0, Q, &in[v*ilen], 0.0, b);    // b = Q * in; [olen x ilen] * [ilen x 1] = [olen x 1] 
+    math::vadd_cpu(olen, a, b, c); // c = a + b; [olen x 1]
+    return c; // the feature vector to update h[v]
+  }
+/*
+  emb_t applyVertex(emb_t in, emb_t accum) {
+    auto n = get_num_samples();
+    auto ilen = get_in_feat_len();
+    auto olen = get_out_feat_len();
+    emb_t a, b, c;
+    math::matmul(n, olen, ilen, accum, W, a); // a = accum * W; [n x ilen] * [ilen x olen] = [n x olen]
+    math::matmul(n, olen, ilen, in, Q, b);    // b = in * Q; [n x ilen] * [ilen x olen] = [n x olen] 
+    math::vadd(n*olen, a, b, c); // c = a + b; [n x olen]
+    return c; // all the feature vectors to update the entire h
+  }
+*/
+  //void update_all(size_t len, Graph& g, const emb_t in, emb_t out) {
+  //}
+};
+
+}
+#include "engine.h"
diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp
new file mode 100644
index 0000000000..5e1a2dbe81
--- /dev/null
+++ b/lonestar/gnn/src/DistributedGraphLoader.cpp
@@ -0,0 +1,48 @@
+/*
+ * This file belongs to the Galois project, a C++ library for exploiting
+ * parallelism. The code is being released under the terms of the 3-Clause BSD
+ * License (a copy is located in LICENSE.txt at the top-level directory).
+ *
+ * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.
+ * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS
+ * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF
+ * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF
+ * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH
+ * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances
+ * shall University be liable for incidental, special, indirect, direct or
+ * consequential damages or loss of profits, interruption of business, or
+ * related expenses which may arise from use of Software or Documentation,
+ * including but not limited to those resulting from defects in Software and/or
+ * Documentation, or loss or inaccuracy of data of any kind.
+ */
+
+/**
+ * @file DistributedGraphLoader.cpp
+ *
+ * Contains definitions for command line arguments related to distributed
+ * graph loading.
+ */
+
+#include "DistributedGraphLoader.h"
+
+using namespace galois::graphs;
+
+namespace cll = llvm::cl;
+
+cll::opt<PARTITIONING_SCHEME> partitionScheme(
+    "partition", cll::desc("Type of partitioning."),
+    cll::values(
+        clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"),
+        clEnumValN(IEC, "iec", "Incoming Edge-Cut"),
+        clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"),
+        clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"),
+        clEnumValN(GNN_OEC, "g-oec", "gnn oec: train nodes evenly distributed"),
+        clEnumValN(GNN_CVC, "g-cvc",
+                   "gnn cvc: train nodes evenly distributed")),
+    cll::init(GNN_OEC));
+
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
+                       cll::init(false));
diff --git a/lonestar/libdistbench/include/DistBench/Input.h b/lonestar/libdistbench/include/DistBench/Input.h
index 088bc82444..d7e9cb8568 100644
--- a/lonestar/libdistbench/include/DistBench/Input.h
+++ b/lonestar/libdistbench/include/DistBench/Input.h
@@ -99,6 +99,8 @@ extern cll::opt<std::string> inputFileTranspose;
 extern cll::opt<bool> symmetricGraph;
 //! partitioning scheme to use
 extern cll::opt<PARTITIONING_SCHEME> partitionScheme;
+//! true if input graph file format is SHAD WMD
+extern cll::opt<bool> useWMD;
 ////! path to vertex id map for custom edge cut
 // extern cll::opt<std::string> vertexIDMapFileName;
 //! true if you want to read graph structure from a file
@@ -143,18 +145,18 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case OEC:
   case IEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose,
-        mastersFile);
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
+        inputFileTranspose, mastersFile);
   case HOVC:
   case HIVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
 
   case CART_VCUT:
   case CART_VCUT_IEC:
     return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
 
     // case CEC:
@@ -164,18 +166,18 @@ constructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   case GINGER_O:
   case GINGER_I:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD ,true,
         inputFileTranspose);
 
   case FENNEL_O:
   case FENNEL_I:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true,
         inputFileTranspose);
   default:
     GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme);
@@ -204,19 +206,19 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
   auto& net = galois::runtime::getSystemNetworkInterface();
   if (net.Num == 1) {
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   }
 
   switch (partitionScheme) {
   case OEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose, mastersFile);
   case IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose, mastersFile);
     } else {
       GALOIS_DIE("incoming edge cut requires transpose graph");
@@ -225,12 +227,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case HOVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   case HIVC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("incoming hybrid cut requires transpose graph");
@@ -239,13 +241,13 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case CART_VCUT:
     return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
 
   case CART_VCUT_IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("cvc incoming cut requires transpose graph");
@@ -258,12 +260,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case GINGER_O:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   case GINGER_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Ginger requires transpose graph");
@@ -272,12 +274,12 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case FENNEL_O:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
   case FENNEL_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Fennel requires transpose graph");
@@ -286,7 +288,7 @@ constructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false,
         inputFileTranspose);
 
   default:
@@ -318,7 +320,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
   if (net.Num == 1) {
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       fprintf(stderr, "WARNING: Loading transpose graph through in-memory "
@@ -326,7 +328,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
                       "graph with -graphTranspose to avoid unnecessary "
                       "overhead.\n");
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     }
   }
@@ -334,12 +336,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
   switch (partitionScheme) {
   case OEC:
     return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose, mastersFile);
   case IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose, mastersFile);
     } else {
       GALOIS_DIE("iec requires transpose graph");
@@ -348,12 +350,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case HOVC:
     return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case HIVC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("hivc requires transpose graph");
@@ -362,13 +364,14 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case CART_VCUT:
     return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case CART_VCUT_IEC:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData,
                                         EdgeData>(inputFile, galois::CUSP_CSC,
-                                                  galois::CUSP_CSC, false,
+                                                  galois::CUSP_CSC, useWMD,
+                                                  false,
                                                   inputFileTranspose);
     } else {
       GALOIS_DIE("cvc requires transpose graph");
@@ -377,12 +380,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case GINGER_O:
     return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case GINGER_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Ginger requires transpose graph");
@@ -391,12 +394,12 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case FENNEL_O:
     return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
   case FENNEL_I:
     if (inputFileTranspose.size()) {
       return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(
-          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,
+          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false,
           inputFileTranspose);
     } else {
       GALOIS_DIE("Fennel requires transpose graph");
@@ -405,7 +408,7 @@ DistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {
 
   case SUGAR_O:
     return galois::cuspPartitionGraph<SugarColumnFlipP, NodeData, EdgeData>(
-        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,
+        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false,
         inputFileTranspose);
 
   default:
diff --git a/lonestar/libdistbench/include/DistBench/Output.h b/lonestar/libdistbench/include/DistBench/Output.h
index 51df733952..e15bbc45c6 100644
--- a/lonestar/libdistbench/include/DistBench/Output.h
+++ b/lonestar/libdistbench/include/DistBench/Output.h
@@ -1,6 +1,7 @@
 #ifndef GALOIS_DISTBENCH_OUTPUT_H
 #define GALOIS_DISTBENCH_OUTPUT_H
 
+#include <cstdint>
 #include <string>
 #include <fstream>
 #include "galois/gIO.h"
diff --git a/lonestar/libdistbench/src/Input.cpp b/lonestar/libdistbench/src/Input.cpp
index 495f68c0c5..844591506f 100644
--- a/lonestar/libdistbench/src/Input.cpp
+++ b/lonestar/libdistbench/src/Input.cpp
@@ -60,6 +60,11 @@ cll::opt<PARTITIONING_SCHEME> partitionScheme(
                    "fennel, incoming edge cut, using CuSP")),
     cll::init(OEC));
 
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
+                       cll::init(false));
+
 cll::opt<bool> readFromFile("readFromFile",
                             cll::desc("Set this flag if graph is to be "
                                       "constructed from file (file must be "
diff --git a/lonestar/libgnnbench/CMakeLists.txt b/lonestar/libgnnbench/CMakeLists.txt
new file mode 100644
index 0000000000..0818a3310c
--- /dev/null
+++ b/lonestar/libgnnbench/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_library(gnnbench STATIC src/Input.cpp src/Start.cpp)
+target_include_directories(gnnbench PUBLIC
+  "${CMAKE_CURRENT_SOURCE_DIR}/include"
+)
+
+target_link_libraries(gnnbench galois_gnn LLVMSupport)
diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h
new file mode 100644
index 0000000000..50713cae67
--- /dev/null
+++ b/lonestar/libgnnbench/include/GNNBench/Input.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include "galois/GraphNeuralNetwork.h"
+#include "galois/graphs/GNNGraph.h"
+#include <llvm/Support/CommandLine.h>
+
+#ifdef GALOIS_ENABLE_GPU
+extern int gpudevice;
+#endif
+
+//! Directory where all files used for GNN training are found
+extern llvm::cl::opt<std::string> input_directory;
+//! Base graph name (used to find the csgr, features, masks, etc.)
+extern llvm::cl::opt<std::string> input_name;
+//! Scheme used to partition the graph
+extern llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme;
+extern llvm::cl::opt<unsigned> num_layers;
+extern llvm::cl::opt<unsigned> layer_size;
+extern llvm::cl::opt<float> learning_rate;
+extern llvm::cl::opt<galois::GNNOutputLayerType> output_layer_type;
+extern llvm::cl::opt<bool> multiclass_labels;
+extern llvm::cl::opt<bool> do_graph_sampling;
+extern llvm::cl::opt<bool> useWMD;
+extern llvm::cl::opt<bool> use_train_subgraph;
+extern llvm::cl::opt<unsigned> minibatch_test_interval;
+extern llvm::cl::opt<unsigned> test_interval;
+extern llvm::cl::opt<unsigned> val_interval;
+extern llvm::cl::opt<unsigned> train_minibatch_size;
+extern llvm::cl::opt<unsigned> test_minibatch_size;
+extern llvm::cl::opt<bool> inductive_subgraph;
+
+const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s);
+
+std::vector<galois::GNNLayerType> CreateLayerTypesVector();
+
+template <typename VTy, typename ETy>
+std::vector<size_t>
+CreateLayerSizesVector(const galois::graphs::GNNGraph<VTy, ETy>* gnn_graph) {
+  // set layer sizes for intermdiate and output layers
+  std::vector<size_t> layer_sizes_vector;
+
+  // if (layer_sizes.size()) {
+  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+  //  for (size_t i = 0; i < num_layers; i++) {
+  //    layer_sizes_vector.emplace_back(layer_sizes[i]);
+  //  }
+  //  // verify user satisfies last intermediate layer needing to have same size
+  //  // as # label classes
+  //  if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) {
+  //    galois::gWarn(
+  //        "Size of last layer (", layer_sizes_vector.back(),
+  //        ") is not equal to # label classes: forcefully changing it to ",
+  //        gnn_graph->GetNumLabelClasses());
+  //    layer_sizes_vector.back()   = gnn_graph->GetNumLabelClasses();
+  //    layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses();
+  //  }
+
+  //  GALOIS_LOG_ASSERT(layer_sizes_vector.back() ==
+  //                    gnn_graph->GetNumLabelClasses());
+  //} else {
+  //  // default 16 for everything until last 2
+  //  for (size_t i = 0; i < num_layers - 1; i++) {
+  //    layer_sizes_vector.emplace_back(16);
+  //  }
+  //  // last 2 sizes must be equivalent to # label classes; this is the last
+  //  // intermediate layer
+  //  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+  //}
+
+  for (size_t i = 0; i < num_layers - 1; i++) {
+    layer_sizes_vector.emplace_back(layer_size);
+  }
+  // last 2 sizes must be equivalent to # label classes; this is the last
+  // intermediate layer
+  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+  // TODO
+  // for now only softmax layer which dictates the output size of the last
+  // intermediate layer + size of the output layer
+  // output layer at the moment required to be same as # label classes
+  layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses());
+
+  return layer_sizes_vector;
+}
+
+galois::GNNLayerConfig CreateLayerConfig();
+
+template <typename VTy, typename ETy>
+std::unique_ptr<galois::BaseOptimizer>
+CreateOptimizer(const galois::graphs::GNNGraph<VTy, ETy>* gnn_graph) {
+  std::vector<size_t> opt_sizes;
+
+  // optimizer sizes are based on intermediate layer sizes, input feats, and
+  // # label classes
+  // if (layer_sizes.size()) {
+  //  GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers);
+  //  opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]);
+  //  // assumption here is that if it reached this point then layer sizes were
+  //  // already sanity checked previously (esp. last layer)
+  //  for (size_t i = 1; i < num_layers; i++) {
+  //    opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]);
+  //  }
+  //} else {
+  //  // everything is size 16 until last
+  //  if (num_layers == 1) {
+  //    // single layer requires a bit of special handling
+  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
+  //                           gnn_graph->GetNumLabelClasses());
+  //  } else {
+  //    // first
+  //    opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16);
+  //    for (size_t i = 1; i < num_layers - 1; i++) {
+  //      opt_sizes.emplace_back(16 * 16);
+  //    }
+  //    // last
+  //    opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses());
+  //  }
+  //}
+
+  // everything is size 16 until last
+  if (num_layers == 1) {
+    // single layer requires a bit of special handling
+    opt_sizes.emplace_back(gnn_graph->node_feature_length() *
+                           gnn_graph->GetNumLabelClasses());
+  } else {
+    // first
+    opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_size);
+    for (size_t i = 1; i < num_layers - 1; i++) {
+      opt_sizes.emplace_back(layer_size * layer_size);
+    }
+    // last
+    opt_sizes.emplace_back(layer_size * gnn_graph->GetNumLabelClasses());
+  }
+  GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers);
+
+  galois::AdamOptimizer::AdamConfiguration adam_config;
+  adam_config.alpha = learning_rate;
+
+  // TODO only adam works right now, add the others later
+  return std::make_unique<galois::AdamOptimizer>(adam_config, opt_sizes,
+                                                 num_layers);
+}
+
+std::vector<unsigned> CreateFanOutVector();
+
+//! Using command line args above, create a GNN using some specified layer type
+//! as the intermediate layer.
+template <typename VTy, typename ETy>
+std::unique_ptr<galois::GraphNeuralNetwork<VTy, ETy>>
+InitializeGraphNeuralNetwork() {
+  // partition/load graph
+  auto gnn_graph = std::make_unique<galois::graphs::GNNGraph<VTy, ETy>>(
+      input_directory, input_name, partition_scheme, !multiclass_labels,
+      useWMD);
+
+  // create layer types vector
+  std::vector<galois::GNNLayerType> layer_types = CreateLayerTypesVector();
+  // sizes
+  std::vector<size_t> layer_sizes_vector =
+      CreateLayerSizesVector(gnn_graph.get());
+  // layer config object
+  galois::GNNLayerConfig layer_config = CreateLayerConfig();
+  // GNN config object
+  galois::GraphNeuralNetworkConfig gnn_config(
+      num_layers, layer_types, layer_sizes_vector, output_layer_type,
+      do_graph_sampling, layer_config);
+  gnn_config.use_train_subgraph_      = use_train_subgraph;
+  gnn_config.validation_interval_     = val_interval;
+  gnn_config.test_interval_           = test_interval;
+  gnn_config.train_minibatch_size_    = train_minibatch_size;
+  gnn_config.test_minibatch_size_     = test_minibatch_size;
+  gnn_config.minibatch_test_interval_ = minibatch_test_interval;
+  gnn_config.inductive_subgraph_      = inductive_subgraph;
+  gnn_config.fan_out_vector_          = CreateFanOutVector();
+
+  // optimizer
+  std::unique_ptr<galois::BaseOptimizer> opt = CreateOptimizer(gnn_graph.get());
+
+  // create the gnn
+  return std::make_unique<galois::GraphNeuralNetwork<VTy, ETy>>(
+      std::move(gnn_graph), std::move(opt), std::move(gnn_config));
+}
diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h
new file mode 100644
index 0000000000..125307e0c3
--- /dev/null
+++ b/lonestar/libgnnbench/include/GNNBench/Start.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "galois/Galois.h"
+#include "galois/Version.h"
+#include "GNNBench/Input.h"
+#ifdef GALOIS_ENABLE_GPU
+#include "galois/CUDAUtilHostDecls.h"
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// CLI
+////////////////////////////////////////////////////////////////////////////////
+
+extern llvm::cl::opt<unsigned> num_threads;
+extern llvm::cl::opt<unsigned> num_epochs;
+extern llvm::cl::opt<unsigned> layer_size;
+extern llvm::cl::opt<galois::GNNLayerType> cl_layer_type;
+extern llvm::cl::opt<unsigned> train_minibatch_size;
+extern llvm::cl::opt<unsigned> test_minibatch_size;
+extern llvm::cl::opt<bool> do_graph_sampling;
+extern llvm::cl::opt<float> learning_rate;
+
+#ifdef GALOIS_ENABLE_GPU
+std::string personality_str(DevicePersonality p);
+extern llvm::cl::opt<int> num_nodes;
+extern llvm::cl::opt<std::string> personality_set;
+
+namespace internal {
+void heteroSetup();
+};
+#endif
+
+const char* GNNLayerToString(galois::GNNLayerType s);
+
+////////////////////////////////////////////////////////////////////////////////
+// Init functions
+////////////////////////////////////////////////////////////////////////////////
+
+//! Parses command line + setup some stats
+void GNNBenchStart(int argc, char** argv, const char* app);
+void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
+                   const char* url);
diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp
new file mode 100644
index 0000000000..c1da754222
--- /dev/null
+++ b/lonestar/libgnnbench/src/Input.cpp
@@ -0,0 +1,241 @@
+#include "galois/Logging.h"
+#include "GNNBench/Input.h"
+
+namespace cll = llvm::cl;
+
+// Self documented via the desc argument
+
+llvm::cl::opt<std::string> input_directory(
+    "inputDirectory",
+    cll::desc("Base directory to find all files required for doing GNN "
+              "training (features, graph topology, masks, etc.)"),
+    cll::init(galois::default_gnn_dataset_path));
+
+llvm::cl::opt<std::string> input_name(
+    cll::Positional,
+    cll::desc("Base name of graph: used to find csgr, features, etc."),
+    cll::Required);
+
+llvm::cl::opt<galois::graphs::GNNPartitionScheme> partition_scheme(
+    "partition", cll::desc("Type of partitioning."),
+    cll::values(clEnumValN(galois::graphs::GNNPartitionScheme::kOEC, "oec",
+                           "Outgoing Edge-Cut (default)"),
+                clEnumValN(galois::graphs::GNNPartitionScheme::kCVC, "cvc",
+                           "Cartesian Vertex-Cut"),
+                clEnumValN(galois::graphs::GNNPartitionScheme::kOCVC, "ocvc",
+                           "Original Cartesian Vertex-Cut")),
+    cll::init(galois::graphs::GNNPartitionScheme::kOEC));
+
+cll::opt<bool> useWMD("useWMD", cll::desc("true if the input graph is"
+                                          " SHAD WMD graph format."
+                                          " Otheriwse, set false."),
+                       cll::init(false));
+
+llvm::cl::opt<unsigned> num_layers(
+    "numLayers",
+    cll::desc(
+        "Number of intermediate layers in the neural network (default 2))"),
+    cll::init(2));
+
+// llvm::cl::list<unsigned> layer_sizes(
+//    "layerSizes",
+//    cll::desc(
+//        "Comma separated list of numbers specifying "
+//        "intermediate layer sizes (does not include output); default sizes are
+//        " "16 until last layer which is the size of the # of labels"),
+//    cll::CommaSeparated);
+
+llvm::cl::opt<unsigned> layer_size(
+    "layerSize",
+    cll::desc(
+        "Number specifying "
+        "intermediate layer sizes (does not include output); default sizes are "
+        "16 until last layer which is the size of the # of labels"),
+    cll::init(16));
+
+llvm::cl::opt<galois::GNNLayerType> cl_layer_type(
+    "layerType",
+    cll::desc("Layer type specifying "
+              "intermediate layers (does not include output); default SAGE"),
+    cll::values(
+        clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn",
+                   "Graph Convolutional Layer (default)"),
+        clEnumValN(galois::GNNLayerType::kSAGE, "sage",
+                   "SAGE layer (GCN with concat + mean)"),
+        clEnumValN(galois::GNNLayerType::kL2Norm, "l2norm", "L2 norm layer"),
+        clEnumValN(galois::GNNLayerType::kReLU, "ReLU", "ReLU norm layer"),
+        clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")),
+    cll::init(galois::GNNLayerType::kSAGE));
+
+llvm::cl::list<unsigned> cl_fan_out_vector(
+    "samplingFanOut",
+    cll::desc(
+        "Comma separated list of layer fanout if sampling/batching is used"),
+    cll::CommaSeparated);
+
+llvm::cl::opt<bool>
+    disable_dropout("disableDropout",
+                    cll::desc("If true (off by default), disables dropout of "
+                              "layer weights during training"),
+                    cll::init(false));
+
+llvm::cl::opt<float> dropout_rate(
+    "dropoutRate",
+    cll::desc("Specifies probability that any one weight is DROPPED (e.g., if "
+              "0.1, then 10 percent chance of dropping) (default 0.5)"),
+    cll::init(0.5));
+
+llvm::cl::opt<bool> disable_activation(
+    "disableActivation",
+    cll::desc("If true (off by default), disable activation at the "
+              "end of an intermediate layers"),
+    cll::init(false));
+
+llvm::cl::opt<bool> disable_normalization(
+    "disableNormalization",
+    cll::desc("If true (off by default), disable normalizing vertex "
+              "features based on their degree"),
+    cll::init(false));
+
+llvm::cl::opt<galois::GNNOutputLayerType> output_layer_type(
+    "outputLayer", cll::desc("Type of output layer"),
+    cll::values(clEnumValN(galois::GNNOutputLayerType::kSoftmax, "softmax",
+                           "Softmax (default)"),
+                clEnumValN(galois::GNNOutputLayerType::kSigmoid, "sigmoid",
+                           "Sigmoid")),
+    cll::init(galois::GNNOutputLayerType::kSoftmax));
+
+llvm::cl::opt<bool>
+    multiclass_labels("multiclassLabels",
+                      cll::desc("If true (off by default), use multi-class "
+                                "ground truth; required for some inputs"),
+                      cll::init(false));
+
+llvm::cl::opt<bool> disable_agg_after_update(
+    "disableAggregationAfterUpdate",
+    cll::desc("If true (off by default), disables aggregate "
+              "after update optimization"),
+    cll::init(false));
+
+llvm::cl::opt<bool> disable_self_aggregate(
+    "disableSelfAggregation",
+    cll::desc("If true (off by default), disables aggregate of self feature"),
+    cll::init(false));
+
+llvm::cl::opt<bool>
+    do_graph_sampling("doGraphSampling",
+                      cll::desc("If true (off by default), sample nodes for "
+                                "use every epoch at a 50\% drop rate"),
+                      cll::init(false));
+
+llvm::cl::opt<bool> use_train_subgraph(
+    "useTrainingSubgraph",
+    cll::desc(
+        "If true (off by default), during training "
+        "only compute minimum required for training nodes in training phase"),
+    cll::init(false));
+
+llvm::cl::opt<bool> inductive_subgraph(
+    "inductiveSubgraph",
+    cll::desc("If true (off by default), only sample training/other nodes when "
+              "constructing subgraph"),
+    cll::init(false));
+
+llvm::cl::opt<unsigned>
+    train_minibatch_size("trainMinibatchSize",
+                         cll::desc("Size of training minibatch (default 0)"),
+                         cll::init(0));
+
+llvm::cl::opt<unsigned>
+    test_minibatch_size("testMinibatchSize",
+                        cll::desc("Size of test minibatch (default 0)"),
+                        cll::init(0));
+
+llvm::cl::opt<unsigned> minibatch_test_interval(
+    "minibatchTestInterval",
+    cll::desc("Size of test intervals for minibatch (default 0)"),
+    cll::init(0));
+
+llvm::cl::opt<unsigned>
+    val_interval("valInterval",
+                 cll::desc("# of epochs to test validation set (default 0)"),
+                 cll::init(0));
+
+llvm::cl::opt<unsigned>
+    test_interval("testInterval",
+                  cll::desc("# of epochs to test test set (default 0)"),
+                  cll::init(0));
+
+llvm::cl::opt<float>
+    learning_rate("learningRate",
+                  cll::desc("Adam optimizer learning rate (default 0.01)"),
+                  cll::init(0.01));
+
+const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) {
+  switch (s) {
+  case galois::graphs::GNNPartitionScheme::kOEC:
+    return "oec";
+  case galois::graphs::GNNPartitionScheme::kCVC:
+    return "cvc";
+  case galois::graphs::GNNPartitionScheme::kOCVC:
+    return "ocvc";
+  default:
+    GALOIS_LOG_FATAL("Invalid partitioning scheme");
+    return "";
+  }
+}
+
+//! Initializes the vector of layer sizes from command line args + graph
+std::vector<galois::GNNLayerType> CreateLayerTypesVector() {
+  std::vector<galois::GNNLayerType> layer_types;
+  for (size_t i = 0; i < num_layers; i++) {
+    layer_types.emplace_back(cl_layer_type);
+  }
+  // if (!cl_layer_types.size()) {
+  //  // default is all GCN layers
+  //  for (size_t i = 0; i < num_layers; i++) {
+  //    layer_types.emplace_back(galois::GNNLayerType::kGraphConvolutional);
+  //  }
+  //} else {
+  //  GALOIS_LOG_VASSERT(cl_layer_types.size() == num_layers,
+  //                     "Number layer types should be {} not {}", num_layers,
+  //                     cl_layer_types.size());
+  //  for (size_t i = 0; i < num_layers; i++) {
+  //    layer_types.emplace_back(cl_layer_types[i]);
+  //  }
+  //}
+  return layer_types;
+}
+
+//! Setup layer config struct based on cli args
+galois::GNNLayerConfig CreateLayerConfig() {
+  galois::GNNLayerConfig layer_config;
+  layer_config.disable_dropout                = disable_dropout;
+  layer_config.dropout_rate                   = dropout_rate;
+  layer_config.disable_activation             = disable_activation;
+  layer_config.disable_normalization          = disable_normalization;
+  layer_config.disable_aggregate_after_update = disable_agg_after_update;
+  layer_config.disable_self_aggregate         = disable_self_aggregate;
+  return layer_config;
+}
+
+std::vector<unsigned> CreateFanOutVector() {
+  std::vector<unsigned> fan_out;
+  // fan out only matters if graph sampling is enabled
+  if (do_graph_sampling) {
+    // assert fan out size is the same
+    if (cl_fan_out_vector.size() == num_layers) {
+      for (unsigned i = 0; i < num_layers; i++) {
+        fan_out.emplace_back(cl_fan_out_vector[i]);
+      }
+    } else {
+      galois::gWarn("Fan out specification does not equal number of layers: "
+                    "using default 10 followed by 25s");
+      fan_out.emplace_back(10);
+      for (unsigned i = 1; i < num_layers; i++) {
+        fan_out.emplace_back(25);
+      }
+    }
+  }
+  return fan_out;
+}
diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp
new file mode 100644
index 0000000000..ed928374cc
--- /dev/null
+++ b/lonestar/libgnnbench/src/Start.cpp
@@ -0,0 +1,196 @@
+#include "GNNBench/Start.h"
+
+namespace cll = llvm::cl;
+
+cll::opt<unsigned> num_threads("t", cll::desc("Number of threads (default 1)"),
+                               cll::init(1));
+cll::opt<unsigned> num_epochs("epochs",
+                              cll::desc("Number of epochs (default 50)"),
+                              cll::init(50));
+
+#ifdef GALOIS_ENABLE_GPU
+std::string personality_str(DevicePersonality p) {
+  switch (p) {
+  case DevicePersonality::CPU:
+    return "CPU";
+  case DevicePersonality::GPU_CUDA:
+    return "GPU_CUDA";
+  default:
+    GALOIS_LOG_ASSERT(false && "Invalid personality");
+    break;
+  }
+  return "";
+}
+
+cll::opt<int> num_nodes(
+    "numNodes",
+    cll::desc("Num of physical nodes with devices (default = num of hosts): "
+              "detect GPU to use for each host automatically"),
+    cll::init(-1));
+cll::opt<std::string> personality_set(
+    "pset",
+    cll::desc("String specifying personality for hosts on each physical "
+              "node. 'c'=CPU, 'g'=GPU (default 'c')"),
+    cll::init("c"));
+#endif
+
+cll::opt<std::string>
+    stat_file("statFile", cll::desc("Optional output file to print stats to"));
+
+////////////////////////////////////////////////////////////////////////////////
+
+static void PrintVersion(llvm::raw_ostream& out) {
+  out << "D-Galois Benchmark Suite v" << galois::getVersion() << " ("
+      << galois::getRevision() << ")\n";
+  out.flush();
+}
+
+const char* GNNLayerToString(galois::GNNLayerType s) {
+  switch (s) {
+  case galois::GNNLayerType::kSAGE:
+    return "sage";
+  case galois::GNNLayerType::kGraphConvolutional:
+    return "gcn";
+  default:
+    GALOIS_LOG_FATAL("Invalid gnn layer");
+    return "";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void GNNBenchStart(int argc, char** argv, const char* app) {
+  GNNBenchStart(argc, argv, app, nullptr, nullptr);
+}
+
+void GNNBenchStart(int argc, char** argv, const char* app, const char* desc,
+                   const char* url) {
+  llvm::cl::SetVersionPrinter(PrintVersion);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+  num_threads = galois::setActiveThreads(num_threads);
+  galois::runtime::setStatFile(stat_file);
+
+  auto& net = galois::runtime::getSystemNetworkInterface();
+
+  if (net.ID == 0) {
+    PrintVersion(llvm::outs());
+    llvm::outs() << "Copyright (C) " << galois::getCopyrightYear()
+                 << " The University of Texas at Austin\n";
+    llvm::outs() << "http://iss.ices.utexas.edu/galois/\n\n";
+    llvm::outs() << "application: " << (app ? app : "unspecified") << "\n";
+
+    if (desc) {
+      llvm::outs() << desc << "\n";
+    }
+    if (url) {
+      llvm::outs()
+          << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url
+          << "\n";
+    }
+    llvm::outs() << "\n";
+    llvm::outs().flush();
+
+    std::ostringstream cmdout;
+
+    for (int i = 0; i < argc; ++i) {
+      cmdout << argv[i];
+      if (i != argc - 1)
+        cmdout << " ";
+    }
+
+    galois::runtime::reportParam("GNNBench", "CommandLine", cmdout.str());
+    galois::runtime::reportParam("GNNBench", "Threads", num_threads);
+    galois::runtime::reportParam("GNNBench", "Hosts", net.Num);
+    galois::runtime::reportParam("GNNBench", "Run_UUID",
+                                 galois::runtime::getRandUUID());
+    galois::runtime::reportParam("GNNBench", "InputDirectory", input_directory);
+    galois::runtime::reportParam("GNNBench", "Input", input_name);
+    galois::runtime::reportParam("GNNBench", "PartitionScheme",
+                                 GNNPartitionToString(partition_scheme));
+    galois::runtime::reportParam("GNNBench", "HiddenLayerSize", layer_size);
+    galois::runtime::reportParam("GNNBench", "LayerType",
+                                 GNNLayerToString(cl_layer_type));
+    galois::runtime::reportParam("GNNBench", "TrainingMinibatchSize",
+                                 train_minibatch_size);
+    galois::runtime::reportParam("GNNBench", "TestingMinibatchSize",
+                                 test_minibatch_size);
+    galois::runtime::reportParam("GNNBench", "IsGraphSampled",
+                                 do_graph_sampling);
+    galois::runtime::reportParam("GNNBench", "LearningRate", learning_rate);
+
+    if (useWMD &&
+        partition_scheme != galois::graphs::GNNPartitionScheme::kOCVC) {
+      // cvc/oec (GNN-CVC, GNN-OEC in CuSP), not ocvc, are variants
+      // of the default CuSP cvc/oec partitioning policies.
+      // The original partitioning policies (including ocvc) only
+      // consider and attempt to balance the number of master nodes
+      // for each host.
+      // However, Galois-GNN chooses training vertices from the original graph,
+      // and extracts, constructs, uses a subgraph only with them for training.
+      // In this case, especially Galois-GNN typically chooses a consecutive
+      // range of vertices as the training vertices.
+      // This method might cause load imbalancing among hosts since most of the
+      // training master nodes are skewed to the few hosts.
+      // In order to alleviate this issue, Galois-GNN provides those variant
+      // partitioning policies. They consider and attempt to balance the
+      // number of master "training" nodes for each host.
+      // SHAD-GNN on WMD graphs is not necessarily constrained to this design.
+      // SHAD-GNN has the specific number of training vertices, and randomly 
+      // selects vertices from a graph as that, which means that Galois-GNN
+      // could avoid vertex imbalancing due to the skewness if it chooses
+      // vertices in balance manner.
+      // To sum up, we do not support the specialized partitioning policies,
+      // but choose vertices in balance manner.
+      GALOIS_LOG_FATAL("Gnn CVC and OEC are not supported for WMD graphs {}",
+          GNNPartitionToString(partition_scheme));
+    }
+  }
+
+  char name[256];
+  gethostname(name, 256);
+  galois::runtime::reportParam("GNNBench", "Hostname", name);
+
+#ifdef GALOIS_ENABLE_GPU
+  internal::heteroSetup();
+#endif
+}
+
+#ifdef GALOIS_ENABLE_GPU
+void internal::heteroSetup() {
+  const unsigned my_host_id = galois::runtime::getHostID();
+
+  auto& net = galois::runtime::getSystemNetworkInterface();
+
+  if (num_nodes == -1) {
+    num_nodes = net.Num;
+  }
+
+  GALOIS_LOG_ASSERT((net.Num % num_nodes) == 0);
+
+  device_personality = DevicePersonality::CPU;
+  if (personality_set.length() == (net.Num / num_nodes)) {
+    switch (personality_set.c_str()[my_host_id % (net.Num / num_nodes)]) {
+    case 'g':
+      galois::gInfo(my_host_id, " chooses GPU");
+      device_personality = DevicePersonality::GPU_CUDA;
+      break;
+    case 'c':
+      galois::gInfo(my_host_id, " chooses CPU");
+      device_personality = DevicePersonality::CPU;
+      break;
+    }
+
+    if (device_personality == DevicePersonality::GPU_CUDA) {
+      gpudevice = get_gpu_device_id(personality_set, num_nodes);
+    } else {
+      gpudevice = -1;
+    }
+
+    SetCUDADeviceId(gpudevice);
+  } else {
+    galois::gWarn(
+        "Command line option -pset ignored because its string length is not "
+        "equal to the number of processes/hosts on each physical node");
+  }
+}
+#endif
diff --git a/lonestar/scientific/cpu/longestedge/test/catch.hpp b/lonestar/scientific/cpu/longestedge/test/catch.hpp
index 6c1756a6ce..9232ff92fe 100644
--- a/lonestar/scientific/cpu/longestedge/test/catch.hpp
+++ b/lonestar/scientific/cpu/longestedge/test/catch.hpp
@@ -12,6 +12,7 @@
 #define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
 // start catch.hpp
 
+#define CATCH_CONFIG_NO_POSIX_SIGNALS
 
 #define CATCH_VERSION_MAJOR 2
 #define CATCH_VERSION_MINOR 11
@@ -10723,6 +10724,13 @@ PVOID FatalConditionHandler::exceptionHandlerHandle = nullptr;
 
 #elif defined( CATCH_CONFIG_POSIX_SIGNALS )
 
+// MINSIGSTKSZ is not constexpr in the recent Linux, and so,
+// requires manual declaration for backward compatibility.
+// This number is from
+// https://stackoverflow.com/questions/71454588/minsigstksz-error-after-update-in-my-manjaro-linux`
+#undef MINSIGSTKSZ
+#define MINSIGSTKSZ 16384
+
 namespace Catch {
 
     struct SignalDefs {
diff --git a/scripts/galois_gnn_log_parser.R b/scripts/galois_gnn_log_parser.R
new file mode 100644
index 0000000000..4e60af0c5d
--- /dev/null
+++ b/scripts/galois_gnn_log_parser.R
@@ -0,0 +1,221 @@
+#!/usr/bin/env Rscript
+
+#######################################################
+# Author: Gurbinder Gill
+# Email:  gill@cs.utexas.edu
+# Date:   Oct 8, 2017
+######################################################
+library("optparse")
+library('data.table')
+
+convertZeroTosStr <- function(a) {
+  if (identical(numeric(0), as.numeric(a)) == 0) {
+    a <- as.numeric(a) / 1000
+  } else {
+    a <- "0"
+  }
+  return (a)
+}
+
+####START: @function to parse commadline##################
+# Parses the command line to get the arguments used
+parseCmdLine <- function (logData, isSharedMemGaloisLog, graphPassedAsInput) {
+   ## Select commandline & param rows
+  cmdLineRow <- subset(logData, CATEGORY == "CommandLine" & STAT_TYPE == "PARAM")
+
+  ## Distributed has extra column: HostID
+  if(isTRUE(isSharedMemGaloisLog)){
+    cmdLine <- substring(cmdLineRow[,5], 0)
+  }
+  else {
+    cmdLine <- substring(cmdLineRow[,6], 0)
+  }
+
+  cmdLineSplit = strsplit(cmdLine, "\\s+")[[1]]
+  deviceKind = "CPU"
+  if(!isTRUE(isSharedMemGaloisLog)){
+    ## To check the device kind
+    pos = regexpr('-pset', cmdLineSplit)
+    deviceKind = ""
+    if(sum(pos>0) > 0){
+      deviceKind = "GPU"
+    } else {
+      deviceKind = "CPU"
+    }
+  }
+
+  ## First postitional argument is always name of the executable
+  ### WORKING: split the exePath name found at the position 1 of the argument list and split on "/".
+  exePathSplit <- strsplit(cmdLineSplit[1], "/")[[1]]
+  benchmark <- exePathSplit[length(exePathSplit)]
+
+  ## subset the threads row from the table
+  numThreads <- (subset(logData, CATEGORY == "Threads" & TOTAL_TYPE != "HostValues"))$TOTAL
+
+  input = "noInput"
+  if(isTRUE(graphPassedAsInput)){
+    ## subset the input row from the table
+    inputPath <- (subset(logData, CATEGORY == "Input" & STAT_TYPE == "PARAM"))$TOTAL
+    print(inputPath)
+    if(!identical(inputPath, character(0))){
+      inputPathSplit <- strsplit(inputPath, "/")[[1]]
+      input <- inputPathSplit[length(inputPathSplit)]
+    }
+    else {
+      inputPathSplit <- strsplit(inputPath[[2]], "/")[[1]]
+      input <- inputPathSplit[length(inputPathSplit)]
+    }
+
+    ### This is to remore the extension for example .gr or .sgr
+    inputsplit <- strsplit(input, "[.]")[[1]]
+    if(length(inputsplit) > 1) {
+      input <- inputsplit[1]
+    }
+  }
+
+  if(isTRUE(isSharedMemGaloisLog)){
+    returnList <- list("benchmark" = benchmark, "input" = input, "numThreads" = numThreads, "deviceKind" = deviceKind)
+    return(returnList)
+  }
+
+ ## Need more params for distributed galois logs
+ numHosts <- (subset(logData, CATEGORY == "Hosts"& TOTAL_TYPE != "HostValues"))$TOTAL
+
+ partitionScheme <- (subset(logData, CATEGORY == "PartitionScheme"& TOTAL_TYPE != "HostValues"))$TOTAL
+
+ runID <- (subset(logData, CATEGORY == "Run_UUID"& TOTAL_TYPE != "HostValues"))$TOTAL
+
+ numIterations <- (subset(logData, CATEGORY == "NumIterations_0"& TOTAL_TYPE != "HostValues"))$TOTAL
+ #If numIterations is not printed in the log files
+ if(identical(numIterations, character(0))){
+   numIterations <- 0
+ }
+
+ end2endTimer <- (subset(logData, CATEGORY == "Timer_0"& TOTAL_TYPE != "HostValues"))$TOTAL
+ end2endTimer <- convertZeroTosStr(end2endTimer)
+
+ aggr_fwd <- (subset(logData, CATEGORY == "AggregateForward"))$TOTAL
+ aggr_fwd <- convertZeroTosStr(aggr_fwd)
+
+ aggr_bwd <- (subset(logData, CATEGORY == "AggregateBackward"))$TOTAL
+ aggr_bwd <- convertZeroTosStr(aggr_bwd)
+
+ fwd_total <- (subset(logData, CATEGORY == "ForwardPhase"))$TOTAL
+ fwd_total <- convertZeroTosStr(fwd_total)
+
+ fwd_xform <- (subset(logData, CATEGORY == "ForwardXForm"))$TOTAL
+ fwd_xform <- convertZeroTosStr(fwd_xform)
+
+ bwd_total <- (subset(logData, CATEGORY == "BackwardPhase"))$TOTAL
+ bwd_total <- convertZeroTosStr(bwd_total)
+
+ bwd_xform <- (subset(logData, CATEGORY == "BackwardXForm"))$TOTAL
+ bwd_xform <- convertZeroTosStr(bwd_xform)
+
+ avg_epoch <- (subset(logData, CATEGORY == "AverageEpochTime"))$TOTAL
+ avg_epoch <- convertZeroTosStr(avg_epoch)
+
+ final_accuracy <- (subset(logData, CATEGORY == "FinalTestAccuracy"))$TOTAL
+
+ train_time <- (subset(logData, CATEGORY == "TrainingTime"))$TOTAL
+ train_time <- convertZeroTosStr(train_time)
+
+ sync_aggr <- (subset(logData, CATEGORY == "Sync_GraphAggregateSync_0"))$TOTAL
+ sync_aggr <- convertZeroTosStr(sync_aggr)
+
+ sync_weight <- (subset(logData, CATEGORY == "Sync_WeightGradientsSum"))$TOTAL
+ sync_weight <- convertZeroTosStr(sync_weight)
+ 
+ buff_breserve_time <- (subset(logData, CATEGORY ==
+                       "BroadcastExtract_GraphAggregateSync_0"))$TOTAL 
+ buff_breserve_time <- convertZeroTosStr(buff_breserve_time)
+ buff_bextract_time <- (subset(logData, CATEGORY ==
+                       "BroadcastExtractBatch_GraphAggregateSync_0"))$TOTAL 
+ buff_bextract_time <- convertZeroTosStr(buff_bextract_time)
+ buff_rreserve_time <- (subset(logData, CATEGORY ==
+                       "ReduceExtract_GraphAggregateSync_0"))$TOTAL 
+ buff_rreserve_time <- convertZeroTosStr(buff_rreserve_time)
+ buff_rextract_time <- (subset(logData, CATEGORY ==
+                       "ReduceExtractBatch_GraphAggregateSync_0"))$TOTAL 
+ buff_rextract_time <- convertZeroTosStr(buff_rextract_time)
+
+ print(input)
+ print(partitionScheme)
+ print(numHosts)
+ ## returnList for distributed galois log
+ returnList <- list("RunID" = runID, "Benchmark" = benchmark,
+                    "Input" = input, "PartitionScheme" = partitionScheme,
+                    "Hosts" = numHosts, "NumThreads" = numThreads,
+                    "EndToEndTime" = end2endTimer,
+                    "TrainTime" = train_time,
+                    "TotalForwardTime" = fwd_total,
+                    "ForwardAggregate" = aggr_fwd,
+                    "ForwardXform" = fwd_xform,
+                    "TotalBackwardTime" = bwd_total,
+                    "BackwardAggregate" = aggr_bwd,
+                    "BackwardXfrom" = bwd_xform,
+                    "AverageEpochTime" = avg_epoch,
+                    "FinalTestAccuracy" = final_accuracy,
+                    "AggregateSync" = sync_aggr,
+                    "Broadcast_buf_reserve" = buff_breserve_time,
+                    "Broadcast_buf_extract" = buff_bextract_time,
+                    "Reduce_buf_reserve" = buff_rreserve_time,
+                    "Reduce_buf_extract" = buff_rreserve_time,
+                    "AggregateWeight" = sync_weight,
+                    "DeviceKind" = deviceKind)
+
+ print("List")
+ print(returnList)
+ # Timer is milli-sec unit 
+ return(returnList)
+}
+#### END: @function to parse commadline ##################
+
+#### START: @function entry point for galois log parser ##################
+galoisLogParser <- function(input, output) {
+  logData <- read.csv(input, stringsAsFactors=F, strip.white=T)
+
+  printNormalStats = TRUE;
+  print("Parsing commadline")
+  paramList <- parseCmdLine(logData, F, T)
+  print("Parsing timers for shared memory galois log")
+
+  ## if computing RSD then normal stats are not printed
+  if(isTRUE(printNormalStats)){
+    if(!file.exists(output)){
+      print(paste(output, "Does not exist. Creating new file"))
+      print(as.data.frame(paramList))
+      write.csv(as.data.frame(paramList), file=output, row.names=F, quote=F)
+    } else {
+      print(paste("Appending data to the existing file", output))
+      write.table(as.data.frame(paramList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=",")
+    }
+  }
+}
+#### END: @function entry point for shared memory galois log ##################
+
+#############################################
+##  Commandline options.
+#######################################
+option_list = list(
+                   make_option(c("-i", "--input"), action="store", default=NA, type='character',
+                               help="name of the input file to parse"),
+                   make_option(c("-o", "--output"), action="store", default=NA, type='character',
+                               help="name of the output file parsed")
+                   )
+
+opt_parser <- OptionParser(usage = "%prog [options] -i input.log -o output.csv", option_list=option_list)
+opt <- parse_args(opt_parser)
+
+if (is.na(opt$i)){
+  print_help(opt_parser)
+  stop("At least one argument must be supplied (input file)", call.=FALSE)
+} else {
+  if (is.na(opt$o)){
+    print("Output file name is not specified. Using name ouput.csv as default")
+    opt$o <- "output.csv"
+  }
+  galoisLogParser(opt$i, opt$o)
+}
+
+##################### END #####################
diff --git a/scripts/generate_wmdpartitioner_statstics.py b/scripts/generate_wmdpartitioner_statstics.py
new file mode 100755
index 0000000000..cbf2426df3
--- /dev/null
+++ b/scripts/generate_wmdpartitioner_statstics.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python3
+import sys
+import os
+import re
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [ atoi(c) for c in re.split(r'(\d+)', text) ]
+
+if len(sys.argv) != 3:
+    print("Invalid arguments: expect <folder_name> <input_name>")
+input_folder = sys.argv[1]
+
+content = "BSD 0\n"
+host_counter = 0
+
+for filename in sorted([name for name in os.listdir(input_folder)], key=natural_keys):
+    print("read file " + filename)
+    with open(os.path.join(input_folder, filename), 'r') as f:
+        lines = [line.rstrip('\n') for line in f]
+
+        local_stats = ""
+
+        local_stats += f"# {host_counter} {host_counter}\n"
+
+        STR_RD = int(lines[3].split(":")[1])
+        local_stats += f"STR RD {STR_RD} {STR_RD*8}\n"
+
+        RND_RD = int(lines[4].split(":")[1])
+        local_stats += f"RND_RD {RND_RD} {RND_RD*8}\n"
+
+        RND_WR = int(lines[5].split(":")[1])
+        local_stats += f"RND_WR {RND_WR} {RND_WR*8}\n"
+
+
+        remote_line_offset = 8
+        for remote_host in range(0, len(lines) - remote_line_offset):
+            if remote_host == host_counter:
+                content += local_stats
+            else:
+                REMOTE_STR_RD = int(lines[remote_host + remote_line_offset].split(":")[1])
+                content += f"# {host_counter} {remote_host}\n"
+                content += f"STR RD {REMOTE_STR_RD} {REMOTE_STR_RD*8}\n"
+
+    host_counter += 1
+
+
+with open(f"GAL_WF1_{host_counter}_0_{sys.argv[2]}.stats", "w") as f:
+    f.write(content)
diff --git a/scripts/run-gpu.sh b/scripts/run-gpu.sh
new file mode 100644
index 0000000000..9f78915a03
--- /dev/null
+++ b/scripts/run-gpu.sh
@@ -0,0 +1,44 @@
+EXECS=( "gcn-dist" "gcn-dist-pinned" )
+#INPUTS=( "ogbn-products" )
+INPUTS=( "reddit" "ogbn-products" )
+#INPUTS=( "ogbn-papers100M" )
+TYPES=( "sage" )
+LSIZE=16
+NLAYERS=2
+EPOCH=200
+PSET="g"
+
+for e in "${EXECS[@]}"
+do
+  for t in 0
+    do
+    echo "Iter:"$t
+    PSET="g"
+    for n in 1 2 3 4
+    do
+      for i in "${INPUTS[@]}"
+      do
+        for k in "${TYPES[@]}"
+        do
+          TYPES_STR=${k}
+          LSIZE_STR=${LSIZE}
+          for nr in {1..${NLAYERS}}
+          do
+            TYPES_STR+=","${k}
+            LSIZE_STR+=","${LSIZE}
+          done
+          echo GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $n ./${e} -inputDirectory=/net/ohm/export/iss/inputs/Learning/ -epochs=${EPOCH} \
+                                     -layerTypes=${TYPES_STR} -disableDropout ${i} -layerSizes=${LSIZE_STR} \
+                                     -numLayers=${NLAYERS} -t=56 -statFile=${e}_${i}_${k}_${LSIZE}_${NLAYERS}_${PSET}_${t}.stats -pset=${PSET} -numNodes=1
+
+
+          CUDA_VISIBLE_DEVICES=2,3,4,5 GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $n ./${e} -inputDirectory=/net/ohm/export/iss/inputs/Learning/ -epochs=${EPOCH} \
+                                     -layerTypes=${TYPES_STR} -disableDropout ${i} -layerSizes=${LSIZE_STR} \
+                                     -numLayers=${NLAYERS} -t=56 -statFile=${e}_${i}_${k}_${LSIZE}_${NLAYERS}_${PSET}_${t}.stats -pset=${PSET} -numNodes=1
+        done
+      done
+      PSET+="g"
+      echo $PSET
+    done
+  done
+done
diff --git a/scripts/run_gnnsys.sh b/scripts/run_gnnsys.sh
new file mode 100644
index 0000000000..3b6ec31e70
--- /dev/null
+++ b/scripts/run_gnnsys.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+INPUTDIR="/net/ohm/export/iss/inputs/Learning/"
+#EXECS=( "gcn-dist" "gcn-dist-pinned" )
+EXECS=( "gcn-dist" )
+#INPUTS=( "cora" "reddit" "ogbn-products" )
+INPUTS=( "reddit" ) 
+LAYERTYPES=( "sage" "gcn" )
+#LAYERTYPES=( "gcn" )
+LAYERSIZE=16
+NUMLAYERS=2
+#PARTITIONS=( "oec" "cvc" )
+PARTITIONS=( "oec" )
+DEVICES="0"
+
+FLAGS=" -epochs=200"
+#FLAGS+=" -disableDropout"
+#FLAGS+=" -testInterval=50"
+
+PREFIX="GALOIS_DO_NOT_BIND_THREADS=1 "
+
+for input in "${INPUTS[@]}"
+do
+  for partition in "${PARTITIONS[@]}"
+  do
+#for num_gpus in {2..4}
+    for num_gpus in 1
+    do
+      PSET="g"
+      for ngpus in $(seq 2 ${num_gpus})
+      do
+        PSET+="g"
+      done
+      for layer in "${LAYERTYPES[@]}"
+      do
+        for exe in "${EXECS[@]}"
+        do
+          # Variable parameters
+          LSIZE_STR=$LAYERSIZE
+          LTYPE_STR=$layer
+          for r in {1..${NUMLAYERS}} 
+          do
+            LSIZE_STR+=","$LAYERSIZE
+            LTYPE_STR+=","$layer
+          done
+          echo "CUDA_VISIBLE_DEVICES=${DEVICES} GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $num_gpus ./${exe} $input $FLAGS -layerTypes=${LTYPE_STR} -t=1 \
+                            -pset=${PSET} -layerSizes=${LSIZE_STR} -numNodes=1 --inputDirectory=${INPUTDIR} \
+                            -statFile=${exe}_${input}_${layer}_${NUMLAYERS}_${LAYERSIZE}_${PSET}_${partition}.stat -partition=${partition}"
+
+          CUDA_VISIBLE_DEVICES=${DEVICES} GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $num_gpus ./${exe} $input $FLAGS -layerTypes=${LTYPE_STR} -t=1 \
+                                    -pset=${PSET} -layerSizes=${LSIZE_STR} -numNodes=1 --inputDirectory=${INPUTDIR} \
+                                    -statFile=${exe}_${input}_${layer}_${NUMLAYERS}_${LAYERSIZE}_${PSET}_${partition}.stat -partition=${partition}
+        done
+      done
+    done
+  done
+done
diff --git a/scripts/shad-gnn/check_feature_construction.py b/scripts/shad-gnn/check_feature_construction.py
new file mode 100644
index 0000000000..62538431a6
--- /dev/null
+++ b/scripts/shad-gnn/check_feature_construction.py
@@ -0,0 +1,51 @@
+import csv
+
+"""
+@autor: Hochan Lee (hochan.lee@amd.com)
+
+Requirement:
+
+The below two files should exist on the directory
+where this script runs.
+
+1) solution.csv is the solution file.
+2) 2hop.[host id].feat is the results of the feature construction
+that we want to check correctness.
+
+Command:
+python check_feature_construction.py
+
+"""
+num_hosts = 4
+
+solution = {}
+with open("solution.csv", "r") as f:
+  reader = csv.reader(f)
+  for row in reader:
+    rlen = len(row)
+    feat = []
+    for i in range(1, rlen):
+      feat.append(int(row[i]))
+    solution[row[0]] = feat
+
+fail = False
+for i in range(0, num_hosts):
+  with open("2hop."+str(i)+".feat", "r") as f:
+    reader = csv.reader(f)
+    for row in reader:
+      rlen = len(row)
+      feat = []
+      for j in range(1, rlen):
+        feat.append(int(row[j]))
+      key = row[0]
+
+      solution_feat = solution[key]
+      for j in range(0, rlen-1):
+        if solution_feat[j] != feat[j]:
+            print(key, " failed at ", j, " on host:", i)
+            fail = True
+
+if fail:
+  print("Verification failed")
+else:
+  print("Verification succeeded")
diff --git a/tools/dist-graph-convert/dist-graph-convert-helpers.cpp b/tools/dist-graph-convert/dist-graph-convert-helpers.cpp
index 4764598bbf..886103881d 100644
--- a/tools/dist-graph-convert/dist-graph-convert-helpers.cpp
+++ b/tools/dist-graph-convert/dist-graph-convert-helpers.cpp
@@ -269,7 +269,7 @@ void sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts) {
       continue;
     galois::runtime::SendBuffer b;
     galois::runtime::gSerialize(b, chunkCounts);
-    net.sendTagged(h, galois::runtime::evilPhase, b);
+    net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
   }
 
   // receive chunk counts
@@ -279,10 +279,10 @@ void sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts) {
   for (unsigned h = 0; h < totalNumHosts; h++) {
     if (h == hostID)
       continue;
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
 
     do {
-      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      rBuffer = net.recieveTagged(galois::runtime::evilPhase);
     } while (!rBuffer);
 
     galois::runtime::gDeserialize(rBuffer->second, recvChunkCounts);
@@ -416,12 +416,12 @@ uint64_t receiveEdgeCounts() {
   for (unsigned h = 0; h < totalNumHosts; h++) {
     if (h == hostID)
       continue;
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
 
     uint64_t recvCount;
 
     do {
-      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      rBuffer = net.recieveTagged(galois::runtime::evilPhase);
     } while (!rBuffer);
     galois::runtime::gDeserialize(rBuffer->second, recvCount);
 
@@ -450,9 +450,8 @@ void receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,
         std::vector<uint32_t> recvDataVector;
 
         while (edgesToReceive) {
-          decltype(
-              net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
-          rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+          decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
+          rBuffer = net.recieveTagged(galois::runtime::evilPhase);
 
           // the buffer will have edge data as well if localsrctodata is
           // nonempty (it will be nonempty if initialized to non-empty by the
@@ -460,7 +459,7 @@ void receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,
           // going to send edge data
           if (rBuffer) {
             auto& receiveBuffer = rBuffer->second;
-            while (receiveBuffer.r_size() > 0) {
+            while (receiveBuffer.size() > 0) {
               uint64_t src;
               if (localSrcToData.empty()) {
                 // receive only dest data
@@ -514,7 +513,7 @@ std::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges) {
       continue;
     galois::runtime::SendBuffer b;
     galois::runtime::gSerialize(b, localAssignedEdges);
-    net.sendTagged(h, galois::runtime::evilPhase, b);
+    net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
   }
 
   // receive
@@ -524,10 +523,10 @@ std::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges) {
       continue;
     }
 
-    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;
+    decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer;
     uint64_t otherAssignedEdges;
     do {
-      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);
+      rBuffer = net.recieveTagged(galois::runtime::evilPhase);
     } while (!rBuffer);
     galois::runtime::gDeserialize(rBuffer->second, otherAssignedEdges);
 
diff --git a/tools/dist-graph-convert/dist-graph-convert-helpers.h b/tools/dist-graph-convert/dist-graph-convert-helpers.h
index abf932056c..dc8d2a954a 100644
--- a/tools/dist-graph-convert/dist-graph-convert-helpers.h
+++ b/tools/dist-graph-convert/dist-graph-convert-helpers.h
@@ -838,7 +838,7 @@ void sendEdgeCounts(const std::vector<Uint64Pair>& hostToNodes,
       continue;
     galois::runtime::SendBuffer b;
     galois::runtime::gSerialize(b, numEdgesPerHost[h].reduce());
-    net.sendTagged(h, galois::runtime::evilPhase, b);
+    net.sendTagged(h, galois::runtime::evilPhase, std::move(b));
   }
 };
 
@@ -929,8 +929,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
               dstVector.clear();
               if (hostSendBuffer.size() > 1400) {
                 net.sendTagged(edgeOwner, galois::runtime::evilPhase,
-                               hostSendBuffer);
-                hostSendBuffer.getVec().clear();
+                               std::move(hostSendBuffer));
+                (*(sendBuffers.getLocal()))[edgeOwner] =
+                    galois::runtime::SendBuffer();
               }
             }
 
@@ -966,8 +967,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
           }
 
           if (hostSendBuffer.size() > 0) {
-            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);
-            hostSendBuffer.getVec().clear();
+            net.sendTagged(h, galois::runtime::evilPhase,
+                           std::move(hostSendBuffer));
+            (*(sendBuffers.getLocal()))[h] = galois::runtime::SendBuffer();
           }
         }
       },
@@ -1049,8 +1051,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
               dataVector.clear();
               if (hostSendBuffer.size() > 1400) {
                 net.sendTagged(edgeOwner, galois::runtime::evilPhase,
-                               hostSendBuffer);
-                hostSendBuffer.getVec().clear();
+                               std::move(hostSendBuffer));
+                (*(sendBuffers.getLocal()))[edgeOwner] =
+                    galois::runtime::SendBuffer();
               }
             }
 
@@ -1090,8 +1093,9 @@ void sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,
           }
 
           if (hostSendBuffer.size() > 0) {
-            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);
-            hostSendBuffer.getVec().clear();
+            net.sendTagged(h, galois::runtime::evilPhase,
+                           std::move(hostSendBuffer));
+            (*(sendBuffers.getLocal()))[h] = galois::runtime::SendBuffer();
           }
         }
       },
diff --git a/tools/graph-convert/graph-convert.cpp b/tools/graph-convert/graph-convert.cpp
index e283b55439..50bd565834 100644
--- a/tools/graph-convert/graph-convert.cpp
+++ b/tools/graph-convert/graph-convert.cpp
@@ -36,6 +36,7 @@
 
 #include <fcntl.h>
 #include <cstdlib>
+#include <optional>
 
 // TODO: move these enums to a common location for all graph convert tools
 enum ConvertMode {