diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..71b4e67e5e --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.csv filter=lfs diff=lfs merge=lfs -text +inputs/wmd/data.001.csv filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml new file mode 100644 index 0000000000..925406e22c --- /dev/null +++ b/.github/workflows/build-and-test.yml @@ -0,0 +1,162 @@ +name: Docker / Ubuntu 22.04 / Build and Test +run-name: docker-ubuntu-2204 performed by @${{ github.triggering_actor }} + +on: + pull_request: + branches: + - master + - main + push: + branches: + - master + - main + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + docker-create-ubuntu-2204: + name: create + runs-on: self-hosted + permissions: + contents: read + packages: write + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Create Docker Image + timeout-minutes: 45 + run: | + make ci-image + + ubuntu-2204-docker: + name: gcc / ${{ matrix.build-type }} / ${{ matrix.sanitizer-type }} + runs-on: self-hosted + permissions: + contents: read + packages: write + env: + IMAGE_NAME: galois + CONTAINER_SRC_DIR: "/pando-galois" + CONTAINER_BUILD_DIR: "/pando-galois/build" + CONTAINER_WORK_DIR: "/pando-galois" + GALOIS_CONTAINER_ENV: "-e=GALOIS_BUILD_TOOL=Ninja" + INTERACTIVE: "" + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + build-type: ['Release'] + sanitizer-type: ['nosan', 'san'] + needs: docker-create-ubuntu-2204 + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: 'true' + submodules: recursive + + - name: Set up environment variables + timeout-minutes: 1 + run: | + echo "UNAME=$(whoami)" >> $GITHUB_ENV + echo "UID=$(id -u)" >> $GITHUB_ENV + echo "GID=$(id -g)" >> $GITHUB_ENV + echo "SRC_DIR=$(pwd)" >> $GITHUB_ENV + echo "GALOIS_CCACHE_DIR=/var/local/$(whoami)/.ccache" >> $GITHUB_ENV + echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile)" >> $GITHUB_ENV + if [ ${{ matrix.sanitizer-type }} == 'san' ]; then + echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='\"-fsanitize=address -fsanitize=undefined\"'" >> $GITHUB_ENV + fi + if [ ${{ matrix.build-type }} == 'Debug' ]; then + echo "GALOIS_CONTAINER_ENV=$GALOIS_CONTAINER_ENV -e=GALOIS_EXTRA_CXX_FLAGS='-O3'" >> $GITHUB_ENV + fi + if [ ${{ runner.name }} == 'zerberus-0' ]; then + echo "CONTAINER_CPUSET='--cpuset-cpus=0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30'" >> $GITHUB_ENV + fi + if [ ${{ runner.name }} == 'zerberus-1' ]; then + echo "CONTAINER_CPUSET='--cpuset-cpus=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31'" >> $GITHUB_ENV + fi + if [ ${{ runner.name }} == 'languedoc-0' ]; then + echo "CONTAINER_CPUSET='--cpuset-cpus=0,1,2,3,4,5,6,7,32,33,34,35,36,37,38,39'" >> $GITHUB_ENV + fi + if [ ${{ runner.name }} == 'languedoc-1' ]; then + echo "CONTAINER_CPUSET='--cpuset-cpus=16,17,18,19,20,21,22,23,48,49,50,51,52,53,54,55'" >> $GITHUB_ENV + fi + if [ ${{ runner.name }} == 'languedoc-2' ]; then + echo "CONTAINER_CPUSET='--cpuset-cpus=8,9,10,11,12,13,14,15,40,41,42,43,44,45,46,47'" >> $GITHUB_ENV + fi + if [ ${{ runner.name }} == 'languedoc-3' ]; then + echo "CONTAINER_CPUSET='--cpuset-cpus=24,25,26,27,28,29,30,31,56,57,58,59,60,61,62,63'" >> $GITHUB_ENV + fi + cat $GITHUB_ENV + + - name: Configure + timeout-minutes: 10 + run: | + mkdir -p ${{ env.GALOIS_CCACHE_DIR }} -m=777 + CONTAINER_CMD="bash -lc 'source /opt/intel/oneapi/setvars.sh && make setup-ci'" \ + CONTAINER_OPTS="-e=BUILD_TYPE=${{ matrix.build-type }}" \ + IMAGE_NAME="${{ env.IMAGE_NAME }}" \ + VERSION="${{ env.IMAGE_VERSION }}" \ + make docker + + - name: Build + timeout-minutes: 15 + run: | + CONTAINER_CMD="bash -c 'ninja -j10 || ninja || ninja'" \ + IMAGE_NAME="${{ env.IMAGE_NAME }}" \ + VERSION="${{ env.IMAGE_VERSION }}" \ + CONTAINER_WORKDIR="${{ env.CONTAINER_BUILD_DIR }}" \ + make docker + + - name: Run Tests + timeout-minutes: 5 + run: | + CONTAINER_CMD="make run-tests" \ + IMAGE_NAME="${{ env.IMAGE_NAME }}" \ + VERSION="${{ env.IMAGE_VERSION }}" \ + make docker + + docker-pre-commit-ubuntu-2204: + name: pre-commit + runs-on: self-hosted + permissions: + contents: read + packages: write + env: + IMAGE_NAME: galois + CONTAINER_SRC_DIR: "/pando-galois" + CONTAINER_BUILD_DIR: "/pando-galois/build" + CONTAINER_WORK_DIR: "/pando-galois" + INTERACTIVE: "" + defaults: + run: + shell: bash -l {0} + needs: docker-create-ubuntu-2204 + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Set up environment variables + timeout-minutes: 1 + run: | + echo "SRC_DIR=$(pwd)" >> $GITHUB_ENV + echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile)" >> $GITHUB_ENV + cat $GITHUB_ENV + - name: Check pre-commit + timeout-minutes: 10 + run: | + IMAGE_NAME="${{ env.IMAGE_NAME }}" \ + VERSION="${{ env.IMAGE_VERSION }}" \ + make docker-pre-commit diff --git a/.gitignore b/.gitignore index 94fc673c6e..a1238adb3e 100644 --- a/.gitignore +++ b/.gitignore @@ -21,12 +21,15 @@ cscope.out .tags* tags .ycm_extra_conf.py +.ccache # no build files -/build* +/*build* +/dockerbuild* # no python build artifacts *.pyc /python/galois.egg-info /python/galois/*.so /_skbuild + diff --git a/.gitmodules b/.gitmodules index 0095886558..d66cce84ad 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ -[submodule "moderngpu"] - path = external/moderngpu - url = https://github.com/moderngpu/moderngpu.git -[submodule "cub"] - path = external/cub - url = https://github.com/NVlabs/cub.git +[submodule "external/pcg-cpp"] + path = external/pcg-cpp + url = https://github.com/imneme/pcg-cpp.git +[submodule "external/parallel-hashmap"] + path = external/parallel-hashmap + url = https://github.com/greg7mdp/parallel-hashmap.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..c30b4276e2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,25 @@ +--- +files: ^libcusp|^libdeepgalois|^libdist|^libgalois|^libgluon|^libgnn|^libwmd +exclude: ^scripts|^python|^inputs +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.2.0 + hooks: + - id: end-of-file-fixer + - id: mixed-line-ending + - id: trailing-whitespace + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.2.0 + hooks: + - id: forbid-tabs + exclude: ^scripts|^python + - id: remove-tabs + exclude: ^scripts|^python + args: [--whitespaces-count, '2'] + - repo: https://github.com/pocc/pre-commit-hooks + rev: v1.3.5 + hooks: + - id: clang-format + args: [-i] + # - id: clang-tidy + # args: [--fix, -p=build/compile_commands.json] diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000000..c00efa2d48 --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +pre-commit 2.19.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index b278ea0df3..721a4db6e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13) +cmake_minimum_required(VERSION 3.17) project(Galois) @@ -6,6 +6,13 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules") include(GNUInstallDirs) +if(STACK_CAPTURE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions") + set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h") + set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}") +endif(STACK_CAPTURE) + file(STRINGS config/version.txt GALOIS_VERSION) string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION}) string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" GALOIS_VERSION_MAJOR ${GALOIS_VERSION}) @@ -22,9 +29,10 @@ endif() ###### Options (alternatively pass as options to cmake -DName=Value) ###### ###### Distributed-heterogeneous features ###### -set(GALOIS_ENABLE_DIST OFF CACHE BOOL "Enable distributed features") +set(GALOIS_ENABLE_DIST ON CACHE BOOL "Enable distributed features") set(GALOIS_CUDA_CAPABILITY "" CACHE STRING "Semi-colon list of CUDA compute capability version numbers to enable GPU features") # e.g., "3.7;6.1" set(GALOIS_COMM_STATS OFF CACHE BOOL "Report more detailed statistics of communication") +set(GALOIS_ENABLE_WMD ON CACHE BOOL "Enable WMD dataset support") ###### General features ###### set(GALOIS_ENABLE_PAPI OFF CACHE BOOL "Use PAPI counters for profiling") set(GALOIS_ENABLE_VTUNE OFF CACHE BOOL "Use VTune for profiling") @@ -33,7 +41,12 @@ set(GALOIS_GRAPH_LOCATION "" CACHE PATH "Location of inputs for tests if downloa set(CXX_CLANG_TIDY "" CACHE STRING "Semi-colon list specifying clang-tidy command and arguments") set(CMAKE_CXX_COMPILER_LAUNCHER "" CACHE STRING "Semi-colon list specifying command to wrap compiler invocations (e.g., ccache)") set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture ('none' to disable)") -set(GALOIS_USE_SANITIZER "" CACHE STRING "Semi-colon list of sanitizers to use (Memory, MemoryWithOrigins, Address, Undefined, Thread)") + +set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library") +set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS") +# TODO; this is GNN related; find better way to do than hardcode +#SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2) + # This option is automatically handled by CMake. # It makes add_library build a shared lib unless STATIC is explicitly specified. # Putting this here is mostly just a placeholder so people know it's an option. @@ -45,6 +58,7 @@ set(GALOIS_NUM_TEST_GPUS "0" CACHE STRING "Number of test GPUs to use (on a sing set(GALOIS_USE_LCI OFF CACHE BOOL "Use LCI network runtime instead of MPI") set(GALOIS_USE_BARE_MPI OFF CACHE BOOL "Use MPI directly (no dedicated network-runtime thread)") set(GALOIS_NUM_TEST_THREADS "" CACHE STRING "Maximum number of threads to use when running tests (default: number of physical cores)") +set(GALOIS_ENABLE_INSTRUMENT OFF CACHE BOOL "Enable generating instrument in the runtime") if(NOT GALOIS_NUM_TEST_THREADS) cmake_host_system_information(RESULT GALOIS_NUM_TEST_THREADS QUERY NUMBER_OF_PHYSICAL_CORES) @@ -59,6 +73,31 @@ include(CTest) ###### Configure compiler ###### +if(PROJECT_IS_TOP_LEVEL) + include_directories(${Galois_SOURCE_DIR}/external/parallel-hashmap) + + if(CMAKE_CXX_FLAGS) + message(STATUS "Provided CXX Flags: " ${CMAKE_CXX_FLAGS}) + endif() + + set(CMAKE_OPTIMIZE_DEPENDENCIES true) + + # Setup CCache + find_program(CCACHE_EXECUTABLE ccache) + if(CCACHE_EXECUTABLE) + message(STATUS "CCache found at: " ${CCACHE_EXECUTABLE}) + set(ccacheEnv + CCACHE_SLOPPINESS=pch_defines,time_macros + ) + # NOTE: Ccache 4.2+ required for reliable CUDA support + foreach(lang IN ITEMS C CXX OBJC OBJCXX CUDA) + set(CMAKE_${lang}_COMPILER_LAUNCHER + ${CMAKE_COMMAND} -E env ${ccacheEnv} ${CCACHE_EXECUTABLE} + ) + endforeach() + endif() +endif() + # generate compile_commands.json set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -130,6 +169,31 @@ endif() ###### Configure features ###### +################################################################################ +# For GNN matrix multiplies +# TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file +################################################################################ +if(USE_MKL_BLAS) + find_package(MKL CONFIG REQUIRED PATH $ENV{MKL_ROOT}) + if (MKL_FOUND) + else() + message(WARNING "MKL not found") + endif() +endif() + +#SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1) +if(USE_OPENBLAS) + find_package(OpenBLAS) + message(STATUS "OpenBLAS: ${OPENBLAS_INCLUDE_DIRS}") + if (OPENBLAS_FOUND) + include_directories(${OPENBLAS_INCLUDE_DIRS}) + else() + message(WARNING "OpenBLAS not found") + endif() +endif() + +################################################################################ + if(GALOIS_ENABLE_VTUNE) set(VTune_ROOT /opt/intel/vtune_amplifier) find_package(VTune REQUIRED) @@ -137,6 +201,7 @@ if(GALOIS_ENABLE_VTUNE) add_definitions(-DGALOIS_ENABLE_VTUNE) endif() + if(GALOIS_ENABLE_PAPI) find_package(PAPI REQUIRED) include_directories(${PAPI_INCLUDE_DIRS}) @@ -191,6 +256,7 @@ endif() add_custom_target(lib) add_custom_target(apps) +add_subdirectory(external) # Core libraries (lib) add_subdirectory(libsupport) @@ -201,8 +267,19 @@ if (GALOIS_ENABLE_DIST) add_subdirectory(libdist) add_subdirectory(libcusp) add_subdirectory(libgluon) + if (GALOIS_ENABLE_WMD) + find_package(MPI REQUIRED) + add_subdirectory(libwmd) + endif() endif() + +# TODO(loc) prefix with GALOIS +if(USE_DEEPGALOIS) + add_subdirectory(libdeepgalois) +endif(USE_DEEPGALOIS) + string(COMPARE NOTEQUAL "${GALOIS_CUDA_CAPABILITY}" "" GALOIS_ENABLE_GPU) + if (GALOIS_ENABLE_GPU) enable_language(CUDA) foreach(GENCODE ${GALOIS_CUDA_CAPABILITY}) @@ -210,8 +287,42 @@ if (GALOIS_ENABLE_GPU) add_compile_options("$<$:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>") endforeach() + # This is necessary to allow building for CUDA 11.x (where CUB is bundled) and earlier versions (where CUB is not included) + add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK) + add_subdirectory(libgpu) + + if (USE_DEEPGALOIS) + SET(CUDA_SEPARABLE_COMPILATION ON) + find_package(CUDA REQUIRED) + set(CUDA_PROPAGATE_HOST_FLAGS off) + set(CUDA_HOST_COMPILER g++) + + string(REPLACE "." "" GENCODES ${GALOIS_CUDA_CAPABILITY}) + string(REPLACE "," ";" GENCODES ${GENCODES}) + foreach(GENCODE ${GENCODES}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE}) + endforeach() + + cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include") + + # MGPU v1.1 + set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers + cuda_include_directories("${MGPU_ROOT}/src") + + # CUB v1.6.4 + set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers + cuda_include_directories("${CUB_ROOT}") + + #find_package(OpenCL REQUIRED) + endif() endif() + +if (GALOIS_ENABLE_DIST AND USE_MKL_BLAS) + # here because I need the GPU declarations above + add_subdirectory(libgnn) +endif() + add_subdirectory(libpangolin) # Applications (apps) @@ -274,3 +385,11 @@ set(CPACK_PACKAGE_VERSION_MAJOR ${GALOIS_VERSION_MAJOR}) set(CPACK_PACKAGE_VERSION_MINOR ${GALOIS_VERSION_MINOR}) set(CPACK_PACKAGE_VERSION_PATCH ${GALOIS_VERSION_PATCH}) include(CPack) + +if(STACK_CAPTURE) + message("Writing CMAKE_CXX_FLAGS") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSTACK_TRACE -finstrument-functions") + set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h") + set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}" CACHE STRING "CMAKE Flags" FORCE) +endif(STACK_CAPTURE) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..007227dc70 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,37 @@ +# Contributing + +Contributors must run quality checks on code. In place of CI we +recommend using `pre-commit` (described below) instead of running +tools like `clang-format` manually. + +Code should be clear and documented where needed. + +## Setup + +Users can run `make docker-image` to setup all dependecies needed for +`pando-galois`. After creating the image it can be run via `make docker`. +And for first time cmake users can run `make run-cmake`. + +## Tools + +### [asdf](https://asdf-vm.com) + +Provides a declarative set of tools pinned to +specific versions for environmental consistency. + +These tools are defined in `.tool-versions`. +Run `make dependencies` to initialize a new environment. + +### [pre-commit](https://pre-commit.com) + +A left shifting tool to consistently run a set of checks on the code repo. +Our checks enforce syntax validations and formatting. +We encourage contributors to use pre-commit hooks. + +```shell +# install all pre-commit hooks +make hooks + +# run pre-commit on repo once +make pre-commit +``` diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..cceb15b94a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,88 @@ +ARG BUILD_IMAGE=ubuntu:22.04 +FROM --platform=linux/amd64 ${BUILD_IMAGE} AS dev + +WORKDIR /tmp + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && \ + apt install -y \ + cmake \ + gcc \ + g++ \ + ccache \ + build-essential \ + make \ + libboost-all-dev \ + libfmt-dev \ + libzstd-dev \ + lsb-release \ + wget \ + software-properties-common \ + gnupg \ + gdb \ + vim \ + git \ + python3 \ + python3-pip \ + unzip \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +# setup intel repo for intel-basekit +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | \ + gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null +RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \ + tee /etc/apt/sources.list.d/oneAPI.list +RUN apt update && \ + apt install -y \ + intel-basekit \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" + +ENV NINJA_BUILD_VERSION=1.11.1 +RUN wget https://github.com/ninja-build/ninja/releases/download/v${NINJA_BUILD_VERSION}/ninja-linux.zip -P /tmp && \ + unzip /tmp/ninja-linux.zip -d /usr/bin && \ + rm /tmp/ninja-linux.zip + +ARG IS_CI=true + +RUN if [ "${IS_CI}" != "true" ] ; then \ + apt update -y \ + && apt install -y \ + vim \ + gdb \ + universal-ctags \ + powerline \ + zsh \ + valgrind \ + sudo \ + doxygen \ + texlive-latex-extra \ + texlive-font-utils \ + && apt clean; fi + +ARG SRC_DIR=/pando-galois +ARG BUILD_DIR=/pando-galois/dockerbuild +ARG UNAME +ARG UID +ARG GID + +RUN if [ "${UNAME}" != "root" ] ; then groupadd -g ${GID} ${UNAME} \ + && useradd -ms /bin/bash -u "${UID}" -g "${GID}" ${UNAME} ; fi + +RUN mkdir -p /home/${UNAME} \ + && chown ${UNAME}:${UNAME} /home/${UNAME} + +USER ${UNAME} +WORKDIR /home/${UNAME} +ENV BUILD_DIR=${BUILD_DIR} + +RUN pip3 install compdb pre-commit cpplint "clang-format>=14.0.0,<17.0.0" + +RUN echo "PATH=/home/${UNAME}/.local/bin/:\$PATH" >> /home/${UNAME}/.zshenv + +RUN echo "export SRC_DIR=${SRC_DIR}" >> /home/${UNAME}/.bashrc +RUN echo "export BUILD_DIR=${BUILD_DIR}" >> /home/${UNAME}/.bashrc +RUN echo "source /opt/intel/oneapi/setvars.sh > /dev/null" >> /home/${UNAME}/.bashrc + +WORKDIR ${SRC_DIR} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..1a5e58b116 --- /dev/null +++ b/Makefile @@ -0,0 +1,131 @@ +SHELL := /bin/bash + +UNAME ?= $(shell whoami) +UID ?= $(shell id -u) +GID ?= $(shell id -g) + +BASE_IMAGE_NAME ?= pando-galois +IMAGE_NAME ?= ${UNAME}-${BASE_IMAGE_NAME} +SRC_DIR ?= $(shell pwd) +VERSION ?= $(shell git log --pretty="%h" -1 Dockerfile) + +CONTAINER_SRC_DIR ?= /pando-galois +CONTAINER_BUILD_DIR ?= /pando-galois/build +CONTAINER_WORKDIR ?= ${CONTAINER_SRC_DIR} +CONTAINER_CONTEXT ?= default +CONTAINER_OPTS ?= +CONTAINER_CPUSET ?= +CONTAINER_CMD ?= bash -l +INTERACTIVE ?= i + +BUILD_TYPE ?= RelWithDebInfo + +# CMake variables +GALOIS_EXTRA_CMAKE_FLAGS ?= "" +GALOIS_EXTRA_CXX_FLAGS ?= "" + +# Developer variables that should be set as env vars in startup files like .profile +GALOIS_CONTAINER_MOUNTS ?= +GALOIS_CONTAINER_ENV ?= +GALOIS_CONTAINER_FLAGS ?= +GALOIS_BUILD_TOOL ?= 'Unix Makefiles' +GALOIS_CCACHE_DIR ?= ${SRC_DIR}/.ccache + +dependencies: dependencies-asdf + +dependencies-asdf: + @echo "Updating asdf plugins..." + @asdf plugin update --all >/dev/null 2>&1 || true + @echo "Adding new asdf plugins..." + @cut -d" " -f1 ./.tool-versions | xargs -I % asdf plugin-add % >/dev/null 2>&1 || true + @echo "Installing asdf tools..." + @cat ./.tool-versions | xargs -I{} bash -c 'asdf install {}' + @echo "Updating local environment to use proper tool versions..." + @cat ./.tool-versions | xargs -I{} bash -c 'asdf local {}' + @asdf reshim + @echo "Done!" + +hooks: + @pre-commit install --hook-type pre-commit + @pre-commit install-hooks + +pre-commit: + @pre-commit run -a + +ci-image: + @${MAKE} docker-image-dependencies + @docker image inspect galois:${VERSION} >/dev/null 2>&1 || \ + docker --context ${CONTAINER_CONTEXT} build \ + --build-arg SRC_DIR=${CONTAINER_SRC_DIR} \ + --build-arg BUILD_DIR=${CONTAINER_BUILD_DIR} \ + --build-arg UNAME=runner \ + --build-arg UID=1078 \ + --build-arg GID=504 \ + -t galois:${VERSION} \ + --file Dockerfile \ + --target dev . + +docker-image: + @${MAKE} docker-image-dependencies + @docker image inspect ${IMAGE_NAME}:${VERSION} >/dev/null 2>&1 || \ + docker --context ${CONTAINER_CONTEXT} build \ + --build-arg SRC_DIR=${CONTAINER_SRC_DIR} \ + --build-arg BUILD_DIR=${CONTAINER_BUILD_DIR} \ + --build-arg UNAME=${UNAME} \ + --build-arg IS_CI=false \ + --build-arg UID=${UID} \ + --build-arg GID=${GID} \ + -t ${IMAGE_NAME}:${VERSION} \ + --file Dockerfile \ + --target dev . + +docker-image-dependencies: + @mkdir -p build + @mkdir -p data + @mkdir -p .ccache + +.PHONY: docker +docker: + @docker --context ${CONTAINER_CONTEXT} run --rm \ + -v ${SRC_DIR}/:${CONTAINER_SRC_DIR} \ + -v ${GALOIS_CCACHE_DIR}/:/home/${UNAME}/.ccache \ + ${GALOIS_CONTAINER_MOUNTS} \ + ${GALOIS_CONTAINER_ENV} \ + ${GALOIS_CONTAINER_FLAGS} \ + ${CONTAINER_CPUSET} \ + --privileged \ + --workdir=${CONTAINER_WORKDIR} \ + ${CONTAINER_OPTS} \ + -${INTERACTIVE}t \ + ${IMAGE_NAME}:${VERSION} \ + ${CONTAINER_CMD} + +run-cmake: + @cmake \ + -S ${SRC_DIR} \ + -B ${BUILD_DIR} \ + -G ${GALOIS_BUILD_TOOL} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ + -DUSE_MKL_BLAS=ON \ + -DGALOIS_ENABLE_DIST=ON \ + ${GALOIS_EXTRA_CMAKE_FLAGS} + +setup: run-cmake + +setup-ci: run-cmake + +run-tests: + @ctest --test-dir build -R wmd --verbose + @ctest --test-dir build -R large-vec --verbose + @ctest --test-dir build -R compile-lscsr --verbose + @ctest --test-dir build -R prefixsum --verbose + @ctest --test-dir build -R wfl --verbose + +# this command is slow since hooks are not stored in the container image +# this is mostly for CI use +docker-pre-commit: + @docker --context ${CONTAINER_CONTEXT} run --rm \ + -v ${SRC_DIR}/:${CONTAINER_SRC_DIR} --privileged \ + --workdir=${CONTAINER_WORKDIR} -t \ + ${IMAGE_NAME}:${VERSION} bash -lc "git config --global --add safe.directory /pando-galois && make hooks && make pre-commit" diff --git a/README.md b/README.md index 3375e800ee..ffda74f765 100644 --- a/README.md +++ b/README.md @@ -11,17 +11,17 @@ an implicitly parallel programming model, where the programmer replaces serial l constructs (e.g. for and while) and serial data structures in their algorithms with parallel loop constructs and concurrent data structures provided by Galois to express their algorithms. Galois is designed so that the programmer does not have to deal with low-level parallel programming constructs such as -threads, locks, barriers, condition variables, etc. +threads, locks, barriers, condition variables, etc. Highlights include: - Parallel *for_each* loop that handles dependencies between iterations, as well as dynamic work creation, and a *do_all* loop for simple parallelism. Both provide load balancing and excellent scalability on multi-socket systems - A concurrent graph library designed for graph analytics algorithms as well as - other domains such as irregular meshes. -- Scalable concurrent containers such as bag, vector, list, etc. + other domains such as irregular meshes. +- Scalable concurrent containers such as bag, vector, list, etc. -Galois is released under the BSD-3-Clause license. +Galois is released under the BSD-3-Clause license. Building Galois @@ -45,7 +45,7 @@ Dependencies Galois builds, runs, and has been tested on GNU/Linux. Even though Galois may build on systems similar to Linux, we have not tested correctness or performance, so please -beware. +beware. At the minimum, Galois depends on the following software: @@ -55,7 +55,7 @@ At the minimum, Galois depends on the following software: - libllvm (>= 7.0 with RTTI support) - libfmt (>= 4.0) -Here are the dependencies for the optional features: +Here are the dependencies for the optional features: - Linux HUGE_PAGES support (please see [www.kernel.org/doc/Documentation/vm/hugetlbpage.txt](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt)). Performance will most likely degrade without HUGE_PAGES enabled. Galois uses 2MB huge page size and relies on the kernel configuration to set aside a large amount of 2MB pages. For example, our performance testing machine (4x14 cores, 192GB RAM) is configured to support up to 65536 2MB pages: @@ -70,13 +70,14 @@ Here are the dependencies for the optional features: ``` - libnuma support. Performance may degrade without it. Please install - libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. -- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files + libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. +- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files - PAPI (>= 5.2.0.0 ) for profiling sections of code - Vtune (>= 2017 ) for profiling sections of code - MPICH2 (>= 3.2) if you are interested in building and running distributed system applications in Galois -- CUDA (>= 8.0) if you want to build GPU or distributed heterogeneous applications +- CUDA (>= 8.0 and < 11.0) if you want to build GPU or distributed heterogeneous applications. + Note that versions >= 11.0 use an incompatible CUB module and will fail to execute. - Eigen (3.3.1 works for us) for some matrix-completion app variants @@ -148,6 +149,12 @@ ctest in the build directory. +Capturing Stack Information +--------------------------- +Currently if you add `-DSTACK_CAPTURE` to your `cmake` line then you will configure stack capturing. +Please view `libgalois/include/runtime/StackTracer.h` for documentation on functions for printing and reseting. +Do not attempt to modify the capture process otherwise. + Running Galois Applications =========================== @@ -156,9 +163,9 @@ Graph Format ------------ Many Galois/Lonestar applications work with graphs. We store graphs in a binary format -called *galois graph file* +called *galois graph file* (`.gr` file extension). Other formats such as edge-list or Matrix-Market can be -converted to `.gr` format with `graph-convert` tool provided in galois. +converted to `.gr` format with `graph-convert` tool provided in galois. You can build graph-convert as follows: ```Shell @@ -168,20 +175,20 @@ make graph-convert ``` Other applications, such as Delaunay Mesh Refinement may read special file formats -or some may even generate random inputs on the fly. +or some may even generate random inputs on the fly. Running ------- All Lonestar applications take a `-t` command-line option to specify the number of threads to use. All applications run a basic sanity check (often insufficient for -correctness) on the program output, which can be turned off with the `-noverify` option. You -can specify `-help` command-line option to print all available options. +correctness) on the program output, which can be turned off with the `-noverify` option. You +can specify `-help` command-line option to print all available options. Upon successful completion, each application will produce some stats regarding running time of various sections, parallel loop iterations and memory usage, etc. These stats are in CSV format and can be redirected to a file using `-statFile` option. -Please refer to the manual for details on stats. +Please refer to the manual for details on stats. Running LonestarGPU applications -------------------------- @@ -199,7 +206,7 @@ Documentation ============= Galois documentation is produced using doxygen, included in this repository, which includes a tutorial, a user's -manual and API documentation for the Galois library. +manual and API documentation for the Galois library. Users can build doxygen documentation in the build directory using: @@ -215,12 +222,12 @@ See online documentation at: Source-Tree Organization ======================== -- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. +- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. - `lonestar` contains the Lonestar benchmark applications and tutorial examples for Galois - `libdist` contains the source code for the distributed-memory and heterogeneous Galois library - `lonestardist` contains the source code for the distributed-memory and heterogeneous benchmark applications. Please refer to `lonestardist/README.md` for instructions on - building and running these apps. + building and running these apps. - `tools` contains various helper programs such as graph-converter to convert between graph file formats and graph-stats to print graph properties diff --git a/README_SHAD.md b/README_SHAD.md new file mode 100644 index 0000000000..4253bb0e55 --- /dev/null +++ b/README_SHAD.md @@ -0,0 +1,57 @@ +README related to SHAD input graph ingestion +(Including some notes for other workflows) +This README is for our internal purpose. +This README will be refined with more concrete information later. + +1. CMakeList paths: +The current CMake in Galois is using hard-:coded paths for CUDA_HOME, +OPENBLAS_ROOT, INTEL_COMPILER_LIBRARIES, and MKL_LIBRARIES. +Please set those variables based on your environments. + + +2. Assumptions regarding SHAD WMD graph formats: +We assume that in SHAD WMD graph formats, each node and edge has a single type, +and those types are ALWAYS uint64_t. +The current Galois does not support node/edge properties (possibly, +programmers can implement a struct containing multiple +fields, but that is not like getData(n), getData(n), etc.) +and so, we store those SHAD types in node and edge data. +If you need other types than uint64_t, you should add new execution paths for +them. + + +3. Limitations of the current SHAD graph ingestion module: +In the original CuSP, each host reads parts of the .gr graph file and constructs +in-memory format. In this case, each host does not need to load the full graph +in its memory space. This is possible since .gr file is CSR and each component +such as outgoing edge indices, outgoing edge destinations, and outgoing edge +data is stored consecutively. + +However, in the SHAD graph format, all components are not stored consecutively. +They are unsorted. For example, edges and nodes can be stored in interleaved +manner. Therefore, it is not possible to read partial graphs by using +the original method. + +As the current SHAD graph ingestion does not focus on decent/scalable methods, +but to make SHAD graphs work in Galois to proceed with workflows, +each host reads the FULL graph to in-memory. This should NOT be the final +artifact since our long-run target graphs should exceed a single machine memory. +But for the immediate goal and the target data sets, I assume that it is fine +for now. + +UT team is currently working on new graph formats for dynamic graphs, and +scalable SHAD graph ingestion across hosts. + +4. TODO: +CuSP marks training/test/validation nodes while it is partitioning a graph. +It is not implemented yet for a SHAD graph. +This will be added in a GNN/feature construction branch. + +5. Requirements: +Galois-GNN requires additional packages listed below on top of the requirements of Galois. +You can use older/newer versions but let me (hochan) also list the versions that I have used: +1) Intel MKL: 2023.1.0 +2) Intel Compiler (including runtime libraries): 2023.0.0 +3) Intel Onedpl-devel library: 2023.1.0 +4) Intel OpenMP: 2023.0.0 + diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake new file mode 100644 index 0000000000..d87020f770 --- /dev/null +++ b/cmake/Modules/FindMKL.cmake @@ -0,0 +1,24 @@ +# Find MKL libraries +# Once done this will define +# MKL_FOUND - System has MKL +# MKL_INCLUDE_DIRS - The MKL include directories +# MKL_LIBRARIES - The libraries needed to use MKL + +set(MKL_LIBRARIES) # Include-only library + +if(MKL_INCLUDE_DIRS) + set(MKL_FIND_QUIETLY TRUE) +endif() + +find_path(MKL_INCLUDE_DIRS mkl.h PATHS ${MKL_ROOT} PATH_SUFFIXES include) +message(STATUS "MKL_INCLUDE_DIRS: ${MKL_INCLUDE_DIRS}") +find_library(MKL_LIBRARY NAMES mkl_rt PATHS ${MKL_ROOT} PATH_SUFFIXES lib/intel64) +message(STATUS "MKL_LIBRARY: ${MKL_LIBRARY}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIRS) +if(MKL_FOUND) + set(MKL_FOUND on) +endif() + +mark_as_advanced(MKL_INCLUDE_DIRS) diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake new file mode 100644 index 0000000000..3f595744d0 --- /dev/null +++ b/cmake/Modules/FindOpenBLAS.cmake @@ -0,0 +1,24 @@ +# Find OpenBLAS libraries +# Once done this will define +# OpenBLAS_FOUND - System has OpenBLAS +# OpenBLAS_INCLUDE_DIRS - The OpenBLAS include directories +# OpenBLAS_LIBRARIES - The libraries needed to use OpenBLAS + +set(OPENBLAS_LIBRARIES) # Include-only library + +if(OPENBLAS_INCLUDE_DIRS) + set(OPENBLAS_FIND_QUIETLY TRUE) +endif() + +find_path(OPENBLAS_INCLUDE_DIRS cblas.h PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES include/openblas) +message(STATUS "OPENBLAS_INCLUDE_DIRS: ${OPENBLAS_INCLUDE_DIRS}") +find_library(OPENBLAS_LIBRARY NAMES openblas PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES lib64) +message(STATUS "OPENBLAS_LIBRARY: ${OPENBLAS_LIBRARY}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(OPENBLAS DEFAULT_MSG OPENBLAS_LIBRARY OPENBLAS_INCLUDE_DIRS) +if(OPENBLAS_FOUND) + set(OPENBLAS_FOUND on) +endif() + +mark_as_advanced(OPENBLAS_INCLUDE_DIRS) diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt new file mode 100644 index 0000000000..310000adc8 --- /dev/null +++ b/external/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(parallel-hashmap) \ No newline at end of file diff --git a/external/cub b/external/cub deleted file mode 160000 index c3cceac115..0000000000 --- a/external/cub +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c3cceac115c072fb63df1836ff46d8c60d9eb304 diff --git a/external/moderngpu b/external/moderngpu deleted file mode 160000 index 2b3985541c..0000000000 --- a/external/moderngpu +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2b3985541c8e88a133769598c406c33ddde9d0a5 diff --git a/external/parallel-hashmap b/external/parallel-hashmap new file mode 160000 index 0000000000..67c24619e4 --- /dev/null +++ b/external/parallel-hashmap @@ -0,0 +1 @@ +Subproject commit 67c24619e4f5ab2097b74cc397732c17a25d6944 diff --git a/external/pcg-cpp b/external/pcg-cpp new file mode 160000 index 0000000000..428802d1a5 --- /dev/null +++ b/external/pcg-cpp @@ -0,0 +1 @@ +Subproject commit 428802d1a5634f96bcd0705fab379ff0113bcf13 diff --git a/inputs/wmd/data.00001.csv b/inputs/wmd/data.00001.csv new file mode 100644 index 0000000000..0f18f74182 --- /dev/null +++ b/inputs/wmd/data.00001.csv @@ -0,0 +1,744 @@ +#delimieter: , +#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon +#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE +Publication,,,,,102583151124020340,,4/1/2013,, +Publication,,,,,1004346153600881042,,12/2/2014,, +Publication,,,,,1433303251800176474,,1/1/2014,, +Publication,,,,,963345652072941810,,3/1/2017,, +ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,, +ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,, +ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,, +ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,, +ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,, +ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,, +ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,, +ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,, +ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,, +ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,, +ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,, +ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,, +ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,, +ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,, +ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,, +ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,, +ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,, +ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,, +ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,, +ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,, +ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,, +ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,, +ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,, +ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,, +ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,, +ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,, +ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,, +ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,, +ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,, +ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,, +ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,, +ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,, +ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,, +ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,, +ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,, +ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,, +ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,, +ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,, +ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,, +ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,, +ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,, +ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,, +ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,, +ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,, +ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,, +ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,, +ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,, +ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,, +ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,, +ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,, +ForumEvent,,,,1142353335442842612,,,10/2/2019,, +ForumEvent,,,,1417645062678302203,,,10/27/2019,, +ForumEvent,,,,691612430615344311,,,11/18/2019,, +ForumEvent,,,,499518911125406276,,,11/7/2019,, +ForumEvent,,,,802203574353867462,,,12/26/2019,, +ForumEvent,,,,1154045191214226005,,,12/19/2019,, +Forum,,,227560344059645632,,,,,, +Forum,,,642724485236726353,,,,,, +Forum,,,1583773067440233990,,,,,, +Forum,,,353365307219544531,,,,,, +Forum,,,1372844135435303981,,,,,, +Forum,,,817570614729612563,,,,,, +Forum,,,1222966301068614432,,,,,, +Forum,,,254347350613723281,,,,,, +Forum,,,230406515001545612,,,,,, +Forum,,,1561731546512891652,,,,,, +Forum,,,188043543797416114,,,,,, +Forum,,,1083041743586306041,,,,,, +Forum,,,132472381132383125,,,,,, +Forum,,,20118285562646166,,,,,, +Forum,,,555784630220125214,,,,,, +Forum,,,1015255971523263924,,,,,, +Forum,,,1342495276080758813,,,,,, +Forum,,,851350143155248158,,,,,, +Forum,,,1427292001647224242,,,,,, +Forum,,,722051276937327353,,,,,, +Forum,,,1107212912316309796,,,,,, +Forum,,,504490409499070811,,,,,, +Forum,,,15133734353741126,,,,,, +Forum,,,869745302967338810,,,,,, +Forum,,,324124332757504717,,,,,, +Forum,,,852491638004013222,,,,,, +Forum,,,1040437236245414809,,,,,, +Forum,,,442231451428861295,,,,,, +Forum,,,101022092642335391,,,,,, +Forum,,,1037815940207624157,,,,,, +Forum,,,1331941318481662527,,,,,, +Forum,,,1615340315424362057,,,,,, +Forum,,,1425519641234605945,,,,,, +Forum,,,705065952261175094,,,,,, +Forum,,,1314315120197156050,,,,,, +Forum,,,214214821270800149,,,,,, +Forum,,,1361197157264541395,,,,,, +Forum,,,1033538541314217453,,,,,, +Forum,,,565733832133342431,,,,,, +Forum,,,451888058015735870,,,,,, +Forum,,,155345234637251110,,,,,, +Forum,,,1371100161965701220,,,,,, +Forum,,,1307221369082243900,,,,,, +Forum,,,406508153569651122,,,,,, +Forum,,,1202482536733844323,,,,,, +Forum,,,912373284682369433,,,,,, +Person,477384404927196020,,,,,,,, +Person,182010581109145287,,,,,,,, +Topic,,,,,,271997,,, +Topic,,,,,,127197,,, +Person,284405379592161575,,,,,,,, +Topic,,,,,,11650,,, +Topic,,,,,,185785,,, +Topic,,,,,,1907525,,, +Topic,,,,,,1333024,,, +Topic,,,,,,2329,,, +Topic,,,,,,571,,, +Topic,,,,,,56683126,,, +Topic,,,,,,146,,, +Topic,,,,,,487,,, +Topic,,,,,,193294,,, +Topic,,,,,,177,,, +Topic,,,,,,81944,,, +Topic,,,,,,998,,, +Topic,,,,,,55424107,,, +Topic,,,,,,41323,,, +Topic,,,,,,38695,,, +Topic,,,,,,379860,,, +Topic,,,,,,1149078,,, +Topic,,,,,,172809,,, +Topic,,,,,,1642639,,, +Topic,,,,,,903552,,, +Topic,,,,,,204,,, +Topic,,,,,,7817,,, +Topic,,,,,,201816,,, +Topic,,,,,,785,,49.19,-2.11 +Topic,,,,,,127,,, +Topic,,,,,,206021,,, +Topic,,,,,,181508,,, +Topic,,,,,,735,,, +Topic,,,,,,304878,,, +Topic,,,,,,7590,,, +Topic,,,,,,8074,,, +Topic,,,,,,24862,,, +Topic,,,,,,35127,,, +Topic,,,,,,60,,40.67,-73.94 +Topic,,,,,,443533,,, +Person,1160244137181801222,,,,,,,, +Topic,,,,,,192242,,, +Topic,,,,,,11707,,, +Topic,,,,,,73843,,, +Topic,,,,,,505619,,, +Topic,,,,,,158668,,, +Topic,,,,,,889,,34.0,66.0 +Person,895197896920634500,,,,,,,, +Topic,,,,,,18426,,40.84676,-73.873207 +Topic,,,,,,787185,,, +Topic,,,,,,467,,, +Person,1419850416906085161,,,,,,,, +Topic,,,,,,2869238,,, +Topic,,,,,,5,,, +Topic,,,,,,334600,,, +Topic,,,,,,191290,,, +Topic,,,,,,122113,,, +Topic,,,,,,179057,,, +Topic,,,,,,11635,,, +Topic,,,,,,329717,,, +Person,33927662206515912,,,,,,,, +Topic,,,,,,35140,,, +Topic,,,,,,485537,,, +Topic,,,,,,102014,,, +Topic,,,,,,40357,,, +Topic,,,,,,1337691,,, +Topic,,,,,,160409,,40.7825,-73.966111111111 +Topic,,,,,,69871376,,, +Topic,,,,,,177749,,, +Topic,,,,,,11348,,, +Topic,,,,,,182218,,, +Topic,,,,,,1229,,47.568611111111,40.852783333333 +Topic,,,,,,5322,,, +Person,1035098046740791143,,,,,,,, +Topic,,,,,,792565,,48.10277778,20.78388889 +Topic,,,,,,37654,,, +Topic,,,,,,25395,,40.735277777778,-74.185 +Topic,,,,,,169313,,, +Topic,,,,,,728,,, +Topic,,,,,,699385,,, +Topic,,,,,,22983,,, +Person,971383124880710240,,,,,,,, +Person,1010629269012322480,,,,,,,, +Topic,,,,,,11299,,40.728333333333,-73.994166666667 +Topic,,,,,,83460,,, +Topic,,,,,,10289,,, +Topic,,,,,,11019,,, +Topic,,,,,,470118,,, +Person,1426050562563532645,,,,,,,, +Person,75415528634186650,,,,,,,, +Person,1001287904525368324,,,,,,,, +Person,242111862342742257,,,,,,,, +Topic,,,,,,11249,,40.747,-73.986 +Topic,,,,,,3933135,,, +Topic,,,,,,44311,,, +Person,1025135622623992536,,,,,,,, +Topic,,,,,,37497186,,, +Person,584485814982143221,,,,,,,, +Person,1508332501512270227,,,,,,,, +Topic,,,,,,328473,,40.712,-74.002 +Topic,,,,,,25347,,, +Topic,,,,,,175111,,, +Person,1312322776399358210,,,,,,,, +Topic,,,,,,16003594,,, +Topic,,,,,,48789658,,, +Topic,,,,,,8148,,, +Topic,,,,,,9420,,, +Topic,,,,,,771572,,40.699945,-73.950148 +Topic,,,,,,5088838,,, +Person,1597454052092354280,,,,,,,, +Person,961135479935321085,,,,,,,, +Topic,,,,,,123705,,, +Topic,,,,,,598435,,, +Topic,,,,,,732934,,, +Person,317248309514344163,,,,,,,, +Person,1524681741257900519,,,,,,,, +Topic,,,,,,254860,,, +Topic,,,,,,335046,,, +Person,534449219561977424,,,,,,,, +Person,1035056342462002945,,,,,,,, +Person,1222330726897222256,,,,,,,, +Person,493345739124130581,,,,,,,, +Topic,,,,,,831691,,, +Topic,,,,,,28321638,,, +Person,682588400093615551,,,,,,,, +Person,920136262355651383,,,,,,,, +Person,351354309273100074,,,,,,,, +Person,495352903902152146,,,,,,,, +Person,211778681592778731,,,,,,,, +Topic,,,,,,1189753,,, +Person,396953035572582107,,,,,,,, +Topic,,,,,,828749,,, +Topic,,,,,,904756,,, +Topic,,,,,,7392008,,, +Topic,,,,,,2566598,,, +Person,363047312690634767,,,,,,,, +Topic,,,,,,618102,,, +Person,205415260510814362,,,,,,,, +Person,1251650482793161774,,,,,,,, +Topic,,,,,,620463,,, +Person,1463522545161373807,,,,,,,, +Person,1150357430325141247,,,,,,,, +Person,674253449444876344,,,,,,,, +Person,1073324208204442390,,,,,,,, +Topic,,,,,,1049632,,40.665352,-73.969264 +Topic,,,,,,29171,,, +Person,1637740339335566412,,,,,,,, +Person,524508243055647325,,,,,,,, +Person,320151361710953715,,,,,,,, +Person,128643504412157535,,,,,,,, +Topic,,,,,,361,,, +Person,1243472362254658420,,,,,,,, +Topic,,,,,,617927,,, +Person,1275555184736572954,,,,,,,, +Topic,,,,,,974850,,, +Person,934144115142885657,,,,,,,, +Topic,,,,,,217627,,, +Topic,,,,,,223155,,, +Person,1504217244688272832,,,,,,,, +Person,144548678565311334,,,,,,,, +Person,1400516284533535554,,,,,,,, +Topic,,,,,,14528,,, +Person,1508951542204233332,,,,,,,, +Person,611325512448133762,,,,,,,, +Person,635555368637193420,,,,,,,, +Person,134403203055015143,,,,,,,, +Topic,,,,,,202013,,, +Topic,,,,,,7602643,,, +Topic,,,,,,121765,,40.774444444444,-73.904166666667 +Person,765254641650259739,,,,,,,, +Person,975526659664533195,,,,,,,, +Person,273872236541568195,,,,,,,, +Topic,,,,,,557887,,, +Topic,,,,,,774228,,, +Topic,,,,,,46744,,, +Person,352033450190732475,,,,,,,, +Person,841466124620556016,,,,,,,, +Person,1517466541524095404,,,,,,,, +Topic,,,,,,114633,,, +Person,301710390995444087,,,,,,,, +Topic,,,,,,16868955,,, +Person,747231730275042400,,,,,,,, +Person,1164902255571715230,,,,,,,, +Person,291914370254601234,,,,,,,, +Topic,,,,,,247154,,, +Topic,,,,,,519,,, +Topic,,,,,,3303945,,, +Person,1022241560051472272,,,,,,,, +Person,566448585007839403,,,,,,,, +Topic,,,,,,11229,,, +Person,735713441679521195,,,,,,,, +Person,1128501731262832684,,,,,,,, +Person,446962590481145702,,,,,,,, +Person,1125113326787431160,,,,,,,, +Person,437201545096608055,,,,,,,, +Person,940377106445268064,,,,,,,, +Person,1647329525841402942,,,,,,,, +Topic,,,,,,3884230,,, +Topic,,,,,,131191,,, +Person,1376053313411407054,,,,,,,, +Person,1347432655942023365,,,,,,,, +Person,1472154222902711100,,,,,,,, +Person,529550602103217450,,,,,,,, +Topic,,,,,,43035,,, +Topic,,,,,,126095,,, +Topic,,,,,,49088,,40.8075,-73.961944444444 +Person,910075513854877065,,,,,,,, +Topic,,,,,,5018694,,40.859105555556,-74.198686111111 +Topic,,,,,,2446683,,43.1189,20.0797 +Topic,,,,,,2030894,,40.850852,-73.844949 +Topic,,,,,,29718382,,, +Topic,,,,,,130965,,40.860833333333,-73.884444444444 +Topic,,,,,,167172,,, +Topic,,,,,,2456507,,, +Person,842652402732741813,,,,,,,, +Topic,,,,,,670897,,33.421111111111,-111.93166666667 +Topic,,,,,,1436668,,, +Topic,,,,,,753651,,, +Topic,,,,,,7451247,,, +Topic,,,,,,2493,,, +Person,719533111062900642,,,,,,,, +Person,834321901190546647,,,,,,,, +Topic,,,,,,12796,,, +Person,937074421253040138,,,,,,,, +Topic,,,,,,18159587,,, +Person,101810442957214781,,,,,,,, +Topic,,,,,,131401,,, +Topic,,,,,,929920,,, +Topic,,,,,,466439,,, +Topic,,,,,,6498684,,, +Topic,,,,,,206361,,, +Topic,,,,,,41796,,, +Person,1152266442105786574,,,,,,,, +Person,95240187156237415,,,,,,,, +Topic,,,,,,7897553,,, +Topic,,,,,,206887,,, +Topic,,,,,,5405633,,, +Person,1031526243841315760,,,,,,,, +Topic,,,,,,11348540,,, +Topic,,,,,,4198163,,, +Topic,,,,,,16048728,,, +Topic,,,,,,189756,,, +Topic,,,,,,643638,,, +Topic,,,,,,783874,,, +Topic,,,,,,492346,,37.2708,-76.7069 +Topic,,,,,,29042975,,, +Topic,,,,,,12103677,,, +Topic,,,,,,2329157,,, +Person,1563598527979706128,,,,,,,, +Topic,,,,,,4442,,, +Person,264075025125849069,,,,,,,, +Person,369370063627142227,,,,,,,, +Person,1300183120520109060,,,,,,,, +Topic,,,,,,18031504,,, +Topic,,,,,,4229887,,, +Person,611117914195523184,,,,,,,, +Topic,,,,,,7543639,,, +Topic,,,,,,13977,,, +Topic,,,,,,18122778,,, +Person,166319955306346577,,,,,,,, +Topic,,,,,,588894,,, +Topic,,,,,,2454265,,, +Person,1547400408884914628,,,,,,,, +Person,373641740834326257,,,,,,,, +Topic,,,,,,5264957,,, +Topic,,,,,,968598,,, +Person,754480939973310112,,,,,,,, +Topic,,,,,,1808877,,, +Person,1443919105364146460,,,,,,,, +Person,735243266472522113,,,,,,,, +Person,1321304826561136177,,,,,,,, +Person,1560601202484151215,,,,,,,, +Person,1403521534163206962,,,,,,,, +Person,231472126788137195,,,,,,,, +Person,208411288512434105,,,,,,,, +Topic,,,,,,7252790,,, +Person,1211456636406749825,,,,,,,, +Person,1071303249530347453,,,,,,,, +Person,1069710216181783510,,,,,,,, +Person,1578613817419480731,,,,,,,, +Person,944546653739552042,,,,,,,, +Topic,,,,,,8856932,,, +Person,616673625330310949,,,,,,,, +Person,1302421465423646583,,,,,,,, +Person,720320812100121121,,,,,,,, +Person,653345304799504620,,,,,,,, +Person,346401281431409585,,,,,,,, +Person,1526112405471861415,,,,,,,, +Person,1501623481588541372,,,,,,,, +Person,312380611598980641,,,,,,,, +Person,1115244423173415593,,,,,,,, +Person,1555348115336584230,,,,,,,, +Person,12321118467056216,,,,,,,, +Person,1352636429150180228,,,,,,,, +Person,725324491051434870,,,,,,,, +Person,846764541256336994,,,,,,,, +Person,140443713446471314,,,,,,,, +Person,1135272113235621141,,,,,,,, +Person,775818654043059161,,,,,,,, +Person,529476525413023401,,,,,,,, +Person,1262668194076216011,,,,,,,, +Person,119474435514352445,,,,,,,, +Person,437573095319558705,,,,,,,, +Person,1035555223142154728,,,,,,,, +Person,556320934631523806,,,,,,,, +Person,1356253242219285320,,,,,,,, +Person,248654236829951090,,,,,,,, +Person,481153633235353485,,,,,,,, +Includes,,,1202482536733844323,1296829658689065159,,,,, +HasTopic,,,,956704137555154092,,335046,,, +HasTopic,,,,1028329324575034354,,1808877,,, +HasTopic,,,,,1004346153600881042,735,,, +Author,1560601202484151215,,,1285128710332882742,,,,, +HasTopic,,,,1513662032452523252,,131401,,, +HasTopic,,,,1302313601603127196,,48789658,,, +HasTopic,,,,1114502034902546550,,40357,,, +Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,, +HasTopic,,,,78678286442461987,,28321638,,, +HasTopic,,,,854149383334143372,,903552,,, +HasTopic,,,,240337224527030225,,519,,, +HasTopic,,,,116892402526543412,,5264957,,, +HasTopic,,,,202421472143651025,,12103677,,, +HasTopic,,,,393285992310638641,,470118,,, +Author,910075513854877065,,,,102583151124020340,,,, +Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,, +Author,1128501731262832684,,,1513662032452523252,,,,, +Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,, +Author,477384404927196020,,,1651365355351122204,,,,, +Author,725324491051434870,,,202421472143651025,,,,, +HasTopic,,,,932362105613871012,,69871376,,, +HasTopic,,,,1290121451283392110,,169313,,, +Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,, +HasTopic,,,,701755398615636460,,8148,,, +Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,, +HasTopic,,,,1651365355351122204,,643638,,, +HasTopic,,,,1114502034902546550,,46744,,, +HasTopic,,,,936722743217343702,,123705,,, +HasTopic,,,,321724159614056152,,6498684,,, +Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,, +HasTopic,,,1372844135435303981,,,60,,, +Author,1578613817419480731,,,321724159614056152,,,,, +HasTopic,,,,1512214307542520410,,8074,,, +HasTopic,,,,618434247743641149,,192242,,, +Includes,,,1202482536733844323,1114502034902546550,,,,, +Includes,,,1615340315424362057,877764733212222524,,,,, +HasTopic,,,,240337224527030225,,1229,,, +HasTopic,,,,1209342585680609487,,179057,,, +Author,834321901190546647,,,,1433303251800176474,,,, +HasTopic,,,,1209342585680609487,,175111,,, +Includes,,,1372844135435303981,581543512052485139,,,,, +Author,720320812100121121,,,420762134340393550,,,,, +Includes,,,1615340315424362057,1424660009578332566,,,,, +HasTopic,,,,932362105613871012,,771572,,, +Includes,,,353365307219544531,956704137555154092,,,,, +HasTopic,,,,956704137555154092,,929920,,, +HasTopic,,,,1441762191425652442,,177749,,, +Author,1563598527979706128,,,1220295546212024391,,,,, +Author,248654236829951090,,,1424660009578332566,,,,, +Author,529476525413023401,,,1302313601603127196,,,,, +HasTopic,,,,1209342585680609487,,771572,,, +HasTopic,,,,,1433303251800176474,83460,,, +Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,, +Includes,,,1615340315424362057,128423416112315798,,,,, +Includes,,,1372844135435303981,932362105613871012,,,,, +Includes,,,1372844135435303981,747423119260925972,,,,, +HasTopic,,,,854149383334143372,,24862,,, +Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,, +HasTopic,,,,1114502034902546550,,328473,,, +Includes,,,1314315120197156050,854149383334143372,,,,, +Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,, +HasOrg,,,,,102583151124020340,49088,,, +HasTopic,,,,932362105613871012,,11299,,, +HasTopic,,,,932362105613871012,,18426,,, +HasTopic,,,,,1004346153600881042,7392008,,, +Author,1526112405471861415,,,846536331643665114,,,,, +HasTopic,,,,78678286442461987,,617927,,, +HasTopic,,,,321724159614056152,,4229887,,, +HasTopic,,,,82629615412640377,,5322,,, +Author,1128501731262832684,,,1114502034902546550,,,,, +Author,166319955306346577,,,209800678458482108,,,,, +Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,, +Author,944546653739552042,,,803952155714850701,,,,, +HasTopic,,,,581543512052485139,,3933135,,, +HasTopic,,,,209800678458482108,,4198163,,, +HasTopic,,,,932362105613871012,,7590,,, +Includes,,,1372844135435303981,1060309546214304182,,,,, +HasTopic,,,,846536331643665114,,167172,,, +HasTopic,,,,740410432146852843,,11348540,,, +Author,937074421253040138,,,,1004346153600881042,,,, +HasTopic,,,,1282227710122181132,,2493,,, +HasTopic,,,,,1433303251800176474,43035,,, +Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,, +Includes,,,1372844135435303981,1512214307542520410,,,,, +HasTopic,,,353365307219544531,,,2329,,, +Includes,,,1615340315424362057,701755398615636460,,,,, +Includes,,,1372844135435303981,1209342585680609487,,,,, +Includes,,,1202482536733844323,1513662032452523252,,,,, +HasTopic,,,,701755398615636460,,968598,,, +HasTopic,,,,128423416112315798,,11019,,, +Author,1356253242219285320,,,854149383334143372,,,,, +Author,1031526243841315760,,,,1433303251800176474,,,, +Author,842652402732741813,,,,1433303251800176474,,,, +HasTopic,,,,440265285168056234,,18159587,,, +Author,1501623481588541372,,,833681012494554358,,,,, +HasTopic,,,,1028329324575034354,,11299,,, +Author,140443713446471314,,,1028329324575034354,,,,, +Author,264075025125849069,,,1245126351375505703,,,,, +Includes,,,1314315120197156050,1614534111336540475,,,,, +HasTopic,,,,,1433303251800176474,131191,,, +HasTopic,,,,420762134340393550,,1907525,,, +Sale,975526659664533195,524508243055647325,,,,,10/15/2018,, +Author,284405379592161575,,,,102583151124020340,,,, +HasTopic,,,,,963345652072941810,735,,, +HasTopic,,,,,1433303251800176474,3303945,,, +Author,1152266442105786574,,,,963345652072941810,,,, +HasTopic,,,,202421472143651025,,1189753,,, +HasTopic,,,,1028329324575034354,,11229,,, +Includes,,,1615340315424362057,1220295546212024391,,,,, +Includes,,,1314315120197156050,846536331643665114,,,,, +HasTopic,,,,1296829658689065159,,7451247,,, +Includes,,,1615340315424362057,78678286442461987,,,,, +HasTopic,,,,541215404780905313,,11635,,, +Author,481153633235353485,,,956704137555154092,,,,, +Author,611117914195523184,,,701755398615636460,,,,, +Author,95240187156237415,,,,963345652072941810,,,, +Includes,,,1372844135435303981,91431002216341149,,,,, +Author,846764541256336994,,,1290121451283392110,,,,, +HasTopic,,,,393285992310638641,,206887,,, +HasTopic,,,,1512214307542520410,,177749,,, +HasTopic,,,,1114502034902546550,,7252790,,, +HasTopic,,,1202482536733844323,,,60,,, +HasTopic,,,,1302313601603127196,,41796,,, +HasTopic,,,,1114502034902546550,,44311,,, +Author,1547400408884914628,,,393285992310638641,,,,, +Author,312380611598980641,,,78678286442461987,,,,, +Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,, +HasTopic,,,,1513662032452523252,,728,,, +Includes,,,1615340315424362057,447169043921403064,,,,, +Includes,,,1372844135435303981,82629615412640377,,,,, +HasTopic,,,,1302313601603127196,,29042975,,, +Author,1321304826561136177,,,1532662490035322233,,,,, +Includes,,,1372844135435303981,1302313601603127196,,,,, +Sale,363047312690634767,242111862342742257,,,,,10/4/2018,, +Author,1071303249530347453,,,737353170652104031,,,,, +HasTopic,,,,1282227710122181132,,35140,,, +HasTopic,,,,91431002216341149,,46744,,, +Includes,,,1372844135435303981,1441762191425652442,,,,, +Sale,446962590481145702,534449219561977424,,,,,10/11/2018,, +Author,1035555223142154728,,,877764733212222524,,,,, +Author,1403521534163206962,,,932362105613871012,,,,, +HasTopic,,,,701755398615636460,,35127,,, +Includes,,,1372844135435303981,1028329324575034354,,,,, +HasTopic,,,,1245126351375505703,,254860,,, +HasTopic,,,,209800678458482108,,7897553,,, +Includes,,,1372844135435303981,186108460103013588,,,,, +HasOrg,,,,,102583151124020340,304878,,, +HasTopic,,,,,1433303251800176474,998,,, +Author,653345304799504620,,,581543512052485139,,,,, +Author,1302421465423646583,,,240337224527030225,,,,, +Author,1211456636406749825,,,618434247743641149,,,,, +HasTopic,,,,240337224527030225,,785,,, +HasTopic,,,,,1004346153600881042,83460,,, +Includes,,,1615340315424362057,1245126351375505703,,,,, +HasTopic,,,,747423119260925972,,16868955,,, +HasTopic,,,,91431002216341149,,9420,,, +Includes,,,451888058015735870,541215404780905313,,,,, +HasTopic,,,,1209342585680609487,,492346,,, +HasTopic,,,,1290121451283392110,,114633,,, +HasTopic,,,,1441762191425652442,,11249,,, +HasTopic,,,,1532662490035322233,,753651,,, +HasTopic,,,,1532662490035322233,,11707,,, +HasTopic,,,,1245126351375505703,,904756,,, +HasTopic,,,15133734353741126,,,189756,,, +Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,, +Includes,,,1372844135435303981,1424263331858043042,,,,, +Includes,,,1615340315424362057,1290121451283392110,,,,, +Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,, +HasTopic,,,,737353170652104031,,16003594,,, +HasTopic,,,,747423119260925972,,2329157,,, +Sale,566448585007839403,765254641650259739,,,,,10/9/2018,, +HasOrg,,,,,963345652072941810,130965,,, +Author,616673625330310949,,,936722743217343702,,,,, +HasTopic,,,,1513662032452523252,,44311,,, +Author,119474435514352445,,,1282227710122181132,,,,, +HasTopic,,,,82629615412640377,,22983,,, +HasTopic,,,,833681012494554358,,201816,,, +HasTopic,,,,,1433303251800176474,14528,,, +Sale,33927662206515912,934144115142885657,,,,,10/10/2018,, +Includes,,,1372844135435303981,618434247743641149,,,,, +Includes,,,1615340315424362057,740410432146852843,,,,, +Author,1300183120520109060,,,128423416112315798,,,,, +Author,1069710216181783510,,,1296829658689065159,,,,, +Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,, +Author,1115244423173415593,,,1614534111336540475,,,,, +Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,, +HasTopic,,,,1285128710332882742,,37654,,, +HasTopic,,,1615340315424362057,,,12796,,, +Author,1555348115336584230,,,817526874194673140,,,,, +HasTopic,,,,,102583151124020340,43035,,, +HasTopic,,,,737353170652104031,,206361,,, +Includes,,,1314315120197156050,833681012494554358,,,,, +Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,, +HasTopic,,,,1220295546212024391,,588894,,, +HasTopic,,,,202421472143651025,,7602643,,, +Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,, +Author,735243266472522113,,,1209342585680609487,,,,, +Includes,,,1372844135435303981,1651365355351122204,,,,, +HasTopic,,,,1424660009578332566,,618102,,, +HasTopic,,,,,102583151124020340,126095,,, +HasTopic,,,,,1004346153600881042,2446683,,, +HasTopic,,,,,1433303251800176474,598435,,, +Author,735713441679521195,,,,1433303251800176474,,,, +Includes,,,1314315120197156050,321724159614056152,,,,, +HasTopic,,,,1114502034902546550,,8856932,,, +HasTopic,,,,,1433303251800176474,5405633,,, +HasTopic,,,,1285128710332882742,,11299,,, +HasTopic,,,,420762134340393550,,12796,,, +HasTopic,,,,1184855350262395542,,329717,,, +HasTopic,,,,116892402526543412,,2456507,,, +Includes,,,1615340315424362057,817526874194673140,,,,, +Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,, +HasTopic,,,,1209342585680609487,,127197,,, +HasTopic,,,,420762134340393550,,16048728,,, +HasTopic,,,,209800678458482108,,774228,,, +Author,231472126788137195,,,82629615412640377,,,,, +HasTopic,,,,,1433303251800176474,7817,,, +HasTopic,,,,932362105613871012,,1049632,,, +Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,, +HasTopic,,,,,1433303251800176474,467,,, +HasTopic,,,,1220295546212024391,,37497186,,, +HasTopic,,,,817526874194673140,,732934,,, +Includes,,,1615340315424362057,202421472143651025,,,,, +HasTopic,,,,1513662032452523252,,889,,, +Includes,,,1372844135435303981,737353170652104031,,,,, +HasTopic,,,,1296829658689065159,,83460,,, +HasTopic,,,,747423119260925972,,1333024,,, +HasTopic,,,,1290121451283392110,,204,,, +Author,611117914195523184,,,440265285168056234,,,,, +Author,754480939973310112,,,1441762191425652442,,,,, +Includes,,,1427292001647224242,936722743217343702,,,,, +HasTopic,,,,1441762191425652442,,485537,,, +Sale,1376053313411407054,1347432655942023365,,,,55424107,2/3/2019,, +HasTopic,,,,581543512052485139,,519,,, +Sale,682588400093615551,920136262355651383,,,,,10/5/2018,, +Author,346401281431409585,,,1114502034902546550,,,,, +Sale,317248309514344163,1524681741257900519,,,,,8/3/2018,, +HasTopic,,,,618434247743641149,,3884230,,, +Sale,1243472362254658420,205415260510814362,,,,,8/9/2018,, +Sale,1472154222902711100,529550602103217450,,,,185785,2/17/2019,, +Includes,,,1314315120197156050,803952155714850701,,,,, +Author,1262668194076216011,,,747423119260925972,,,,, +HasTopic,,,,833681012494554358,,787185,,, +HasTopic,,,,932362105613871012,,160409,,, +Sale,1125113326787431160,437201545096608055,,,,146,10/14/2018,, +HasTopic,,,,1424660009578332566,,334600,,, +HasTopic,,,451888058015735870,,,121765,,, +HasTopic,,,,1285128710332882742,,193294,,, +HasTopic,,,,956704137555154092,,217627,,, +HasTopic,,,,1424263331858043042,,2454265,,, +HasTopic,,,,1651365355351122204,,783874,,, +HasTopic,,,,1209342585680609487,,792565,,, +Author,1128501731262832684,,,1060309546214304182,,,,, +Author,775818654043059161,,,740410432146852843,,,,, +HasTopic,,,,854149383334143372,,4442,,, +HasOrg,,,,,1004346153600881042,670897,,, +Sale,1508951542204233332,611325512448133762,,,,,10/11/2018,, +HasTopic,,,,747423119260925972,,202013,,, +Author,208411288512434105,,,1513662032452523252,,,,, +HasTopic,,,,740410432146852843,,5088838,,, +Includes,,,15133734353741126,1532662490035322233,,,,, +HasTopic,,,,817526874194673140,,18122778,,, +HasTopic,,,,440265285168056234,,102014,,, +HasTopic,,,,186108460103013588,,732934,,, +Includes,,,1615340315424362057,209800678458482108,,,,, +Author,373641740834326257,,,116892402526543412,,,,, +HasTopic,,,,740410432146852843,,122113,,, +Includes,,,1615340315424362057,440265285168056234,,,,, +Author,719533111062900642,,,,1433303251800176474,,,, +HasTopic,,,,420762134340393550,,10289,,, +HasTopic,,,,1184855350262395542,,11348,,, +HasTopic,,,,82629615412640377,,247154,,, +Author,369370063627142227,,,1184855350262395542,,,,, +Author,1135272113235621141,,,91431002216341149,,,,, +Sale,635555368637193420,134403203055015143,,,,,8/15/2018,, +Includes,,,1314315120197156050,1282227710122181132,,,,, +HasTopic,,,,,1433303251800176474,828749,,, +HasTopic,,,1314315120197156050,,,12796,,, +HasTopic,,,,541215404780905313,,182218,,, +HasTopic,,,,877764733212222524,,7543639,,, +HasTopic,,,,,1433303251800176474,1436668,,, +HasTopic,,,,447169043921403064,,29171,,, +HasOrg,,,,,1433303251800176474,2030894,,, +Sale,477384404927196020,1128501731262832684,,,,271997,09/30/2018,, +HasTopic,,,,581543512052485139,,505619,,, +HasTopic,,,,618434247743641149,,60,,, +HasTopic,,,,803952155714850701,,557887,,, +HasTopic,,,,78678286442461987,,831691,,, +Author,101810442957214781,,,,1004346153600881042,,,, +Sale,1597454052092354280,961135479935321085,,,,1149078,8/1/2018,, +HasOrg,,,,,1004346153600881042,5018694,,, +Sale,301710390995444087,1312322776399358210,,,,379860,10/12/2018,, +HasTopic,,,,,1004346153600881042,29718382,,, +HasTopic,,,,,1433303251800176474,620463,,, +HasTopic,,,,1651365355351122204,,44311,,, +Author,1443919105364146460,,,1424263331858043042,,,,, +Sale,895197896920634500,1508332501512270227,,,,487,7/31/2018,, +HasTopic,,,,321724159614056152,,158668,,, +Includes,,,1427292001647224242,240337224527030225,,,,, +HasTopic,,,,1184855350262395542,,223155,,, +Includes,,,1615340315424362057,1184855350262395542,,,,, +HasTopic,,,,1532662490035322233,,1337691,,, +Author,1352636429150180228,,,186108460103013588,,,,, +HasTopic,,,,393285992310638641,,974850,,, +Includes,,,1615340315424362057,116892402526543412,,,,, +HasTopic,,,,128423416112315798,,361,,, +HasTopic,,,,541215404780905313,,13977,,, +HasTopic,,,,1512214307542520410,,699385,,, +Sale,1035098046740791143,352033450190732475,,,,,8/25/2018,, +Includes,,,1615340315424362057,420762134340393550,,,,, +Author,556320934631523806,,,541215404780905313,,,,, +HasTopic,,,1427292001647224242,,,25395,,, +HasTopic,,,,1424660009578332566,,191290,,, +Sale,351354309273100074,1073324208204442390,,,,206021,8/7/2018,, +Sale,1025135622623992536,1001287904525368324,,,,,10/2/2018,, +Author,12321118467056216,,,1512214307542520410,,,,, +HasTopic,,,,932362105613871012,,60,,, +HasTopic,,,,803952155714850701,,5,,, +HasTopic,,,,1220295546212024391,,18031504,,, +HasTopic,,,,91431002216341149,,73843,,, +Author,437573095319558705,,,447169043921403064,,,,, +HasTopic,,,,1424263331858043042,,2566598,,, +Includes,,,15133734353741126,1285128710332882742,,,,, +HasTopic,,,,1513662032452523252,,466439,,, +Includes,,,1314315120197156050,393285992310638641,,,,, diff --git a/inputs/wmd/data.001.csv b/inputs/wmd/data.001.csv new file mode 100644 index 0000000000..f5479d326c --- /dev/null +++ b/inputs/wmd/data.001.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff7df1aa0a2261d930471fc057251d1aa2cb404fa8c88c12c3b65fe2a5204bf8 +size 2879652 diff --git a/inputs/wmd/dynamic0.txt b/inputs/wmd/dynamic0.txt new file mode 100644 index 0000000000..37648532bc --- /dev/null +++ b/inputs/wmd/dynamic0.txt @@ -0,0 +1,46 @@ +Author,346401281431409585,,,1114502034902546550,,,,, +Sale,317248309514344163,1524681741257900519,,,,,8/3/2018,, +HasTopic,,,,618434247743641149,,3884230,,, +Sale,1243472362254658420,205415260510814362,,,,,8/9/2018,, +Sale,1472154222902711100,529550602103217450,,,,185785,2/17/2019,, +Includes,,,1314315120197156050,803952155714850701,,,,, +Author,1262668194076216011,,,747423119260925972,,,,, +HasTopic,,,,833681012494554358,,787185,,, +HasTopic,,,,932362105613871012,,160409,,, +Sale,1125113326787431160,437201545096608055,,,,146,10/14/2018,, +HasTopic,,,,1424660009578332566,,334600,,, +HasTopic,,,451888058015735870,,,121765,,, +HasTopic,,,,1285128710332882742,,193294,,, +HasTopic,,,,956704137555154092,,217627,,, +HasTopic,,,,1424263331858043042,,2454265,,, +HasTopic,,,,1651365355351122204,,783874,,, +HasTopic,,,,1209342585680609487,,792565,,, +Author,1128501731262832684,,,1060309546214304182,,,,, +Author,775818654043059161,,,740410432146852843,,,,, +HasTopic,,,,854149383334143372,,4442,,, +HasOrg,,,,,1004346153600881042,670897,,, +Sale,1508951542204233332,611325512448133762,,,,,10/11/2018,, +HasTopic,,,,747423119260925972,,202013,,, +Author,208411288512434105,,,1513662032452523252,,,,, +HasTopic,,,,740410432146852843,,5088838,,, +Includes,,,15133734353741126,1532662490035322233,,,,, +HasTopic,,,,817526874194673140,,18122778,,, +HasTopic,,,,440265285168056234,,102014,,, +HasTopic,,,,186108460103013588,,732934,,, +Includes,,,1615340315424362057,209800678458482108,,,,, +Author,373641740834326257,,,116892402526543412,,,,, +HasTopic,,,,740410432146852843,,122113,,, +Includes,,,1615340315424362057,440265285168056234,,,,, +Author,719533111062900642,,,,1433303251800176474,,,, +HasTopic,,,,420762134340393550,,10289,,, +HasTopic,,,,1184855350262395542,,11348,,, +HasTopic,,,,82629615412640377,,247154,,, +Author,369370063627142227,,,1184855350262395542,,,,, +Author,1135272113235621141,,,91431002216341149,,,,, +Sale,635555368637193420,134403203055015143,,,,,8/15/2018,, +Includes,,,1314315120197156050,1282227710122181132,,,,, +HasTopic,,,,,1433303251800176474,828749,,, +HasTopic,,,1314315120197156050,,,12796,,, +HasTopic,,,,541215404780905313,,182218,,, +HasTopic,,,,877764733212222524,,7543639,,, +HasTopic,,,,,1433303251800176474,1436668,,, diff --git a/inputs/wmd/dynamic1.txt b/inputs/wmd/dynamic1.txt new file mode 100644 index 0000000000..8f13aa07e8 --- /dev/null +++ b/inputs/wmd/dynamic1.txt @@ -0,0 +1,44 @@ +HasTopic,,,,447169043921403064,,29171,,, +HasOrg,,,,,1433303251800176474,2030894,,, +Sale,477384404927196020,1128501731262832684,,,,271997,09/30/2018,, +HasTopic,,,,581543512052485139,,505619,,, +HasTopic,,,,618434247743641149,,60,,, +HasTopic,,,,803952155714850701,,557887,,, +HasTopic,,,,78678286442461987,,831691,,, +Author,101810442957214781,,,,1004346153600881042,,,, +Sale,1597454052092354280,961135479935321085,,,,1149078,8/1/2018,, +HasOrg,,,,,1004346153600881042,5018694,,, +Sale,301710390995444087,1312322776399358210,,,,379860,10/12/2018,, +HasTopic,,,,,1004346153600881042,29718382,,, +HasTopic,,,,,1433303251800176474,620463,,, +HasTopic,,,,1651365355351122204,,44311,,, +Author,1443919105364146460,,,1424263331858043042,,,,, +Sale,895197896920634500,1508332501512270227,,,,487,7/31/2018,, +HasTopic,,,,321724159614056152,,158668,,, +Includes,,,1427292001647224242,240337224527030225,,,,, +HasTopic,,,,1184855350262395542,,223155,,, +Includes,,,1615340315424362057,1184855350262395542,,,,, +HasTopic,,,,1532662490035322233,,1337691,,, +Author,1352636429150180228,,,186108460103013588,,,,, +HasTopic,,,,393285992310638641,,974850,,, +Includes,,,1615340315424362057,116892402526543412,,,,, +HasTopic,,,,128423416112315798,,361,,, +HasTopic,,,,541215404780905313,,13977,,, +HasTopic,,,,1512214307542520410,,699385,,, +Sale,1035098046740791143,352033450190732475,,,,,8/25/2018,, +Includes,,,1615340315424362057,420762134340393550,,,,, +Author,556320934631523806,,,541215404780905313,,,,, +HasTopic,,,1427292001647224242,,,25395,,, +HasTopic,,,,1424660009578332566,,191290,,, +Sale,351354309273100074,1073324208204442390,,,,206021,8/7/2018,, +Sale,1025135622623992536,1001287904525368324,,,,,10/2/2018,, +Author,12321118467056216,,,1512214307542520410,,,,, +HasTopic,,,,932362105613871012,,60,,, +HasTopic,,,,803952155714850701,,5,,, +HasTopic,,,,1220295546212024391,,18031504,,, +HasTopic,,,,91431002216341149,,73843,,, +Author,437573095319558705,,,447169043921403064,,,,, +HasTopic,,,,1424263331858043042,,2566598,,, +Includes,,,15133734353741126,1285128710332882742,,,,, +HasTopic,,,,1513662032452523252,,466439,,, +Includes,,,1314315120197156050,393285992310638641,,,,, diff --git a/inputs/wmd/static b/inputs/wmd/static new file mode 100644 index 0000000000..b95a23c9b4 --- /dev/null +++ b/inputs/wmd/static @@ -0,0 +1,650 @@ +#delimieter: , +#columns:type,person1,person2,forum,forum_event,publication,topic,date,lat,lon +#types:STRING,UINT,UINT,UINT,UINT,UINT,UINT,USDATE,DOUBLE,DOUBLE +Publication,,,,,102583151124020340,,4/1/2013,, +Publication,,,,,1004346153600881042,,12/2/2014,, +Publication,,,,,1433303251800176474,,1/1/2014,, +Publication,,,,,963345652072941810,,3/1/2017,, +ForumEvent,,,1372844135435303981,1651365355351122204,,,1/7/2019,, +ForumEvent,,,1372844135435303981,1060309546214304182,,,1/3/2018,, +ForumEvent,,,1372844135435303981,932362105613871012,,,1/8/2018,, +ForumEvent,,,1372844135435303981,618434247743641149,,,1/8/2018,, +ForumEvent,,,1372844135435303981,1209342585680609487,,,1/10/2018,, +ForumEvent,,,1615340315424362057,1245126351375505703,,,2/13/2018,, +ForumEvent,,,1372844135435303981,581543512052485139,,,2/5/2018,, +ForumEvent,,,1314315120197156050,833681012494554358,,,3/23/2018,, +ForumEvent,,,1615340315424362057,1220295546212024391,,,3/26/2018,, +ForumEvent,,,1372844135435303981,1424263331858043042,,,4/5/2018,, +ForumEvent,,,1615340315424362057,1290121451283392110,,,4/12/2018,, +ForumEvent,,,1427292001647224242,240337224527030225,,,4/24/2018,, +ForumEvent,,,1615340315424362057,440265285168056234,,,5/17/2018,, +ForumEvent,,,1615340315424362057,817526874194673140,,,5/31/2018,, +ForumEvent,,,1314315120197156050,846536331643665114,,,6/12/2018,, +ForumEvent,,,1202482536733844323,1114502034902546550,,,6/14/2018,, +ForumEvent,,,1372844135435303981,1441762191425652442,,,7/8/2018,, +ForumEvent,,,1615340315424362057,128423416112315798,,,7/20/2018,, +ForumEvent,,,1615340315424362057,701755398615636460,,,8/1/2018,, +ForumEvent,,,1314315120197156050,393285992310638641,,,8/12/2018,, +ForumEvent,,,1615340315424362057,420762134340393550,,,9/9/2018,, +ForumEvent,,,1372844135435303981,737353170652104031,,,9/14/2018,, +ForumEvent,,,1615340315424362057,116892402526543412,,,10/13/2018,, +ForumEvent,,,1372844135435303981,1028329324575034354,,,10/20/2018,, +ForumEvent,,,1202482536733844323,1513662032452523252,,,10/30/2018,, +ForumEvent,,,1314315120197156050,803952155714850701,,,11/14/2018,, +ForumEvent,,,1372844135435303981,186108460103013588,,,11/12/2018,, +ForumEvent,,,1615340315424362057,1184855350262395542,,,12/1/2018,, +ForumEvent,,,1372844135435303981,1302313601603127196,,,12/16/2018,, +ForumEvent,,,1615340315424362057,78678286442461987,,,1/11/2019,, +ForumEvent,,,15133734353741126,1285128710332882742,,,1/10/2019,, +ForumEvent,,,1615340315424362057,447169043921403064,,,2/2/2019,, +ForumEvent,,,1372844135435303981,91431002216341149,,,2/13/2019,, +ForumEvent,,,1202482536733844323,1296829658689065159,,,2/13/2019,, +ForumEvent,,,1615340315424362057,877764733212222524,,,3/28/2019,, +ForumEvent,,,1314315120197156050,1614534111336540475,,,3/3/2019,, +ForumEvent,,,1615340315424362057,209800678458482108,,,4/14/2019,, +ForumEvent,,,15133734353741126,1532662490035322233,,,4/1/2019,, +ForumEvent,,,1314315120197156050,321724159614056152,,,5/29/2019,, +ForumEvent,,,1372844135435303981,1512214307542520410,,,5/17/2019,, +ForumEvent,,,1615340315424362057,740410432146852843,,,6/5/2019,, +ForumEvent,,,1372844135435303981,82629615412640377,,,6/24/2019,, +ForumEvent,,,1427292001647224242,936722743217343702,,,6/30/2019,, +ForumEvent,,,1372844135435303981,747423119260925972,,,7/11/2019,, +ForumEvent,,,451888058015735870,541215404780905313,,,7/3/2019,, +ForumEvent,,,1615340315424362057,1424660009578332566,,,8/25/2019,, +ForumEvent,,,1314315120197156050,1282227710122181132,,,8/5/2019,, +ForumEvent,,,1314315120197156050,854149383334143372,,,9/19/2019,, +ForumEvent,,,1615340315424362057,202421472143651025,,,9/21/2019,, +ForumEvent,,,353365307219544531,956704137555154092,,,10/17/2019,, +ForumEvent,,,,1142353335442842612,,,10/2/2019,, +ForumEvent,,,,1417645062678302203,,,10/27/2019,, +ForumEvent,,,,691612430615344311,,,11/18/2019,, +ForumEvent,,,,499518911125406276,,,11/7/2019,, +ForumEvent,,,,802203574353867462,,,12/26/2019,, +ForumEvent,,,,1154045191214226005,,,12/19/2019,, +Forum,,,227560344059645632,,,,,, +Forum,,,642724485236726353,,,,,, +Forum,,,1583773067440233990,,,,,, +Forum,,,353365307219544531,,,,,, +Forum,,,1372844135435303981,,,,,, +Forum,,,817570614729612563,,,,,, +Forum,,,1222966301068614432,,,,,, +Forum,,,254347350613723281,,,,,, +Forum,,,230406515001545612,,,,,, +Forum,,,1561731546512891652,,,,,, +Forum,,,188043543797416114,,,,,, +Forum,,,1083041743586306041,,,,,, +Forum,,,132472381132383125,,,,,, +Forum,,,20118285562646166,,,,,, +Forum,,,555784630220125214,,,,,, +Forum,,,1015255971523263924,,,,,, +Forum,,,1342495276080758813,,,,,, +Forum,,,851350143155248158,,,,,, +Forum,,,1427292001647224242,,,,,, +Forum,,,722051276937327353,,,,,, +Forum,,,1107212912316309796,,,,,, +Forum,,,504490409499070811,,,,,, +Forum,,,15133734353741126,,,,,, +Forum,,,869745302967338810,,,,,, +Forum,,,324124332757504717,,,,,, +Forum,,,852491638004013222,,,,,, +Forum,,,1040437236245414809,,,,,, +Forum,,,442231451428861295,,,,,, +Forum,,,101022092642335391,,,,,, +Forum,,,1037815940207624157,,,,,, +Forum,,,1331941318481662527,,,,,, +Forum,,,1615340315424362057,,,,,, +Forum,,,1425519641234605945,,,,,, +Forum,,,705065952261175094,,,,,, +Forum,,,1314315120197156050,,,,,, +Forum,,,214214821270800149,,,,,, +Forum,,,1361197157264541395,,,,,, +Forum,,,1033538541314217453,,,,,, +Forum,,,565733832133342431,,,,,, +Forum,,,451888058015735870,,,,,, +Forum,,,155345234637251110,,,,,, +Forum,,,1371100161965701220,,,,,, +Forum,,,1307221369082243900,,,,,, +Forum,,,406508153569651122,,,,,, +Forum,,,1202482536733844323,,,,,, +Forum,,,912373284682369433,,,,,, +Person,477384404927196020,,,,,,,, +Person,182010581109145287,,,,,,,, +Topic,,,,,,271997,,, +Topic,,,,,,127197,,, +Person,284405379592161575,,,,,,,, +Topic,,,,,,11650,,, +Topic,,,,,,185785,,, +Topic,,,,,,1907525,,, +Topic,,,,,,1333024,,, +Topic,,,,,,2329,,, +Topic,,,,,,571,,, +Topic,,,,,,56683126,,, +Topic,,,,,,146,,, +Topic,,,,,,487,,, +Topic,,,,,,193294,,, +Topic,,,,,,177,,, +Topic,,,,,,81944,,, +Topic,,,,,,998,,, +Topic,,,,,,55424107,,, +Topic,,,,,,41323,,, +Topic,,,,,,38695,,, +Topic,,,,,,379860,,, +Topic,,,,,,1149078,,, +Topic,,,,,,172809,,, +Topic,,,,,,1642639,,, +Topic,,,,,,903552,,, +Topic,,,,,,204,,, +Topic,,,,,,7817,,, +Topic,,,,,,201816,,, +Topic,,,,,,785,,49.19,-2.11 +Topic,,,,,,127,,, +Topic,,,,,,206021,,, +Topic,,,,,,181508,,, +Topic,,,,,,735,,, +Topic,,,,,,304878,,, +Topic,,,,,,7590,,, +Topic,,,,,,8074,,, +Topic,,,,,,24862,,, +Topic,,,,,,35127,,, +Topic,,,,,,60,,40.67,-73.94 +Topic,,,,,,443533,,, +Person,1160244137181801222,,,,,,,, +Topic,,,,,,192242,,, +Topic,,,,,,11707,,, +Topic,,,,,,73843,,, +Topic,,,,,,505619,,, +Topic,,,,,,158668,,, +Topic,,,,,,889,,34.0,66.0 +Person,895197896920634500,,,,,,,, +Topic,,,,,,18426,,40.84676,-73.873207 +Topic,,,,,,787185,,, +Topic,,,,,,467,,, +Person,1419850416906085161,,,,,,,, +Topic,,,,,,2869238,,, +Topic,,,,,,5,,, +Topic,,,,,,334600,,, +Topic,,,,,,191290,,, +Topic,,,,,,122113,,, +Topic,,,,,,179057,,, +Topic,,,,,,11635,,, +Topic,,,,,,329717,,, +Person,33927662206515912,,,,,,,, +Topic,,,,,,35140,,, +Topic,,,,,,485537,,, +Topic,,,,,,102014,,, +Topic,,,,,,40357,,, +Topic,,,,,,1337691,,, +Topic,,,,,,160409,,40.7825,-73.966111111111 +Topic,,,,,,69871376,,, +Topic,,,,,,177749,,, +Topic,,,,,,11348,,, +Topic,,,,,,182218,,, +Topic,,,,,,1229,,47.568611111111,40.852783333333 +Topic,,,,,,5322,,, +Person,1035098046740791143,,,,,,,, +Topic,,,,,,792565,,48.10277778,20.78388889 +Topic,,,,,,37654,,, +Topic,,,,,,25395,,40.735277777778,-74.185 +Topic,,,,,,169313,,, +Topic,,,,,,728,,, +Topic,,,,,,699385,,, +Topic,,,,,,22983,,, +Person,971383124880710240,,,,,,,, +Person,1010629269012322480,,,,,,,, +Topic,,,,,,11299,,40.728333333333,-73.994166666667 +Topic,,,,,,83460,,, +Topic,,,,,,10289,,, +Topic,,,,,,11019,,, +Topic,,,,,,470118,,, +Person,1426050562563532645,,,,,,,, +Person,75415528634186650,,,,,,,, +Person,1001287904525368324,,,,,,,, +Person,242111862342742257,,,,,,,, +Topic,,,,,,11249,,40.747,-73.986 +Topic,,,,,,3933135,,, +Topic,,,,,,44311,,, +Person,1025135622623992536,,,,,,,, +Topic,,,,,,37497186,,, +Person,584485814982143221,,,,,,,, +Person,1508332501512270227,,,,,,,, +Topic,,,,,,328473,,40.712,-74.002 +Topic,,,,,,25347,,, +Topic,,,,,,175111,,, +Person,1312322776399358210,,,,,,,, +Topic,,,,,,16003594,,, +Topic,,,,,,48789658,,, +Topic,,,,,,8148,,, +Topic,,,,,,9420,,, +Topic,,,,,,771572,,40.699945,-73.950148 +Topic,,,,,,5088838,,, +Person,1597454052092354280,,,,,,,, +Person,961135479935321085,,,,,,,, +Topic,,,,,,123705,,, +Topic,,,,,,598435,,, +Topic,,,,,,732934,,, +Person,317248309514344163,,,,,,,, +Person,1524681741257900519,,,,,,,, +Topic,,,,,,254860,,, +Topic,,,,,,335046,,, +Person,534449219561977424,,,,,,,, +Person,1035056342462002945,,,,,,,, +Person,1222330726897222256,,,,,,,, +Person,493345739124130581,,,,,,,, +Topic,,,,,,831691,,, +Topic,,,,,,28321638,,, +Person,682588400093615551,,,,,,,, +Person,920136262355651383,,,,,,,, +Person,351354309273100074,,,,,,,, +Person,495352903902152146,,,,,,,, +Person,211778681592778731,,,,,,,, +Topic,,,,,,1189753,,, +Person,396953035572582107,,,,,,,, +Topic,,,,,,828749,,, +Topic,,,,,,904756,,, +Topic,,,,,,7392008,,, +Topic,,,,,,2566598,,, +Person,363047312690634767,,,,,,,, +Topic,,,,,,618102,,, +Person,205415260510814362,,,,,,,, +Person,1251650482793161774,,,,,,,, +Topic,,,,,,620463,,, +Person,1463522545161373807,,,,,,,, +Person,1150357430325141247,,,,,,,, +Person,674253449444876344,,,,,,,, +Person,1073324208204442390,,,,,,,, +Topic,,,,,,1049632,,40.665352,-73.969264 +Topic,,,,,,29171,,, +Person,1637740339335566412,,,,,,,, +Person,524508243055647325,,,,,,,, +Person,320151361710953715,,,,,,,, +Person,128643504412157535,,,,,,,, +Topic,,,,,,361,,, +Person,1243472362254658420,,,,,,,, +Topic,,,,,,617927,,, +Person,1275555184736572954,,,,,,,, +Topic,,,,,,974850,,, +Person,934144115142885657,,,,,,,, +Topic,,,,,,217627,,, +Topic,,,,,,223155,,, +Person,1504217244688272832,,,,,,,, +Person,144548678565311334,,,,,,,, +Person,1400516284533535554,,,,,,,, +Topic,,,,,,14528,,, +Person,1508951542204233332,,,,,,,, +Person,611325512448133762,,,,,,,, +Person,635555368637193420,,,,,,,, +Person,134403203055015143,,,,,,,, +Topic,,,,,,202013,,, +Topic,,,,,,7602643,,, +Topic,,,,,,121765,,40.774444444444,-73.904166666667 +Person,765254641650259739,,,,,,,, +Person,975526659664533195,,,,,,,, +Person,273872236541568195,,,,,,,, +Topic,,,,,,557887,,, +Topic,,,,,,774228,,, +Topic,,,,,,46744,,, +Person,352033450190732475,,,,,,,, +Person,841466124620556016,,,,,,,, +Person,1517466541524095404,,,,,,,, +Topic,,,,,,114633,,, +Person,301710390995444087,,,,,,,, +Topic,,,,,,16868955,,, +Person,747231730275042400,,,,,,,, +Person,1164902255571715230,,,,,,,, +Person,291914370254601234,,,,,,,, +Topic,,,,,,247154,,, +Topic,,,,,,519,,, +Topic,,,,,,3303945,,, +Person,1022241560051472272,,,,,,,, +Person,566448585007839403,,,,,,,, +Topic,,,,,,11229,,, +Person,735713441679521195,,,,,,,, +Person,1128501731262832684,,,,,,,, +Person,446962590481145702,,,,,,,, +Person,1125113326787431160,,,,,,,, +Person,437201545096608055,,,,,,,, +Person,940377106445268064,,,,,,,, +Person,1647329525841402942,,,,,,,, +Topic,,,,,,3884230,,, +Topic,,,,,,131191,,, +Person,1376053313411407054,,,,,,,, +Person,1347432655942023365,,,,,,,, +Person,1472154222902711100,,,,,,,, +Person,529550602103217450,,,,,,,, +Topic,,,,,,43035,,, +Topic,,,,,,126095,,, +Topic,,,,,,49088,,40.8075,-73.961944444444 +Person,910075513854877065,,,,,,,, +Topic,,,,,,5018694,,40.859105555556,-74.198686111111 +Topic,,,,,,2446683,,43.1189,20.0797 +Topic,,,,,,2030894,,40.850852,-73.844949 +Topic,,,,,,29718382,,, +Topic,,,,,,130965,,40.860833333333,-73.884444444444 +Topic,,,,,,167172,,, +Topic,,,,,,2456507,,, +Person,842652402732741813,,,,,,,, +Topic,,,,,,670897,,33.421111111111,-111.93166666667 +Topic,,,,,,1436668,,, +Topic,,,,,,753651,,, +Topic,,,,,,7451247,,, +Topic,,,,,,2493,,, +Person,719533111062900642,,,,,,,, +Person,834321901190546647,,,,,,,, +Topic,,,,,,12796,,, +Person,937074421253040138,,,,,,,, +Topic,,,,,,18159587,,, +Person,101810442957214781,,,,,,,, +Topic,,,,,,131401,,, +Topic,,,,,,929920,,, +Topic,,,,,,466439,,, +Topic,,,,,,6498684,,, +Topic,,,,,,206361,,, +Topic,,,,,,41796,,, +Person,1152266442105786574,,,,,,,, +Person,95240187156237415,,,,,,,, +Topic,,,,,,7897553,,, +Topic,,,,,,206887,,, +Topic,,,,,,5405633,,, +Person,1031526243841315760,,,,,,,, +Topic,,,,,,11348540,,, +Topic,,,,,,4198163,,, +Topic,,,,,,16048728,,, +Topic,,,,,,189756,,, +Topic,,,,,,643638,,, +Topic,,,,,,783874,,, +Topic,,,,,,492346,,37.2708,-76.7069 +Topic,,,,,,29042975,,, +Topic,,,,,,12103677,,, +Topic,,,,,,2329157,,, +Person,1563598527979706128,,,,,,,, +Topic,,,,,,4442,,, +Person,264075025125849069,,,,,,,, +Person,369370063627142227,,,,,,,, +Person,1300183120520109060,,,,,,,, +Topic,,,,,,18031504,,, +Topic,,,,,,4229887,,, +Person,611117914195523184,,,,,,,, +Topic,,,,,,7543639,,, +Topic,,,,,,13977,,, +Topic,,,,,,18122778,,, +Person,166319955306346577,,,,,,,, +Topic,,,,,,588894,,, +Topic,,,,,,2454265,,, +Person,1547400408884914628,,,,,,,, +Person,373641740834326257,,,,,,,, +Topic,,,,,,5264957,,, +Topic,,,,,,968598,,, +Person,754480939973310112,,,,,,,, +Topic,,,,,,1808877,,, +Person,1443919105364146460,,,,,,,, +Person,735243266472522113,,,,,,,, +Person,1321304826561136177,,,,,,,, +Person,1560601202484151215,,,,,,,, +Person,1403521534163206962,,,,,,,, +Person,231472126788137195,,,,,,,, +Person,208411288512434105,,,,,,,, +Topic,,,,,,7252790,,, +Person,1211456636406749825,,,,,,,, +Person,1071303249530347453,,,,,,,, +Person,1069710216181783510,,,,,,,, +Person,1578613817419480731,,,,,,,, +Person,944546653739552042,,,,,,,, +Topic,,,,,,8856932,,, +Person,616673625330310949,,,,,,,, +Person,1302421465423646583,,,,,,,, +Person,720320812100121121,,,,,,,, +Person,653345304799504620,,,,,,,, +Person,346401281431409585,,,,,,,, +Person,1526112405471861415,,,,,,,, +Person,1501623481588541372,,,,,,,, +Person,312380611598980641,,,,,,,, +Person,1115244423173415593,,,,,,,, +Person,1555348115336584230,,,,,,,, +Person,12321118467056216,,,,,,,, +Person,1352636429150180228,,,,,,,, +Person,725324491051434870,,,,,,,, +Person,846764541256336994,,,,,,,, +Person,140443713446471314,,,,,,,, +Person,1135272113235621141,,,,,,,, +Person,775818654043059161,,,,,,,, +Person,529476525413023401,,,,,,,, +Person,1262668194076216011,,,,,,,, +Person,119474435514352445,,,,,,,, +Person,437573095319558705,,,,,,,, +Person,1035555223142154728,,,,,,,, +Person,556320934631523806,,,,,,,, +Person,1356253242219285320,,,,,,,, +Person,248654236829951090,,,,,,,, +Person,481153633235353485,,,,,,,, +Includes,,,1202482536733844323,1296829658689065159,,,,, +HasTopic,,,,956704137555154092,,335046,,, +HasTopic,,,,1028329324575034354,,1808877,,, +HasTopic,,,,,1004346153600881042,735,,, +Author,1560601202484151215,,,1285128710332882742,,,,, +HasTopic,,,,1513662032452523252,,131401,,, +HasTopic,,,,1302313601603127196,,48789658,,, +HasTopic,,,,1114502034902546550,,40357,,, +Sale,1275555184736572954,1463522545161373807,,,,,8/16/2018,, +HasTopic,,,,78678286442461987,,28321638,,, +HasTopic,,,,854149383334143372,,903552,,, +HasTopic,,,,240337224527030225,,519,,, +HasTopic,,,,116892402526543412,,5264957,,, +HasTopic,,,,202421472143651025,,12103677,,, +HasTopic,,,,393285992310638641,,470118,,, +Author,910075513854877065,,,,102583151124020340,,,, +Sale,1426050562563532645,75415528634186650,,,,,7/29/2018,, +Author,1128501731262832684,,,1513662032452523252,,,,, +Sale,971383124880710240,1010629269012322480,,,,38695,7/28/2018,, +Author,477384404927196020,,,1651365355351122204,,,,, +Author,725324491051434870,,,202421472143651025,,,,, +HasTopic,,,,932362105613871012,,69871376,,, +HasTopic,,,,1290121451283392110,,169313,,, +Sale,396953035572582107,1400516284533535554,,,,41323,8/21/2018,, +HasTopic,,,,701755398615636460,,8148,,, +Sale,940377106445268064,1647329525841402942,,,,25347,10/15/2018,, +HasTopic,,,,1651365355351122204,,643638,,, +HasTopic,,,,1114502034902546550,,46744,,, +HasTopic,,,,936722743217343702,,123705,,, +HasTopic,,,,321724159614056152,,6498684,,, +Sale,1419850416906085161,1128501731262832684,,,,2869238,09/28/2018,, +HasTopic,,,1372844135435303981,,,60,,, +Author,1578613817419480731,,,321724159614056152,,,,, +HasTopic,,,,1512214307542520410,,8074,,, +HasTopic,,,,618434247743641149,,192242,,, +Includes,,,1202482536733844323,1114502034902546550,,,,, +Includes,,,1615340315424362057,877764733212222524,,,,, +HasTopic,,,,240337224527030225,,1229,,, +HasTopic,,,,1209342585680609487,,179057,,, +Author,834321901190546647,,,,1433303251800176474,,,, +HasTopic,,,,1209342585680609487,,175111,,, +Includes,,,1372844135435303981,581543512052485139,,,,, +Author,720320812100121121,,,420762134340393550,,,,, +Includes,,,1615340315424362057,1424660009578332566,,,,, +HasTopic,,,,932362105613871012,,771572,,, +Includes,,,353365307219544531,956704137555154092,,,,, +HasTopic,,,,956704137555154092,,929920,,, +HasTopic,,,,1441762191425652442,,177749,,, +Author,1563598527979706128,,,1220295546212024391,,,,, +Author,248654236829951090,,,1424660009578332566,,,,, +Author,529476525413023401,,,1302313601603127196,,,,, +HasTopic,,,,1209342585680609487,,771572,,, +HasTopic,,,,,1433303251800176474,83460,,, +Sale,1022241560051472272,1637740339335566412,,,,1642639,10/13/2018,, +Includes,,,1615340315424362057,128423416112315798,,,,, +Includes,,,1372844135435303981,932362105613871012,,,,, +Includes,,,1372844135435303981,747423119260925972,,,,, +HasTopic,,,,854149383334143372,,24862,,, +Sale,1160244137181801222,1035056342462002945,,,,181508,10/4/2018,, +HasTopic,,,,1114502034902546550,,328473,,, +Includes,,,1314315120197156050,854149383334143372,,,,, +Sale,1472154222902711100,1128501731262832684,,,,185785,09/28/2018,, +HasOrg,,,,,102583151124020340,49088,,, +HasTopic,,,,932362105613871012,,11299,,, +HasTopic,,,,932362105613871012,,18426,,, +HasTopic,,,,,1004346153600881042,7392008,,, +Author,1526112405471861415,,,846536331643665114,,,,, +HasTopic,,,,78678286442461987,,617927,,, +HasTopic,,,,321724159614056152,,4229887,,, +HasTopic,,,,82629615412640377,,5322,,, +Author,1128501731262832684,,,1114502034902546550,,,,, +Author,166319955306346577,,,209800678458482108,,,,, +Sale,735713441679521195,1128501731262832684,,,,11650,10/10/2018,, +Author,944546653739552042,,,803952155714850701,,,,, +HasTopic,,,,581543512052485139,,3933135,,, +HasTopic,,,,209800678458482108,,4198163,,, +HasTopic,,,,932362105613871012,,7590,,, +Includes,,,1372844135435303981,1060309546214304182,,,,, +HasTopic,,,,846536331643665114,,167172,,, +HasTopic,,,,740410432146852843,,11348540,,, +Author,937074421253040138,,,,1004346153600881042,,,, +HasTopic,,,,1282227710122181132,,2493,,, +HasTopic,,,,,1433303251800176474,43035,,, +Sale,495352903902152146,211778681592778731,,,,81944,8/6/2018,, +Includes,,,1372844135435303981,1512214307542520410,,,,, +HasTopic,,,353365307219544531,,,2329,,, +Includes,,,1615340315424362057,701755398615636460,,,,, +Includes,,,1372844135435303981,1209342585680609487,,,,, +Includes,,,1202482536733844323,1513662032452523252,,,,, +HasTopic,,,,701755398615636460,,968598,,, +HasTopic,,,,128423416112315798,,11019,,, +Author,1356253242219285320,,,854149383334143372,,,,, +Author,1031526243841315760,,,,1433303251800176474,,,, +Author,842652402732741813,,,,1433303251800176474,,,, +HasTopic,,,,440265285168056234,,18159587,,, +Author,1501623481588541372,,,833681012494554358,,,,, +HasTopic,,,,1028329324575034354,,11299,,, +Author,140443713446471314,,,1028329324575034354,,,,, +Author,264075025125849069,,,1245126351375505703,,,,, +Includes,,,1314315120197156050,1614534111336540475,,,,, +HasTopic,,,,,1433303251800176474,131191,,, +HasTopic,,,,420762134340393550,,1907525,,, +Sale,975526659664533195,524508243055647325,,,,,10/15/2018,, +Author,284405379592161575,,,,102583151124020340,,,, +HasTopic,,,,,963345652072941810,735,,, +HasTopic,,,,,1433303251800176474,3303945,,, +Author,1152266442105786574,,,,963345652072941810,,,, +HasTopic,,,,202421472143651025,,1189753,,, +HasTopic,,,,1028329324575034354,,11229,,, +Includes,,,1615340315424362057,1220295546212024391,,,,, +Includes,,,1314315120197156050,846536331643665114,,,,, +HasTopic,,,,1296829658689065159,,7451247,,, +Includes,,,1615340315424362057,78678286442461987,,,,, +HasTopic,,,,541215404780905313,,11635,,, +Author,481153633235353485,,,956704137555154092,,,,, +Author,611117914195523184,,,701755398615636460,,,,, +Author,95240187156237415,,,,963345652072941810,,,, +Includes,,,1372844135435303981,91431002216341149,,,,, +Author,846764541256336994,,,1290121451283392110,,,,, +HasTopic,,,,393285992310638641,,206887,,, +HasTopic,,,,1512214307542520410,,177749,,, +HasTopic,,,,1114502034902546550,,7252790,,, +HasTopic,,,1202482536733844323,,,60,,, +HasTopic,,,,1302313601603127196,,41796,,, +HasTopic,,,,1114502034902546550,,44311,,, +Author,1547400408884914628,,,393285992310638641,,,,, +Author,312380611598980641,,,78678286442461987,,,,, +Sale,273872236541568195,1251650482793161774,,,,172809,8/22/2018,, +HasTopic,,,,1513662032452523252,,728,,, +Includes,,,1615340315424362057,447169043921403064,,,,, +Includes,,,1372844135435303981,82629615412640377,,,,, +HasTopic,,,,1302313601603127196,,29042975,,, +Author,1321304826561136177,,,1532662490035322233,,,,, +Includes,,,1372844135435303981,1302313601603127196,,,,, +Sale,363047312690634767,242111862342742257,,,,,10/4/2018,, +Author,1071303249530347453,,,737353170652104031,,,,, +HasTopic,,,,1282227710122181132,,35140,,, +HasTopic,,,,91431002216341149,,46744,,, +Includes,,,1372844135435303981,1441762191425652442,,,,, +Sale,446962590481145702,534449219561977424,,,,,10/11/2018,, +Author,1035555223142154728,,,877764733212222524,,,,, +Author,1403521534163206962,,,932362105613871012,,,,, +HasTopic,,,,701755398615636460,,35127,,, +Includes,,,1372844135435303981,1028329324575034354,,,,, +HasTopic,,,,1245126351375505703,,254860,,, +HasTopic,,,,209800678458482108,,7897553,,, +Includes,,,1372844135435303981,186108460103013588,,,,, +HasOrg,,,,,102583151124020340,304878,,, +HasTopic,,,,,1433303251800176474,998,,, +Author,653345304799504620,,,581543512052485139,,,,, +Author,1302421465423646583,,,240337224527030225,,,,, +Author,1211456636406749825,,,618434247743641149,,,,, +HasTopic,,,,240337224527030225,,785,,, +HasTopic,,,,,1004346153600881042,83460,,, +Includes,,,1615340315424362057,1245126351375505703,,,,, +HasTopic,,,,747423119260925972,,16868955,,, +HasTopic,,,,91431002216341149,,9420,,, +Includes,,,451888058015735870,541215404780905313,,,,, +HasTopic,,,,1209342585680609487,,492346,,, +HasTopic,,,,1290121451283392110,,114633,,, +HasTopic,,,,1441762191425652442,,11249,,, +HasTopic,,,,1532662490035322233,,753651,,, +HasTopic,,,,1532662490035322233,,11707,,, +HasTopic,,,,1245126351375505703,,904756,,, +HasTopic,,,15133734353741126,,,189756,,, +Sale,1222330726897222256,493345739124130581,,,,177,8/4/2018,, +Includes,,,1372844135435303981,1424263331858043042,,,,, +Includes,,,1615340315424362057,1290121451283392110,,,,, +Sale,477384404927196020,182010581109145287,,,,271997,2/17/2019,, +HasTopic,,,,737353170652104031,,16003594,,, +HasTopic,,,,747423119260925972,,2329157,,, +Sale,566448585007839403,765254641650259739,,,,,10/9/2018,, +HasOrg,,,,,963345652072941810,130965,,, +Author,616673625330310949,,,936722743217343702,,,,, +HasTopic,,,,1513662032452523252,,44311,,, +Author,119474435514352445,,,1282227710122181132,,,,, +HasTopic,,,,82629615412640377,,22983,,, +HasTopic,,,,833681012494554358,,201816,,, +HasTopic,,,,,1433303251800176474,14528,,, +Sale,33927662206515912,934144115142885657,,,,,10/10/2018,, +Includes,,,1372844135435303981,618434247743641149,,,,, +Includes,,,1615340315424362057,740410432146852843,,,,, +Author,1300183120520109060,,,128423416112315798,,,,, +Author,1069710216181783510,,,1296829658689065159,,,,, +Sale,1504217244688272832,144548678565311334,,,,56683126,8/13/2018,, +Author,1115244423173415593,,,1614534111336540475,,,,, +Sale,747231730275042400,584485814982143221,,,,127,10/7/2018,, +HasTopic,,,,1285128710332882742,,37654,,, +HasTopic,,,1615340315424362057,,,12796,,, +Author,1555348115336584230,,,817526874194673140,,,,, +HasTopic,,,,,102583151124020340,43035,,, +HasTopic,,,,737353170652104031,,206361,,, +Includes,,,1314315120197156050,833681012494554358,,,,, +Sale,128643504412157535,320151361710953715,,,,443533,8/24/2018,, +HasTopic,,,,1220295546212024391,,588894,,, +HasTopic,,,,202421472143651025,,7602643,,, +Sale,1150357430325141247,674253449444876344,,,,,10/7/2018,, +Author,735243266472522113,,,1209342585680609487,,,,, +Includes,,,1372844135435303981,1651365355351122204,,,,, +HasTopic,,,,1424660009578332566,,618102,,, +HasTopic,,,,,102583151124020340,126095,,, +HasTopic,,,,,1004346153600881042,2446683,,, +HasTopic,,,,,1433303251800176474,598435,,, +Author,735713441679521195,,,,1433303251800176474,,,, +Includes,,,1314315120197156050,321724159614056152,,,,, +HasTopic,,,,1114502034902546550,,8856932,,, +HasTopic,,,,,1433303251800176474,5405633,,, +HasTopic,,,,1285128710332882742,,11299,,, +HasTopic,,,,420762134340393550,,12796,,, +HasTopic,,,,1184855350262395542,,329717,,, +HasTopic,,,,116892402526543412,,2456507,,, +Includes,,,1615340315424362057,817526874194673140,,,,, +Sale,841466124620556016,1517466541524095404,,,,,8/27/2018,, +HasTopic,,,,1209342585680609487,,127197,,, +HasTopic,,,,420762134340393550,,16048728,,, +HasTopic,,,,209800678458482108,,774228,,, +Author,231472126788137195,,,82629615412640377,,,,, +HasTopic,,,,,1433303251800176474,7817,,, +HasTopic,,,,932362105613871012,,1049632,,, +Sale,1164902255571715230,291914370254601234,,,,571,10/8/2018,, +HasTopic,,,,,1433303251800176474,467,,, +HasTopic,,,,1220295546212024391,,37497186,,, +HasTopic,,,,817526874194673140,,732934,,, +Includes,,,1615340315424362057,202421472143651025,,,,, +HasTopic,,,,1513662032452523252,,889,,, +Includes,,,1372844135435303981,737353170652104031,,,,, +HasTopic,,,,1296829658689065159,,83460,,, +HasTopic,,,,747423119260925972,,1333024,,, +HasTopic,,,,1290121451283392110,,204,,, +Author,611117914195523184,,,440265285168056234,,,,, +Author,754480939973310112,,,1441762191425652442,,,,, +Includes,,,1427292001647224242,936722743217343702,,,,, diff --git a/libcusp/CMakeLists.txt b/libcusp/CMakeLists.txt index 2cc6e1714d..67b603019e 100644 --- a/libcusp/CMakeLists.txt +++ b/libcusp/CMakeLists.txt @@ -27,3 +27,5 @@ install(TARGETS galois_cusp COMPONENT lib INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" ) + +add_subdirectory(test) diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h index 22fdf63d10..515f957e54 100644 --- a/libcusp/include/galois/graphs/BasePolicies.h +++ b/libcusp/include/galois/graphs/BasePolicies.h @@ -42,6 +42,9 @@ class PartitioningScaffold { uint64_t _numEdges; //!< number of edges in graph //! maps from host id to nodes that host as read from disk std::vector> _gid2host; + std::vector + _virtualToPhyMapping; // saving Virtual hosts to Phy hosts map + bool hash; // switch between using gid2host and VtoP maps public: /** @@ -64,7 +67,14 @@ class PartitioningScaffold { */ void saveGIDToHost(std::vector>& gid2host) { _gid2host = gid2host; + hash = false; } + void saveGIDToHost(std::vector& virtualToPhyMapping) { + _virtualToPhyMapping = virtualToPhyMapping; + hash = true; + } + + bool predeterminedMapping(std::vector&) { return false; } }; /** @@ -88,15 +98,19 @@ class ReadMasterAssignment : public PartitioningScaffold { * @returns Host ID of host that read the node specified by the GID. */ uint32_t retrieveMaster(uint32_t gid) const { - for (auto h = 0U; h < _numHosts; ++h) { - uint64_t start, end; - std::tie(start, end) = _gid2host[h]; - if (gid >= start && gid < end) { - return h; + if (hash == false) { + for (auto h = 0U; h < _numHosts; ++h) { + uint64_t start, end; + std::tie(start, end) = _gid2host[h]; + if (gid >= start && gid < end) { + return h; + } } + assert(false); + return _numHosts; + } else { + return _virtualToPhyMapping[gid % (_virtualToPhyMapping.size())]; } - assert(false); - return _numHosts; } // below all unused if not assigning masters in default manner, but must be @@ -149,8 +163,13 @@ class CustomMasterAssignment : public PartitioningScaffold { char _status; //!< Specifies what phase of master assignment partitioner is on //! Metadata for determining where a node's master is std::vector _localNodeToMaster; - //! Map GID to its master + //! Map GID to its master; only for nodes we own std::unordered_map _gid2masters; + //! Unlike gid2masters, this contains a mapping in vector form of ALL mappings + //! for all nodes in the graph instead of just local ones; only used if it is + //! known exactly where everything ends up before partitioning + std::vector _globalHostMap; + //! This host's node offset (each host reads a distinct contiguous portion //! of graph uint64_t _nodeOffset; @@ -183,6 +202,8 @@ class CustomMasterAssignment : public PartitioningScaffold { * mapping is not found but instead returns -1 if in stage 1, else * fails. * + * ONLY WORKS IF GID IS ON LOCAL HOST ELSE WILL FAIL + * * @param gid GID to get master of * @returns Master of specified GID, -1, unsigned, if not found */ @@ -194,19 +215,17 @@ class CustomMasterAssignment : public PartitioningScaffold { // found in map if (gidMasterIter != _gid2masters.end()) { uint32_t mappedMaster = gidMasterIter->second; - // galois::gDebug("[", _hostID, "] ", gid, " found with master ", - // mappedMaster, "!"); // make sure host is in bounds assert(mappedMaster < _numHosts); return mappedMaster; } else { // NOT FOUND (not necessarily a bad thing, and required for // some cases) - galois::gDebug("[", _hostID, "] ", gid, " not found!"); if (_status == 2) { // die if we expect all gids to be mapped already (stage 2) GALOIS_DIE("should not fail to find a GID after stage 2 " - "of master assignment phase"); + "of master assignment phase; that or passed in gid that" + " doesn't exist on this host"); } return (uint32_t)-1; } @@ -242,7 +261,6 @@ class CustomMasterAssignment : public PartitioningScaffold { for (auto i = gid2offsets.begin(); i != gid2offsets.end(); i++) { assert(i->second < localNodeToMaster.size()); - galois::gDebug("Map ", i->first, " to ", localNodeToMaster[i->second]); _gid2masters[i->first] = localNodeToMaster[i->second]; } assert(_gid2masters.size() == (originalSize + gid2offsets.size())); @@ -303,13 +321,10 @@ class CustomMasterAssignment : public PartitioningScaffold { auto offsetIntoMapIter = _gid2masters.find(gid); if (offsetIntoMapIter == _gid2masters.end()) { // NOT FOUND - galois::gDebug("[", _hostID, "] ", gid, " not found; mapping!"); _gid2masters[gid] = mappedMaster; return true; } else { // already mapped - galois::gDebug("[", _hostID, "] ", gid, " already mapped with master ", - offsetIntoMapIter->second, "!"); assert(offsetIntoMapIter->second == mappedMaster); return false; } diff --git a/libcusp/include/galois/graphs/CuSPPartitioner.h b/libcusp/include/galois/graphs/CuSPPartitioner.h index 6df9707a27..5541be426d 100644 --- a/libcusp/include/galois/graphs/CuSPPartitioner.h +++ b/libcusp/include/galois/graphs/CuSPPartitioner.h @@ -50,6 +50,7 @@ using DistGraphPtr = * to the partitioner * @param outputType Specifies the output format (CSR or CSC) that each * partition will be created in + * @param useWMD "true" if the passed graph file format is a WMD graph * @param symmetricGraph This should be "true" if the passed in graphFile * is a symmetric graph * @param transposeGraphFile Transpose graph of graphFile in Galois binary @@ -83,7 +84,8 @@ template DistGraphPtr cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType, - CUSP_GRAPH_TYPE outputType, bool symmetricGraph = false, + CUSP_GRAPH_TYPE outputType, bool useWMD = false, + bool symmetricGraph = false, std::string transposeGraphFile = "", std::string masterBlockFile = "", bool cuspAsync = true, uint32_t cuspStateRounds = 100, @@ -126,13 +128,13 @@ cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType, } return std::make_unique( - inputToUse, net.ID, net.Num, cuspAsync, cuspStateRounds, useTranspose, - readPolicy, nodeWeight, edgeWeight, masterBlockFile); + inputToUse, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds, + useTranspose, readPolicy, nodeWeight, edgeWeight, masterBlockFile); } else { // symmetric graph path: assume the passed in graphFile is a symmetric // graph; output is also symmetric return std::make_unique( - graphFile, net.ID, net.Num, cuspAsync, cuspStateRounds, false, + graphFile, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds, false, readPolicy, nodeWeight, edgeWeight, masterBlockFile); } } diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 1720081e77..540b25e120 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -31,6 +31,7 @@ #include #include "galois/graphs/LC_CSR_Graph.h" +#include "galois/graphs/LC_CSR_CSC_Graph.h" #include "galois/graphs/BufferedGraph.h" #include "galois/runtime/DistStats.h" #include "galois/graphs/OfflineGraph.h" @@ -60,13 +61,16 @@ enum MASTERS_DISTRIBUTION { * @tparam NodeTy type of node data for the graph * @tparam EdgeTy type of edge data for the graph */ -template +template class DistGraph { private: //! Graph name used for printing things constexpr static const char* const GRNAME = "dGraph"; - using GraphTy = galois::graphs::LC_CSR_Graph; + using GraphTy = + galois::graphs::LC_CSR_CSC_Graph; // vector for determining range objects for master nodes + nodes // with edges (which includes masters) @@ -256,14 +260,14 @@ class DistGraph { for (unsigned d = 0; d < DecomposeFactor; ++d) { galois::runtime::gSerialize(b, gid2host[id + d * numHosts]); } - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } net.flush(); unsigned received = 1; while (received < numHosts) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); assert(p->first != id); auto& b = p->second; @@ -326,14 +330,210 @@ class DistGraph { continue; galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, gid2host[id]); - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } net.flush(); unsigned received = 1; while (received < numHosts) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + assert(p->first != id); + auto& b = p->second; + galois::runtime::gDeserialize(b, gid2host[p->first]); + ++received; + } + increment_evilPhase(); + } + + /** + * Given the number of global nodes, compute the masters for each node by + * evenly (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host. Considers + * ONLY nodes and not edges. + * + * @param numGlobalNodes The number of global nodes to divide + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + void computeMastersBlockedNodes(uint64_t numGlobalNodes, + const std::vector& scalefactor, + unsigned DecomposeFactor = 1) { + uint64_t numNodes_to_divide = numGlobalNodes; + if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) { + for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i) + gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide, + i, numHosts * DecomposeFactor)); + return; + } + + // TODO: not compatible with DecomposeFactor. + assert(scalefactor.size() == numHosts); + + unsigned numBlocks = 0; + + for (unsigned i = 0; i < numHosts; ++i) { + numBlocks += scalefactor[i]; + } + + std::vector> blocks; + for (unsigned i = 0; i < numBlocks; ++i) { + blocks.push_back( + galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks)); + } + + std::vector prefixSums; + prefixSums.push_back(0); + + for (unsigned i = 1; i < numHosts; ++i) { + prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]); + } + + for (unsigned i = 0; i < numHosts; ++i) { + unsigned firstBlock = prefixSums[i]; + unsigned lastBlock = prefixSums[i] + scalefactor[i] - 1; + gid2host.push_back( + std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second)); + } + } + + /** + * Given the number of global nodes and edges, + * compute the masters for each node by + * evenly (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host while taking + * into consideration the only edges of the node to get + * even blocks. + * + * @param numGlobalNodes The number of global nodes to divide + * @param numGlobalEdges The number of global edges to divide + * @param outIndices A complete outgoing edge range array of CSR to calculate + * range + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + void computeMastersBalancedEdges(uint64_t numGlobalNodes, + uint64_t numGlobalEdges, + uint64_t* outIndices, + const std::vector& scalefactor, + uint32_t edgeWeight, + unsigned DecomposeFactor = 1) { + if (edgeWeight == 0) { + edgeWeight = 1; + } + + auto& net = galois::runtime::getSystemNetworkInterface(); + + gid2host.resize(numHosts * DecomposeFactor); + for (unsigned d = 0; d < DecomposeFactor; ++d) { + // TODO(hc): + auto r = galois::graphs::divideNodesBinarySearch( + numGlobalNodes, numGlobalEdges, 0, edgeWeight, (id + d * numHosts), + numHosts * DecomposeFactor, outIndices, scalefactor); + gid2host[id + d * numHosts].first = *(r.first.first); + gid2host[id + d * numHosts].second = *(r.first.second); + } + + for (unsigned h = 0; h < numHosts; ++h) { + if (h == id) { + continue; + } + galois::runtime::SendBuffer b; + for (unsigned d = 0; d < DecomposeFactor; ++d) { + galois::runtime::gSerialize(b, gid2host[id + d * numHosts]); + } + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + unsigned received = 1; + while (received < numHosts) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + assert(p->first != id); + auto& b = p->second; + for (unsigned d = 0; d < DecomposeFactor; ++d) { + galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]); + } + ++received; + } + increment_evilPhase(); + +#ifndef NDEBUG + // TODO(hc): + for (unsigned h = 0; h < numHosts; h++) { + if (h == 0) { + assert(gid2host[h].first == 0); + } else if (h == numHosts - 1) { + assert(gid2host[h].first == gid2host[h - 1].second); + assert(gid2host[h].second == numGlobalNodes); + } else { + assert(gid2host[h].first == gid2host[h - 1].second); + assert(gid2host[h].second == gid2host[h + 1].first); + } + } +#endif + } + + /** + * Given the number of global nodes and edges, + * compute the masters for each node by evenly + * (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host while taking + * into consideration the edges of the node AND the node itself. + * + * @param numGlobalNodes The number of global nodes to divide + * @param numGlobalEdges The number of global edges to divide + * @param outIndices A complete outgoing edge range array of CSR to calculate + * range + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. Ignored + * in this function currently. + * + * @todo make this function work with decompose factor + */ + void computeMastersBalancedNodesAndEdges( + uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint64_t* outIndices, + const std::vector& scalefactor, uint32_t nodeWeight, + uint32_t edgeWeight, unsigned) { + if (nodeWeight == 0) { + nodeWeight = numGlobalEdges / numGlobalNodes; // average degree + } + if (edgeWeight == 0) { + edgeWeight = 1; + } + + auto& net = galois::runtime::getSystemNetworkInterface(); + gid2host.resize(numHosts); + auto r = galois::graphs::divideNodesBinarySearch( + numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight, id, numHosts, + outIndices, scalefactor); + gid2host[id].first = *r.first.first; + gid2host[id].second = *r.first.second; + for (unsigned h = 0; h < numHosts; ++h) { + if (h == id) + continue; + galois::runtime::SendBuffer b; + galois::runtime::gSerialize(b, gid2host[id]); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + unsigned received = 1; + while (received < numHosts) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); assert(p->first != id); auto& b = p->second; @@ -390,10 +590,68 @@ class DistGraph { galois::runtime::reportStatCond_Tmax( GRNAME, "MasterDistTime", timer.get()); - galois::gPrint( + galois::gDebug( "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f, " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(), - " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n"); + " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)"); + return numNodes_to_divide; + } + + /** + * Wrapper call that will call into more specific compute masters + * functions that compute masters based on nodes, edges, or both. + * + * @param masters_distribution method of masters distribution to use + * @param numGlobalNodes The number of global nodes to divide + * @param numGlobalEdges The number of global edges to divide + * @param outIndices A complete outgoing edge range array of CSR to calculate + * range + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param nodeWeight weight to give nodes when computing balance + * @param edgeWeight weight to give edges when computing balance + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution, + uint64_t numGlobalNodes, uint64_t numGlobalEdges, + uint64_t* outIndices, + const std::vector& scalefactor, + uint32_t nodeWeight = 0, uint32_t edgeWeight = 0, + unsigned DecomposeFactor = 1) { + galois::Timer timer; + timer.start(); + uint64_t numNodes_to_divide = numGlobalNodes; + + // compute masters for all nodes + switch (masters_distribution) { + case BALANCED_MASTERS: + computeMastersBlockedNodes(numGlobalNodes, scalefactor, DecomposeFactor); + break; + case BALANCED_MASTERS_AND_EDGES: + computeMastersBalancedNodesAndEdges(numGlobalNodes, numGlobalEdges, + outIndices, scalefactor, nodeWeight, + edgeWeight, DecomposeFactor); + break; + case BALANCED_EDGES_OF_MASTERS: + default: + computeMastersBalancedEdges(numGlobalNodes, numGlobalEdges, outIndices, + scalefactor, edgeWeight, DecomposeFactor); + break; + } + + timer.stop(); + + galois::runtime::reportStatCond_Tmax( + GRNAME, "MasterDistTime", timer.get()); + +#if 0 + galois::gDebug( + "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f, + " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(), + " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)"); +#endif return numNodes_to_divide; } @@ -443,14 +701,14 @@ class DistGraph { continue; galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, gid2host[id]); - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } net.flush(); unsigned received = 1; while (received < numHosts) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); assert(p->first != id); auto& b = p->second; @@ -539,6 +797,9 @@ class DistGraph { public: virtual ~DistGraph() {} + + unsigned GetLIDHost(uint64_t lid) const { return getHostIDImpl(getGID(lid)); } + //! Determines which host has the master for a particular node //! @returns Host id of node in question inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); } @@ -742,6 +1003,51 @@ class DistGraph { return IDs; } + ////////////////////////////////////////////////////////////////////////////// + // for in edges + ////////////////////////////////////////////////////////////////////////////// + + //! Construct the transpose graph for the partitioned graph + void ConstructIncomingEdges() { graph.constructIncomingEdges(); } + + /** + * Get the edge data for a particular edge in the graph. + * + * @param ni edge to get the data of + * @param mflag access flag for edge data + * @returns The edge data for the requested edge + */ + typename GraphTy::edge_data_reference + GetInEdgeData(edge_iterator ni, + galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) { + return graph.getInEdgeData(ni, mflag); + } + + GraphNode GetInEdgeDest(edge_iterator ni) { return graph.getInEdgeDst(ni); } + + edge_iterator in_edge_begin(GraphNode N) { + return graph.in_edge_begin(N, galois::MethodFlag::UNPROTECTED); + } + + edge_iterator in_edge_end(GraphNode N) { + return graph.in_edge_end(N, galois::MethodFlag::UNPROTECTED); + } + + galois::runtime::iterable> + in_edges(GraphNode N) { + return galois::graphs::internal::make_no_deref_range(in_edge_begin(N), + in_edge_end(N)); + } + + //! Return corresponding out-edge index for an in-edge + size_t InEdgeToOutEdge(edge_iterator ni) const { + return graph.InEdgeToOutEdge(ni); + } + + ////////////////////////////////////////////////////////////////////////////// + // end in edges + ////////////////////////////////////////////////////////////////////////////// + protected: /** * Uses a pre-computed prefix sum to determine division of nodes among @@ -773,7 +1079,6 @@ class DistGraph { withEdgeRanges.size() != 0) { masterRanges = withEdgeRanges; } else { - galois::gDebug("Manually det. master thread ranges"); masterRanges = galois::graphs::determineUnitRangesFromGraph( graph, galois::runtime::activeThreads, beginMaster, beginMaster + numOwned, 0); @@ -799,7 +1104,6 @@ class DistGraph { masterRanges.size() != 0) { withEdgeRanges = masterRanges; } else { - galois::gDebug("Manually det. with edges thread ranges"); withEdgeRanges = galois::graphs::determineUnitRangesFromGraph( graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0); } @@ -865,10 +1169,7 @@ class DistGraph { /** * Deallocates underlying LC CSR Graph */ - void deallocate() { - galois::gDebug("Deallocating CSR in DistGraph"); - graph.deallocate(); - } + void deallocate() { graph.deallocate(); } /** * Sort the underlying LC_CSR_Graph by ID (destinations) @@ -881,10 +1182,22 @@ class DistGraph { [&](GN n) { graph.sortEdges(n, IdLess()); }, galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal()); } + + //////////////////////////////////////////////////////////////////////////////// + // what follows are GNN functions; some are not great (e.g. expose arrays) + // TODO figure out better way to do this + //////////////////////////////////////////////////////////////////////////////// + EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); } + NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); } + + //! Used by substrate to determine if some stats are to be reported + bool is_a_graph() const { return true; } }; -template -constexpr const char* const galois::graphs::DistGraph::GRNAME; +template +constexpr const char* const + galois::graphs::DistGraph::GRNAME; } // end namespace graphs } // end namespace galois diff --git a/libcusp/include/galois/graphs/DistributedLocalGraph.h b/libcusp/include/galois/graphs/DistributedLocalGraph.h new file mode 100644 index 0000000000..2920614232 --- /dev/null +++ b/libcusp/include/galois/graphs/DistributedLocalGraph.h @@ -0,0 +1,1063 @@ +/* + * This file belongs to the Galois project, a C++ library for exploiting + * parallelism. The code is being released under the terms of the 3-Clause BSD + * License (a copy is located in LICENSE.txt at the top-level directory). + * + * Copyright (C) 2018, The University of Texas at Austin. All rights reserved. + * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS + * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF + * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF + * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH + * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances + * shall University be liable for incidental, special, indirect, direct or + * consequential damages or loss of profits, interruption of business, or + * related expenses which may arise from use of Software or Documentation, + * including but not limited to those resulting from defects in Software and/or + * Documentation, or loss or inaccuracy of data of any kind. + */ + +/** + * @file DistributedLocalGraph.h + * + * Contains the implementation for DistLocalGraph. Command line argument + * definitions are found in DistributedGraph.cpp. + */ + +#ifndef _GALOIS_DISTRIBUTED_LOCAL_GRAPH_H +#define _GALOIS_DISTRIBUTED_LOCAL_GRAPH_H + +#include +#include + +#include "galois/graphs/DistributedGraph.h" +#include "galois/graphs/LS_LC_CSR_Graph.h" +#include "galois/graphs/BufferedGraph.h" +#include "galois/runtime/DistStats.h" +#include "galois/graphs/OfflineGraph.h" +#include "galois/DynamicBitset.h" + +/* + * Headers for boost serialization + */ + +namespace galois { +namespace graphs { + +/** + * Base DistLocalGraph class that all distributed graphs extend from. + * + * @tparam NodeTy type of node data for the graph + * @tparam EdgeTy type of edge data for the graph + */ +template +class DistLocalGraph { +private: + //! Graph name used for printing things + constexpr static const char* const GRNAME = "dGraph"; + + using GraphTy = galois::graphs::LS_LC_CSR_Graph; + + // vector for determining range objects for master nodes + nodes + // with edges (which includes masters) + //! represents split of all nodes among threads to balance edges + std::vector allNodesRanges; + //! represents split of master nodes among threads to balance edges + std::vector masterRanges; + //! represents split of nodes with edges (includes masters) among threads to + //! balance edges + std::vector withEdgeRanges; + //! represents split of all nodes among threads to balance in-edges + std::vector allNodesRangesIn; + //! represents split of master nodes among threads to balance in-edges + std::vector masterRangesIn; + + using NodeRangeType = + galois::runtime::SpecificRange>; + + //! Vector of ranges that stores the 3 different range objects that a user is + //! able to access + std::vector specificRanges; + //! Like specificRanges, but for in edges + std::vector specificRangesIn; + +protected: + //! The internal graph used by DistLocalGraph to represent the graph + GraphTy* graph; + + //! Marks if the graph is transposed or not. + bool transposed; + + // global graph variables + uint64_t numGlobalNodes; //!< Total nodes in the global unpartitioned graph. + uint64_t numGlobalEdges; //!< Total edges in the global unpartitioned graph. + uint32_t numNodes; //!< Num nodes in this graph in total + uint64_t numEdges; //!< Num edges in this graph in total + + const unsigned id; //!< ID of the machine. + const uint32_t numHosts; //!< Total number of machines + + // local graph + // size() = Number of nodes created on this host (masters + mirrors) + uint32_t numOwned; //!< Number of nodes owned (masters) by this host. + //!< size() - numOwned = mirrors on this host + uint32_t beginMaster; //!< Local id of the beginning of master nodes. + //!< beginMaster + numOwned = local id of the end of + //!< master nodes + uint32_t numNodesWithEdges; //!< Number of nodes (masters + mirrors) that have + //!< outgoing edges + + //! Information that converts host to range of nodes that host reads + std::vector> gid2host; + //! Mirror nodes from different hosts. For reduce + std::vector> mirrorNodes; + + //! GID = localToGlobalVector[LID] + std::vector localToGlobalVector; + //! LID = globalToLocalMap[GID] + std::unordered_map globalToLocalMap; + + //! Increments evilPhase, a phase counter used by communication. + void inline increment_evilPhase() { + ++galois::runtime::evilPhase; + if (galois::runtime::evilPhase >= + static_cast( + std::numeric_limits::max())) { // limit defined by MPI or + // LCI + galois::runtime::evilPhase = 1; + } + } + + //! Returns evilPhase + 1, handling loop around as necessary + unsigned inline evilPhasePlus1() { + unsigned result = galois::runtime::evilPhase + 1; + + // limit defined by MPI or LCI + if (result >= uint32_t{std::numeric_limits::max()}) { + return 1; + } + return result; + } + + //! used to sort edges in the sort edges function + template + struct IdLess { + bool + operator()(const galois::graphs::EdgeSortValue& e1, + const galois::graphs::EdgeSortValue& e2) const { + return e1.dst < e2.dst; + } + }; + +private: + /** + * Given an OfflineGraph, compute the masters for each node by + * evenly (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host. Considers + * ONLY nodes and not edges. + * + * @param g The offline graph which has loaded the graph you want + * to get the masters for + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + void computeMastersBlockedNodes(galois::graphs::OfflineGraph& g, + const std::vector& scalefactor, + unsigned DecomposeFactor = 1) { + uint64_t numNodes_to_divide = g.size(); + if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) { + for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i) + gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide, + i, numHosts * DecomposeFactor)); + return; + } + + // TODO: not compatible with DecomposeFactor. + assert(scalefactor.size() == numHosts); + + unsigned numBlocks = 0; + + for (unsigned i = 0; i < numHosts; ++i) { + numBlocks += scalefactor[i]; + } + + std::vector> blocks; + for (unsigned i = 0; i < numBlocks; ++i) { + blocks.push_back( + galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks)); + } + + std::vector prefixSums; + prefixSums.push_back(0); + + for (unsigned i = 1; i < numHosts; ++i) { + prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]); + } + + for (unsigned i = 0; i < numHosts; ++i) { + unsigned firstBlock = prefixSums[i]; + unsigned lastBlock = prefixSums[i] + scalefactor[i] - 1; + gid2host.push_back( + std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second)); + } + } + + /** + * Given an OfflineGraph, compute the masters for each node by + * evenly (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host while taking + * into consideration the only edges of the node to get + * even blocks. + * + * @param g The offline graph which has loaded the graph you want + * to get the masters for + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + void computeMastersBalancedEdges(galois::graphs::OfflineGraph& g, + const std::vector& scalefactor, + uint32_t edgeWeight, + unsigned DecomposeFactor = 1) { + if (edgeWeight == 0) { + edgeWeight = 1; + } + + auto& net = galois::runtime::getSystemNetworkInterface(); + + gid2host.resize(numHosts * DecomposeFactor); + for (unsigned d = 0; d < DecomposeFactor; ++d) { + auto r = g.divideByNode(0, edgeWeight, (id + d * numHosts), + numHosts * DecomposeFactor, scalefactor); + gid2host[id + d * numHosts].first = *(r.first.first); + gid2host[id + d * numHosts].second = *(r.first.second); + } + + for (unsigned h = 0; h < numHosts; ++h) { + if (h == id) { + continue; + } + galois::runtime::SendBuffer b; + for (unsigned d = 0; d < DecomposeFactor; ++d) { + galois::runtime::gSerialize(b, gid2host[id + d * numHosts]); + } + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + unsigned received = 1; + while (received < numHosts) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + assert(p->first != id); + auto& b = p->second; + for (unsigned d = 0; d < DecomposeFactor; ++d) { + galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]); + } + ++received; + } + increment_evilPhase(); + +#ifndef NDEBUG + for (unsigned h = 0; h < numHosts; h++) { + if (h == 0) { + assert(gid2host[h].first == 0); + } else if (h == numHosts - 1) { + assert(gid2host[h].first == gid2host[h - 1].second); + assert(gid2host[h].second == g.size()); + } else { + assert(gid2host[h].first == gid2host[h - 1].second); + assert(gid2host[h].second == gid2host[h + 1].first); + } + } +#endif + } + + /** + * Given an OfflineGraph, compute the masters for each node by + * evenly (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host while taking + * into consideration the edges of the node AND the node itself. + * + * @param g The offline graph which has loaded the graph you want + * to get the masters for + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. Ignored + * in this function currently. + * + * @todo make this function work with decompose factor + */ + void computeMastersBalancedNodesAndEdges( + galois::graphs::OfflineGraph& g, const std::vector& scalefactor, + uint32_t nodeWeight, uint32_t edgeWeight, unsigned) { + if (nodeWeight == 0) { + nodeWeight = g.sizeEdges() / g.size(); // average degree + } + if (edgeWeight == 0) { + edgeWeight = 1; + } + + auto& net = galois::runtime::getSystemNetworkInterface(); + gid2host.resize(numHosts); + auto r = g.divideByNode(nodeWeight, edgeWeight, id, numHosts, scalefactor); + gid2host[id].first = *r.first.first; + gid2host[id].second = *r.first.second; + for (unsigned h = 0; h < numHosts; ++h) { + if (h == id) + continue; + galois::runtime::SendBuffer b; + galois::runtime::gSerialize(b, gid2host[id]); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + unsigned received = 1; + while (received < numHosts) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + assert(p->first != id); + auto& b = p->second; + galois::runtime::gDeserialize(b, gid2host[p->first]); + ++received; + } + increment_evilPhase(); + } + +protected: + /** + * Wrapper call that will call into more specific compute masters + * functions that compute masters based on nodes, edges, or both. + * + * @param masters_distribution method of masters distribution to use + * @param g The offline graph which has loaded the graph you want + * to get the masters for + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param nodeWeight weight to give nodes when computing balance + * @param edgeWeight weight to give edges when computing balance + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution, + galois::graphs::OfflineGraph& g, + const std::vector& scalefactor, + uint32_t nodeWeight = 0, uint32_t edgeWeight = 0, + unsigned DecomposeFactor = 1) { + galois::Timer timer; + timer.start(); + g.reset_seek_counters(); + + uint64_t numNodes_to_divide = g.size(); + + // compute masters for all nodes + switch (masters_distribution) { + case BALANCED_MASTERS: + computeMastersBlockedNodes(g, scalefactor, DecomposeFactor); + break; + case BALANCED_MASTERS_AND_EDGES: + computeMastersBalancedNodesAndEdges(g, scalefactor, nodeWeight, + edgeWeight, DecomposeFactor); + break; + case BALANCED_EDGES_OF_MASTERS: + default: + computeMastersBalancedEdges(g, scalefactor, edgeWeight, DecomposeFactor); + break; + } + + timer.stop(); + + galois::runtime::reportStatCond_Tmax( + GRNAME, "MasterDistTime", timer.get()); + + galois::gPrint( + "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f, + " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(), + " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n"); + return numNodes_to_divide; + } + + //! reader assignment from a file + //! corresponds to master assignment if using an edge cut + void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) { + // read file lines + std::ifstream mappings(filename); + std::string curLine; + + unsigned timesToRead = id + 1; + + for (unsigned i = 0; i < timesToRead; i++) { + std::getline(mappings, curLine); + } + + std::vector modifyLine(curLine.begin(), curLine.end()); + char* tokenizedString = modifyLine.data(); + char* token; + token = strtok(tokenizedString, " "); + + // loop 6 more times + for (unsigned i = 0; i < 6; i++) { + token = strtok(NULL, " "); + } + std::string left(token); + + // 3 more times for right + for (unsigned i = 0; i < 3; i++) { + token = strtok(NULL, " "); + } + std::string right(token); + + gid2host.resize(numHosts); + gid2host[id].first = std::stoul(left); + gid2host[id].second = std::stoul(right) + 1; + galois::gPrint("[", id, "] Left: ", gid2host[id].first, + ", Right: ", gid2host[id].second, "\n"); + + ///////////////////////// + // send/recv from other hosts + ///////////////////////// + auto& net = galois::runtime::getSystemNetworkInterface(); + + for (unsigned h = 0; h < numHosts; ++h) { + if (h == id) + continue; + galois::runtime::SendBuffer b; + galois::runtime::gSerialize(b, gid2host[id]); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + unsigned received = 1; + while (received < numHosts) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + assert(p->first != id); + auto& b = p->second; + galois::runtime::gDeserialize(b, gid2host[p->first]); + ++received; + } + increment_evilPhase(); + + // sanity checking assignment + for (unsigned h = 0; h < numHosts; h++) { + if (h == 0) { + GALOIS_ASSERT(gid2host[h].first == 0); + } else if (h == numHosts - 1) { + GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second, + gid2host[h].first, " ", gid2host[h - 1].second); + GALOIS_ASSERT(gid2host[h].second == g.size(), gid2host[h].second, " ", + g.size()); + } else { + GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second, + gid2host[h].first, " ", gid2host[h - 1].second); + GALOIS_ASSERT(gid2host[h].second == gid2host[h + 1].first, + gid2host[h].second, " ", gid2host[h + 1].first); + } + } + } + + uint32_t G2L(uint64_t gid) const { + assert(isLocal(gid)); + return globalToLocalMap.at(gid); + } + + uint64_t L2G(uint32_t lid) const { return localToGlobalVector[lid]; } + +public: + //! Type representing a node in this graph + using GraphNode = typename GraphTy::VertexTopologyID; + //! Type representing an edge data in this graph + using EdgeType = EdgeTy; + //! iterator type over edges + using edge_iterator = typename GraphTy::EdgeIterator; + + /** + * Constructor for DistLocalGraph. Initializes metadata fields. + * + * @param host host number that this graph resides on + * @param numHosts total number of hosts in the currently executing program + */ + DistLocalGraph(unsigned host, unsigned numHosts) + : transposed(false), id(host), numHosts(numHosts) { + mirrorNodes.resize(numHosts); + numGlobalNodes = 0; + numGlobalEdges = 0; + } + + /** + * Return a vector of pairs denoting mirror node ranges. + * + * Assumes all mirror nodes occur after the masters: this invariant should be + * held by CuSP. + */ + std::vector> getMirrorRanges() const { + std::vector> mirrorRangesVector; + // order of nodes locally is masters, outgoing mirrors, incoming mirrors, + // so just get from numOwned to end + if (numOwned != numNodes) { + assert(numOwned < numNodes); + mirrorRangesVector.push_back(std::make_pair(numOwned, numNodes)); + } + return mirrorRangesVector; + } + + std::vector>& getMirrorNodes() { return mirrorNodes; } + +private: + virtual unsigned getHostIDImpl(uint64_t) const = 0; + virtual bool isOwnedImpl(uint64_t) const = 0; + virtual bool isLocalImpl(uint64_t) const = 0; + virtual bool isVertexCutImpl() const = 0; + virtual std::pair cartesianGridImpl() const { + return std::make_pair(0u, 0u); + } + +public: + virtual ~DistLocalGraph() {} + void initGraph(uint64_t numNodes) { graph = new GraphTy(numNodes); } + //! Determines which host has the master for a particular node + //! @returns Host id of node in question + inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); } + //! Determine if a node has a master on this host. + //! @returns True if passed in global id has a master on this host + inline bool isOwned(uint64_t gid) const { return isOwnedImpl(gid); } + //! Determine if a node has a proxy on this host + //! @returns True if passed in global id has a proxy on this host + inline bool isLocal(uint64_t gid) const { return isLocalImpl(gid); } + /** + * Returns true if current partition is a vertex cut + * @returns true if partition being stored in this graph is a vertex cut + */ + inline bool is_vertex_cut() const { return isVertexCutImpl(); } + /** + * Returns Cartesian split (if it exists, else returns pair of 0s + */ + inline std::pair cartesianGrid() const { + return cartesianGridImpl(); + } + + bool isTransposed() { return transposed; } + + /** + * Converts a local node id into a global node id + * + * @param nodeID local node id + * @returns global node id corresponding to the local one + */ + inline uint64_t getGID(const uint32_t nodeID) const { return L2G(nodeID); } + + /** + * Converts a global node id into a local node id + * + * @param nodeID global node id + * @returns local node id corresponding to the global one + */ + inline uint32_t getLID(const uint64_t nodeID) const { return G2L(nodeID); } + + /** + * Get data of a node. + * + * @param N node to get the data of + * @param mflag access flag for node data + * @returns A node data object + */ + inline NodeTy& getData(GraphNode N) { + auto& r = graph->getData(N); + return r; + } + + /** + * Get the edge data for a particular edge in the graph. + * + * @param ni edge to get the data of + * @param mflag access flag for edge data + * @returns The edge data for the requested edge + */ + inline EdgeTy& getEdgeData(GraphNode src, edge_iterator ni) { + GraphNode dst = getEdgeDst(ni); + auto& r = graph->getEdgeData(std::make_pair(src, getGID(dst))); + return r; + } + + inline EdgeTy& getEdgeData(edge_iterator ni) { + auto& r = graph->getEdgeData(*ni); + return r; + } + + /** + * Gets edge destination of edge ni. + * + * @param ni edge id to get destination of + * @returns Local ID of destination of edge ni + */ + GraphNode getEdgeDst(edge_iterator ni) { + return getGID(graph->getEdgeDst(*ni)); + } + + /** + * Gets the first edge of some node. + * + * @param N node to get the edge of + * @returns iterator to first edge of N + */ + inline edge_iterator edge_begin(GraphNode N) { + return graph->edges(N).begin(); + } + + /** + * Gets the end edge boundary of some node. + * + * @param N node to get the edge of + * @returns iterator to the end of the edges of node N, i.e. the first edge + * of the next node (or an "end" iterator if there is no next node) + */ + inline edge_iterator edge_end(GraphNode N) { return graph->edges(N).end(); } + + /** + * Return the degree of the edge in the local graph + **/ + inline uint64_t localDegree(GraphNode N) { return graph->getDegree(N); } + + /** + * Returns an iterable object over the edges of a particular node in the + * graph. + * + * @param N node to get edges iterator over + */ + inline galois::runtime::iterable> + edges(GraphNode N) { + return galois::graphs::internal::make_no_deref_range(edge_begin(N), + edge_end(N)); + } + + /** + * Gets number of nodes on this (local) graph. + * + * @returns number of nodes present in this (local) graph + */ + inline size_t size() const { return graph->size(); } + + /** + * Gets number of edges on this (local) graph. + * + * @returns number of edges present in this (local) graph + */ + inline size_t sizeEdges() { return graph->sizeEdges(); } + + /** + * Gets number of nodes on this (local) graph. + * + * @returns number of nodes present in this (local) graph + */ + inline size_t numMasters() const { return numOwned; } + + /** + * Gets number of nodes with edges (may include nodes without edges) + * on this (local) graph. + * + * @returns number of nodes with edges (may include nodes without edges + * as it measures a contiguous range) + */ + inline size_t getNumNodesWithEdges() const { return numNodesWithEdges; } + + /** + * Gets number of nodes on the global unpartitioned graph. + * + * @returns number of nodes present in the global unpartitioned graph + */ + inline size_t globalSize() const { return numGlobalNodes; } + + /** + * Gets number of edges on the global unpartitioned graph. + * + * @returns number of edges present in the global unpartitioned graph + */ + inline size_t globalSizeEdges() const { return numGlobalEdges; } + + /** + * Returns a range object that encapsulates all nodes of the graph. + * + * @returns A range object that contains all the nodes in this graph + */ + inline const NodeRangeType& allNodesRange() const { + assert(specificRanges.size() == 3); + return specificRanges[0]; + } + + /** + * Returns a range object that encapsulates only master nodes in this + * graph. + * + * @returns A range object that contains the master nodes in this graph + */ + inline const NodeRangeType& masterNodesRange() const { + assert(specificRanges.size() == 3); + return specificRanges[1]; + } + + /** + * Returns a range object that encapsulates master nodes and nodes + * with edges in this graph. + * + * @returns A range object that contains the master nodes and the nodes + * with outgoing edges in this graph + */ + inline const NodeRangeType& allNodesWithEdgesRange() const { + assert(specificRanges.size() == 3); + return specificRanges[2]; + } + + /** + * Returns a vector object that contains the global IDs (in order) of + * the master nodes in this graph. + * + * @returns A vector object that contains the global IDs (in order) of + * the master nodes in this graph + */ + std::vector getMasterGlobalIDs() { + std::vector IDs; + + IDs.reserve(numMasters()); + for (auto node : masterNodesRange()) { + IDs.push_back(getGID(node)); + } + + return IDs; + } + +protected: + /** + * Uses a pre-computed prefix sum to determine division of nodes among + * threads. + * + * The call uses binary search to determine the ranges. + */ + inline void determineThreadRanges() { + allNodesRanges = galois::graphs::determineUnitRangesFromPrefixSum( + galois::runtime::activeThreads, graph->getEdgePrefixSum()); + } + + /** + * Determines the thread ranges for master nodes only and saves them to + * the object. + * + * Only call after graph is constructed + only call once + */ + inline void determineThreadRangesMaster() { + // make sure this hasn't been called before + assert(masterRanges.size() == 0); + + // first check if we even need to do any work; if already calculated, + // use already calculated vector + if (beginMaster == 0 && (beginMaster + numOwned) == size()) { + masterRanges = allNodesRanges; + } else if (beginMaster == 0 && + (beginMaster + numOwned) == numNodesWithEdges && + withEdgeRanges.size() != 0) { + masterRanges = withEdgeRanges; + } else { + galois::gDebug("Manually det. master thread ranges"); + masterRanges = galois::graphs::determineUnitRangesFromGraph( + *graph, galois::runtime::activeThreads, beginMaster, + beginMaster + numOwned, 0, true); + } + } + + /** + * Determines the thread ranges for nodes with edges only and saves them to + * the object. + * + * Only call after graph is constructed + only call once + */ + inline void determineThreadRangesWithEdges() { + // make sure not called before + assert(withEdgeRanges.size() == 0); + + // first check if we even need to do any work; if already calculated, + // use already calculated vector + if (numNodesWithEdges == size()) { + withEdgeRanges = allNodesRanges; + } else if (beginMaster == 0 && + (beginMaster + numOwned) == numNodesWithEdges && + masterRanges.size() != 0) { + withEdgeRanges = masterRanges; + } else { + galois::gDebug("Manually det. with edges thread ranges"); + withEdgeRanges = galois::graphs::determineUnitRangesFromGraph( + *graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0); + } + } + + /** + * Initializes the 3 range objects that a user can access to iterate + * over the graph in different ways. + */ + void initializeSpecificRanges() { + assert(specificRanges.size() == 0); + + // TODO/FIXME assertion likely not safe if a host gets no nodes + // make sure the thread ranges have already been calculated + // for the 3 ranges + assert(allNodesRanges.size() != 0); + assert(masterRanges.size() != 0); + assert(withEdgeRanges.size() != 0); + + // 0 is all nodes + specificRanges.push_back(galois::runtime::makeSpecificRange( + boost::counting_iterator(0), + boost::counting_iterator(size()), allNodesRanges.data())); + + // 1 is master nodes + specificRanges.push_back(galois::runtime::makeSpecificRange( + boost::counting_iterator(beginMaster), + boost::counting_iterator(beginMaster + numOwned), + masterRanges.data())); + + // 2 is with edge nodes + specificRanges.push_back(galois::runtime::makeSpecificRange( + boost::counting_iterator(0), + boost::counting_iterator(numNodesWithEdges), + withEdgeRanges.data())); + + assert(specificRanges.size() == 3); + } + + /** + * Specific range editor: makes the range for edges equivalent to the range + * for masters. + */ + void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; } + + void recalculateG2LMap() { + for (uint64_t i = 0; i < localToGlobalVector.size(); i++) { + globalToLocalMap[localToGlobalVector[i]] = i; + } + } + +public: + /** + * Write the local LC_CSR graph to the file on a disk. + * + * @todo revive this + */ + void save_local_graph_to_file(std::string) { GALOIS_DIE("not implemented"); } + + /** + * Read the local LC_CSR graph from the file on a disk. + * + * @todo revive this + */ + void read_local_graph_from_file(std::string) { + GALOIS_DIE("not implemented"); + } + + /** + * Deallocates underlying LC CSR Graph + */ + void deallocate() { + galois::gDebug("Deallocating CSR in DistLocalGraph"); + graph->deallocate(); + } + + /** + * Sort the underlying LC_CSR_Graph by ID (destinations) + * It sorts edges of the nodes by destination. + */ + void sortEdgesByDestination() { + galois::do_all( + galois::iterate(graph->vertices().begin(), graph->vertices().end()), + [&](GraphNode n) { graph->sortEdges(n); }, galois::no_stats(), + galois::loopname("CSREdgeSort"), galois::steal()); + } + + //! Used by substrate to determine if some stats are to be reported + bool is_a_graph() const { return true; } + inline NodeTy& getTopologyID(uint64_t nodeID) { + return graph.getData(getLID(nodeID)); + } + + inline NodeTy& getTopologyIDFromIndex(uint64_t index) { + return graph.getData(index); + } + + uint64_t getTokenID(NodeTy& vertex) { + return getGID(&vertex - &graph.getData(0)); + } + + uint32_t getVertexIndex(NodeTy& vertex) { + return (&vertex - &graph.getData(0)); + } + + uint64_t getLocalityVertex(NodeTy& vertex) { + uint64_t gid = getTopologyID(vertex); + return getHostIDImpl(gid); + } + + /** Edge Manipulation **/ + edge_iterator mintEdgeHandle(NodeTy& src, std::uint64_t off) { + return edge_begin(src) + off; + } + + // template + // typename std::enable_if::value>::type + // setData(typename GraphTy::node_data_reference vertex, T data) { + // graph.setData(vertex, data); + // } + + ///** Data Manipulations **/ + + // typename GraphTy::node_data_reference + // getData(typename GraphTy::node_data_reference vertex) { + // return graph.getData(getTokenID(vertex)); + // } + + template + typename std::enable_if::value>::type + setEdgeData(edge_iterator eh, T data) { + graph.setEdgeData(eh, data); + } + + template + typename std::enable_if::value, EdgeTy&>::type + getEdgeData(edge_iterator eh) { + return graph.getEdgeData(eh); + } + + enum Task { + ADD_VERTEX, + ADD_VERTEX_TOPOLOGY_ONLY, + ADD_EDGES, + ADD_EDGES_TOPOLOGY_ONLY, + DELETE_VERTEX, + DELETE_EDGES + }; + + template + void sendModifyRequest(uint32_t host, Args... args) { + galois::runtime::SendBuffer b; + galois::runtime::gSerialize(b, args...); + galois::runtime::getSystemNetworkInterface().sendTagged( + host, galois::runtime::evilPhase, std::move(b)); + } + + // Assumptions: + // 1. A vertex is added before any edges are added to it + // 2. No support for deleting edges/vertices yet + // 3. Only works for OEC + void + updateVariables(bool isVertex, uint64_t src, + std::optional> dsts = std::nullopt) { + + if (isVertex) { + if (globalToLocalMap.find(src) == globalToLocalMap.end()) { + localToGlobalVector.push_back(src); + globalToLocalMap[src] = localToGlobalVector.size() - 1; + numNodes++; + } + numOwned++; + } else { + uint64_t srcLID = globalToLocalMap[src]; + if (edge_begin(srcLID) == edge_end(srcLID)) { + numNodesWithEdges++; + } + for (auto token : dsts.value()) { + if (globalToLocalMap.find(token) == globalToLocalMap.end()) { + localToGlobalVector.push_back(token); + globalToLocalMap[token] = localToGlobalVector.size() - 1; + numNodes++; + } + if (!isOwned(token)) { + mirrorNodes[getHostID(token)].push_back(token); + } + } + numEdges += dsts.value().size(); + } + } + + /** Topology Modifications **/ + void addVertexTopologyOnly(uint32_t token) { + uint64_t belongsTo = getHostID(token); + if (belongsTo == id) { + updateVariables(true, token); + // graph->addVertexTopologyOnly(); + } else { + sendModifyRequest(belongsTo, ADD_VERTEX_TOPOLOGY_ONLY, token); + } + } + + template + void addVertex(uint64_t token, T data) { + uint64_t belongsTo = getHostID(token); + if (belongsTo == id) { + updateVariables(true, token); + // graph->setData(getLID(token), data); + } else { + sendModifyRequest(belongsTo, ADD_VERTEX, token, data); + } + } + + void addEdgesTopologyOnly(uint64_t src, std::vector dsts) { + uint64_t belongsTo = getHostID(src); + if (belongsTo == id) { + updateVariables(false, src, dsts); + graph->addEdgesTopologyOnly(getLID(src), dsts); + } else { + sendModifyRequest(belongsTo, ADD_EDGES_TOPOLOGY_ONLY, src, dsts); + } + } + + void addEdges(uint64_t src, std::vector dsts, + std::vector data) { + uint64_t belongsTo = getHostID(src); + if (belongsTo == id) { + updateVariables(false, src, dsts); + std::vector lids; + for (uint32_t i = 0; i < dsts.size(); i++) { + lids.push_back(getLID(dsts[i])); + } + graph->addEdges(getLID(src), lids, data); + } else { + sendModifyRequest(belongsTo, src, dsts, data); + } + } + + void deleteVertex(uint64_t src) { + uint64_t belongsTo = getHostID(src); + if (belongsTo == id) { + // TODO(Divija): Uncomment when we have the graph API + // graph.deleteVertex(getLID(src)); + } else { + sendModifyRequest(belongsTo, DELETE_VERTEX, src); + } + } + + void deleteEdges(uint64_t src, std::vector edges) { + // TODO:Remove dst tokens from local map? + uint64_t belongsTo = getHostID(src); + if (belongsTo == id) { + // TODO(Divija): Uncomment when we have the graph API + // return graph.deleteEdges(getLID(src), edges); + } else { + sendModifyRequest(belongsTo, DELETE_EDGES, src, edges); + } + } +}; + +template +constexpr const char* const + galois::graphs::DistLocalGraph::GRNAME; +} // end namespace graphs +} // end namespace galois + +#endif //_GALOIS_DISTRIBUTED_LOCAL_GRAPH_H diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h index 942a2ceb61..3794d9eef1 100644 --- a/libcusp/include/galois/graphs/GenericPartitioners.h +++ b/libcusp/include/galois/graphs/GenericPartitioners.h @@ -25,8 +25,6 @@ class NoCommunication : public galois::graphs::ReadMasterAssignment { } }; -/** - */ class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment { public: MiningPolicyNaive(uint32_t, uint32_t numHosts, uint64_t, uint64_t, @@ -38,6 +36,17 @@ class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment { bool keepEdge(uint32_t src, uint32_t dst) const { return src < dst; } }; +class OECPolicy : public galois::graphs::ReadMasterAssignment { +public: + OECPolicy(uint32_t, uint32_t numHosts, uint64_t, uint64_t, + std::vector&) + : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {} + + static bool needNodeDegrees() { return false; } + + bool keepEdge(uint32_t, uint32_t) const { return true; } +}; + class MiningPolicyDegrees : public galois::graphs::ReadMasterAssignment { std::vector& ndegrees; @@ -905,4 +914,145 @@ class SugarColumnFlipP : public galois::graphs::CustomMasterAssignment { } }; +class GnnOEC : public galois::graphs::CustomMasterAssignment { +public: + GnnOEC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes, + uint64_t numEdges) + : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes, + numEdges){}; + + template + uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph&, + const std::vector&, + std::unordered_map&, + const std::vector&, + std::vector>&, + const std::vector&, + std::vector>&) { + // this is expected to be set + return _globalHostMap[src]; + } + + uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; } + + //! outgoing edge cut + uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const { + return retrieveMaster(src); + } + + bool noCommunication() { return false; } + bool isVertexCut() const { return false; } + void serializePartition(boost::archive::binary_oarchive&) {} + void deserializePartition(boost::archive::binary_iarchive&) {} + std::pair cartesianGrid() { + return std::make_pair(0u, 0u); + } + + bool predeterminedMapping(std::vector& mappings) { + if (mappings.size() != _numNodes) { + GALOIS_DIE("predetermined mapping size not equal to num nodes"); + } + _globalHostMap.resize(_numNodes); + + galois::do_all(galois::iterate((size_t)0, mappings.size()), + [&](size_t n) { _globalHostMap[n] = mappings[n]; }); + + return true; + } +}; + +class GnnCVC : public galois::graphs::CustomMasterAssignment { + unsigned numRowHosts; + unsigned numColumnHosts; + unsigned _h_offset; + + void factorizeHosts() { + numColumnHosts = sqrt(_numHosts); + + while ((_numHosts % numColumnHosts) != 0) + numColumnHosts--; + + numRowHosts = _numHosts / numColumnHosts; + assert(numRowHosts >= numColumnHosts); + + if (_hostID == 0) { + galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts, + "\n"); + } + } + + //! Returns the grid row ID of this host + unsigned gridRowID() const { return (_hostID / numColumnHosts); } + //! Returns the grid row ID of the specified host + unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); } + //! Returns the grid column ID of this host + unsigned gridColumnID() const { return (_hostID % numColumnHosts); } + //! Returns the grid column ID of the specified host + unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); } + + //! Find the row of a particular node + unsigned getRowOfNode(uint64_t gid) const { + return gridRowID(retrieveMaster(gid)); + } + + //! Find the column of a particular node + unsigned getColumnOfNode(uint64_t gid) const { + return gridColumnID(retrieveMaster(gid)); + } + +public: + GnnCVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes, + uint64_t numEdges) + : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes, + numEdges) { + factorizeHosts(); + _h_offset = gridRowID() * numColumnHosts; + }; + + template + uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph&, + const std::vector&, + std::unordered_map&, + const std::vector&, + std::vector>&, + const std::vector&, + std::vector>&) { + // this is expected to be set + return _globalHostMap[src]; + } + + uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; } + + uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const { + unsigned blockedRowOffset = getRowOfNode(src) * numColumnHosts; + unsigned cyclicColumnOffset = getColumnOfNode(dst); + return blockedRowOffset + cyclicColumnOffset; + } + + bool noCommunication() { return false; } + bool isVertexCut() const { + if ((numRowHosts == 1) || (numColumnHosts == 1)) + return false; + return true; + } + + void serializePartition(boost::archive::binary_oarchive&) {} + void deserializePartition(boost::archive::binary_iarchive&) {} + std::pair cartesianGrid() { + return std::make_pair(numRowHosts, numColumnHosts); + } + + bool predeterminedMapping(std::vector& mappings) { + if (mappings.size() != _numNodes) { + GALOIS_DIE("predetermined mapping size not equal to num nodes"); + } + _globalHostMap.resize(_numNodes); + + galois::do_all(galois::iterate((size_t)0, mappings.size()), + [&](size_t n) { _globalHostMap[n] = mappings[n]; }); + + return true; + } +}; + #endif diff --git a/libcusp/include/galois/graphs/MiningPartitioner.h b/libcusp/include/galois/graphs/MiningPartitioner.h index e49d16023e..c809c24dd0 100644 --- a/libcusp/include/galois/graphs/MiningPartitioner.h +++ b/libcusp/include/galois/graphs/MiningPartitioner.h @@ -540,15 +540,15 @@ class MiningGraph : public DistGraph { if (h != base_DistGraph::id) { galois::runtime::SendBuffer bitsetBuffer; galois::runtime::gSerialize(bitsetBuffer, presentProxies); - net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer); + net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer)); } } // receive loop for (unsigned h = 0; h < net.Num - 1; h++) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; // deserialize proxiesOnOtherHosts @@ -653,8 +653,7 @@ class MiningGraph : public DistGraph { bytesSent.update(b.size()); // send buffer and free memory - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } galois::runtime::reportStat_Tsum( GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce()); @@ -675,9 +674,9 @@ class MiningGraph : public DistGraph { for (unsigned h = 0; h < net.Num - 1; h++) { // expect data from comm partner back - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; @@ -1057,15 +1056,15 @@ class MiningGraph : public DistGraph { bytesSent.update(b.size()); maxBytesSent.update(b.size()); - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); - b.getVec().reserve(edgePartitionSendBufSize * 1.25); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + b = galois::runtime::SerializeBuffer(); + b.reserve(edgePartitionSendBufSize * 1.25); } } } // overlap receives - auto buffer = net.recieveTagged(galois::runtime::evilPhase, nullptr); + auto buffer = net.recieveTagged(galois::runtime::evilPhase); this->processReceivedEdgeBuffer(buffer, graph, receivedNodes); }, #if MORE_DIST_STATS @@ -1085,8 +1084,8 @@ class MiningGraph : public DistGraph { bytesSent.update(sendBuffer.size()); maxBytesSent.update(sendBuffer.size()); - net.sendTagged(h, galois::runtime::evilPhase, sendBuffer); - sendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer)); + sendBuffer = galois::runtime::SerializeBuffer(); } } } @@ -1108,7 +1107,7 @@ class MiningGraph : public DistGraph { GraphTy& graph, std::atomic& receivedNodes) { if (buffer) { auto& rb = buffer->second; - while (rb.r_size() > 0) { + while (rb.size() > 0) { uint64_t n; std::vector gdst_vec; galois::runtime::gDeserialize(rb, n); @@ -1134,8 +1133,8 @@ class MiningGraph : public DistGraph { // receive edges for all mirror nodes while (receivedNodes < nodesToReceive) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + p = net.recieveTagged(galois::runtime::evilPhase); processReceivedEdgeBuffer(p, graph, receivedNodes); } } diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index bfc92d989a..e8d7e15d8e 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -29,6 +29,9 @@ #include "galois/graphs/DistributedGraph.h" #include "galois/DReducible.h" + +#include "shad/ShadGraphConverter.h" + #include #include @@ -75,6 +78,69 @@ class NewDistGraphGeneric : public DistGraph { uint32_t nodesToReceive; + std::vector getGNNBreakpoints(std::string filename) { + // contains 2 numbers: begin and end of train + // everything else can be split evenly among hosts as they are not + // performance critical + std::vector bps; + + // TODO(loc) avoid this entirely and load it from file... + // if through all possible GNN outputs + if (filename.find("cora") != std::string::npos) { + bps.push_back(0); + bps.push_back(140); + } else if (filename.find("reddit") != std::string::npos) { + bps.push_back(0); + bps.push_back(153431); + } else if (filename.find("citeseer") != std::string::npos) { + bps.push_back(0); + bps.push_back(120); + } else if (filename.find("pubmed") != std::string::npos) { + bps.push_back(0); + bps.push_back(60); + } else if (filename.find("ppi") != std::string::npos) { + bps.push_back(0); + bps.push_back(9716); + } else if (filename.find("tester") != std::string::npos) { + bps.push_back(0); + bps.push_back(5); + } else if (filename.find("ogbn-arxiv") != std::string::npos) { + bps.push_back(0); + bps.push_back(169252); + } else if (filename.find("ogbn-products") != std::string::npos) { + bps.push_back(0); + bps.push_back(196615); + } else if (filename.find("yelp") != std::string::npos) { + // this is entire graph: yelp's mask isn't contiguous + bps.push_back(0); + bps.push_back(716847); + } else if (filename.find("amazon") != std::string::npos) { + // this is entire graph: amazon's mask isn't contiguous + bps.push_back(0); + bps.push_back(1569960); + } else if (filename.find("ogbn-proteins") != std::string::npos) { + // this is entire graph: amazon's mask isn't contiguous + bps.push_back(0); + bps.push_back(86618); + } else if (filename.find("ogbn-papers100M-remap") != std::string::npos) { + galois::gInfo("papers remap being used"); + // whole graph (non contiguous mask) + bps.push_back(0); + bps.push_back(1207178); + } else if (filename.find("ogbn-papers100M") != std::string::npos) { + // whole graph (non contiguous mask) + bps.push_back(0); + bps.push_back(111059956); + } else { + // TODO(loc) only die under certain conditions; don't die if something + // is missing + // GALOIS_DIE("invalid input for gnn partitioning ", filename, + // " hardcode needed"); + } + + return bps; + } + public: //! typedef for base DistGraph class using base_DistGraph = DistGraph; @@ -157,7 +223,8 @@ class NewDistGraphGeneric : public DistGraph { */ NewDistGraphGeneric( const std::string& filename, unsigned host, unsigned _numHosts, - bool cuspAsync = true, uint32_t stateRounds = 100, bool transpose = false, + bool useWMD = false, bool cuspAsync = true, uint32_t stateRounds = 100, + bool transpose = false, galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS, uint32_t nodeWeight = 0, uint32_t edgeWeight = 0, std::string masterBlockFile = "", bool readFromFile = false, @@ -170,24 +237,83 @@ class NewDistGraphGeneric : public DistGraph { Tgraph_construct.start(); if (readFromFile) { - galois::gPrint("[", base_DistGraph::id, - "] Reading local graph from file ", localGraphFileName, - "\n"); + galois::gDebug("[", base_DistGraph::id, + "] Reading local graph from file ", localGraphFileName); base_DistGraph::read_local_graph_from_file(localGraphFileName); Tgraph_construct.stop(); return; } - galois::graphs::OfflineGraph g(filename); - base_DistGraph::numGlobalNodes = g.size(); - base_DistGraph::numGlobalEdges = g.sizeEdges(); + galois::graphs::OfflineGraph* offlineGraph{nullptr}; + + std::string host_prefix = + std::string("[") + + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + std::string("] "); + + shad::ShadGraphConverter shadConverter; + galois::graphs::BufferedGraph bufGraph; + bufGraph.resetReadCounters(); + std::vector dummy; // not actually getting masters, but getting assigned readers for nodes if (masterBlockFile == "") { - base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight); + if (useWMD) { + uint64_t numGlobalNodes{0}, numGlobalEdges{0}; + galois::gInfo(host_prefix, "Starts reading SHAD graph file"); + // Read and load the whole SHAD WMD dataset to memory. + // TODO(hc): Note that this reads the entire graph. + // We will improve this to read partial graphs + // on each host later. For now, the main focus is + // to enable WMD dataset for the workflows. + shadConverter.readSHADFile(filename, &numGlobalNodes, &numGlobalEdges); + galois::gInfo(host_prefix, "Completes reading SHAD graph file"); + base_DistGraph::numGlobalNodes = numGlobalNodes; + base_DistGraph::numGlobalEdges = numGlobalEdges; + + galois::gInfo(host_prefix, + "Read graph # nodes:", std::to_string(numGlobalNodes), + " # edges:", std::to_string(numGlobalEdges)); + galois::gInfo(host_prefix, "Starts node array construction from SHAD" + " graph"); + // Construct node data/outgoing index range arrays + // for a GLOBAL array, not a local array. + // Later, parts for the local graph partition will be + // extracted and be used after graph partitioning is done. + // Basically, the idea that is used here is to mimic + // the BufferedGraph. BufferedGraph does not load the whole arrays + // to memory, but only reads and loads parts of the arrays from + // an input file. It is possible since the .gr files are stored + // in a CSR format, and in a consecutive manner. We can know + // offset for each data in advance. + // However, we cannot achieve it from a SHAD graph file since + // it is not consecutive, but edges and nodes are mixed. + // Due to this, we construct nodes' array for a global graph + // here. This array will be restructured after CuSP decides + // local nodes. + // TODO(hc): UT will improve and redesign this part to + // get scalability. + shadConverter.constructNodeArrays(0, numGlobalNodes, numGlobalNodes); + galois::gInfo(host_prefix, "Completes node array construction from SHAD" + " graph"); + // Compute master proxies by using the number of global nodes + // and edges. + base_DistGraph::computeMasters( + md, base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges, + shadConverter.getOutIndexBuffer(), dummy, nodeWeight, edgeWeight); + } else { + offlineGraph = new galois::graphs::OfflineGraph(filename); + base_DistGraph::numGlobalNodes = offlineGraph->size(); + base_DistGraph::numGlobalEdges = offlineGraph->sizeEdges(); + base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight, + edgeWeight); + } } else { + if (useWMD) { + GALOIS_DIE("SHAD graph format does not support master block file"); + } galois::gInfo("Getting reader assignment from file"); - base_DistGraph::readersFromFile(g, masterBlockFile); + base_DistGraph::readersFromFile(*offlineGraph, masterBlockFile); } graphPartitioner = std::make_unique( @@ -196,12 +322,43 @@ class NewDistGraphGeneric : public DistGraph { // TODO abstract this away somehow graphPartitioner->saveGIDToHost(base_DistGraph::gid2host); - uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first; - typename galois::graphs::OfflineGraph::edge_iterator edgeBegin = - g.edge_begin(nodeBegin); - uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second; - typename galois::graphs::OfflineGraph::edge_iterator edgeEnd = - g.edge_begin(nodeEnd); + // get training nodes and split evenly among hosts + std::vector trainPoints = this->getGNNBreakpoints(filename); + // TODO(hc) + if (!trainPoints.empty()) { + std::vector testDistribution = + galois::graphs::determineUnitRangesFromPrefixSum( + base_DistGraph::numHosts, *offlineGraph, trainPoints[0], + trainPoints[1]); + + std::vector restDistribution = + galois::graphs::determineUnitRangesFromPrefixSum( + base_DistGraph::numHosts, *offlineGraph, trainPoints[1], + offlineGraph->size()); + + // create global distribution of edges + std::vector mappings(offlineGraph->size()); + galois::do_all( + galois::iterate((size_t)0, (size_t)base_DistGraph::numHosts), + [&](size_t h) { + // test + uint32_t hCur = testDistribution[h]; + uint32_t hEnd = testDistribution[h + 1]; + for (; hCur < hEnd; hCur++) { + mappings[hCur] = h; + } + // the rest + hCur = restDistribution[h]; + hEnd = restDistribution[h + 1]; + for (; hCur < hEnd; hCur++) { + mappings[hCur] = h; + } + }); + bool validPart = graphPartitioner->predeterminedMapping(mappings); + if (!validPart) { + galois::gWarn("partitioning policy used doesn't use trainpoints"); + } + } // signifies how many outgoing edges a particular host should expect from // this host @@ -222,27 +379,39 @@ class NewDistGraphGeneric : public DistGraph { // phase 0 - galois::gPrint("[", base_DistGraph::id, "] Starting graph reading.\n"); - galois::graphs::BufferedGraph bufGraph; - bufGraph.resetReadCounters(); + galois::gDebug("[", base_DistGraph::id, "] Starting graph reading."); galois::StatTimer graphReadTimer("GraphReading", GRNAME); graphReadTimer.start(); - bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin, - *edgeEnd, base_DistGraph::numGlobalNodes, - base_DistGraph::numGlobalEdges); + + uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first; + uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second; + + if (!useWMD) { + // If the input graph is not SHAD WMD format, + // construct a buffered graph from the file directly, as ordinary. + typename galois::graphs::OfflineGraph::edge_iterator edgeBegin = + offlineGraph->edge_begin(nodeBegin); + typename galois::graphs::OfflineGraph::edge_iterator edgeEnd = + offlineGraph->edge_begin(nodeEnd); + bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin, + *edgeEnd, base_DistGraph::numGlobalNodes, + base_DistGraph::numGlobalEdges); + } else { + constructCSRFromSHADGraph(&bufGraph, &shadConverter, nodeBegin, nodeEnd, + host_prefix); + } + graphReadTimer.stop(); - galois::gPrint("[", base_DistGraph::id, "] Reading graph complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Reading graph complete."); if (graphPartitioner->masterAssignPhase()) { // loop over all nodes, determine where neighbors are, assign masters galois::StatTimer phase0Timer("Phase0", GRNAME); - galois::gPrint("[", base_DistGraph::id, - "] Starting master assignment.\n"); + galois::gDebug("[", base_DistGraph::id, "] Starting master assignment."); phase0Timer.start(); phase0(bufGraph, cuspAsync, stateRounds); phase0Timer.stop(); - galois::gPrint("[", base_DistGraph::id, - "] Master assignment complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Master assignment complete."); } galois::StatTimer inspectionTimer("EdgeInspection", GRNAME); @@ -357,13 +526,23 @@ class NewDistGraphGeneric : public DistGraph { base_DistGraph::initializeSpecificRanges(); Tgraph_construct.stop(); - galois::gPrint("[", base_DistGraph::id, "] Graph construction complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Graph construction complete."); + + if (useWMD) { + // Different from the gr format file that has been used by Galois + // and does not contain node data in the file, + // a SHAD graph file has a single type for each node, and it + // is considered as node data. + // This function constructs and sets node data (type). + assignNodeDataFromSHADProp(&shadConverter); + } // report state rounds if (base_DistGraph::id == 0) { galois::runtime::reportStat_Single(GRNAME, "CuSPStateRounds", (uint32_t)stateRounds); } + galois::gPrint("[", base_DistGraph::id, "] Dist graph constructed\n"); } private: @@ -406,6 +585,108 @@ class NewDistGraphGeneric : public DistGraph { return toReturn; } + /// Construct arrays for in-memory CSR. + /// In case of the node out-going edge range array and + /// the node data array, it will extract parts corresponding to + /// local graph paritition from the arrays holding the global + /// array information. + /// Edge destination and data arrays are constructed based on + /// unordered maps constructed from SHAD graph reading. + /// NOTE that those arrays for CSR all store GLOBAL node ids. + /// For example, edge destination array's size is equal + /// to the number of local edges, but its destination ID is + /// global node IDs, not local node IDs. + /// + /// @tparam T Graph node data type + /// + /// @param bufGraph Buffered graph to construct + /// @param shadConverter Shad graph ingestor which ingested + /// a SHAD graph in memory to an unordered node/edge map + /// @param nodeBegin Global id of the first local node range + /// @param nodeEnd Global id of the last local node range + /// @param host_prefix Log prefix string for this host + template < + typename T = NodeTy, + typename std::enable_if_t>* = nullptr> + void + constructCSRFromSHADGraph(galois::graphs::BufferedGraph* bufGraph, + shad::ShadGraphConverter* shadConverter, + uint64_t nodeBegin, uint64_t nodeEnd, + std::string host_prefix) { + uint32_t numLocalNodes = nodeEnd - nodeBegin; + // So, this holds outgoing edge array of a whole (global) graph. + uint64_t* outIndexBuffer = shadConverter->getOutIndexBuffer(); + // Global edge id range assigned to the current host. + uint64_t edgeBegin = (nodeBegin == 0) ? 0 : outIndexBuffer[nodeBegin - 1]; + // This is the last local node's edge range end. + // So, [edgeBegin, edgeEnd) is for this current host. + uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1]; + galois::gInfo(host_prefix, "Starts local out index array construction"); + // Extract node out-going range and data arrays of local nodes. + // From now on, those arrays store local node information + // as a dense memory representation. + shadConverter->extractLocalOutIndexArray(nodeBegin, nodeEnd); + galois::gInfo(host_prefix, "Completes local out index array construction"); + + galois::gInfo(host_prefix, "Starts edge destination/data " + "array construction"); + uint64_t numLocalEdges = edgeEnd - edgeBegin; + shadConverter->constructEdgeArrays(nodeBegin, edgeBegin, numLocalNodes, + numLocalEdges); + + galois::gInfo(host_prefix, "Completes edge destination/data " + "array construction"); + // Construct a buffered graph that is used by CuSP to partition + // a graph. + shadConverter->constructBufferedGraph( + base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges, + nodeBegin, nodeEnd, edgeBegin, edgeEnd, bufGraph); + galois::gInfo(host_prefix, "Completes buffered graph construction from" + " SHAD graph"); + } + + // Disable this method for non-SHAD graph construction. + template >* = + nullptr> + void constructCSRFromSHADGraph(galois::graphs::BufferedGraph*, + shad::ShadGraphConverter*, uint64_t, uint64_t, + std::string) {} + + /** + * @brief Assign a SHAD node type to a node data. + * + * @detail Different from the gr format file that has been used by Galois + * and does not contain node data in the file, + * a SHAD graph file has a single type for each node, and it + * considered as node data. This function constructs and sets node + * data based on that. + * This function assumes that the node type's data type is always + * uint64_t. + * + * @tparam T Node data type + * + * @param shadConverter SHAD graph converter holding node data from a + * SHAD file. + */ + template < + typename T = NodeTy, + typename std::enable_if_t>* = nullptr> + void assignNodeDataFromSHADProp(shad::ShadGraphConverter* shadConverter) { + galois::gPrint("[", base_DistGraph::id, "] Graph node data is assigned."); + shad::ShadNodeTy* nodeDataBuffer = shadConverter->getNodeDataBuffer(); + galois::do_all(galois::iterate(base_DistGraph::allNodesRange()), + [&](uint32_t lid) { + uint64_t gid = this->getGID(lid); + this->getData(lid) = nodeDataBuffer[gid]; + }); + } + + template >* = + nullptr> + void assignNodeDataFromSHADProp(shad::ShadGraphConverter*) {} + /** * For each other host, determine which nodes that this host needs to get * info from @@ -496,16 +777,10 @@ class NewDistGraphGeneric : public DistGraph { lid++; } } - galois::gDebug("[", base_DistGraph::id, " -> ", h, "] bitset size ", - (end - start) / 64, " vs. vector size ", - syncNodes[h].size() / 2); } lid -= numLocal; assert(lid == numToReserve); - galois::gDebug("[", base_DistGraph::id, "] total bitset size ", - (ghosts.size() - numLocal) / 64, " vs. total vector size ", - numToReserve / 2); // TODO: should not be used after this - refactor to make this clean ghosts.resize(0); @@ -539,16 +814,16 @@ class NewDistGraphGeneric : public DistGraph { if (h != base_DistGraph::id) { galois::runtime::gSerialize(bitsetBuffer, syncNodes[h]); bytesSent += bitsetBuffer.size(); - net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer); + net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer)); } } // Step 5: recv bitset to other hosts; this indicates which local nodes each // other host needs to be informed of updates of for (unsigned h = 0; h < net.Num - 1; h++) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; // deserialize into neighbor bitsets @@ -638,7 +913,7 @@ class NewDistGraphGeneric : public DistGraph { // note the +1 on evil phase; load messages send using a different // phase to avoid conflicts - net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b); + net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b)); } } sendTimer.stop(); @@ -658,13 +933,13 @@ class NewDistGraphGeneric : public DistGraph { std::vector& edgeLoads, galois::DynamicBitSet& loadsClear) { auto& net = galois::runtime::getSystemNetworkInterface(); - decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr)) p; + decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1())) p; galois::StatTimer recvTimer("Phase0AsyncRecvLoadTime", GRNAME); recvTimer.start(); do { // note the +1 - p = net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr); + p = net.recieveTagged(base_DistGraph::evilPhasePlus1()); if (p) { unsigned messageType = (unsigned)-1; @@ -859,13 +1134,13 @@ class NewDistGraphGeneric : public DistGraph { galois::runtime::gSerialize(b, mastersToSend); } bytesSent += b.size(); - net.sendTagged(targetHost, galois::runtime::evilPhase, b); + net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b)); } else { // send empty no-op message, tag 0 galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, 0u); bytesSent += b.size(); - net.sendTagged(targetHost, galois::runtime::evilPhase, b); + net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b)); } sendOffsetsTimer.stop(); @@ -934,9 +1209,9 @@ class NewDistGraphGeneric : public DistGraph { bytesSent += b.size(); // assumes phase is 0 or 1 if (phase == 1) { - net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b); + net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b)); } else if (phase == 0) { - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } else { GALOIS_DIE("unexpected phase: ", phase); } @@ -954,8 +1229,6 @@ class NewDistGraphGeneric : public DistGraph { std::vector& receivedOffsets, std::vector& receivedMasters) { uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first; - galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, " offset ", - hostOffset); // if execution gets here, messageType was 1 or 2 assert(receivedMasters.size() == receivedOffsets.size()); @@ -963,10 +1236,8 @@ class NewDistGraphGeneric : public DistGraph { galois::do_all( galois::iterate((size_t)0, receivedMasters.size()), [&](size_t i) { - uint64_t curGID = hostOffset + receivedOffsets[i]; - uint32_t indexIntoMap = gid2offsets[curGID]; - galois::gDebug("[", base_DistGraph::id, "] gid ", curGID, " offset ", - indexIntoMap); + uint64_t curGID = hostOffset + receivedOffsets[i]; + uint32_t indexIntoMap = gid2offsets[curGID]; localNodeToMaster[indexIntoMap] = receivedMasters[i]; }, galois::no_stats()); @@ -985,9 +1256,9 @@ class NewDistGraphGeneric : public DistGraph { std::vector& receivedMasters) { auto& net = galois::runtime::getSystemNetworkInterface(); - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; @@ -1011,9 +1282,6 @@ class NewDistGraphGeneric : public DistGraph { messageType); } - galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, - " send message type ", messageType); - return std::make_pair(sendingHost, messageType); } @@ -1030,11 +1298,11 @@ class NewDistGraphGeneric : public DistGraph { std::unordered_map& gid2offsets, galois::DynamicBitSet& hostFinished) { auto& net = galois::runtime::getSystemNetworkInterface(); - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; // repeat loop until no message do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); if (p) { uint32_t sendingHost = p->first; unsigned messageType = (unsigned)-1; @@ -1068,9 +1336,6 @@ class NewDistGraphGeneric : public DistGraph { GALOIS_DIE("invalid message type for sync of master assignments: ", messageType); } - - galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, - " send message type ", messageType); } } while (p); } @@ -1258,8 +1523,6 @@ class NewDistGraphGeneric : public DistGraph { // gid to vector offset setup std::unordered_map gid2offsets; uint64_t neighborCount = phase0MapSetup(ghosts, gid2offsets, syncNodes); - galois::gDebug("[", base_DistGraph::id, "] num neighbors found is ", - neighborCount); // send off neighbor metadata phase0SendRecv(syncNodes); @@ -1291,7 +1554,7 @@ class NewDistGraphGeneric : public DistGraph { if (async) { if (base_DistGraph::id == 0) { - galois::gPrint("Using asynchronous master determination sends.\n"); + galois::gDebug("Using asynchronous master determination sends."); } hostFinished.resize(base_DistGraph::numHosts); @@ -1309,8 +1572,8 @@ class NewDistGraphGeneric : public DistGraph { #endif if (base_DistGraph::id == 0) { - galois::gPrint("Number of BSP sync rounds in master assignment: ", - stateRounds, "\n"); + galois::gDebug("Number of BSP sync rounds in master assignment: ", + stateRounds); } // galois::PerThreadTimer ptt( @@ -1328,13 +1591,6 @@ class NewDistGraphGeneric : public DistGraph { auto work = getSpecificThreadRange(bufGraph, rangeVec, beginNode, endNode); - // debug print - // galois::on_each([&] (unsigned i, unsigned j) { - // galois::gDebug("[", base_DistGraph::id, " ", i, "] sync round ", - // syncRound, " local range ", - // *work.local_begin(), " ", *work.local_end()); - //}); - galois::do_all( // iterate over my read nodes galois::iterate(work), @@ -1352,10 +1608,6 @@ class NewDistGraphGeneric : public DistGraph { // on map with subtraction localNodeToMaster[node - globalOffset] = assignedHost; - // galois::gDebug("[", base_DistGraph::id, "] state round ", - // syncRound, - // " set ", node, " ", node - globalOffset); - // ptt.stop(); }, galois::loopname("Phase0DetermineMasters"), galois::steal(), @@ -1386,13 +1638,6 @@ class NewDistGraphGeneric : public DistGraph { asyncSyncLoad(nodeLoads, nodeAccum, edgeLoads, edgeAccum, loadsClear); } loadSyncTimer.stop(); - -#ifndef NDEBUG - if (async) { - galois::gDebug("[", base_DistGraph::id, "] host count ", - hostFinished.count()); - } -#endif } // if asynchronous, don't move on until everything is done @@ -1409,14 +1654,6 @@ class NewDistGraphGeneric : public DistGraph { waitTime.start(); while (hostFinished.count() != base_DistGraph::numHosts || loadsClear.count() != base_DistGraph::numHosts) { - //#ifndef NDEBUG - // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to - // finish, ", - // hostFinished.count()); - // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts - // loads " - // "syncs to finish, ", loadsClear.count()); - //#endif // make sure all assignments are done and all loads are done syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets, hostFinished); @@ -1425,15 +1662,9 @@ class NewDistGraphGeneric : public DistGraph { waitTime.stop(); } -#ifndef NDEBUG - printLoad(nodeLoads, nodeAccum); - printLoad(edgeLoads, edgeAccum); -#endif - // sanity check for correctness (all should be assigned) for (uint32_t i = 0; i < localNodeToMaster.size(); i++) { if (localNodeToMaster[i] == (uint32_t)-1) { - // galois::gDebug("[", base_DistGraph::id, "] bad index ", i); assert(localNodeToMaster[i] != (uint32_t)-1); } } @@ -1444,9 +1675,9 @@ class NewDistGraphGeneric : public DistGraph { base_DistGraph::increment_evilPhase(); } - galois::gPrint("[", base_DistGraph::id, + galois::gDebug("[", base_DistGraph::id, "] Local master assignment " - "complete.\n"); + "complete."); // one more step: let masters know of nodes they own (if they don't // have the node locally then this is the only way they will learn about @@ -1458,7 +1689,7 @@ class NewDistGraphGeneric : public DistGraph { recvMastersToOwners(); p0master2ownerTimer.stop(); - galois::gPrint("[", base_DistGraph::id, "] Received my master mappings.\n"); + galois::gDebug("[", base_DistGraph::id, "] Received my master mappings."); base_DistGraph::increment_evilPhase(); @@ -1503,11 +1734,10 @@ class NewDistGraphGeneric : public DistGraph { inspectionTimer.stop(); uint64_t allBytesRead = bufGraph.getBytesRead(); - galois::gPrint( - "[", base_DistGraph::id, - "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f, - " seconds to read ", allBytesRead, " bytes (", - allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n"); + galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ", + inspectionTimer.get_usec() / 1000000.0f, " seconds to read ", + allBytesRead, " bytes (", + allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)"); // get incoming mirrors ready for creation uint32_t additionalMirrorCount = incomingMirrors.count(); @@ -1606,7 +1836,7 @@ class NewDistGraphGeneric : public DistGraph { void edgeCutLoad(GraphTy& graph, galois::graphs::BufferedGraph& bGraph) { if (base_DistGraph::id == 0) { - galois::gPrint("Loading edge-data while creating edges\n"); + galois::gDebug("Loading edge-data while creating edges"); } uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first; @@ -1637,10 +1867,10 @@ class NewDistGraphGeneric : public DistGraph { galois::steal(), galois::no_stats()); timer.stop(); - galois::gPrint("[", base_DistGraph::id, + galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ", timer.get_usec() / 1000000.0f, " seconds to read ", bGraph.getBytesRead(), " bytes (", - bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n"); + bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)"); } /** @@ -1658,7 +1888,7 @@ class NewDistGraphGeneric : public DistGraph { void edgeCutLoad(GraphTy& graph, galois::graphs::BufferedGraph& bGraph) { if (base_DistGraph::id == 0) { - galois::gPrint("Loading edge-data while creating edges\n"); + galois::gDebug("Loading edge-data while creating edges"); } uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first; @@ -1688,10 +1918,10 @@ class NewDistGraphGeneric : public DistGraph { galois::steal(), galois::no_stats()); timer.stop(); - galois::gPrint("[", base_DistGraph::id, + galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ", timer.get_usec() / 1000000.0f, " seconds to read ", bGraph.getBytesRead(), " bytes (", - bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n"); + bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)"); } /** @@ -1724,11 +1954,10 @@ class NewDistGraphGeneric : public DistGraph { inspectionTimer.stop(); // report edge inspection time uint64_t allBytesRead = bufGraph.getBytesRead(); - galois::gPrint( - "[", base_DistGraph::id, - "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f, - " seconds to read ", allBytesRead, " bytes (", - allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n"); + galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ", + inspectionTimer.get_usec() / 1000000.0f, " seconds to read ", + allBytesRead, " bytes (", + allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)"); // old inspection barrier // galois::runtime::getHostBarrier().wait(); @@ -1990,9 +2219,6 @@ class NewDistGraphGeneric : public DistGraph { size_t curCount = 0; // size_t actuallySet = 0; for (uint32_t offset : offsetsToConsider.getOffsets()) { - // galois::gDebug("[", base_DistGraph::id, "] ", " setting ", - // offset + hostOffset, " from host ", senderHost, - // " to ", recvMasterLocations[curCount]); graphPartitioner->addMasterMapping(offset + hostOffset, recvMasterLocations[curCount]); // bool set = graphPartitioner->addMasterMapping(offset + hostOffset, @@ -2000,9 +2226,6 @@ class NewDistGraphGeneric : public DistGraph { // if (set) { actuallySet++; } curCount++; } - - // galois::gDebug("[", base_DistGraph::id, "] host ", senderHost, ": set ", - // actuallySet, " out of ", recvMasterLocations.size()); } /** @@ -2019,9 +2242,6 @@ class NewDistGraphGeneric : public DistGraph { size_t curCount = 0; for (uint64_t gid : gids) { assert(gid < base_DistGraph::numGlobalNodes); - // galois::gDebug("[", base_DistGraph::id, "] ", " in-setting ", gid, " to - // ", - // recvMasterLocations[curCount]); graphPartitioner->addMasterMapping(gid, recvMasterLocations[curCount]); curCount++; } @@ -2082,7 +2302,6 @@ class NewDistGraphGeneric : public DistGraph { galois::runtime::gSerialize(b, offsets); if (graphPartitioner->masterAssignPhase()) { - // galois::gDebug("incoming master map serialization"); // serializeIncomingMasterMap(b, curBitset, h); serializeIncomingMasterMap(b, curBitset); } @@ -2091,7 +2310,6 @@ class NewDistGraphGeneric : public DistGraph { galois::runtime::gSerialize(b, 1); galois::runtime::gSerialize(b, curBitset); if (graphPartitioner->masterAssignPhase()) { - // galois::gDebug("incoming master map serialization"); // serializeIncomingMasterMap(b, curBitset, h); serializeIncomingMasterMap(b, curBitset); } @@ -2102,14 +2320,13 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(b.size()); // send buffer and free memory - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } galois::runtime::reportStat_Tsum( GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce()); - galois::gPrint("[", base_DistGraph::id, "] Inspection sends complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Inspection sends complete."); } /** @@ -2127,9 +2344,9 @@ class NewDistGraphGeneric : public DistGraph { for (unsigned h = 0; h < net.Num - 1; h++) { // expect data from comm partner back - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; @@ -2189,8 +2406,7 @@ class NewDistGraphGeneric : public DistGraph { } } - galois::gPrint("[", base_DistGraph::id, - "] Inspection receives complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Inspection receives complete."); } /** @@ -2217,10 +2433,7 @@ class NewDistGraphGeneric : public DistGraph { inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges); finalizeInspection(prefixSumOfEdges); - galois::gDebug("[", base_DistGraph::id, - "] To receive this many nodes: ", nodesToReceive); - - galois::gPrint("[", base_DistGraph::id, "] Inspection mapping complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Inspection mapping complete."); return prefixSumOfEdges; } @@ -2256,9 +2469,6 @@ class NewDistGraphGeneric : public DistGraph { galois::block_range((size_t)0, hostSize, tid, nthreads); uint64_t count = 0; for (size_t i = beginNode; i < endNode; i++) { - // galois::gDebug("[", base_DistGraph::id, "] ", i + startNode, - // " mapped to ", - // graphPartitioner->retrieveMaster(i+startNode)); if (graphPartitioner->retrieveMaster(i + startNode) == myHID) { count++; } @@ -2275,9 +2485,7 @@ class NewDistGraphGeneric : public DistGraph { assert(base_DistGraph::localToGlobalVector.size() == base_DistGraph::numNodes); - uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1]; - galois::gDebug("[", base_DistGraph::id, "] This many masters from host ", - h, ": ", newMasterNodes); + uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1]; uint32_t startingNodeIndex = base_DistGraph::numNodes; // increase size of prefix sum + mapping vector prefixSumOfEdges.resize(base_DistGraph::numNodes + newMasterNodes); @@ -2565,6 +2773,7 @@ class NewDistGraphGeneric : public DistGraph { for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes; i++) { uint32_t globalID = base_DistGraph::localToGlobalVector[i]; + assert(graphPartitioner->retrieveMaster(globalID) != base_DistGraph::id); base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)] .push_back(globalID); } @@ -2577,9 +2786,9 @@ class NewDistGraphGeneric : public DistGraph { galois::graphs::BufferedGraph& bufGraph) { if (base_DistGraph::id == 0) { if (std::is_void::value) { - fprintf(stderr, "Loading void edge-data while creating edges.\n"); + galois::gDebug("Loading void edge-data while creating edges."); } else { - fprintf(stderr, "Loading edge-data while creating edges.\n"); + galois::gDebug(stderr, "Loading edge-data while creating edges."); } } @@ -2604,10 +2813,10 @@ class NewDistGraphGeneric : public DistGraph { loadEdgeTimer.stop(); - galois::gPrint("[", base_DistGraph::id, "] Edge loading time: ", + galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ", loadEdgeTimer.get_usec() / 1000000.0f, " seconds to read ", bufBytesRead, " bytes (", - bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)\n"); + bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)"); } // Edge type is not void. (i.e. edge data exists) @@ -2719,16 +2928,15 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(b.size()); maxBytesSent.update(b.size()); - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); - b.getVec().reserve(edgePartitionSendBufSize * 1.25); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + b = galois::runtime::SerializeBuffer(); + b.reserve(edgePartitionSendBufSize * 1.25); } } } // overlap receives - auto buffer = - net.recieveTagged(galois::runtime::evilPhase, nullptr); + auto buffer = net.recieveTagged(galois::runtime::evilPhase); this->processReceivedEdgeBuffer(buffer, graph, receivedNodes); }, #if MORE_DIST_STATS @@ -2751,8 +2959,8 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(sendBuffer.size()); maxBytesSent.update(sendBuffer.size()); - net.sendTagged(h, galois::runtime::evilPhase, sendBuffer); - sendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer)); + sendBuffer = galois::runtime::SerializeBuffer(); } } } @@ -2865,16 +3073,15 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(b.size()); maxBytesSent.update(b.size()); - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); - b.getVec().reserve(edgePartitionSendBufSize * 1.25); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + b = galois::runtime::SerializeBuffer(); + b.reserve(edgePartitionSendBufSize * 1.25); } } } // overlap receives - auto buffer = - net.recieveTagged(galois::runtime::evilPhase, nullptr); + auto buffer = net.recieveTagged(galois::runtime::evilPhase); this->processReceivedEdgeBuffer(buffer, graph, receivedNodes); }, #if MORE_DIST_STATS @@ -2897,8 +3104,7 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(sendBuffer.size()); maxBytesSent.update(sendBuffer.size()); - net.sendTagged(h, galois::runtime::evilPhase, sendBuffer); - sendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer)); } } } @@ -2920,7 +3126,7 @@ class NewDistGraphGeneric : public DistGraph { GraphTy& graph, std::atomic& receivedNodes) { if (buffer) { auto& rb = buffer->second; - while (rb.r_size() > 0) { + while (rb.size() > 0) { uint64_t n; std::vector gdst_vec; galois::runtime::gDeserialize(rb, n); @@ -2946,8 +3152,8 @@ class NewDistGraphGeneric : public DistGraph { // receive edges for all mirror nodes while (receivedNodes < nodesToReceive) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + p = net.recieveTagged(galois::runtime::evilPhase); processReceivedEdgeBuffer(p, graph, receivedNodes); } } diff --git a/libcusp/test/CMakeLists.txt b/libcusp/test/CMakeLists.txt new file mode 100644 index 0000000000..710627302c --- /dev/null +++ b/libcusp/test/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(shad_dist_graph shad-dist-graph.cpp) +target_link_libraries(shad_dist_graph galois_gnn) diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp new file mode 100644 index 0000000000..492bfeb2ad --- /dev/null +++ b/libcusp/test/shad-dist-graph.cpp @@ -0,0 +1,132 @@ +/* + * This file belongs to the Galois project, a C++ library for exploiting + * parallelism. The code is being released under the terms of the 3-Clause BSD + * License (a copy is located in LICENSE.txt at the top-level directory). + * + * Copyright (C) 2018, The University of Texas at Austin. All rights reserved. + * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS + * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF + * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF + * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH + * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances + * shall University be liable for incidental, special, indirect, direct or + * consequential damages or loss of profits, interruption of business, or + * related expenses which may arise from use of Software or Documentation, + * including but not limited to those resulting from defects in Software and/or + * Documentation, or loss or inaccuracy of data of any kind. + */ + +#include + +#include "galois/Galois.h" +#include "galois/graphs/CuSPPartitioner.h" +#include "shad/ShadGraphConverter.h" + +int main() { + galois::DistMemSys G; + unsigned M = galois::substrate::getThreadPool().getMaxThreads(); + // M = 1; + galois::setActiveThreads(M); + + shad::ShadGraphConverter shadConverter; + size_t numNodes{0}, numEdges{0}; + + // TODO(hc): This path should be properly set based on user's environment. + // Later, this test dataset will be included in the Galois repository, and + // will use a relative path. + std::string filename = "/home/hochan/data.01.csv"; + shadConverter.readSHADFile(filename, &numNodes, &numEdges); + std::unique_ptr> + graph = galois::cuspPartitionGraph( + filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true); + + std::cout << "Test starts...\n"; + + galois::DGAccumulator sumGlobalNodes; + galois::DGAccumulator sumGlobalEdges; + + sumGlobalNodes.reset(); + sumGlobalEdges.reset(); + + sumGlobalNodes += graph->numMasters(); + sumGlobalEdges += graph->sizeEdges(); + + uint64_t reducedSumGlobalNodes = sumGlobalNodes.reduce(); + uint64_t reducedSumGlobalEdges = sumGlobalEdges.reduce(); + + assert(reducedSumGlobalNodes == numNodes); + assert(reducedSumGlobalNodes == graph->globalSize()); + assert(reducedSumGlobalEdges == numEdges); + assert(reducedSumGlobalEdges == graph->globalSizeEdges()); + + std::cout << "Num. nodes/edges tests has been passed\n"; + + uint32_t id = galois::runtime::getSystemNetworkInterface().ID; + uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num; + { + std::ofstream fp(std::to_string(id) + ".master"); + for (uint32_t src = 0; src < graph->numMasters(); ++src) { + uint64_t srcglobal = graph->getGID(src); + fp << "node " << srcglobal << ", type: " << graph->getData(src).type + << ", key: " << graph->getData(src).key << "\n"; + for (auto e : graph->edges(src)) { + uint32_t dstlocal = graph->getEdgeDst(e); + uint64_t dstglobal = graph->getGID(dstlocal); + fp << "\t edge dst " << dstglobal << ", type: " << graph->getEdgeData(e) + << "\n"; + } + } + fp.close(); + } + + { + for (uint32_t host = 0; host < numHosts; ++host) { + if (host == id) { + continue; + } + std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + + ".graph"); + for (uint32_t i = 0; i < graph->size(); ++i) { + fp << i << ", " << graph->getGID(i) << ", " << graph->getData(i).type + << ", " << graph->getData(i).key << "\n"; + } + fp.close(); + } + } +#if 0 + { + for (uint32_t host = 0; host < numHosts; ++host) { + if (host == id) { + continue; + } + std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".mirror"); + for (uint32_t i = 0; + i < graph->getMirrorNodes()[host].size(); ++i) { + uint64_t srcglobal = graph->getMirrorNodes()[host][i]; + uint32_t src = graph->getLID(srcglobal); + fp << "src:" << src << ", global:" << srcglobal << ", node data:" << + graph->getData(src) << "\n" << std::flush; + + assert(shadConverter.checkNode(srcglobal, graph->getData(src))); + fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n"; + //if (std::distance(graph->edge_begin(src), graph->edge_end(src)) > 0) { + for (auto e : graph->edges(src)) { + uint32_t dst = graph->getEdgeDst(e); + uint64_t dstglobal = graph->getGID(dst); + assert(shadConverter.checkNode(dstglobal, graph->getData(dst))); + assert(shadConverter.checkEdge(srcglobal, dstglobal, + std::distance(graph->edge_begin(src), e), + graph->getEdgeData(e))); + fp << "\t edge dst " << dstglobal << ", type: " << + graph->getEdgeData(e) << "\n" << std::flush; + } + } + fp.close(); + } + } +#endif + + return 0; +} diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt new file mode 100644 index 0000000000..44be89edad --- /dev/null +++ b/libdeepgalois/CMakeLists.txt @@ -0,0 +1,131 @@ +cmake_minimum_required(VERSION 2.8) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") +SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas) +SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) +set(BLAS_LIB "-lopenblas -lpthread") +if(USE_MKL_BLAS) + link_directories(${INTEL_LIBS_DIR}) + message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}") + SET(BLAS_INC_DIR ${MKL_ROOT}/include) + SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64) + set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") + #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL") +endif() + +# blas library +include_directories(${BLAS_INC_DIR}) +link_directories(${BLAS_LIB_DIR}) +message(STATUS "BLAS_INC_DIR: ${BLAS_INC_DIR}") +message(STATUS "BLAS_LIB_DIR: ${BLAS_LIB_DIR}") + +# galois base libs +include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +link_directories(${CMAKE_SOURCE_DIR}/libgalois) + +if(GALOIS_ENABLE_GPU) + # hetero path + set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU --extended-lambda ${CUDA_NVCC_FLAGS}") + set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers + include_directories("${CUB_ROOT}") + set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers + include_directories("${MGPU_ROOT}/src") + include_directories(${CUDA_HOME}/include) + include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) + + find_package(CUDA REQUIRED) + set(CUDA_SEPARABLE_COMPILATION ON) + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(CUDA_HOST_COMPILER g++) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_60,code=sm_60) + #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_61,code=sm_61) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70) + #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic) + link_directories(${CUDA_HOME}/lib64) + link_directories(${CMAKE_SOURCE_DIR}/libgpu) + message(STATUS "CUDA_LIB_DIR: ${CUDA_HOME}/lib64") + + set(CUDA_SOURCES + src/layers/graph_conv_layer.cu + src/layers/softmax_loss_layer.cu + src/layers/sigmoid_loss_layer.cu + src/layers/leaky_relu_layer.cu + src/layers/l2_norm_layer.cu + src/layers/relu_layer.cu + src/layers/aggregator.cu + src/math_functions.cu + src/optimizer.cu + src/DistContext.cu + src/Sampler.cu + src/lgraph.cu + src/node.cu + src/Net.cu + ) + cuda_add_library(dg_gpu ${CUDA_SOURCES}) + target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand) + set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA") + set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +if(GALOIS_ENABLE_GPU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU") + set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp src/Train.cpp) +else() + set(sources + src/layers/softmax_loss_layer.cpp + src/layers/sigmoid_loss_layer.cpp + src/layers/graph_conv_layer.cpp + src/layers/leaky_relu_layer.cpp + src/layers/l2_norm_layer.cpp + src/layers/relu_layer.cpp + src/layers/aggregator.cpp + src/math_functions.cpp + src/optimizer.cpp + src/DistContext.cpp + src/RandomWalk.cpp + src/Sampler.cpp + src/reader.cpp + src/lgraph.cpp + src/utils.cpp + src/Train.cpp + src/node.cpp + src/Net.cpp + ) +endif(GALOIS_ENABLE_GPU) + +add_library(dg_cpu STATIC ${sources}) +target_link_libraries(dg_cpu galois_shmem) +target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) +target_link_libraries(dg_cpu ${BLAS_LIB} ${BOOST_LIBRARIES}) +target_include_directories(dg_cpu PUBLIC + ${CMAKE_SOURCE_DIR}/libgalois/include + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +# dist galois setup/linking to dg_cpu +if(GALOIS_ENABLE_DIST) + target_link_libraries(dg_cpu galois_dist_async galois_cusp galois_gluon) + target_include_directories(dg_cpu PUBLIC + ${CMAKE_SOURCE_DIR}/libdist/include + ${CMAKE_SOURCE_DIR}/libcusp/include + ${CMAKE_SOURCE_DIR}/libgluon/include + ) + + if(GALOIS_ENABLE_GPU) + target_link_libraries(dg_gpu galois_dist_async galois_cusp galois_gluon) + target_include_directories(dg_gpu PUBLIC + ${CMAKE_SOURCE_DIR}/libdist/include + ${CMAKE_SOURCE_DIR}/libcusp/include + ${CMAKE_SOURCE_DIR}/libgluon/include + ) + endif() +endif() + +set_target_properties(dg_cpu PROPERTIES + INTERFACE_POSITION_INDEPENDENT_CODE On + POSITION_INDEPENDENT_CODE On +) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h new file mode 100644 index 0000000000..ba3d1510bf --- /dev/null +++ b/libdeepgalois/include/deepgalois/Context.h @@ -0,0 +1,48 @@ +#pragma once +#include +#include +#include "deepgalois/types.h" +#include "deepgalois/reader.h" +#include "deepgalois/configs.h" +#include "deepgalois/GraphTypes.h" + +namespace deepgalois { + +class Context { + bool is_device; // is this on device or host + bool is_selfloop_added; // whether selfloop is added to the input graph + std::string dataset; + Reader reader; + +public: + GraphCPU* graph_cpu; // the input graph, |V| = N + GraphCPU* getGraphPointer() { return graph_cpu; } + Context() : Context(false) {} + //! initializer for gpu; goes ahead and sets a few things + Context(bool use_gpu) : is_device(use_gpu), is_selfloop_added(false) {} + ~Context() {} + void set_dataset(std::string dataset_str) { + dataset = dataset_str; + reader.init(dataset); + } + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, + mask_t* masks) { + return reader.read_masks(mask_type, n, begin, end, masks); + } + size_t read_graph(bool selfloop) { + graph_cpu = new GraphCPU(); + graph_cpu->readGraph(dataset, selfloop); + is_selfloop_added = selfloop; + return graph_cpu->size(); + } + + //! Checks if subgraph being used, sets currenet graph, then calls degreex + //! counting + GraphCPU* getFullGraph() { + graph_cpu + ->degree_counting(); // TODO: why is it here? should be in read_graph + return graph_cpu; + } +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h new file mode 100644 index 0000000000..3ecf9ed411 --- /dev/null +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -0,0 +1,142 @@ +#ifndef __DG_DIST_CONTEXT__ +#define __DG_DIST_CONTEXT__ +#ifdef GALOIS_ENABLE_GPU +#include "deepgalois/cutils.h" +#else +#include "galois/graphs/GluonSubstrate.h" +#endif + +#include "deepgalois/types.h" +#include "deepgalois/Context.h" +#include "deepgalois/GraphTypes.h" +#include "deepgalois/reader.h" + +namespace deepgalois { + +class DistContext { + bool is_device; // is this on device or host + bool is_selfloop_added; // whether selfloop is added to the input graph + bool usingSingleClass; + std::string dataset; + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + Graph* lGraph; // learning graph version + DGraph* partitionedGraph; // the input graph, |V| = N + std::vector partitionedSubgraphs; + label_t* h_labels; // labels for classification. Single-class: Nx1, + // multi-class: NxE + float_t* h_feats; // input features: N x D +#ifdef GALOIS_ENABLE_GPU + label_t* d_labels; // labels on device + label_t* d_labels_subg; // labels for subgraph on device + float_t* d_feats; // input features on device + float_t* d_feats_subg; // input features for subgraph on device + float_t* d_normFactors; + float_t* d_normFactorsSub; +#else + galois::graphs::GluonSubstrate* syncSubstrate; +#endif + std::vector h_labels_subg; // labels for subgraph + std::vector h_feats_subg; // input features for subgraph + std::vector + normFactors; // normalization constant based on graph structure + std::vector normFactorsSub; // normalization constant for subgraph + + Reader reader; + +public: + // TODO better constructor + DistContext(); + DistContext(bool isDevice) + : is_device(isDevice), is_selfloop_added(false), usingSingleClass(true), + dataset(""), num_classes(0), feat_len(0), lGraph(NULL), + partitionedGraph(NULL), h_labels(0), h_feats(0) {} + ~DistContext(); + + size_t read_graph(std::string dataset_str, bool selfloop = false); + + //! read labels of local nodes only + size_t read_labels(bool isSingleClassLabel, std::string dataset_str); + + //! read features of local nodes only + size_t read_features(std::string dataset_str); + + //! read masks of local nodes only + size_t read_masks(std::string dataset_str, std::string mask_type, size_t n, + size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph); + + DGraph* getGraphPointer() { return partitionedGraph; } + Graph* getLGraphPointer() { return lGraph; } + Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; }; + + void initializeSyncSubstrate(); +#ifdef GALOIS_ENABLE_GPU + float_t* get_feats_ptr() { return d_feats; } + float_t* get_feats_subg_ptr() { return d_feats_subg; } + label_t* get_labels_ptr() { return d_labels; } + label_t* get_labels_subg_ptr() { return d_labels_subg; } + float_t* get_norm_factors_ptr() { return d_normFactors; } + float_t* get_norm_factors_subg_ptr() { return d_normFactorsSub; } + void copy_data_to_device(); // copy labels and input features + static cublasHandle_t cublas_handle_; // used to call cuBLAS + static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE + static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE + static curandGenerator_t + curand_generator_; // used to generate random numbers on GPU + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } + inline static cusparseMatDescr_t cusparse_matdescr() { + return cusparse_matdescr_; + } + inline static curandGenerator_t curand_generator() { + return curand_generator_; + } +#else + void saveDistGraph(DGraph* a); + galois::graphs::GluonSubstrate* getSyncSubstrate(); + float_t* get_feats_ptr() { return h_feats; } + float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } + label_t* get_labels_ptr() { return h_labels; } + label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } + float_t* get_norm_factors_ptr() { return normFactors.data(); } + float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; } +#endif + + void set_dataset(std::string dataset_str) { + dataset = dataset_str; + reader.init(dataset); + } + + //! allocate the norm factor vector + void allocNormFactor(); + void allocNormFactorSub(int subID); + //! construct norm factor vector by using data from global graph + void constructNormFactor(deepgalois::Context* globalContext); + void constructNormFactorSub(int subgraphID); + + void constructSubgraphLabels(size_t m, const mask_t* masks); + void constructSubgraphFeatures(size_t m, const mask_t* masks); + + //! return label for some node + //! NOTE: this is LID, not GID + label_t get_label(size_t lid) { return h_labels[lid]; } + + //! returns pointer to the features of each local node + float_t* get_in_ptr(); + + //! allocate memory for subgraphs (don't actually build them) + void allocateSubgraphs(int num_subgraphs, unsigned max_size); + + //! return if a vertex is owned by the partitioned graph this context contains + bool isOwned(unsigned gid); + //! return if part graph has provided vertex for given gid locally + bool isLocal(unsigned gid); + //! get GID of an lid for a vertex + unsigned getGID(unsigned lid); + //! get local id of a vertex given a global id for that vertex + unsigned getLID(unsigned gid); +}; + +} // namespace deepgalois + +#endif diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h new file mode 100644 index 0000000000..3f613a3039 --- /dev/null +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -0,0 +1,27 @@ +#pragma once + +#include "deepgalois/types.h" +#include "deepgalois/lgraph.h" + +#ifdef GALOIS_ENABLE_GPU +#define USE_CSRGRAPH +#ifdef USE_CSRGRAPH +#include "graph_gpu.h" +#endif +#else +#include "galois/Galois.h" +#include "galois/graphs/NewGeneric.h" +#endif + +namespace deepgalois { +using edge_iterator = index_t; +using GraphCPU = LearningGraph; +#ifdef GALOIS_ENABLE_GPU +using DGraph = CSRGraph; +using Graph = CSRGraph; +using GraphGPU = CSRGraph; +#else +using DGraph = galois::graphs::DistGraph; +using Graph = LearningGraph; +#endif +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h new file mode 100644 index 0000000000..bd33924eee --- /dev/null +++ b/libdeepgalois/include/deepgalois/Net.h @@ -0,0 +1,151 @@ +/** + * Based on the net.hpp file from Caffe deep learning framework. + */ +#pragma once +#include +#include "deepgalois/types.h" +#include "deepgalois/layers/l2_norm_layer.h" +#include "deepgalois/layers/graph_conv_layer.h" +#include "deepgalois/layers/softmax_loss_layer.h" +#include "deepgalois/layers/sigmoid_loss_layer.h" +#include "deepgalois/optimizer.h" +#include "deepgalois/utils.h" +#include "deepgalois/Context.h" +#include "deepgalois/GraphTypes.h" +#include "deepgalois/DistContext.h" +#include "deepgalois/Sampler.h" + +namespace deepgalois { + +// N: number of vertices, D: feature vector dimentions, +// E: number of distinct labels, i.e. number of vertex classes +// layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) +// layer 2: features N x 16, weights 16 x E, out N x E +class Net { + std::string header; + bool is_single_class; // single-class (one-hot) or multi-class label + bool has_l2norm; // whether the net contains an l2_norm layer + bool has_dense; // whether the net contains an dense layer + unsigned neighbor_sample_size; // neighbor sampling + unsigned subgraph_sample_size; // subgraph sampling + int num_threads; // number of threads + size_t globalSamples; // number of samples: N + size_t distNumSamples; // number of samples: N + size_t num_classes; // number of vertex classes: E + size_t num_conv_layers; // number of convolutional layers + size_t num_layers; // total number of layers (conv + output) + int num_epochs; // number of epochs + unsigned h1; // hidden layer size + float learning_rate; // learning rate + float dropout_rate; // dropout rate + float weight_decay; // weighti decay for over-fitting + // begins/ends below are global ids + size_t globalTrainBegin; + size_t globalTrainEnd; + size_t globalTrainCount; + size_t globalValBegin; + size_t globalValEnd; + size_t globalValCount; + size_t globalTestBegin; + size_t globalTestEnd; + size_t globalTestCount; + int val_interval; + int num_subgraphs; + unsigned subgraphNumVertices; + bool is_selfloop; + + mask_t* globalTrainMasks; // masks for training + mask_t* globalValMasks; // masks for validation + mask_t* globalTestMasks; // masks for test + // TODO it's looking like we may not even need these dist versions + mask_t* distTrainMasks; + mask_t* distValMasks; + mask_t* distTestMasks; // masks for test, dst + + mask_t* d_train_masks; // masks for training on device + mask_t* d_val_masks; // masks for validation on device + mask_t* d_test_masks; // masks for test on device + + mask_t* subgraphs_masks; // masks for subgraphs; size of local graph + // masks for subgraphs on device; size of local graph + mask_t* d_subgraphs_masks; + std::vector feature_dims; // feature dimnesions for each layer + std::vector layers; // all the layers in the neural network + + // one context is for entire graph; other is for partitioned graph + // TODO optimize single host case + + //! context holds all of the graph data + deepgalois::Context* graphTopologyContext; + + //! dist context holds graph data of the partitioned graph only + deepgalois::DistContext* distContext; + DGraph* dGraph; + Sampler* sampler; + +public: + //! Default net constructor + Net() + : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, false, true, false, false, + 25, 9000, 1) {} + + //! Net constructor + Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, + unsigned hidden1, float lr, float dropout, float wd, bool selfloop, + bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, + int val_itv); + + // allocate memory for subgraph masks + void allocateSubgraphsMasks(int num_subgraphs); + + //! Initializes metadata for the partition: loads data, labels, etc + void partitionInit(DGraph* graph, std::string dataset_str, + bool isSingleClassLabel); + size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } + size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } + void regularize(); // add weight decay + void train(optimizer* opt, bool need_validate); + double evaluate(std::string type, acc_t& loss, acc_t& acc); + + //! read masks of test set for GLOBAL set + void read_test_masks(std::string dataset); + //! read test masks only for local nodes; assumes dist context is initialized + void readDistributedTestMasks(std::string dataset); + + // void copy_test_masks_to_device(); + void construct_layers(); + + //! Add an l2_norm layer to the network + void append_l2norm_layer(size_t layer_id); + + //! Add an dense layer to the network + void append_dense_layer(size_t layer_id); + + //! Add an output layer to the network + void append_out_layer(size_t layer_id); + + //! Add a convolution layer to the network + void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, + bool bias = false, bool dropout = true); + + // update trainable weights after back-prop + void update_weights(optimizer* opt); + + // forward propagation + acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks); + void bprop(); // back propagation + void set_contexts(); // Save the context + void set_netphases(net_phase phase); // current phase: train or test + void print_layers_info(); // print layer information + void print_configs(); // print the configurations + + // comparing outputs with the ground truth (labels) + acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, + mask_t* gMasks, float_t* preds, + label_t* localGroundTruth); + acc_t masked_multi_class_accuracy(size_t gBegin, size_t gEnd, size_t gCount, + mask_t* gMasks, float_t* preds, + label_t* localGroundTruth); +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h new file mode 100644 index 0000000000..ff1b460b10 --- /dev/null +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -0,0 +1,112 @@ +#pragma once +#include "deepgalois/GraphTypes.h" + +namespace deepgalois { +#define ETA 1.5 // length factor of DB in sampling +#define SAMPLE_CLIP 3000 // clip degree in sampling +#define DEFAULT_SIZE_FRONTIER 1000 +#define DEFAULT_SIZE_SUBG 9000 + +class Sampler { +public: + typedef int db_t; + +protected: + index_t m; // number of vertice in the frontier + size_t count_; + + //! averaged degree of masked graph + int avg_deg; + //! average degree cut off to a clip + int subg_deg; + + // VertexList vertices_; + // mask_t* masks_; + + //! List of training nodes; sampling set + std::vector trainingNodes; + + //! masked original graph; typically to the training set + GraphCPU* globalMaskedGraph; + GraphCPU* globalGraph; + DGraph* partGraph; + + //! Reindex a graph to only contain those in the vertex set + void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed); + + //! Given a graph, return a graph with edges to unmasked vertices removed in + //! mg + template + void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub); + + //! determine degree of each vertex in a masked graph (given by masks and g) + template + void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, + std::vector& degrees); + + //! Set masks bitset with IDs in the vertices VertexSet + // void createMasks(size_t n, VertexSet vertices, mask_t* masks); + // inline VertexList reindexVertices(size_t n, VertexSet vertex_set); + // void checkGSDB(std::vector& DB0, std::vector& DB1, + // std::vector& DB2, index_t size); + + //! convert set of gids to lids + VertexSet convertToLID(VertexSet& gidSet); + + void createMasks(size_t n, VertexSet vertices, mask_t* masks) { + std::fill(masks, masks + n, 0); + for (auto v : vertices) + masks[v] = 1; + } + + //! helper function to get degree of some vertex given some graph + inline unsigned getDegree(GraphCPU* g, index_t v) { + return g->edge_end_host(v) - g->edge_begin_host(v); + } + + inline VertexList reindexVertices(size_t n, VertexSet vertex_set) { + VertexList new_ids(n, 0); + int vid = 0; + for (auto v : vertex_set) { + new_ids[v] = vid++; // reindex + } + return new_ids; + } + + // helper function for graph saint implementation below + void checkGSDB(std::vector& DB0, std::vector& DB1, + std::vector& DB2, index_t size) { + if (DB0.capacity() < size) { + DB0.reserve(DB0.capacity() * 2); + DB1.reserve(DB1.capacity() * 2); + DB2.reserve(DB2.capacity() * 2); + } + DB0.resize(size); + DB1.resize(size); + DB2.resize(size); + } + +public: + Sampler() : m(DEFAULT_SIZE_FRONTIER) {} + ~Sampler() {} + + //! sample a subgraph sg of size n from graph g + //! sg is overwritten/is output + void generateSubgraph(VertexSet& vertex_set, mask_t* masks, Graph* sg); + + //! API function for user-defined selection strategy + // TODO how to expose this? + void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, + VertexSet& vertex_set); + virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed); + + // galois::runtime::iterable > + // neighbor_sampler(Graph &g, VertexID v); + + //! Given a mask, construct the graph with only those vertices ans ave as the + //! masked graph in this class for the sampler. + void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, + DGraph* dg); +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h new file mode 100644 index 0000000000..5cbb1909fd --- /dev/null +++ b/libdeepgalois/include/deepgalois/configs.h @@ -0,0 +1,13 @@ +#pragma once + +namespace deepgalois { + +const std::string path = + "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset + +#define NUM_DATASETS 9 +const std::string dataset_names[NUM_DATASETS] = { + "cora", "citeseer", "ppi", "pubmed", "flickr", + "yelp", "reddit", "amazon", "tester"}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h new file mode 100644 index 0000000000..4e4e9842b1 --- /dev/null +++ b/libdeepgalois/include/deepgalois/cutils.h @@ -0,0 +1,192 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +// CUDA: use 256 threads per block +const int CUDA_NUM_THREADS = 256; + +// CUDA: number of blocks for threads. +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +inline unsigned CudaTest(const char* msg) { + cudaError_t e; + // cudaThreadSynchronize(); + cudaDeviceSynchronize(); + if (cudaSuccess != (e = cudaGetLastError())) { + fprintf(stderr, "%s: %d\n", msg, e); + fprintf(stderr, "%s\n", cudaGetErrorString(e)); + exit(-1); + } + return 0; +} + +inline const char* cublasGetErrorString(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; +#if CUDA_VERSION >= 6000 + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; +#endif +#if CUDA_VERSION >= 6050 + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; +#endif + default: + break; + } + return "Unknown cublas status"; +} + +inline const char* cusparseGetErrorString(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_ZERO_PIVOT: + return "CUSPARSE_STATUS_ZERO_PIVOT"; + default: + break; + } + return "Unknown cusparse status"; +} + +inline const char* curandGetErrorString(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + default: + break; + } + return "Unknown curand status"; +} + +// CUDA: various checks for different function calls. +#define CUDA_CHECK(condition) \ + do { \ + cudaError_t error = condition; \ + if (error != cudaSuccess) { \ + fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n", \ + error, __FILE__, __LINE__, cudaGetErrorString(error)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUBLAS_CHECK(condition) \ + do { \ + cublasStatus_t status = condition; \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, \ + "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, cublasGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUSPARSE_CHECK(condition) \ + do { \ + cusparseStatus_t status = condition; \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + fprintf(stderr, \ + "error %d: cuSPARSE error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, cusparseGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CURAND_CHECK(condition) \ + do { \ + curandStatus_t status = condition; \ + if (status != CURAND_STATUS_SUCCESS) { \ + fprintf(stderr, \ + "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, curandGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// CUDA: grid stride looping +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +// CUDA: check for error after kernel execution and exit loudly if there is one. +#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) + +inline void print_device_vector(size_t n, const float_t* d_x, + std::string name = "x") { + float_t* h_x = new float_t[n]; + CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < n; i++) + std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; + delete[] h_x; +} + +inline void print_device_int_vector(size_t n, const int* d_x, + std::string name = "x") { + int* h_x = new int[n]; + CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(int), cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < n; i++) + std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; + delete[] h_x; +} diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h new file mode 100644 index 0000000000..2918cdd8dd --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h @@ -0,0 +1,160 @@ +#ifndef __GLUON_GRADIENTS__ +#define __GLUON_GRADIENTS__ + +#include "galois/gstl.h" +#include "galois/runtime/Network.h" +#include "deepgalois/types.h" + +namespace deepgalois { + +/** + * Wraps the weight gradients and provides an interface for Gluon to + * synchronize them during distributed execution. + */ +class GluonGradients { +private: + //! Data type used for gradients + using GradientType = float_t; + //! type that's being used by the gradient vector + using GradientVecType = vec_t; + + GradientVecType& _gradients; + //! number of weight gradients + size_t _numWeights; + //! number of gradients this host is responsible for + size_t _numOwned; + + //! My host ID + unsigned _myHost; + //! Total num hosts in system + unsigned _totalHosts; + + //! first node I own + unsigned _beginMaster; + //! last node I own (contiguous chunk) + unsigned _endMaster; + + //! my nodes whose's masters are on other hosts; global ids + std::vector> _mirrorNodes; + //! nodes that are mirrors on this host + std::vector> _mirrorRanges; + +public: + bool is_a_graph() { return true; } + + /** + * Save weight gradients + number of them (i.e. size). + * Then setup mirror metadata for Gluon to use during setup. + */ + GluonGradients(GradientVecType& gradients, size_t numWeights) + : _gradients(gradients), _numWeights(numWeights) { + _myHost = galois::runtime::getSystemNetworkInterface().ID; + _totalHosts = galois::runtime::getSystemNetworkInterface().Num; + + // allocate a vector for each host + _mirrorNodes.resize(_totalHosts); + + // loop through distribution of weights to hosts + for (unsigned h = 0; h < _totalHosts; h++) { + std::pair curRange = + galois::block_range((size_t)0, _numWeights, h, _totalHosts); + + if (h != _myHost) { + // setup mirrors for the host h which is just the list of IDs + size_t curW = curRange.first; + size_t lastW = curRange.second; + size_t numW = lastW - curW; + + // set mirrors for host h + _mirrorNodes[h].reserve(numW); + for (; curW < lastW; curW++) { + _mirrorNodes[h].push_back(curW); + } + } else { + // these belong to this host; save, then mirror ranges can be + // calculated from this + _beginMaster = curRange.first; + _endMaster = curRange.second; + _numOwned = _endMaster - _beginMaster; + + // first range is 0 to begin master + if (_beginMaster > 0) { + galois::gInfo("[", _myHost, "] Mirror range ", 0, " to ", + _beginMaster); + _mirrorRanges.emplace_back(0, _beginMaster); + } + + // second range is endMaster to end + if (_endMaster < _numWeights) { + galois::gInfo("[", _myHost, "] Mirror range ", _endMaster, " to ", + _numWeights); + _mirrorRanges.emplace_back(_endMaster, _numWeights); + } + } + } + + galois::gInfo("[", _myHost, "] This host owns ", _beginMaster, " to ", + _endMaster); + } + + //! Size is number of weights + size_t size() const { return _numWeights; } + + //! Global size is number of weights + size_t globalSize() const { return _numWeights; } + + //! Return the weights owned by this host + size_t numMasters() const { return _numOwned; } + + //! Return host ID + unsigned myHostID() const { return _myHost; } + + //! Return num hosts in the system + unsigned numHosts() const { return _totalHosts; } + + //! GID is same as LID since all hosts have all weights + uint32_t getGID(const uint32_t nodeID) const { return nodeID; } + + //! LID is same as GID since all hosts have all weights + uint32_t getLID(const uint32_t nodeID) const { return nodeID; } + + //! Return local weight w + GradientType& getData(uint32_t w) const { return _gradients[w]; } + + //! Return ranges for mirrors (unowned nodes) + const std::vector>& getMirrorRanges() const { + return _mirrorRanges; + } + + //! Return mirror nodes for each host from this host's point of view + std::vector>& getMirrorNodes() { return _mirrorNodes; } + + //! clears the vector + // TODO return to this when we start distributing on GPUs; wrapper + // end probably shouldn't be managing this MAYBE + void deallocate() { _gradients.clear(); } + + // Essentially no-op functions follow + + //! no nodes with edges + size_t getNumNodesWithEdges() { return 0; } + + //! No edges; not a vertex cut + bool is_vertex_cut() const { return false; } + + //! no edges, return 0 + unsigned edge_begin(uint32_t) { return 0; } + + //! no edges, return 0 + unsigned edge_end(uint32_t) { return 0; } + + //! no edges, return 0 + unsigned getEdgeDst(uint32_t) { return 0; } + + //! no edges, return 0 + unsigned getEdgeData(uint32_t) { return 0; } +}; + +} // namespace deepgalois + +#endif // end header guard diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h new file mode 100644 index 0000000000..d4c23af1bb --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -0,0 +1,51 @@ +#ifndef GALOIS_ENABLE_GPU +#ifndef __GRAD_SYNC_STRUCT__ +#define __GRAD_SYNC_STRUCT__ + +#include "deepgalois/types.h" + +struct GradientSync { + using ValTy = float_t; + + static ValTy extract(uint32_t, float_t& weight) { return weight; } + + static bool reduce(uint32_t, float_t& weight, ValTy y) { + // TODO merge function here + // for now make sure the weights are close enough + // if (std::abs(weight - y) > 0.00001) { + // galois::gInfo("weight ", node_id, " not consistent with one received"); + //} + if (y == 0) { + galois::gPrint("nothing important\n"); + } + weight += y; + // need a post process divide all step + // weight /= 2; + return true; + } + + //! reset weight to 0 + static void reset(uint32_t, float_t& weight) { weight = 0; } + + //! save weight + static void setVal(uint32_t, float_t& weight, ValTy y) { weight = y; } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } +}; + +// no bitset; everything is sent anyways +#endif +#endif diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h new file mode 100644 index 0000000000..570aa56d2b --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -0,0 +1,60 @@ +#ifndef GALOIS_ENABLE_GPU +#ifndef __GRAPH_CONV_SYNC_STRUCT__ +#define __GRAPH_CONV_SYNC_STRUCT__ +#include "galois/BufferWrapper.h" + +struct GraphConvSync { + using ValTy = galois::BufferWrapper; + + //! return a vector of floats to sync + static ValTy extract(uint32_t node_id, char&) { + ValTy vecToReturn( + &deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize], + deepgalois::_syncVectorSize); + // move constructor should kick in here to avoid return copy + return vecToReturn; + } + + //! reduction is addition in this case; add received vector to + //! own vector + static bool reduce(uint32_t node_id, char&, ValTy y) { + assert(y.size() == deepgalois::_syncVectorSize); + // loop and do addition + for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] += + y[i]; + } + return true; + } + + //! do nothing (waste of a write) + static void reset(uint32_t, char&) {} + + //! element wise set + static void setVal(uint32_t node_id, char&, ValTy y) { + assert(y.size() == deepgalois::_syncVectorSize); + // loop and do addition + for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] = y[i]; + } + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } +}; + +GALOIS_SYNC_STRUCTURE_BITSET(conv); +#endif +#endif diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h new file mode 100644 index 0000000000..8ef845b1d9 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -0,0 +1,23 @@ +#pragma once +#include "deepgalois/types.h" +//! For each node in the graph, add the embeddings of all of its neighbors +//! together (using norm_factor if specified) +#ifndef GALOIS_ENABLE_GPU +#include "deepgalois/GraphTypes.h" +namespace deepgalois { +// TODO template arg +void update_all(size_t len, Graph& g, const float_t* in, float_t* out, + bool norm, float_t* norm_factor); +void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, + bool norm, float_t* norm_factor); +} // namespace deepgalois +#else +#include "deepgalois/GraphTypes.h" +// #include "graph_gpu.h" +namespace deepgalois { +void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor); +void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor); +} // namespace deepgalois +#endif diff --git a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h new file mode 100644 index 0000000000..e4b59e694f --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h @@ -0,0 +1,28 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +// element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i``` +class elementwise_add_layer : public layer { +public: + elementwise_add_layer(unsigned level, std::vector in_dim, + std::vector out_dim) + : layer(level, in_dim, out_dim) { + trainable_ = false; + } + std::string layer_type() const override { + return std::string("elementwise_add"); + } + void forward_propagation(const tensor_t& in_data, + tensor_t& out_data) override { + for (size_t sample = 0; sample < in_data.size(); ++sample) { + for (size_t j = 0; j < in_data[0].size(); j++) + out_data[sample][j] = in_data[sample][j]; + } + } + void back_propagation(const tensor_t& in_data, const tensor_t& out_data, + tensor_t& out_grad, tensor_t& in_grad) override { + in_grad = out_grad; + } +}; +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h new file mode 100644 index 0000000000..14c47c9813 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -0,0 +1,85 @@ +#pragma once +#include "layer.h" +#include "deepgalois/layers/aggregator.h" + +/** + * GraphConv Layer; based on DGL implementation + follows TinyDNN layer + * convention + * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html + * + * Parameters + * ---------- + * x: int, number of samples. + * y: int, Input feature size. + * z: int, Output feature size. + * dropout: bool, optional, if True, a dropout operation is applied before + * other operations. + * norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. + * Default: ``True``. + * bias : bool, optional, if True, adds a learnable bias to the output. + * Default: ``False``. + * activation: default false + */ +namespace deepgalois { +class graph_conv_layer : public layer { +public: + graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, + float_t dropout_rate, std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), + dropout_(dropout), dropout_rate_(dropout_rate) { + assert(input_dims[0] == output_dims[0]); // num_vertices + trainable_ = true; + name_ = layer_type() + "_" + std::to_string(level); + assert(dropout_rate_ >= 0. && dropout_rate_ < 1.); + scale_ = 1. / (1. - dropout_rate_); + } + graph_conv_layer(unsigned level, std::vector in_dims, + std::vector out_dims) + : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, + out_dims) {} + ~graph_conv_layer() {} + void malloc_and_init(); + std::string layer_type() const override { return std::string("graph_conv"); } + virtual acc_t get_weight_decay_loss(); + //! Uses weights contained in this layer to update in_data (results from + //! previous) and save result to out_data + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + //! Uses gradients from layer after this one to update both own weight + //! gradients as well as gradients for the features (in_grad) + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); + // user-defined aggregate function +#ifndef GALOIS_ENABLE_GPU + virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); + void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out); +#else + virtual void aggregate(size_t len, GraphGPU& g, const float_t* in, + float_t* out); + void d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out); +#endif + // user-defined combine function + virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, + const float_t* neighbors, float_t* out); + +private: + bool act_; // whether to use activation function at the end + bool norm_; // whether to normalize data + bool bias_; // whether to add bias afterwards + bool dropout_; // whether to use dropout at first + const float_t dropout_rate_; + float_t scale_; + float_t* out_temp; //!< intermediate data temporary + float_t* in_temp; + float_t* in_temp1; + float_t* trans_data; // y*x + mask_t* dropout_mask; // x*y + float_t epsilon; // LeakyReLU angle of negative slope: set to 0.2 + + // Glorot & Bengio (AISTATS 2010) + inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, + unsigned seed = 1); + inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix); +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h new file mode 100644 index 0000000000..c7167700a2 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h @@ -0,0 +1,28 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +// L2 Normalization Layer +class l2_norm_layer : public layer { +public: + l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, + dims_t out_dims) + : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) { + assert(input_dims[0] == output_dims[0]); // num_vertices + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); + } + l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) + : l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {} + ~l2_norm_layer() {} + std::string layer_type() const override { return std::string("l2_norm"); } + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); + +protected: + float_t epsilon_; + float_t scale_; +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h new file mode 100644 index 0000000000..6e1ac879cc --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -0,0 +1,215 @@ +#pragma once +/** + * Code from on below link. Modified under Galois's license. + * + * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h + * + * Copyright (c) 2013, Taiga Nomi and the respective contributors + * All rights reserved. + * Reused/revised under 3-BSD + */ +#include +#include "deepgalois/GraphTypes.h" +#include "deepgalois/Context.h" +#include "deepgalois/optimizer.h" +#include "deepgalois/layers/node.h" +#include "deepgalois/DistContext.h" + +#ifndef GALOIS_ENABLE_GPU +#include "galois/graphs/GluonSubstrate.h" +#include "deepgalois/layers/GluonGradients.h" +#endif + +namespace deepgalois { + +/** + * base class of all kind of NN layers + * + * sub-class should override these methods: + * - forward_propagation ... body of forward-pass calculation + * - back_propagation ... body of backward-pass calculation + * - in_shape ... specify input data shapes + * - out_shape ... specify output data shapes + * - layer_type ... name of layer + * + * Node inheritance is just to get accessed to linked-list semantics it + * provides + **/ +class layer : public deepgalois::node { +public: + using ContextType = deepgalois::DistContext; + +protected: +#ifndef GALOIS_ENABLE_GPU + const std::string header = + "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + "] "; +#endif + unsigned level_; // layer id: [0, num_layers-1] + size_t begin_; // sample begin index + size_t end_; // sample end index + size_t count_; // number of samples + size_t num_dims; // number of dimensions + net_phase phase_; // in which phase: train, val or test + std::vector input_dims; // input dimensions + std::vector output_dims; // output dimentions + std::string name_; // name of this layer + bool trainable_; // is this layer trainable + bool use_mask; + vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E + vec_t Q; // parameters to learn, for vertex v's neighbors, same size as W + vec_t weight_grad; // weight gradient for updating parameters + float_t* d_W; // parameters to learn on device (GPU) + float_t* d_weight_grad; // weight gradient on device (GPU) + vec_t alpha_l; // parameters to learn (H x 1), only used for GAT + vec_t alpha_r; // parameters to learn (H x 1), only used for GAT + vec_t alpha_lgrad; // gradients for updating alpha (GAT only) + vec_t alpha_rgrad; // gradients for updating alpha (GAT only) + mask_t* masks_; // masks to show which samples are valid + mask_t* d_masks_; // masks on device (GPU) + float_t* loss; // error for each vertex: N x 1 + ContextType* context; + label_t* labels; + float_t* norm_consts; // normalization score + vec_t scores; // un-normalized scores + vec_t temp_scores; // un-normalized scores + vec_t scores_grad; // gradients of un-normalized scores + vec_t norm_scores; // normalized scores + vec_t norm_scores_grad; // gradients of normalized scores +// TODO +#ifdef GALOIS_ENABLE_GPU + GraphGPU* graph_gpu; +#else + Graph* graph_cpu; + // Used for synchronization of weight gradients + deepgalois::GluonGradients* gradientGraph; + galois::graphs::GluonSubstrate* syncSub; +#endif + +public: + layer(unsigned level, std::vector in_dims, + std::vector out_dims) + : level_(level), begin_(0), end_(0), num_dims(in_dims.size()), + input_dims(in_dims), output_dims(out_dims), labels(NULL) {} + virtual ~layer() = default; + virtual std::string layer_type() const = 0; + virtual void malloc_and_init() {} + void print_layer_info() { //! debug print function + unsigned myID = 0; +#ifndef GALOIS_ENABLE_GPU + myID = galois::runtime::getSystemNetworkInterface().ID; +#endif + std::cout << "[" << myID << "] Layer " << level_ + << " type: " << layer_type() << "input[" << input_dims[0] << "," + << input_dims[1] << "] output[" << output_dims[0] << "," + << output_dims[1] << "]\n"; + // galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(), + // "input[", input_dims[0], ",", input_dims[1], "] output[", + // output_dims[0], ",", output_dims[1], "]\n"); + } + // get methods + virtual acc_t get_prediction_loss() { return acc_t(0); } + virtual acc_t get_weight_decay_loss() { return acc_t(0); } + bool trainable() const { return trainable_; } + std::string get_name() { return name_; } + mask_t* get_device_masks() { return d_masks_; } + float_t* get_weights_ptr() { return &W[0]; } + float_t* get_weights_device_ptr() { return d_W; } + float_t* get_grads_ptr() { return &weight_grad[0]; } + float_t* get_grads_device_ptr() { return d_weight_grad; } + + // set methods + void set_netphase(net_phase ctx) { phase_ = ctx; } + void set_context(ContextType* ctx) { context = ctx; } + void set_trainable(bool trainable) { + trainable_ = trainable; + } // is this layer trainable? + void set_labels_ptr(label_t* ptr) { labels = ptr; } + void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; } + void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); } + void set_name(std::string name) { name_ = name; } // name metadata +#ifndef GALOIS_ENABLE_GPU + void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; } +#else + void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; } +#endif + void update_dim_size(size_t g_size) { + input_dims[0] = output_dims[0] = g_size; + } + + //! set the data of the previous layer connected to this one + void set_in_data(float_t* data) { + prev_ = + std::make_shared(this, input_dims[0], input_dims[1]); + prev_->set_data(data); + // no need to allocate memory for gradients, since this is the input layer. + } + + virtual void set_sample_mask(size_t sample_begin, size_t sample_end, + size_t sample_count, mask_t* masks) { + begin_ = sample_begin; + end_ = sample_end; + count_ = sample_count; + use_mask = false; + if (masks != NULL) { + use_mask = true; +#ifndef GALOIS_ENABLE_GPU + masks_ = masks; +#else + d_masks_ = masks; +#endif + } + } + + void add_edge() { + // add an outgoing edge + next_ = std::make_shared(this, output_dims[0], + output_dims[1]); + // allocate memory for intermediate feature vectors and gradients + next_->alloc(); + } + + // main functions for layer work + virtual void forward_propagation(const float_t* in_data, + float_t* out_data) = 0; + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) = 0; + + //! calls forward propagation using previous layer as input and writes + //! to next layer as output + void forward() { + // std::cout << name_ << ": forwarding ... "; + forward_propagation(prev()->get_data(), next()->get_data()); + } + + //! calls backward propagation + void backward() { + // std::cout << name_ << ": backwarding ... "; + back_propagation(prev()->get_data(), next()->get_data(), + next()->get_gradient(), prev()->get_gradient()); + } + + //! use optimizer to update weights given gradient (weight_grad) + void update_weight(deepgalois::optimizer* opt) { +#ifndef GALOIS_ENABLE_GPU + // parallelize only when target size is big enough to mitigate thread + // spawning overhead. + // bool parallel = (W.size() >= 512); + opt->update(layer::weight_grad, layer::W); // W += grad +#else + opt->update_gpu(input_dims[1] * output_dims[1], d_weight_grad, + d_W); // W += grad +#endif + // prev()->clear_grads(); + next()->clear_grads(); + } +}; + +//! Connects tail to head's edge and sets that edge's target to tail +// inline void connect(layer* head, layer* tail) { +inline void connect(layer* head, layer* tail) { + tail->prev_ = head->next_; + tail->prev_->add_next_node(tail); +} + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h new file mode 100644 index 0000000000..2f43e0a228 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h @@ -0,0 +1,22 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +// Leaky ReLU Layer +class leaky_relu_layer : public layer { +public: + leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, + dims_t out_dims); + leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) + : leaky_relu_layer(level, 0.0, in_dims, out_dims) {} + ~leaky_relu_layer() {} + std::string layer_type() const override { return std::string("leaky_relu"); } + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); + +protected: + float_t epsilon_; + size_t n; +}; +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h new file mode 100644 index 0000000000..ebcc774cc1 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/linear_layer.h @@ -0,0 +1,34 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +class linear_layer : public layer { +public: + linear_layer(unsigned level, float_t scale, float_t bias, + std::vector in_dims, std::vector out_dims) + : layer(level, in_dims, out_dims), scale_(scale), bias_(bias) { + trainable_ = false; + } + linear_layer(unsigned level, std::vector in_dim, + std::vector out_dim) + : linear_layer(level, 1.0, 0.0, in_dim, out_dim) {} + std::string layer_type() const override { return "linear"; } + + void forward_propagation(const tensor_t& in_data, + tensor_t& out_data) override { + for (size_t sample = 0; sample < input_dims[0]; ++sample) { + for (size_t i = 0; i < input_dims[1]; i++) + out_data[sample][i] = scale_ * in_data[sample][i] + bias_; + } + } + void back_propagation(const tensor_t& in_data, const tensor_t& out_data, + tensor_t& out_grad, tensor_t& in_grad) override { + for (size_t sample = 0; sample < input_dims[0]; ++sample) + for (size_t i = 0; i < input_dims[1]; i++) + in_grad[sample][i] = out_grad[sample][i] * scale_; + } + +protected: + float_t scale_, bias_; +}; +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h new file mode 100644 index 0000000000..11499bbede --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/node.h @@ -0,0 +1,75 @@ +#pragma once +/** + * Code modified from below + * + * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/node.h + * + * Copyright (c) 2013, Taiga Nomi and the respective contributors + * All rights reserved. + * Reused/revised under 3-BSD + */ + +#include +#include +#include +#include "deepgalois/types.h" + +namespace deepgalois { + +class node; +class layer; +class edge; + +typedef std::shared_ptr edgeptr_t; + +// node data structure: each layer is a node, two layers are connected by an +// edge +class node : public std::enable_shared_from_this { +public: + node() { + prev_ = NULL; + next_ = NULL; + } + // node(size_t in_size, size_t out_size) { + //} //: prev_(in_size), next_(out_size) {} + virtual ~node() {} + const edgeptr_t prev() const { return prev_; } + const edgeptr_t next() const { return next_; } + +protected: + // node() = delete; + friend void connect(layer* head, layer* tail); + mutable edgeptr_t prev_; + mutable edgeptr_t next_; +}; + +// edges manage the input/output data and gradients between nodes +class edge { +public: + edge(node* prev, size_t n, size_t len) + : num_samples_(n), ft_dim_(len), data_(NULL), grad_(NULL), prev_(prev) {} + + void alloc(); + void clear_grads(); + void merge_grads(float_t* dst); + void set_data(float_t* ptr) { data_ = ptr; } + float_t* get_data() { return data_; } + const float_t* get_data() const { return data_; } + float_t* get_gradient() { return grad_; } + const float_t* get_gradient() const { return grad_; } + + const node* next() const { return next_; } + node* prev() { return prev_; } + const node* prev() const { return prev_; } + void add_next_node(node* next) { next_ = next; } + +private: + size_t num_samples_; // number of samples + size_t ft_dim_; // feature dimensions + float_t* data_; // feature vectors + float_t* grad_; // gradients + node* prev_; // previous node, "producer" of data + node* next_; // next node, "consumer" of data +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h new file mode 100644 index 0000000000..4e1c47ed77 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h @@ -0,0 +1,18 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +// ReLU Layer +class relu_layer : public layer { +public: + relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + } + ~relu_layer() {} + std::string layer_type() const override { return std::string("relu"); } + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); +}; +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h new file mode 100644 index 0000000000..be133995c0 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h @@ -0,0 +1,20 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +class sigmoid_loss_layer : public layer { +public: + sigmoid_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims); + ~sigmoid_loss_layer(); + std::string layer_type() const override { + return std::string("sigmoid_loss"); + } + void malloc_and_init(); + inline label_t get_label(size_t i, size_t j); + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); + virtual acc_t get_prediction_loss(); +}; +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h new file mode 100644 index 0000000000..7ba096a2aa --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h @@ -0,0 +1,20 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +class softmax_loss_layer : public layer { +public: + softmax_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims); + ~softmax_loss_layer(); + std::string layer_type() const override { + return std::string("softmax_loss"); + } + void malloc_and_init(); + inline label_t get_label(size_t i); + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); + virtual acc_t get_prediction_loss(); +}; +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h new file mode 100644 index 0000000000..01b84a60b6 --- /dev/null +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -0,0 +1,171 @@ +#pragma once +#include "deepgalois/types.h" +#include +#include + +#ifdef __CUDACC__ +#define CUDA_HOSTDEV __host__ __device__ +#else +#define CUDA_HOSTDEV +#endif + +namespace deepgalois { + +class LearningGraph { + typedef std::vector IndexList; + // typedef index_t* IndexList; +protected: + bool is_device; + index_t max_size_; + index_t num_vertices_; + index_t num_edges_; + IndexList rowptr_; + IndexList colidx_; + IndexList degrees_; + vdata_t* vertex_data_; + edata_t* edge_data_; + + index_t* d_rowptr_; + index_t* d_colidx_; + index_t* d_degrees_; + vdata_t* d_vertex_data_; + edata_t* d_edge_data_; + std::vector> mirrorNodes; + +public: + typedef size_t iterator; + LearningGraph(bool use_gpu) + : is_device(use_gpu), max_size_(0), num_vertices_(0), num_edges_(0), + vertex_data_(NULL), edge_data_(NULL) {} + LearningGraph() : LearningGraph(false) {} + ~LearningGraph() { dealloc(); } + void init(index_t nv, index_t ne) { + num_vertices_ = nv; + num_edges_ = ne; + } + size_t size() { return (size_t)num_vertices_; } + size_t sizeEdges() { return (size_t)num_edges_; } + index_t get_degree(index_t vid) { return degrees_[vid]; } + + iterator begin() const { return iterator(0); } + iterator end() const { return iterator(num_vertices_); } + void progressPrint(unsigned maxii, unsigned ii); + void allocOnDevice(bool no_edge_data_); + void copy_to_cpu(); + void copy_to_gpu(); + void dealloc(); + void degree_counting(); + void constructNodes() {} + void set_max_size(index_t max) { + assert(max > 0); + max_size_ = max; + } + + void readGraph(std::string dataset, bool selfloop = false); + void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; } + void allocateFrom(index_t nv, index_t ne) { + // printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_, + // num_edges_); + num_vertices_ = nv; + num_edges_ = ne; + rowptr_.resize(num_vertices_ + 1); + colidx_.resize(num_edges_); + degrees_.resize(num_vertices_); + rowptr_[0] = 0; + } + + void constructEdge(index_t eid, index_t dst, edata_t edata = 0) { + assert(dst < num_vertices_); + assert(eid < num_edges_); + colidx_[eid] = dst; + if (edge_data_) + edge_data_[eid] = edata; + } + + void add_selfloop() { + auto old_colidx_ = colidx_; + colidx_.resize(num_vertices_ + num_edges_); + for (index_t i = 0; i < num_vertices_; i++) { + auto start = rowptr_[i]; + auto end = rowptr_[i + 1]; + bool selfloop_inserted = false; + if (start == end) { + colidx_[start + i] = i; + continue; + } + for (auto e = start; e != end; e++) { + auto dst = old_colidx_[e]; + if (!selfloop_inserted) { + if (i < dst) { + selfloop_inserted = true; + colidx_[e + i] = i; + colidx_[e + i + 1] = dst; + } else if (e + 1 == end) { + selfloop_inserted = true; + colidx_[e + i + 1] = i; + colidx_[e + i] = dst; + } else + colidx_[e + i] = dst; + } else + colidx_[e + i + 1] = dst; + } + } + for (index_t i = 0; i <= num_vertices_; i++) + rowptr_[i] += i; + num_edges_ += num_vertices_; + printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_, + num_edges_); + } + + bool isLocal(index_t vid); + index_t getLID(index_t vid); + bool is_vertex_cut(); + std::vector>& getMirrorNodes(); + uint64_t numMasters(); + uint64_t globalSize(); + + index_t* row_start_host_ptr() { return &rowptr_[0]; } + index_t* edge_dst_host_ptr() { return &colidx_[0]; } + index_t getEdgeDstHost(index_t eid) { return colidx_[eid]; } + index_t edge_begin_host(index_t vid) { return rowptr_[vid]; } + index_t edge_end_host(index_t vid) { return rowptr_[vid + 1]; } +#ifndef GALOIS_ENABLE_GPU + index_t getEdgeDst(index_t eid) { return colidx_[eid]; } + index_t edge_begin(index_t vid) { return rowptr_[vid]; } + index_t edge_end(index_t vid) { return rowptr_[vid + 1]; } + vdata_t getData(index_t vid) { return vertex_data_[vid]; } + index_t getDegree(index_t vid) { return degrees_[vid]; } + index_t* row_start_ptr() { return &rowptr_[0]; } + const index_t* row_start_ptr() const { return &rowptr_[0]; } + index_t* edge_dst_ptr() { return &colidx_[0]; } + const index_t* edge_dst_ptr() const { return &colidx_[0]; } + index_t* degrees_ptr() { return °rees_[0]; } + edata_t* edge_data_ptr() { return edge_data_; } + vdata_t* vertex_data_ptr() { return vertex_data_; } +#else + CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; } + CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; } + CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src + 1]; } + CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; } + // CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; } + // CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } + CUDA_HOSTDEV index_t getDegree(index_t vid) { + return d_rowptr_[vid + 1] - d_rowptr_[vid]; + } + CUDA_HOSTDEV index_t getOutDegree(index_t vid) { + return d_rowptr_[vid + 1] - d_rowptr_[vid]; + } + index_t* row_start_ptr() { return d_rowptr_; } + const index_t* row_start_ptr() const { return d_rowptr_; } + index_t* edge_dst_ptr() { return d_colidx_; } + const index_t* edge_dst_ptr() const { return d_colidx_; } + index_t* degrees_ptr() { return d_degrees_; } + edata_t* edge_data_ptr() { return d_edge_data_; } + vdata_t* vertex_data_ptr() { return d_vertex_data_; } + // const vdata_t *vertex_data_ptr() const { return vertex_data_; } + // const edata_t *edge_data_ptr() const { return edge_data; } + void print_test(); +#endif +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh new file mode 100644 index 0000000000..e6b5836386 --- /dev/null +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -0,0 +1,180 @@ +/** + * File inspired by similar one from TinyDNN + * https://github.com/tiny-dnn/ + */ +#ifndef _MATH_FUNCTIONS_ +#define _MATH_FUNCTIONS_ +#include +#include +#include +#include +#include "deepgalois/types.h" + +#ifdef USE_MKL +#include +#else +extern "C" { +#include +} +#endif + +namespace deepgalois { + +namespace math { + +// single-precision dense matrix multiply +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C); + +// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse +void csrmm_cpu(const int M, const int N, const int K, const int nnz, + const float alpha, float* A_nonzeros, int* A_idx_ptr, + int* A_nonzero_idx, const float* B, const float beta, float* C); + +// matrix-vector multiply +void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, const float beta, + float* y); + +//! add 2 arrays for n elements +void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); + +//! multiply n elements of vector by scalar +void scal(size_t n, const float_t alpha, float_t* x); +void scale(size_t n, const float_t alpha, const float_t* x, float_t* y); +void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y); + +//! do dot product of 2 vectors +float_t dot(size_t n, const float_t* x, const float_t* y); + +// concatenation of two vectors into one +void concat(size_t n, const float_t* x, const float_t* y, float_t* z); + +// SAXPY stands for “Single-precision A*X Plus Y" +void axpy(size_t n, const float_t a, float_t* x, float_t* y); + +// Returns the index of the maximum value +int argmax(const size_t n, const float_t* x); // the arguments of the maxima + +//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) +//! / 2 +float_t l2_norm(size_t n, const float_t* a); + +//! clear n elements of a vector +void clear_cpu(size_t n, float_t* in); + +//! copy vector from in -> out; first len elements +void copy_cpu(size_t len, const float_t* in, float_t* out); + +// dropout functions randomly remove weights +void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, + const float_t* in, mask_t* mask, float_t* out); + +// dropout derivative: use existing dropouts in masks instead of generating +// them; +void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, + mask_t* mask, float_t* out); + +//! ReLU = keep if positive; and ReLU derivative: 1 if data > 0, 0 otherwise +void relu_cpu(size_t n, const float_t* in, float_t* out); +void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); + +// Leaky ReLU +void leaky_relu(float_t epsilon, float_t in, float_t& out); +void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out); +void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out); +void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, + const float_t* data, float_t* out); + +// Loss function for single-class label (one-hot) data: softmax +void softmax(size_t n, const float_t* input, float_t* output); +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp); + +// Cross entropy +float_t cross_entropy(size_t n, const float_t* y, const float_t* p); +void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); + +// Loss function for multi-class label (one-hot) data: sigmoid +void sigmoid(size_t n, const float_t* input, float_t* output); +void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp); + +// dropout functions randomly remove weights +void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, + float_t* out); +void d_dropout(const float scale, const float_t* in, mask_t* mask, + float_t* out); + +//! transposes a matrix (malloc'd array) +void transpose(size_t x, size_t y, const float_t* in, float_t* out); + +} // namespace math +} // namespace deepgalois + +// GPU operators +bool isnan_gpu(int n, + const float_t* array); // does array contain any 'nan' element +void init_const_gpu(int n, float_t value, float_t* array); +void copy_gpu(int len, const float_t* in, float_t* out); +void vadd_gpu(const int n, const float_t* a, const float_t* b, + float_t* out); // vector add +void axpy_gpu(const int n, const float_t a, const float_t* x, + float_t* y); // axpy +void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU +void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, + float_t* out_diff); // ReLU derivative +void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in, + float_t* out); // Leaky ReLU +void d_leaky_relu_gpu(const int n, const float_t epsilon, + const float_t* in_diff, const float_t* data, + float_t* out_diff); // Leaky ReLU derivative +void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + mask_t* masks, float_t* out); // dropout +void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + const mask_t* masks, float_t* out); // dropout derivative +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C); +void matmul_gpu(const size_t x, const size_t y, const size_t z, + const float_t* A, const float_t* B, float_t* C); +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, + float_t* C); // matrix multiply +void csrmm_gpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, const int* A_idx_ptr, + const int* A_nonzero_idx, const float* B, const float beta, + float* trans_C, float* C); +void softmax_cross_entropy_gpu(int len, int begin, int end, + const float_t* in_data, const mask_t* masks, + const label_t* labels, float_t* loss, + float_t* out_data); +void d_softmax_cross_entropy_gpu(int len, int bengin, int end, + const mask_t* masks, const label_t* labels, + const float_t* out_data, float_t* diff); +void sigmoid_cross_entropy_gpu(int len, int begin, int end, + const float_t* in_data, const mask_t* masks, + const label_t* labels, float_t* loss, + float_t* out_data); +void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end, + const mask_t* masks, const label_t* labels, + const float_t* out_data, float_t* diff); +void scal_gpu(const int n, const float alpha, float* X); +void add_scalar_gpu(const int n, const float_t alpha, float_t* Y); +void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r); +bool is_allocated_device(float_t* data); +void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); +void float_malloc_device(int n, float_t*& ptr); +void float_free_device(float_t*& ptr); +void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr); +void uint8_malloc_device(int n, uint8_t*& ptr); +void uint8_free_device(uint8_t*& ptr); +void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, + float_t* loss); +acc_t l2_norm_gpu(int n, const float_t* in); +void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out); +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, + float_t* out_diff); +#endif diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h new file mode 100644 index 0000000000..694819591c --- /dev/null +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -0,0 +1,197 @@ +/** + * Code taken/modified from below link. + * + * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h + * Copyright (c) 2013, Taiga Nomi and the respective contributors + * All rights reserved. + * Reused under 3-BSD + */ +#pragma once + +// TODO: +// - use classes, not structs (modern C++) +// - templatize this instead of using inheritance +// - put optimizers in their own namespace + +#include +#include +#include "deepgalois/types.h" + +namespace deepgalois { + +// base class of optimizer +// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss +// function) +struct optimizer { + optimizer() = default; + optimizer(const optimizer&) = default; + optimizer(optimizer&&) = default; + optimizer& operator=(const optimizer&) = default; + optimizer& operator=(optimizer&&) = default; + virtual ~optimizer() = default; + virtual void update(const vec_t& dW, vec_t& W) = 0; +#ifdef GALOIS_ENABLE_GPU + virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0; +#endif + virtual void reset() {} // override to implement pre-learning action +}; + +// helper class to hold N values for each weight +template +struct stateful_optimizer : public optimizer { + void reset() override { + for (auto& e : E_) + e.clear(); + } + +protected: + template + vec_t& get(const vec_t& key) { + static_assert(Index < N, "index out of range"); + if (E_[Index][&key].empty()) + E_[Index][&key].resize(key.size(), float_t(0)); + return E_[Index][&key]; + } + std::unordered_map E_[N]; +#ifdef GALOIS_ENABLE_GPU + template + float_t* get_gpu(const size_t n, const float_t* key); + std::unordered_map dE_[N]; +#endif +}; + +/** + * adaptive gradient method + * + * J Duchi, E Hazan and Y Singer, + * Adaptive subgradient methods for online learning and stochastic optimization + * The Journal of Machine Learning Research, pages 2121-2159, 2011. + **/ +struct adagrad : public stateful_optimizer<1> { + adagrad() : alpha(0.01), eps(float_t(1e-8)) {} + void update(const vec_t& dW, vec_t& W); +#ifdef GALOIS_ENABLE_GPU + void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif + float_t alpha; // learning rate +private: + float_t eps; +}; + +/** + * RMSprop + * + * T Tieleman, and G E Hinton, + * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012) + **/ +struct RMSprop : public stateful_optimizer<1> { + RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} + void update(const vec_t& dW, vec_t& W); +#ifdef GALOIS_ENABLE_GPU + void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif + float_t alpha; // learning rate + float_t mu; // decay term +private: + float_t eps; // constant value to avoid zero-division +}; + +// Adam: A Method for Stochastic Optimization +// http://arxiv.org/abs/1412.6980 +struct adam : public stateful_optimizer<2> { + adam() + : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), + b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} + void update(const vec_t& dW, vec_t& W); +#ifdef GALOIS_ENABLE_GPU + void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif + + float_t alpha; // learning rate + float_t b1; // decay term + float_t b2; // decay term + float_t b1_t; // decay term power t + float_t b2_t; // decay term power t + +private: + float_t eps; // constant value to avoid zero-division +}; + +/** + * @brief [a new optimizer (2015)] + * @details [see Adam: A Method for Stochastic Optimization (Algorithm 2) + * http://arxiv.org/abs/1412.6980] + * + */ +struct adamax : public stateful_optimizer<2> { + adamax() + : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), + eps(float_t(1e-8)) {} + void update(const vec_t& dW, vec_t& W); +#ifdef GALOIS_ENABLE_GPU + void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif + + float_t alpha; // learning rate + float_t b1; // decay term + float_t b2; // decay term + float_t b1_t; // decay term power t + +private: + float_t eps; // constant value to avoid zero-division +}; + +// SGD without momentum +// slightly faster than tiny_dnn::momentum +struct gradient_descent : public optimizer { + gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} + void update(const vec_t& dW, vec_t& W); +#ifdef GALOIS_ENABLE_GPU + void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif + float_t alpha; // learning rate + float_t lambda; // weight decay +}; + +/** + * SGD with momentum + * + * B T Polyak, + * Some methods of speeding up the convergence of iteration methods + * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964. + **/ +struct momentum : public stateful_optimizer<1> { +public: + momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} + void update(const vec_t& dW, vec_t& W); +#ifdef GALOIS_ENABLE_GPU + void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif + + float_t alpha; // learning rate + float_t lambda; // weight decay + float_t mu; // momentum +}; + +/** + * SGD with Nesterov momentum + * + * Y Nesterov, + * A method for unconstrained convex minimization problem with the rate of + * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983. + **/ +struct nesterov_momentum : public stateful_optimizer<1> { +public: + nesterov_momentum() + : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} + void update(const vec_t& dW, vec_t& W); +#ifdef GALOIS_ENABLE_GPU + void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif + + float_t alpha; // learning rate + float_t lambda; // weight decay + float_t mu; // momentum +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/random.h b/libdeepgalois/include/deepgalois/random.h new file mode 100644 index 0000000000..6e5cb0fe5b --- /dev/null +++ b/libdeepgalois/include/deepgalois/random.h @@ -0,0 +1,53 @@ +// random number generators for CPU +#pragma once + +#include +#include "galois/Galois.h" +#include "deepgalois/GraphTypes.h" + +namespace deepgalois { + +class PerThreadRNG { + galois::substrate::PerThreadStorage engine; + galois::substrate::PerThreadStorage> + distribution; + +public: + //! init distribution + PerThreadRNG() : distribution{0.0, 1.0} {}; + + //! thread local RNG float from 0 to 1 + float_t get_number() { + float_t num = (*distribution.getLocal())(*engine.getLocal()); + return num; + } +}; + +class random_generator { +public: + static random_generator& get_instance() { + static random_generator instance; + return instance; + } + std::mt19937& operator()() { return gen_; } + void set_seed(unsigned int seed) { gen_.seed(seed); } + +private: + random_generator() : gen_(1) {} + std::mt19937 gen_; +}; + +template +inline typename std::enable_if::value, T>::type +uniform_rand(T min, T max) { + std::uniform_int_distribution dst(min, max); + return dst(random_generator::get_instance()()); +} + +template +inline typename std::enable_if::value, T>::type +uniform_rand(T min, T max) { + std::uniform_real_distribution dst(min, max); + return dst(random_generator::get_instance()()); +} +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h new file mode 100644 index 0000000000..c25eeceac2 --- /dev/null +++ b/libdeepgalois/include/deepgalois/reader.h @@ -0,0 +1,22 @@ +#pragma once +#include "deepgalois/lgraph.h" +// #include "galois/DistGalois.h" +namespace deepgalois { + +class Reader { +private: + std::string dataset_str; + void progressPrint(unsigned maxii, unsigned ii); + +public: + Reader() : dataset_str("") {} + Reader(std::string dataset) : dataset_str(dataset) {} + void init(std::string dataset) { dataset_str = dataset; } + size_t read_labels(bool is_single_class, label_t*& labels); + size_t read_features(float_t*& feats, std::string filetype = "bin"); + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, + mask_t* masks); + void readGraphFromGRFile(LearningGraph* g); +}; + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h new file mode 100644 index 0000000000..17dd05b15d --- /dev/null +++ b/libdeepgalois/include/deepgalois/types.h @@ -0,0 +1,58 @@ +#ifndef _GNN_TYPES_H_ +#define _GNN_TYPES_H_ +#include +#include +#include +#include + +// TODO namespace + +#ifdef CNN_USE_DOUBLE +typedef double float_t; +typedef double feature_t; +#else +typedef float float_t; +typedef float feature_t; // feature type +#endif +typedef std::vector vec_t; // feature vector (1D) +typedef std::vector + tensor_t; // feature vectors (2D): num_samples x feature_dim +typedef std::vector FV; // feature vector +typedef std::vector FV2D; // feature vectors: num_samples x feature_dim +typedef float acc_t; // Accuracy type +typedef uint8_t label_t; // label is for classification (supervised learning) +typedef uint8_t mask_t; // mask is used to indicate different uses of labels: + // train, val, test +typedef uint32_t VertexID; +typedef uint64_t EdgeID; +typedef std::vector VertexList; +typedef std::set VertexSet; +typedef std::vector dims_t; // dimentions type + +typedef uint32_t index_t; // index type +typedef float_t edata_t; // edge data type +typedef float_t vdata_t; // vertex data type +typedef float_t* emb_t; // embedding (feature vector) type + +enum class net_phase { train, test }; + +#define CHUNK_SIZE 256 +#define TB_SIZE 256 +#define BLOCK_SIZE 256 +#define WARP_SIZE 32 +#define MAX_NUM_CLASSES 128 +#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) +#define USE_CUSPARSE + +namespace deepgalois { +// TODO only being used by graph conv layer at the moment so extern works, +// but this design is bad and needs to be revisited + +//! Set this to let sync struct know where to get data from +extern float_t* _dataToSync; +//! Set this to let sync struct know the size of the vector to use during +//! sync +extern long unsigned _syncVectorSize; +} // namespace deepgalois + +#endif diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h new file mode 100644 index 0000000000..bf74aad196 --- /dev/null +++ b/libdeepgalois/include/deepgalois/utils.h @@ -0,0 +1,143 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "deepgalois/types.h" + +namespace deepgalois { + +//! tracks max mem usage with rusage +// TODO use Galois's getrusage functionality +class ResourceManager { +public: + ResourceManager() {} + ~ResourceManager() {} + // peak memory usage + std::string get_peak_memory() { + double kbm; + struct rusage CurUsage; + getrusage(RUSAGE_SELF, &CurUsage); + kbm = (double)CurUsage.ru_maxrss; + double mbm = kbm / 1024.0; + double gbm = mbm / 1024.0; + return "Peak memory: " + to_string_with_precision(mbm, 3) + " MB; " + + to_string_with_precision(gbm, 3) + " GB"; + } + +private: + template + std::string to_string_with_precision(const T a_value, const int& n) { + std::ostringstream out; + out << std::fixed; + out << std::setprecision(n) << a_value; + return out.str(); + } +}; + +// TODO don't need a separate timer: use Galois's regular timer +class Timer { +public: + Timer() {} + void Start() { gettimeofday(&start_time_, NULL); } + void Stop() { + gettimeofday(&elapsed_time_, NULL); + elapsed_time_.tv_sec -= start_time_.tv_sec; + elapsed_time_.tv_usec -= start_time_.tv_usec; + } + double Seconds() const { + return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6; + } + double Millisecs() const { + return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000; + } + double Microsecs() const { + return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; + } + +private: + struct timeval start_time_; + struct timeval elapsed_time_; +}; + +// sequential prefix sum +template +inline std::vector prefix_sum(const std::vector& in) { + std::vector prefix(in.size() + 1); + OutTy total = 0; + for (size_t i = 0; i < in.size(); i++) { + prefix[i] = total; + total += (OutTy)in[i]; + } + prefix[in.size()] = total; + return prefix; +} + +template +OutTy* parallel_prefix_sum(const std::vector& in); + +// Utility function to randomly select k items from [begin, end) +template +inline T* select_k_items(T k, T begin, T end) { + auto i = begin; + + // reservoir[] is the output array. Initialize + // it with first k vertices + T* reservoir = new T[k]; + for (; i < k; i++) + reservoir[i] = i; + + // Use a different seed value so that we don't get + // same result each time we run this program + srand(time(NULL)); + + // Iterate from the (k+1)th element to nth element + for (; i < end; i++) { + // Pick a random index from 0 to i. + auto j = rand() % (i + 1); + + // If the randomly picked index is smaller than k, + // then replace the element present at the index + // with new element from stream + if (j < k) + reservoir[j] = i; + } + return reservoir; +} + +// Utility function to find ceiling of r in arr[l..h] +template +inline T find_ceil(T* arr, T r, T l, T h) { + T mid; + while (l < h) { + mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 + (r > arr[mid]) ? (l = mid + 1) : (h = mid); + } + return (arr[l] >= r) ? l : -1; +} + +// Utility function to select one element from n elements given a frequency +// (probability) distribution +// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/ +template +T select_one_item(T n, T* dist) { + T* offsets = new T[n]; + offsets[0] = dist[0]; + // compute the prefix sum of the distribution + for (T i = 1; i < n; ++i) + offsets[i] = offsets[i - 1] + dist[i]; + // offsets[n-1] is sum of all frequencies + T sum = offsets[n - 1]; + T r = (rand() % sum) + 1; + // find which range r falls into, and return the index of the range + return find_ceil(offsets, r, 0, n - 1); +} + +acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t* masks, + size_t num_classes, label_t* ground_truth, float_t* pred); + +} // namespace deepgalois diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt new file mode 100644 index 0000000000..d9bf751eac --- /dev/null +++ b/libdeepgalois/licensenote.txt @@ -0,0 +1,59 @@ +TODO + +figure out which files have coded based on other codebsaes, get license, +note here + +e.g. +https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn +under BSD-3 + +DGL structure as well from what I can tell + +================================================================================ +Caffe License +================================================================================ + +COPYRIGHT + +All contributions by the University of California: +Copyright (c) 2014-2017 The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014-2017, the respective contributors +All rights reserved. + +Caffe uses a shared copyright model: each contributor holds copyright over +their contributions to Caffe. The project versioning records all such +contribution and copyright details. If a contributor wants to further mark +their specific copyright on a particular contribution, they should indicate +their copyright solely in the commit message of the change when it is +committed. + +LICENSE + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CONTRIBUTION AGREEMENT + +By contributing to the BVLC/caffe repository through pull-request, comment, +or otherwise, the contributor releases their content to the +license and copyright terms herein. diff --git a/libdeepgalois/scripts/run-multi.sh b/libdeepgalois/scripts/run-multi.sh new file mode 100755 index 0000000000..da9861fb2e --- /dev/null +++ b/libdeepgalois/scripts/run-multi.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp +LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn +GNNS="gcn" +#GRAPHS="ppi yelp amazon" +GRAPHS="ppi" +EPOCHS="200" +NTHREADS="56" +DROPOUT="0.1 0.2 0.3 0.5" +LEARNINGRATES="0.01" +HIDDENDIM="16 64 128" +OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois + +for GNN in $GNNS; do + for NT in $NTHREADS; do + for GR in $GRAPHS; do + for K in $EPOCHS; do + for DR in $DROPOUT; do + for LR in $LEARNINGRATES; do + for HD in $HIDDENDIM; do + EXEC_DIR=$LONESTARGNN/$GNN + echo $EXEC_DIR + echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" + $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log + echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" + done + done + done + done + done + done +done diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh new file mode 100755 index 0000000000..a6bc223ebd --- /dev/null +++ b/libdeepgalois/scripts/run-single.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp +LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn +GNNS="gcn" +GRAPHS="cora citeseer pubmed flickr reddit" +#GRAPHS="cora" +EPOCHS="200" +NTHREADS="56" +DROPOUT="0.1 0.2 0.3 0.5" +LEARNINGRATES="0.01" +HIDDENDIM="16 32 64 128 256 512" +OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois + +for GNN in $GNNS; do + for NT in $NTHREADS; do + for GR in $GRAPHS; do + for K in $EPOCHS; do + for DR in $DROPOUT; do + for LR in $LEARNINGRATES; do + for HD in $HIDDENDIM; do + EXEC_DIR=$LONESTARGNN/$GNN + echo $EXEC_DIR + echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" + $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log + echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" + done + done + done + done + done + done +done diff --git a/libdeepgalois/scripts/test-multi.sh b/libdeepgalois/scripts/test-multi.sh new file mode 100755 index 0000000000..a67bd047a8 --- /dev/null +++ b/libdeepgalois/scripts/test-multi.sh @@ -0,0 +1 @@ +./gcn ppi -k=20 -t=14 -sc=0 -h=128 diff --git a/libdeepgalois/scripts/test-single.sh b/libdeepgalois/scripts/test-single.sh new file mode 100755 index 0000000000..78093d71ed --- /dev/null +++ b/libdeepgalois/scripts/test-single.sh @@ -0,0 +1 @@ +./gcn cora -k=200 -t=14 diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp new file mode 100644 index 0000000000..21bcad0fe3 --- /dev/null +++ b/libdeepgalois/src/DistContext.cpp @@ -0,0 +1,405 @@ +#include "deepgalois/DistContext.h" +#include "deepgalois/utils.h" +#include "deepgalois/configs.h" + +namespace deepgalois { +DistContext::DistContext() : DistContext(false) { syncSubstrate = NULL; } + +DistContext::~DistContext() {} + +void DistContext::saveDistGraph(DGraph* a) { + partitionedGraph = a; + + // construct lgraph from underlying lc csr graph + // TODO fix this so i don't have more than 1 copy of graph in memory + this->lGraph = new Graph(); + this->lGraph->allocateFrom(a->size(), a->sizeEdges()); + this->lGraph->constructNodes(); + + galois::do_all( + galois::iterate((size_t)0, a->size()), + [&](const auto src) { + this->lGraph->fixEndEdge(src, *a->edge_end(src)); + index_t idx = *(a->edge_begin(src)); + + for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) { + const auto dst = a->getEdgeDst(e); + this->lGraph->constructEdge(idx++, dst, 0); + } + }, + galois::loopname("lgraphcopy")); +} + +// TODO move to reader class +size_t DistContext::read_labels(bool isSingleClassLabel, + std::string dataset_str) { + DGraph* dGraph = DistContext::partitionedGraph; + this->usingSingleClass = isSingleClassLabel; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Reading labels from disk...\n"); + + std::string filename = path + dataset_str + "-labels.txt"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m; + // read file header + in >> m >> this->num_classes >> std::ws; + assert(m == dGraph->globalSize()); + + // size of labels should be # local nodes + if (isSingleClassLabel) { + galois::gPrint("[", myID, "] One hot labels...\n"); + // single-class (one-hot) label for each vertex: N x 1 + this->h_labels = new label_t[dGraph->size()]; + } else { + galois::gPrint("[", myID, "] Multi-class labels...\n"); + this->h_labels = new label_t[dGraph->size() * this->num_classes]; + // multi-class label for each vertex: N x E + } + + uint32_t foundVertices = 0; + unsigned v = 0; + // each line contains a set of 0s and 1s + while (std::getline(in, line)) { + // only bother if local node + if (dGraph->isLocal(v)) { + std::istringstream label_stream(line); + unsigned x; + // for each class + for (size_t idx = 0; idx < this->num_classes; ++idx) { + // check if that class is labeled + label_stream >> x; + + // diff between single and multi class + if (isSingleClassLabel) { + if (x != 0) { + // set local id + this->h_labels[dGraph->getLID(v)] = idx; + foundVertices++; + break; + } + } else { + this->h_labels[dGraph->getLID(v) * this->num_classes + idx] = x; + foundVertices++; + } + } + } + // always increment v + v++; + } + + in.close(); + + // print the number of vertex classes + galois::gPrint("[", myID, + "] Done with labels, unique label counts: ", num_classes, + "; set ", foundVertices, " nodes\n"); + + return num_classes; +} + +// TODO move to reader class +size_t DistContext::read_features(std::string dataset_str) { + DGraph* dGraph = DistContext::partitionedGraph; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Reading features from disk...\n"); + + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + size_t m; // m = number of vertices + // dimension read + std::string file_dims = path + dataset_str + "-dims.txt"; + std::ifstream ifs; + ifs.open(file_dims, std::ios::in); + ifs >> m >> this->feat_len >> std::ws; + ifs.close(); + + galois::gPrint("[", myID, "] N x D: ", m, " x ", feat_len, "\n"); + + // TODO read in without using 2 in-memory buffers + // full read feats to load into h_feats + float_t* fullFeats = new float_t[m * feat_len]; + // actual stored feats + h_feats = new float_t[dGraph->size() * feat_len]; + + // read in full feats + filename = path + dataset_str + "-feats.bin"; + in.open(filename, std::ios::binary | std::ios::in); + in.read((char*)fullFeats, sizeof(float_t) * m * feat_len); + in.close(); + + // get the local ids we want + size_t count = 0; + for (size_t i = 0; i < m; i++) { + if (dGraph->isLocal(i)) { + // h_feats[count * feat_len] = fullFeats[i]; + std::copy(fullFeats + i * DistContext::feat_len, + fullFeats + (i + 1) * DistContext::feat_len, + &this->h_feats[dGraph->getLID(i) * DistContext::feat_len]); + count++; + } + } + GALOIS_ASSERT(count == dGraph->size()); + free(fullFeats); + + galois::gPrint("[", myID, "] Done with features, feature length: ", feat_len, + "\n"); + + return feat_len; +} + +// TODO move to reader class/reuse reader class somehow +size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, + size_t n, size_t& begin, size_t& end, + mask_t* masks, DGraph* dGraph) { + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + + bool dataset_found = false; + for (int i = 0; i < NUM_DATASETS; i++) { + if (dataset_str == dataset_names[i]) { + dataset_found = true; + break; + } + } + if (!dataset_found) { + GALOIS_DIE("Dataset currently not supported"); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + // only bother if it's local + if (dGraph->isLocal(i)) { + masks[dGraph->getLID(i)] = 1; + sample_count++; + } + } + } + i++; + } + galois::gPrint("[", myID, "] ", mask_type, "_mask range: [", begin, ", ", end, + ") Number of valid samples: ", sample_count, "(", + (float)sample_count / (float)n * (float)100, "\%)\n"); + in.close(); + return sample_count; +} + +float_t* DistContext::get_in_ptr() { return &h_feats[0]; } + +void DistContext::initializeSyncSubstrate() { + DistContext::syncSubstrate = new galois::graphs::GluonSubstrate( + *DistContext::partitionedGraph, + galois::runtime::getSystemNetworkInterface().ID, + galois::runtime::getSystemNetworkInterface().Num, false); +} + +void DistContext::allocNormFactor() { +#ifdef USE_MKL + this->normFactors.resize(partitionedGraph->sizeEdges()); +#else + this->normFactors.resize(partitionedGraph->size()); +#endif +} + +void DistContext::allocNormFactorSub(int subID) { +#ifdef USE_MKL + this->normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges()); +#else + this->normFactorsSub.resize(partitionedSubgraphs[subID]->size()); +#endif +} + +void DistContext::constructNormFactor(deepgalois::Context* globalContext) { + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Norm factor construction\n"); + // using original graph to get ids + Graph* wholeGraph = globalContext->getFullGraph(); + + allocNormFactor(); + // this is for testing purposes + // galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), + // [&] (unsigned i) { + // this->normFactors[i] = 0; + // } + //); + +#ifdef USE_MKL + galois::do_all( + galois::iterate((size_t)0, partitionedGraph->size()), + [&](unsigned i) { + float_t c_i = std::sqrt( + float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + + for (auto e = partitionedGraph->edge_begin(i); + e != partitionedGraph->edge_end(i); e++) { + const auto j = partitionedGraph->getEdgeDst(e); + float_t c_j = std::sqrt( + float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); + + if (c_i == 0.0 || c_j == 0.0) { + this->normFactors[*e] = 0.0; + } else { + this->normFactors[*e] = 1.0 / (c_i * c_j); + } + } + }, + galois::loopname("NormCountingEdge")); +#else + galois::do_all( + galois::iterate((size_t)0, partitionedGraph->size()), + [&](unsigned v) { + auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) { + this->normFactors[v] = 0.0; + } else { + this->normFactors[v] = 1.0 / temp; + } + }, + galois::loopname("NormCountingNode")); +#endif + galois::gPrint("[", myID, "] Norm factor construction done \n"); +} + +void DistContext::constructNormFactorSub(int subgraphID) { + // galois::gPrint("Sub norm factor construction\n"); + // right now norm factor based on subgraph + // TODO fix this for dist execution + + allocNormFactorSub(subgraphID); + + Graph& graphToUse = *partitionedSubgraphs[subgraphID]; + graphToUse.degree_counting(); + + // TODO using partitioned subgraph rather than whoel graph; i.e. dist + // setting wrong +#ifdef USE_MKL + galois::do_all( + galois::iterate((size_t)0, graphToUse.size()), + [&](unsigned i) { + // float_t c_i = + // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i))); + + for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i); + e++) { + const auto j = graphToUse.getEdgeDst(e); + float_t c_j = std::sqrt(float_t(graphToUse.get_degree(j))); + + if (c_i == 0.0 || c_j == 0.0) { + this->normFactorsSub[e] = 0.0; + } else { + this->normFactorsSub[e] = 1.0 / (c_i * c_j); + } + } + }, + galois::loopname("NormCountingEdge")); +#else + galois::do_all( + galois::iterate((size_t)0, graphToUse.size()), + [&](unsigned v) { + // auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); + auto degree = graphToUse.get_degree(v); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) { + this->normFactorsSub[v] = 0.0; + } else { + this->normFactorsSub[v] = 1.0 / temp; + } + // galois::gPrint(this->normFactorsSub[v], "\n"); + }, + galois::loopname("NormCountingNode")); +#endif + // galois::gPrint("Sub norm factor construction done\n"); +} +//! generate labels for the subgraph, m is subgraph size, mask +//! tells which vertices to use +void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { + if (DistContext::usingSingleClass) { + DistContext::h_labels_subg.resize(m); + } else { + DistContext::h_labels_subg.resize(m * DistContext::num_classes); + } + size_t count = 0; + // see which labels to copy over for this subgraph + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + if (DistContext::usingSingleClass) { + DistContext::h_labels_subg[count] = h_labels[i]; + } else { + std::copy( + DistContext::h_labels + i * DistContext::num_classes, + DistContext::h_labels + (i + 1) * DistContext::num_classes, + &DistContext::h_labels_subg[count * DistContext::num_classes]); + } + // galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n"); + count++; + } + } + GALOIS_ASSERT(count == m); +} + +//! generate input features for the subgraph, m is subgraph size, +//! masks tells which vertices to use +void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { + size_t count = 0; + DistContext::h_feats_subg.resize(m * feat_len); + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + std::copy(DistContext::h_feats + i * DistContext::feat_len, + DistContext::h_feats + (i + 1) * DistContext::feat_len, + &DistContext::h_feats_subg[count * DistContext::feat_len]); + // for (unsigned a = 0; a < DistContext::feat_len; a++) { + // if (h_feats_subg[count * DistContext::feat_len + a] != 0) { + // galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], + // " "); + // } + //} + // galois::gPrint("\n"); + count++; + } + } + GALOIS_ASSERT(count == m); +} + +galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { + return DistContext::syncSubstrate; +} + +//! allocate memory for subgraphs (don't actually build them) +void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) { + this->partitionedSubgraphs.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) { + this->partitionedSubgraphs[i] = new Graph(); + this->partitionedSubgraphs[i]->set_max_size(max_size); + } +} + +bool DistContext::isOwned(unsigned gid) { + return this->partitionedGraph->isOwned(gid); +} + +bool DistContext::isLocal(unsigned gid) { + return this->partitionedGraph->isLocal(gid); +} + +unsigned DistContext::getGID(unsigned lid) { + return this->partitionedGraph->getGID(lid); +} + +unsigned DistContext::getLID(unsigned gid) { + return this->partitionedGraph->getLID(gid); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu new file mode 100644 index 0000000000..30704b0748 --- /dev/null +++ b/libdeepgalois/src/DistContext.cu @@ -0,0 +1,257 @@ +#include +#include +#include +#include +#include "deepgalois/DistContext.h" +#include "deepgalois/math_functions.hh" +#include "deepgalois/configs.h" + +// random seeding +int64_t cluster_seedgen(void) { + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + std::cout << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) + fclose(f); + pid = getpid(); + s = time(NULL); + seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); + return seed; +} + +namespace deepgalois { + +// computing normalization factor for each vertex +__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) { + CUDA_KERNEL_LOOP(i, n) { + float_t temp = sqrt(float_t(graph.getOutDegree(i))); + if (temp == 0.0) norm_fac[i] = 0.0; + else norm_fac[i] = 1.0 / temp; + } +} + +// TODO: make sure self-loop added for each vertex +// computing normalization factor for each edge +__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) { + CUDA_KERNEL_LOOP(src, n) { + assert(src < n); + float_t d_src = float_t(graph.getOutDegree(src)); + assert(d_src != 0.0); // should never be zero since self-loop added for each vertex + d_src = 1.0 / sqrt(d_src); + auto start = graph.edge_begin(src); + index_t end = graph.edge_end(src); + for (index_t e = start; e != end; e++) { + index_t dst = graph.getEdgeDst(e); + // if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, + // dst, e, start, end); + assert(dst < n); + float_t d_dst = float_t(graph.getOutDegree(dst)); + assert(d_dst != 0.0); + d_dst = 1.0 / sqrt(d_dst); + norm_fac[e] = d_src * d_dst; + } + } +} + +cublasHandle_t DistContext::cublas_handle_ = 0; +cusparseHandle_t DistContext::cusparse_handle_ = 0; +cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0; +curandGenerator_t DistContext::curand_generator_ = 0; + +DistContext::DistContext() : DistContext(true) { + d_labels = NULL; + d_feats = NULL; + d_labels_subg = NULL; + d_feats_subg = NULL; + d_normFactors = NULL; + d_normFactorsSub = NULL; + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); + CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); + CUSPARSE_CHECK( + cusparseSetMatType(cusparse_matdescr_, CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CHECK( + cusparseSetMatIndexBase(cusparse_matdescr_, CUSPARSE_INDEX_BASE_ZERO)); + CURAND_CHECK( + curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK( + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); +} + +DistContext::~DistContext() { + if (cublas_handle_) + CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (cusparse_handle_) + CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); + if (cusparse_matdescr_) + CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_)); + if (curand_generator_) + CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + if (d_labels) CUDA_CHECK(cudaFree(d_labels)); + if (d_feats) CUDA_CHECK(cudaFree(d_feats)); + if (d_normFactors) CUDA_CHECK(cudaFree(d_normFactors)); + if (d_labels_subg) CUDA_CHECK(cudaFree(d_labels_subg)); + if (d_feats_subg) CUDA_CHECK(cudaFree(d_feats_subg)); + if (d_normFactorsSub) CUDA_CHECK(cudaFree(d_normFactorsSub)); +} + +size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) { + num_classes = reader.read_labels(isSingleClass, h_labels); + return num_classes; +} + +size_t DistContext::read_features(std::string dataset_str) { + feat_len = reader.read_features(h_feats); + return feat_len; +} + +size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, + size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) { + return reader.read_masks(mask_type, n, begin, end, masks); +} + +//! allocate memory for subgraphs (don't actually build them) +void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) { + this->partitionedSubgraphs.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) { + this->partitionedSubgraphs[i] = new Graph(); + this->partitionedSubgraphs[i]->set_max_size(max_size); + } +} + +void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { + size_t labels_size = m; + if (!usingSingleClass) labels_size = m * num_classes; + h_labels_subg.resize(labels_size); + size_t count = 0; + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + if (usingSingleClass) h_labels_subg[count] = h_labels[i]; + else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, + &h_labels_subg[count * num_classes]); + count++; + } + } + if (d_labels_subg) uint8_free_device(d_labels_subg); + uint8_malloc_device(labels_size, d_labels_subg); + uint8_copy_device(labels_size, &h_labels_subg[0], d_labels_subg); +} + +void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { + //std::cout << "construct subgraph features (d_feats_subg: " << d_feats_subg << ") ... "; + size_t count = 0; + DistContext::h_feats_subg.resize(m * feat_len); + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len, &h_feats_subg[count * feat_len]); + count++; + } + } + if (d_feats_subg) float_free_device(d_feats_subg); + float_malloc_device(m * feat_len, d_feats_subg); + float_copy_device(m * feat_len, &h_feats_subg[0], d_feats_subg); + //std::cout << "Done\n"; +} + +void DistContext::constructNormFactorSub(int subgraphID) { + Graph& graphToUse = *partitionedSubgraphs[subgraphID]; + auto n = graphToUse.size(); + //std::cout << "Pre-computing subgraph normalization factor (n=" << n << ") ... "; + + #ifdef USE_CUSPARSE + auto nnz = graphToUse.sizeEdges(); + float_malloc_device(nnz, d_normFactorsSub); + init_const_gpu(nnz, 0.0, d_normFactors); + norm_factor_computing_edge<<>>( + n, graphToUse, d_normFactorsSub); +#else + float_malloc_device(n, d_normFactorsSub); + norm_factor_computing_node<<>>( + n, graphToUse, d_normFactorsSub); +#endif + CudaTest("solving norm_factor_computing kernel failed"); + //std::cout << "Done\n"; +} + +void DistContext::constructNormFactor(deepgalois::Context* globalContext) { + auto n = partitionedGraph->size(); + std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; + if (!is_selfloop_added) { + std::cout << "Set -sl=1 to add selfloop\n"; + exit(0); + } +#ifdef USE_CUSPARSE + auto nnz = partitionedGraph->sizeEdges(); + CUDA_CHECK(cudaMalloc((void**)&d_normFactors, nnz * sizeof(float_t))); + init_const_gpu(nnz, 0.0, d_normFactors); + norm_factor_computing_edge<<>>( + n, *partitionedGraph, d_normFactors); +#else + CUDA_CHECK(cudaMalloc((void**)&d_normFactors, n * sizeof(float_t))); + norm_factor_computing_node<<>>( + n, *partitionedGraph, d_normFactors); +#endif + CudaTest("solving norm_factor_computing kernel failed"); + std::cout << "Done\n"; +} + +/* +void DistContext::SetDevice(const int device_id) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + if (current_device == device_id) return; + CUDA_CHECK(cudaSetDevice(device_id)); + if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) +CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, +CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, +cluster_seedgen())); +} +*/ +size_t DistContext::read_graph(std::string dataset, bool selfloop) { + partitionedGraph = new DGraph(); +#ifdef USE_CSRGRAPH + std::string filename = path + dataset + ".csgr"; + GraphGPU g; + g.read(filename.c_str(), false); + if (selfloop) { + g.add_selfloop(); + is_selfloop_added = selfloop; + } + g.copy_to_gpu(*partitionedGraph); +#else + partitionedGraph->readGraph(dataset); + if (selfloop) { + partitionedGraph->add_selfloop(); + is_selfloop_added = selfloop; + } + partitionedGraph->copy_to_gpu(); +#endif + return partitionedGraph->size(); +} + +void DistContext::copy_data_to_device() { + auto n = partitionedGraph->size(); + std::cout << "Copying labels and features to GPU memory. n = " << n << " ... "; + if (usingSingleClass) { + CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); + } else { + CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice)); + } + CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); + // print_device_vector(10, d_feats, "d_feats"); + std::cout << "Done\n"; +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp new file mode 100644 index 0000000000..d07b19f912 --- /dev/null +++ b/libdeepgalois/src/Net.cpp @@ -0,0 +1,177 @@ +/** + * Based on the net.hpp file from Caffe deep learning framework. + */ + +#include "galois/Timer.h" +#include "galois/Galois.h" +#include "deepgalois/Net.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +void Net::partitionInit(DGraph* graph, std::string dataset_str, + bool isSingleClassLabel) { + this->dGraph = graph; + this->distContext = new deepgalois::DistContext(); + this->distContext->saveDistGraph(dGraph); + this->distNumSamples = this->dGraph->size(); + + // TODO self loop setup would have to be done before this during partitioning + // or on master node only + + this->distContext->initializeSyncSubstrate(); + num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str); + + // std::cout << "Reading label masks ... "; + this->distTrainMasks = new mask_t[this->distNumSamples]; + this->distValMasks = new mask_t[this->distNumSamples]; + std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples, + 0); + std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0); + + // load the training/val masks + if (dataset_str == "reddit") { + // find local ID from global ID, set if it exists + for (size_t i = this->globalTrainBegin; i < this->globalTrainEnd; i++) { + if (this->dGraph->isLocal(i)) { + this->distTrainMasks[this->dGraph->getLID(i)] = 1; + } + } + for (size_t i = this->globalValBegin; i < this->globalValEnd; i++) { + if (this->dGraph->isLocal(i)) { + this->distValMasks[this->dGraph->getLID(i)] = 1; + } + } + } else { + globalTrainCount = this->distContext->read_masks( + dataset_str, "train", this->distNumSamples, this->globalTrainBegin, + this->globalTrainEnd, this->distTrainMasks, this->dGraph); + globalValCount = this->distContext->read_masks( + dataset_str, "val", this->distNumSamples, this->globalValBegin, + this->globalValEnd, this->distValMasks, this->dGraph); + } + + // input feature dimension: D + feature_dims[0] = this->distContext->read_features(dataset_str); + + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (this->has_l2norm) { + // l2 normalized embedding: E + feature_dims[num_conv_layers + 1] = num_classes; + } + if (this->has_dense) { + // MLP embedding: E + feature_dims[num_layers - 1] = num_classes; + } + feature_dims[num_layers] = num_classes; // normalized output embedding: E +} + +void Net::allocateSubgraphsMasks(int num_subgraphs) { + subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; +} + +// add weight decay +void Net::regularize() { + size_t layer_id = 0; + auto n = feature_dims[layer_id] * feature_dims[layer_id + 1]; + // TODO: parallel + math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), + layers[layer_id]->get_grads_ptr()); +} + +void Net::read_test_masks(std::string dataset) { + if (dataset == "reddit") { + globalTestBegin = 177262; + globalTestCount = 55703; + globalTestEnd = globalTestBegin + globalTestCount; + for (size_t i = globalTestBegin; i < globalTestEnd; i++) { + globalTestMasks[i] = 1; + } + } else { + globalTestCount = graphTopologyContext->read_masks( + "test", globalSamples, globalTestBegin, globalTestEnd, globalTestMasks); + } +} + +void Net::readDistributedTestMasks(std::string dataset) { + distTestMasks = new mask_t[distNumSamples]; + if (dataset == "reddit") { + globalTestBegin = 177262; + globalTestCount = 55703; + globalTestEnd = globalTestBegin + globalTestCount; + for (size_t i = globalTestBegin; i < globalTestEnd; i++) { + if (dGraph->isLocal(i)) + distTestMasks[dGraph->getLID(i)] = 1; + } + } else { + globalTestCount = distContext->read_masks( + dataset, std::string("test"), globalSamples, globalTestBegin, + globalTestEnd, distTestMasks, dGraph); + } +} + +/** + * @param gBegin GLOBAL begin + * @param gEnd GLOBAL end + * @param gMasks: GLOBAL masks + * @param gCount GLOBAL training count + */ +acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, + mask_t* gMasks, float_t* preds, + label_t* localGroundTruth) { + galois::DGAccumulator accuracy_all; + galois::DGAccumulator sampleCount; + accuracy_all.reset(); + sampleCount.reset(); + + galois::do_all( + galois::iterate(gBegin, gEnd), + [&](const auto& gid) { + // only look at owned nodes (i.e. masters); the prediction for these + // should only by handled on the owner + if (this->dGraph->isOwned(gid)) { + sampleCount += 1; + uint32_t localID = this->dGraph->getLID(gid); + if (gMasks == NULL) { + auto pred = + math::argmax(num_classes, &preds[localID * num_classes]); + // check prediction + if ((label_t)pred == localGroundTruth[localID]) { + accuracy_all += 1.0; + } + } else { + if (gMasks[gid] == 1) { + // get prediction + auto pred = + math::argmax(num_classes, &preds[localID * num_classes]); + // check prediction + if ((label_t)pred == localGroundTruth[localID]) { + accuracy_all += 1.0; + } + } + } + } + }, + galois::loopname("getMaskedLoss")); + + gCount = sampleCount.reduce(); + galois::gDebug("Total sample count is ", gCount); + // all hosts should get same accuracy + return accuracy_all.reduce() / (acc_t)gCount; +} + +acc_t Net::masked_multi_class_accuracy(size_t gBegin, size_t gEnd, + size_t gCount, mask_t* gMasks, + float_t* preds, + label_t* localGroundTruth) { + // TODO fix this + if (galois::runtime::getSystemNetworkInterface().Num > 1) { + GALOIS_DIE( + "Multi-class accuracy not yet implemented for distributed setting\n"); + } + + return deepgalois::masked_f1_score(gBegin, gEnd, gCount, gMasks, num_classes, + localGroundTruth, preds); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu new file mode 100644 index 0000000000..ee70e1d578 --- /dev/null +++ b/libdeepgalois/src/Net.cu @@ -0,0 +1,227 @@ +#include "deepgalois/Net.h" +#include "deepgalois/cutils.h" +#include "deepgalois/math_functions.hh" +#include "gg.h" +#include "ggcuda.h" +#include + +// the arguments of the maxima +__device__ int argmax_device(const int n, const float_t* x) { + float_t max = x[0]; + int max_ind = 0; + for (int i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + +__global__ void masked_accuracy_kernel(int num_classes, int begin, int end, + mask_t* masks, float_t* preds, + label_t* labels, + HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage + local_accuracy; + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) { + label_t pred = (label_t)argmax_device(num_classes, + preds + (begin + i) * num_classes); + if (pred == labels[begin + i]) + total.reduce(1.0); + } + } + total.thread_exit>(local_accuracy); +} + +acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, + mask_t* masks, float_t* preds, label_t* labels) { + assert(count > 0); + HGAccumulator accuracy_accum; + Shared total_accuracy = Shared(1); + *(total_accuracy.cpu_wr_ptr()) = 0; + accuracy_accum.rv = total_accuracy.gpu_wr_ptr(); + masked_accuracy_kernel<<>>( + num_classes, begin, end, masks, preds, labels, accuracy_accum); + CudaTest("solving masked_accuracy kernel failed"); + cudaDeviceSynchronize(); + return *(total_accuracy.cpu_rd_ptr()) / count; +} + +typedef float f1count_t; +__global__ void +masked_f1_score_kernel(int num_classes, int begin, int end, mask_t* masks, + float_t* preds, label_t* labels, + f1count_t* true_positive, f1count_t* false_positive, + f1count_t* false_negtive, f1count_t* true_negtive) { + CUDA_KERNEL_LOOP(i, end - begin) { + int id = begin + i; + if (masks[id] == 1) { + for (size_t j = 0; j < num_classes; j++) { + int idx = id * num_classes + j; + if (labels[idx] == 1 && preds[idx] > 0.5) { + atomicAdd(&true_positive[j], 1.0); + } else if (labels[idx] == 0 && preds[idx] > 0.5) { + atomicAdd(&false_positive[j], 1.0); + } else if (labels[idx] == 1 && preds[idx] <= 0.5) { + atomicAdd(&false_negtive[j], 1.0); + } else { + atomicAdd(&true_negtive[j], 1.0); + } + } + } + } +} + +acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, + mask_t* masks, float_t* preds, label_t* labels) { + float beta = 1.0; + assert(count > 0); + f1count_t* h_tp = new f1count_t[num_classes]; + f1count_t* h_fp = new f1count_t[num_classes]; + f1count_t* h_fn = new f1count_t[num_classes]; + f1count_t* h_tn = new f1count_t[num_classes]; + f1count_t *d_tp, *d_fp, *d_fn, *d_tn; + float_malloc_device(num_classes, d_tp); + float_malloc_device(num_classes, d_fp); + float_malloc_device(num_classes, d_fn); + float_malloc_device(num_classes, d_tn); + init_const_gpu(num_classes, 0.0, d_tp); + init_const_gpu(num_classes, 0.0, d_fp); + init_const_gpu(num_classes, 0.0, d_fn); + init_const_gpu(num_classes, 0.0, d_tn); + masked_f1_score_kernel<<>>( + num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn, d_tn); + CudaTest("solving masked_f1_score_kernel kernel failed"); + CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + + acc_t pNumerator = 0.0; + acc_t pDenominator = 0.0; + acc_t rNumerator = 0.0; + acc_t rDenominator = 0.0; + acc_t precisionMacro = 0.0; + acc_t recallMacro = 0.0; + for (size_t i = 0; i < num_classes; i++) { + acc_t fn = (acc_t)h_fn[i]; // false negtive + acc_t fp = (acc_t)h_fp[i]; // false positive + acc_t tp = (acc_t)h_tp[i]; // true positive + // acc_t tn = (acc_t)h_tn[i]; // true positive + + precisionMacro = precisionMacro + (tp / (tp + fp)); + recallMacro = recallMacro + (tp / (tp + fn)); + pNumerator = pNumerator + tp; + pDenominator = pDenominator + (tp + fp); + rNumerator = rNumerator + tp; + rDenominator = rDenominator + (tp + fn); + } + precisionMacro = precisionMacro / num_classes; + recallMacro = recallMacro / num_classes; + acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) / + ((beta * beta) * precisionMacro + recallMacro); + acc_t recallMicro = rNumerator / rDenominator; + acc_t precisionMicro = pNumerator / pDenominator; + acc_t f1_micro = (((beta * beta) + 1) * precisionMicro * recallMicro) / + ((beta * beta) * precisionMicro + recallMicro); + std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro + << ", f1_macro: " << f1_macro << ") "; + + float_free_device(d_tp); + float_free_device(d_fp); + float_free_device(d_fn); + float_free_device(d_tn); + delete[] h_tp; + delete[] h_fp; + delete[] h_fn; + delete[] h_tn; + return f1_micro; +} + +namespace deepgalois { + +void Net::allocateSubgraphsMasks(int num_subgraphs) { + subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; + //CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t))); +} + +void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) { + copy_masks_device(globalSamples, globalTrainMasks, d_train_masks); + copy_masks_device(globalSamples, globalValMasks, d_val_masks); + + this->distContext = new deepgalois::DistContext(); + this->distContext->set_dataset(dataset_str); + + // read the graph into CPU memory and copy it to GPU memory + this->distNumSamples = this->distContext->read_graph(dataset_str, is_selfloop); + + // read labels into CPU memory + num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str); + + // read features into CPU memory + feature_dims[0] = this->distContext->read_features(dataset_str); + + // copy labels and features from CPU memory to GPU memory + distContext->copy_data_to_device(); // copy labels and input features to the device + + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (this->has_l2norm) { + // l2 normalized embedding: E + feature_dims[num_conv_layers + 1] = num_classes; + } + if (this->has_dense) { + // MLP embedding: E + feature_dims[num_layers - 1] = num_classes; + } + feature_dims[num_layers] = num_classes; // normalized output embedding: E +} + +void Net::read_test_masks(std::string dataset) { + if (dataset == "reddit") { + globalTestBegin = 177262; + globalTestCount = 55703; + globalTestEnd = globalTestBegin + globalTestCount; + for (size_t i = globalTestBegin; i < globalTestEnd; i++) + globalTestMasks[i] = 1; + } else { + globalTestCount = distContext->read_masks(dataset, std::string("test"), + globalSamples, globalTestBegin, globalTestEnd, globalTestMasks, NULL); + } + //copy_test_masks_to_device(); + copy_masks_device(globalSamples, globalTestMasks, d_test_masks); +} + +//void Net::copy_test_masks_to_device() {} + +// add weight decay +void Net::regularize() { + size_t layer_id = 0; + auto n = feature_dims[layer_id] * feature_dims[layer_id + 1]; + axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), + layers[layer_id]->get_grads_device_ptr()); +} + +//void Net::normalize() {} + +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, + label_t* ground_truth) { + return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, + ground_truth); +} + +acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, + label_t* ground_truth) { + return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds, + ground_truth); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp new file mode 100644 index 0000000000..23efe124d2 --- /dev/null +++ b/libdeepgalois/src/RandomWalk.cpp @@ -0,0 +1,222 @@ +#include +#include +#include +#include "galois/Galois.h" +#include "deepgalois/utils.h" +#include "deepgalois/Sampler.h" + +namespace deepgalois { + +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, + DGraph* dg) { + this->count_ = count; + // save original graph + Sampler::globalGraph = g; + // save partitioned graph + Sampler::partGraph = dg; + + // allocate the object for the new masked graph + Sampler::globalMaskedGraph = new GraphCPU(); + + std::vector degrees(g->size(), 0); + galois::gPrint("graph size: ", g->size(), "\n"); + // get degrees of nodes that will be in new graph + // this->getMaskedDegrees(g->size(), masks, g, degrees); + galois::do_all( + galois::iterate(size_t(0), g->size()), + [&](const auto src) { + if (masks[src] == 1) { + for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); + e++) { + const auto dst = g->getEdgeDstHost(e); + if (masks[dst] == 1) + degrees[src]++; + } + } + }, + galois::loopname("update_degrees")); + + auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto ne = offsets[g->size()]; + + // save ids (of original graph) of training nodes to vector + for (size_t i = 0; i < g->size(); i++) { + if (masks[i] == 1) + Sampler::trainingNodes.push_back(i); + } + + Sampler::globalMaskedGraph->allocateFrom(g->size(), ne); + Sampler::globalMaskedGraph->constructNodes(); + // same as original graph, except keep only edges involved in masks + galois::do_all( + galois::iterate((size_t)0, g->size()), + [&](const auto src) { + Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); + e++) { + const auto dst = g->getEdgeDstHost(e); + if (masks[dst] == 1) { + // galois::gPrint(src, " ", dst, "\n"); + Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); + } + } + } + }, + galois::loopname("gen_subgraph")); + + Sampler::globalMaskedGraph->degree_counting(); + Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); + Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; + + // TODO masked part graph as well to save time later; right now constructing + // from full part graph +} + +// implementation from GraphSAINT +// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp +void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { + if (n < m) + m = n; + unsigned myseed = seed; + + // unsigned myseed = tid; + // DBx: Dashboard line x, IAx: Index array line x + std::vector DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2; + DB0.reserve(subg_deg * m * ETA); + DB1.reserve(subg_deg * m * ETA); + DB2.reserve(subg_deg * m * ETA); + IA0.reserve(n); + IA1.reserve(n); + IA2.reserve(n); + IA3.reserve(n); + IA4.reserve(n); + IA0.resize(m); + IA1.resize(m); + IA2.resize(m); + IA3.resize(m); + + // galois::gPrint("seed ", myseed, " m ", m, "\n"); + // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n"); + // printf("( "); + // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " "; + // printf(")\n"); + + for (index_t i = 0; i < m; i++) { + auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size(); + db_t v = IA3[i] = Sampler::trainingNodes[rand_idx]; + st.insert(v); + IA0[i] = getDegree(Sampler::globalMaskedGraph, v); + IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i]; + IA1[i] = 1; + IA2[i] = 0; + } + // calculate prefix sum for IA0 and store in IA2 to compute the address for + // each frontier in DB + IA2[0] = IA0[0]; + for (index_t i = 1; i < m; i++) + IA2[i] = IA2[i - 1] + IA0[i]; + // now fill DB accordingly + checkGSDB(DB0, DB1, DB2, IA2[m - 1]); + for (index_t i = 0; i < m; i++) { + db_t DB_start = (i == 0) ? 0 : IA2[i - 1]; + db_t DB_end = IA2[i]; + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3[i]; + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + DB2[j] = i + 1; + } + } + + db_t choose, neigh_v, newsize, tmp; + for (index_t itr = 0; itr < n - m; itr++) { + choose = db_t(-1); + while (choose == db_t(-1)) { + tmp = rand_r(&myseed) % DB0.size(); + if (size_t(tmp) < DB0.size()) + if (DB0[tmp] != db_t(-1)) + choose = tmp; + } + choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); + db_t v = DB0[choose]; + auto degree = getDegree(Sampler::globalMaskedGraph, v); + neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); + if (neigh_v != db_t(-1)) { + neigh_v = Sampler::globalMaskedGraph->getEdgeDstHost( + Sampler::globalMaskedGraph->edge_begin_host(v) + neigh_v); + st.insert(neigh_v); + IA1[DB2[choose] - 1] = 0; + IA0[DB2[choose] - 1] = 0; + for (auto i = choose; i < choose - DB1[choose]; i++) + DB0[i] = db_t(-1); + newsize = getDegree(Sampler::globalMaskedGraph, neigh_v); + newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize; + } else + newsize = 0; + // shrink DB to remove sampled nodes, also shrink IA accordingly + bool cond = DB0.size() + newsize > DB0.capacity(); + if (cond) { + // compute prefix sum for the location in shrinked DB + IA4.resize(IA0.size()); + IA4[0] = IA0[0]; + for (size_t i = 1; i < IA0.size(); i++) + IA4[i] = IA4[i - 1] + IA0[i]; + nDB0.resize(IA4.back()); + nDB1.resize(IA4.back()); + nDB2.resize(IA4.back()); + IA2.assign(IA4.begin(), IA4.end()); + for (size_t i = 0; i < IA0.size(); i++) { + if (IA1[i] == 0) + continue; + db_t DB_start = (i == 0) ? 0 : IA4[i - 1]; + db_t DB_end = IA4[i]; + for (auto j = DB_start; j < DB_end; j++) { + nDB0[j] = IA3[i]; + nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + nDB2[j] = i + 1; + } + } + // remap the index in DB2 by compute prefix of IA1 (new idx in IA) + IA4.resize(IA1.size()); + IA4[0] = IA1[0]; + for (size_t i = 1; i < IA1.size(); i++) + IA4[i] = IA4[i - 1] + IA1[i]; + DB0.assign(nDB0.begin(), nDB0.end()); + DB1.assign(nDB1.begin(), nDB1.end()); + DB2.assign(nDB2.begin(), nDB2.end()); + for (auto i = DB2.begin(); i < DB2.end(); i++) + *i = IA4[*i - 1]; + db_t curr = 0; + for (size_t i = 0; i < IA0.size(); i++) { + if (IA0[i] != 0) { + IA0[curr] = IA0[i]; + IA1[curr] = IA1[i]; + IA2[curr] = IA2[i]; + IA3[curr] = IA3[i]; + curr++; + } + } + IA0.resize(curr); + IA1.resize(curr); + IA2.resize(curr); + IA3.resize(curr); + } + checkGSDB(DB0, DB1, DB2, newsize + DB0.size()); + IA0.push_back(newsize); + IA1.push_back(1); + IA2.push_back(IA2.back() + IA0.back()); + IA3.push_back(neigh_v); + db_t DB_start = (*(IA2.end() - 2)); + db_t DB_end = IA2.back(); + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3.back(); + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + DB2[j] = IA3.size(); + } + } + // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: "); + // print_vertex_set(st); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp new file mode 100644 index 0000000000..055b5c0a85 --- /dev/null +++ b/libdeepgalois/src/Sampler.cpp @@ -0,0 +1,360 @@ +#include +#include +#include "galois/Galois.h" +#include "deepgalois/utils.h" +#include "deepgalois/Sampler.h" +#define PARALLEL_GEN + +namespace deepgalois { + +//! debug function: prints out sets of vertices +void print_vertex_set(VertexSet vertex_set) { + unsigned counter = 0; + unsigned n = vertex_set.size(); + galois::gPrint("( "); + for (int i : vertex_set) { + counter++; + if (counter > 16 && counter < n - 16) + continue; + galois::gPrint(i, " "); + } + galois::gPrint(")\n"); +} + +/* +// implementation from GraphSAINT +// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp +void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { + if (n < m) m = n; + unsigned myseed = seed; + + // unsigned myseed = tid; + // DBx: Dashboard line x, IAx: Index array line x + std::vector DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2; + DB0.reserve(subg_deg * m * ETA); + DB1.reserve(subg_deg * m * ETA); + DB2.reserve(subg_deg * m * ETA); + IA0.reserve(n); + IA1.reserve(n); + IA2.reserve(n); + IA3.reserve(n); + IA4.reserve(n); + IA0.resize(m); + IA1.resize(m); + IA2.resize(m); + IA3.resize(m); + + // galois::gPrint("seed ", myseed, " m ", m, "\n"); + // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n"); + // printf("( "); + // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " "; + // printf(")\n"); + + for (index_t i = 0; i < m; i++) { + auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size(); + db_t v = IA3[i] = Sampler::trainingNodes[rand_idx]; + st.insert(v); + IA0[i] = getDegree(Sampler::globalMaskedGraph, v); + IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i]; + IA1[i] = 1; + IA2[i] = 0; + } + // calculate prefix sum for IA0 and store in IA2 to compute the address for + // each frontier in DB + IA2[0] = IA0[0]; + for (index_t i = 1; i < m; i++) + IA2[i] = IA2[i - 1] + IA0[i]; + // now fill DB accordingly + checkGSDB(DB0, DB1, DB2, IA2[m - 1]); + for (index_t i = 0; i < m; i++) { + db_t DB_start = (i == 0) ? 0 : IA2[i - 1]; + db_t DB_end = IA2[i]; + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3[i]; + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + DB2[j] = i + 1; + } + } + + db_t choose, neigh_v, newsize, tmp; + for (index_t itr = 0; itr < n - m; itr++) { + choose = db_t(-1); + while (choose == db_t(-1)) { + tmp = rand_r(&myseed) % DB0.size(); + if (size_t(tmp) < DB0.size()) + if (DB0[tmp] != db_t(-1)) + choose = tmp; + } + choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); + db_t v = DB0[choose]; + auto degree = getDegree(Sampler::globalMaskedGraph, v); + neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); + if (neigh_v != db_t(-1)) { + neigh_v = Sampler::globalMaskedGraph->getEdgeDst( + Sampler::globalMaskedGraph->edge_begin(v) + neigh_v); + st.insert(neigh_v); + IA1[DB2[choose] - 1] = 0; + IA0[DB2[choose] - 1] = 0; + for (auto i = choose; i < choose - DB1[choose]; i++) + DB0[i] = db_t(-1); + newsize = getDegree(Sampler::globalMaskedGraph, neigh_v); + newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize; + } else + newsize = 0; + // shrink DB to remove sampled nodes, also shrink IA accordingly + bool cond = DB0.size() + newsize > DB0.capacity(); + if (cond) { + // compute prefix sum for the location in shrinked DB + IA4.resize(IA0.size()); + IA4[0] = IA0[0]; + for (size_t i = 1; i < IA0.size(); i++) + IA4[i] = IA4[i - 1] + IA0[i]; + nDB0.resize(IA4.back()); + nDB1.resize(IA4.back()); + nDB2.resize(IA4.back()); + IA2.assign(IA4.begin(), IA4.end()); + for (size_t i = 0; i < IA0.size(); i++) { + if (IA1[i] == 0) + continue; + db_t DB_start = (i == 0) ? 0 : IA4[i - 1]; + db_t DB_end = IA4[i]; + for (auto j = DB_start; j < DB_end; j++) { + nDB0[j] = IA3[i]; + nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + nDB2[j] = i + 1; + } + } + // remap the index in DB2 by compute prefix of IA1 (new idx in IA) + IA4.resize(IA1.size()); + IA4[0] = IA1[0]; + for (size_t i = 1; i < IA1.size(); i++) + IA4[i] = IA4[i - 1] + IA1[i]; + DB0.assign(nDB0.begin(), nDB0.end()); + DB1.assign(nDB1.begin(), nDB1.end()); + DB2.assign(nDB2.begin(), nDB2.end()); + for (auto i = DB2.begin(); i < DB2.end(); i++) + *i = IA4[*i - 1]; + db_t curr = 0; + for (size_t i = 0; i < IA0.size(); i++) { + if (IA0[i] != 0) { + IA0[curr] = IA0[i]; + IA1[curr] = IA1[i]; + IA2[curr] = IA2[i]; + IA3[curr] = IA3[i]; + curr++; + } + } + IA0.resize(curr); + IA1.resize(curr); + IA2.resize(curr); + IA3.resize(curr); + } + checkGSDB(DB0, DB1, DB2, newsize + DB0.size()); + IA0.push_back(newsize); + IA1.push_back(1); + IA2.push_back(IA2.back() + IA0.back()); + IA3.push_back(neigh_v); + db_t DB_start = (*(IA2.end() - 2)); + db_t DB_end = IA2.back(); + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3.back(); + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + DB2[j] = IA3.size(); + } + } + // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: "); + // print_vertex_set(st); +} +*/ + +// API function for user-defined selection strategy +// Select n vertices from vertices and put them in vertex_set. +// nv: number of vertices in the original graph; +// n: number of vertices in the subgraph; +// m: number of vertices in the frontier. +// our implementation of GraphSAINT sampling +void Sampler::selectVertices(index_t nv, index_t n, Graph* g, + VertexList vertices, VertexSet& vertex_set) { + // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, + // graph size: ", g->size(), "\n"); + assert(nv == vertices.size()); + // randomly select m vertices from vertices as frontier + auto frontier_indices = deepgalois::select_k_items((int)m, 0, (int)nv); + VertexList frontier(m); + for (index_t i = 0; i < m; i++) + frontier[i] = vertices[frontier_indices[i]]; + vertex_set.insert(frontier.begin(), frontier.end()); + // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); + int* degrees = new int[m]; + // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { + for (index_t i = 0; i < m; i++) { + degrees[i] = (int)getDegree(g, frontier[i]); + } //, galois::loopname("compute_degrees")); + for (index_t i = 0; i < n - m; i++) { + auto pos = select_one_item((int)m, degrees); + auto u = frontier[pos]; + auto degree = degrees[pos]; + int j = 0; + for (; j < degree; j++) { + auto neighbor_id = rand() % degree; // randomly select a neighbor + auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id); + if (vertex_set.find(dst) == vertex_set.end()) { + frontier[pos] = dst; + degrees[pos] = getDegree(g, frontier[pos]); + vertex_set.insert(dst); + break; + } + } + if (j == degree) + galois::gPrint("Not found from ", degree, " neighbors\n"); + } + /* + assert(n == vertex_set.size()); // size of vertex_set could be slightly + smaller than n galois::gPrint("Done selection, vertex_set size: ", + vertex_set.size(), ", set: "); print_vertex_set(vertex_set); + */ +} + +// Given a subset of vertices and a graph g, generate a subgraph sg from the +// graph g +void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, + Graph& reindexGraph) { + // auto n = origGraph.size(); // old graph size + auto nv = keptVertices.size(); // new graph (subgraph) size + VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices); + std::vector degrees(nv, 0); // degrees of vertices in the subgraph + for (auto v : keptVertices) { + degrees[new_ids[v]] = getDegree(&origGraph, v); + } + // auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto offsets = deepgalois::prefix_sum(degrees); + auto ne = offsets[nv]; + // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, + // "\n"); + reindexGraph.allocateFrom(nv, ne); + reindexGraph.constructNodes(); + VertexList old_ids(keptVertices.begin(), + keptVertices.end()); // vertex ID mapping +#ifdef PARALLEL_GEN + galois::do_all( + galois::iterate(size_t(0), size_t(nv)), + [&](const auto i) { +#else + for (size_t i = 0; i < nv; i++) { +#endif + reindexGraph.fixEndEdge(i, offsets[i + 1]); + unsigned j = 0; + auto old_id = old_ids[i]; + for (auto e = origGraph.edge_begin(old_id); + e != origGraph.edge_end(old_id); e++) { + auto dst = new_ids[origGraph.getEdgeDst(e)]; + assert(dst < nv); + reindexGraph.constructEdge(offsets[i] + j, dst, 0); + j++; + } + } +#ifdef PARALLEL_GEN + , + galois::loopname("construct_graph")); +#endif +} + +VertexSet Sampler::convertToLID(VertexSet& gidSet) { + VertexSet existingLIDs; + // find local selected vertices, convert to lid + for (auto i : gidSet) { + if (partGraph->isLocal(i)) { + existingLIDs.insert(partGraph->getLID(i)); + } + } + return existingLIDs; +} + +template +void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, + std::vector& degrees) { + // template <> + // void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g, + // std::vector& degrees) { + assert(degrees.size() == n); + galois::do_all( + galois::iterate(size_t(0), n), + [&](const auto src) { + // for (size_t src = 0; src < n; src++) { + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gInfo("Edge ", src, " ", dst); + degrees[src]++; + } + } + } + }, + galois::loopname("update_degrees")); +} + +template +void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, + SubgraphTy* sub) { + std::vector degrees(n, 0); + this->getMaskedDegrees(n, masks, g, degrees); + // auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto offsets = deepgalois::prefix_sum(degrees); + size_t ne = offsets[n]; + // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne, + // "\n"); + + // note this constructs the full graph's nodes; just trims edges + sub->allocateFrom(n, ne); + sub->constructNodes(); + + galois::do_all( + galois::iterate(size_t(0), size_t(n)), + [&](const auto src) { + sub->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gPrint(src, " ", dst, "\n"); + sub->constructEdge(idx++, dst, 0); + } + } + } + }, + galois::loopname("gen_subgraph")); +} + +void Sampler::generateSubgraph(VertexSet& sampledSet, mask_t* masks, + Graph* sg) { + // n = 9000 by default + // do the sampling of vertices from training set + using masked graph + + // sampledSet is a list of *global* ids in the graph + // create new vertex set with LIDs for partitioned graph + VertexSet sampledLIDs = this->convertToLID(sampledSet); + + // VertexSet sampledLIDs; + // galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n"); + // galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(), + // "\n"); for (auto i : this->trainingNodes) { + // sampledLIDs.insert(i); + //} + + // create the masks + createMasks(Sampler::partGraph->size(), sampledLIDs, masks); + + // this graph will contain sampled vertices and induced subgraph for it + Graph maskedSG; + // TODO use partMaskedGraph once constructed later + // remove edges whose destination is not masked + this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, + &maskedSG); + this->reindexSubgraph(sampledLIDs, maskedSG, *sg); + + // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n"); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu new file mode 100644 index 0000000000..1cdfc49e32 --- /dev/null +++ b/libdeepgalois/src/Sampler.cu @@ -0,0 +1,168 @@ +#include +#include +#include "deepgalois/cutils.h" +#include "deepgalois/Sampler.h" + +namespace deepgalois { + +__global__ void clear_masks(index_t n, mask_t* masks) { + CUDA_KERNEL_LOOP(i, n) { masks[i] = 0; } +} + +// set the masks of vertices in a given vertex set +// n is the size of the vertex set +__global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) { + CUDA_KERNEL_LOOP(i, n) { masks[vertices[i]] = 1; } +} + +// compute the degrees of a masked graph +// n is the size of the original graph +__global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g, + index_t* degrees) { + CUDA_KERNEL_LOOP(src, n) { + //if (src < 10) printf("masks[%d] = %d\n", src, masks[src]); + degrees[src] = 0; + if (masks[src] == 1) { + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + if (masks[dst] == 1) + degrees[src]++; + } + } + //if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]); + } +} + +// Given a graph, remove any edge which has end-point masked, and generate the +// subgraph n is the size of the original graph and the subgraph offset was +// computed by using prefix-sum of the masked degrees +__global__ void generate_masked_graph_kernel(index_t n, const mask_t* masks, + const index_t* offsets, GraphGPU g, + GraphGPU subg) { + CUDA_KERNEL_LOOP(src, n) { + subg.fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + if (masks[dst] == 1) + subg.constructEdge(idx++, dst); + } + } + } +} + +// compute the degrees of the subgraph induced by the vertex set +// n is the size of the vertex set +// new_ids array maps vertex ID in the original graph to the vertex ID in the +// subgraph +__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids, + GraphGPU g, index_t* degrees) { + CUDA_KERNEL_LOOP(i, n) { + auto v = vertices[i]; + degrees[new_ids[v]] = g.getOutDegree(v); + } +} + +// Given a masked graph, remove the masked vertices, reindex the rest vertices, +// and generate the subgraph offset was computed by using prefix-sum of the new +// degrees n is the size of the old_ids and the sbugraph +__global__ void generate_graph_kernel(index_t n, const index_t* offsets, + const index_t* old_ids, + const index_t* new_ids, GraphGPU g, + GraphGPU subg) { + CUDA_KERNEL_LOOP(i, n) { + subg.fixEndEdge(i, offsets[i + 1]); + index_t j = 0; + auto src = old_ids[i]; + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = new_ids[g.getEdgeDst(e)]; + assert(dst < n); + subg.constructEdge(offsets[i] + j, dst); + j++; + } + } +} + +/* +void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) { + index_t vid = 0; + for (index_t i = 0; i < n; i++) { + auto v = vertices[i]; + new_indices[v] = vid++; + } +} +*/ + +template +void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) { + //std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n"; + index_t *degrees, *offsets; + CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n)); + get_masked_degrees<<>>(n, masks, *g, degrees); + CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1))); + thrust::exclusive_scan(thrust::device, degrees, degrees+n+1, offsets); + CUDA_CHECK(cudaFree(degrees)); + index_t ne; + CUDA_CHECK(cudaMemcpy(&ne, &offsets[n], sizeof(index_t), cudaMemcpyDeviceToHost)); + //std::cout << "maskedSG num_edges " << ne << "\n"; + subg->allocateFrom(n, ne); // TODO: avoid reallocation + generate_masked_graph_kernel<<>>(n, masks, offsets, *g, *subg); + CUDA_CHECK(cudaFree(offsets)); +} + +// n: size of the original graph +// nv: size of the subgraph; i.e. size of vertex_set +// masks, graph g and subgraph sub are on the device (GPU) +void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) { + index_t n = partGraph->size(); + auto nv = vertex_set.size(); + //std::cout << "g size: " << n << " sg sizes: " << nv << "\n"; + // convert the vertex_set to a vertex_list and copy it to the device + VertexList vertex_list(vertex_set.begin(), vertex_set.end()); + index_t* d_vertex_list; + cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t)); + CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice)); + + createMasks(n, vertex_set, masks); + mask_t* d_masks; + cudaMalloc((void**)&d_masks, n * sizeof(mask_t)); + CUDA_CHECK(cudaMemcpy(d_masks, masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); + //clear_masks<<>>(n, d_masks); // set all 0 + //CudaTest("solving clear_masks kernel failed"); + // createMasks: set masks for vertices in the vertex_set + //set_masks<<>>(n, d_vertex_list, d_masks); + //CudaTest("solving set_masks kernel failed"); + GraphGPU masked_sg; // size is the same as original graph, but masked dst removed + getMaskedGraph(n, d_masks, partGraph, &masked_sg); // remove edges whose destination is not masked + //std::cout << "maskedGraph generated\n"; + + // re-index the subgraph + index_t* d_new_ids; + cudaMalloc((void**)&d_new_ids, n * sizeof(index_t)); + // Given an old vertex ID ∈ [0, n), returns a new vertex ID ∈ [0, nv) + auto new_ids = reindexVertices(n, vertex_set); + CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), cudaMemcpyHostToDevice)); + + // generate the offsets for the re-indexed subgraph + index_t *degrees, *offsets; + CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*nv)); + get_new_degrees<<>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees); + CudaTest("solving get_new_degrees kernel failed"); + CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1))); + thrust::exclusive_scan(thrust::device, degrees, degrees+nv+1, offsets); + CUDA_CHECK(cudaFree(degrees)); + index_t ne; + CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost)); + //std::cout << "subgraph num_edges " << ne << "\n"; + + // allocate memory for the subgraph + sub->allocateFrom(nv, ne); // avoid reallocation + // generate the subgraph + generate_graph_kernel<<>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub); + CudaTest("solving generate_graph kernel failed"); + CUDA_CHECK(cudaFree(offsets)); + //std::cout << "Subgraph generated\n"; +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp new file mode 100644 index 0000000000..4275232baa --- /dev/null +++ b/libdeepgalois/src/Train.cpp @@ -0,0 +1,554 @@ +#include "galois/Galois.h" +#include "deepgalois/Net.h" + +namespace deepgalois { + +Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, + unsigned hidden1, float lr, float dropout, float wd, bool selfloop, + bool single, bool l2norm, bool dense, unsigned neigh_sz, + unsigned subg_sz, int val_itv) + : // globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0), + // globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), + // globalValBegin(0), globalValEnd(0), globalValCount(0), + // globalTestBegin(0), globalTestEnd(0), globalTestCount(0), + // globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL) + // {} + is_single_class(single), has_l2norm(l2norm), has_dense(dense), + neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), + num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), h1(hidden1), + learning_rate(lr), dropout_rate(dropout), weight_decay(wd), + val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { + // init some identifiers for this host + unsigned myID = 0; +#ifndef GALOIS_ENABLE_GPU + myID = galois::runtime::getSystemNetworkInterface().ID; +#endif + this->header = "[" + std::to_string(myID) + "] "; + assert(n_conv > 0); + this->num_layers = num_conv_layers + 1; + + // additional layers to add + if (has_l2norm) + this->num_layers++; + if (has_dense) + this->num_layers++; + // initialize feature metadata + feature_dims.resize(num_layers + 1); + print_configs(); + + // initialze global graph context + graphTopologyContext = new deepgalois::Context(); + graphTopologyContext->set_dataset(dataset_str); + // read *entire* graph, get num nodes + globalSamples = graphTopologyContext->read_graph(selfloop); + + // get training and validation sets: this is to create the training + // subgraph in the sampler + globalTrainMasks = new mask_t[globalSamples]; + globalValMasks = new mask_t[globalSamples]; + globalTestMasks = new mask_t[globalSamples]; + std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0); + std::fill(globalValMasks, globalValMasks + globalSamples, 0); + + // reddit is hard coded + if (dataset_str == "reddit") { + this->globalTrainBegin = 0; + this->globalTrainCount = 153431; + this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; + this->globalValBegin = 153431; + this->globalValCount = 23831; + this->globalValEnd = this->globalValBegin + this->globalValCount; + + // TODO do all can be used below + for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) + globalTrainMasks[i] = 1; + for (size_t i = globalValBegin; i < globalValEnd; i++) + globalValMasks[i] = 1; + } else { + globalTrainCount = graphTopologyContext->read_masks( + "train", globalSamples, globalTrainBegin, globalTrainEnd, + globalTrainMasks); + globalValCount = graphTopologyContext->read_masks( + "val", globalSamples, globalValBegin, globalValEnd, globalValMasks); + } + // make sure sampel size isn't greater than what we have to train with + assert(subgraph_sample_size <= globalTrainCount); + + layers.resize(num_layers); + // hidden1 level embedding: 16 + for (size_t i = 1; i < num_conv_layers; i++) + feature_dims[i] = this->h1; + + // features are read in distcontext, not this context (this context only + // used for sampling) + if (subgraph_sample_size) + sampler = new deepgalois::Sampler(); +} + +void Net::train(optimizer* opt, bool need_validate) { + galois::StatTimer train_timer("Timer_0"); + train_timer.start(); + std::string separator = "\n"; + double total_train_time = 0.0; + int num_subg_remain = 0; +#ifndef GALOIS_ENABLE_GPU + unsigned hostID = galois::runtime::getSystemNetworkInterface().ID; +#endif + + if (subgraph_sample_size) { + galois::StatTimer construct_time("SubgraphAllocateTime"); + construct_time.start(); + distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); + allocateSubgraphsMasks(num_subgraphs); + std::cout << header + << "Constructing training vertex set induced graph...\n"; + // auto gg = distContext->getGraphPointer(); + auto gg = + graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem + sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, + distContext->getGraphPointer()); + construct_time.stop(); + } + + galois::gPrint(header, "Start training...\n"); + + Timer t_epoch; + + // run epochs + for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) { + t_epoch.Start(); + + //////////////////////////////////////////////////////////////////////////////// + // Sampling + //////////////////////////////////////////////////////////////////////////////// + if (subgraph_sample_size) { + galois::StatTimer sample_time("SubgraphSampleTime"); + sample_time.start(); + if (num_subg_remain == 0) { + std::cout << header << "Generating " << num_subgraphs + << " subgraph(s)\n"; + galois::StatTimer t_subgen("SubgraphGenerateTime"); + t_subgen.start(); + + // generate subgraphs + for (int sid = 0; sid < num_subgraphs; sid++) { + VertexSet sampledSet; + sampler->selectVertices(subgraph_sample_size, sampledSet, + curEpoch); // m = 1000 by default + sampler->generateSubgraph(sampledSet, + subgraphs_masks + sid * globalSamples, + distContext->getSubgraphPointer(sid)); + } + num_subg_remain = num_subgraphs; + t_subgen.stop(); + } + // count their degrees + for (int i = 0; i < num_subgraphs; i++) { + auto sg_ptr = distContext->getSubgraphPointer(i); + sg_ptr->degree_counting(); + // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " + // num_e ", sg_ptr->sizeEdges(), "\n"); + } + + // choose a subgraph to use + num_subg_remain--; + int sg_id = num_subg_remain; + auto subgraphPointer = distContext->getSubgraphPointer(sg_id); + this->subgraphNumVertices = subgraphPointer->size(); + + // std::cout << "Subgraph num_vertices: " << subgraphNumVertices + // << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; + for (size_t i = 0; i < num_layers; i++) { + layers[i]->update_dim_size(this->subgraphNumVertices); + } + + // TODO dist version where i need global degrees + // change normalization constants + distContext->constructNormFactorSub(sg_id); + for (size_t i = 0; i < num_conv_layers; i++) { + layers[i]->set_graph_ptr(subgraphPointer); + layers[i]->set_norm_consts_ptr( + distContext->get_norm_factors_subg_ptr()); + } + + // update labels for subgraph + distContext->constructSubgraphLabels( + this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); + layers[num_layers - 1]->set_labels_ptr( + distContext->get_labels_subg_ptr()); + + // update features for subgraph + distContext->constructSubgraphFeatures( + this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); + layers[0]->set_feats_ptr( + distContext->get_feats_subg_ptr()); // feed input data + + // Graph* testing = distContext->getSubgraphPointer(sg_id); + // for (size_t i = 0; i < testing->size(); i++) { + // for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) + // { + // galois::gPrint(i, " ", testing->getEdgeDst(j), "\n"); + // } + //} + sample_time.stop(); + } // end subgraph sample loop + //////////////////////////////////////////////////////////////////////////////// + + // training steps +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; +#else + if (hostID == 0) { + galois::gInfo("Epoch ", std::setw(3), curEpoch); + } +#endif + set_netphases(net_phase::train); + acc_t train_loss = 0.0, train_acc = 0.0; + + // galois::gPrint(header, "Calling into eval for forward propagation\n"); + // forward: after this phase, layer edges will contain intermediate + // features for use during backprop + double fw_time = evaluate("train", train_loss, train_acc); + // evaluate("train", train_loss, train_acc); + + // galois::gPrint(header, "Calling into backward propagation\n"); + // backward: use intermediate features + ground truth to update layers + // with feature gradients whcih are then used to calculate weight + // gradients + Net::bprop(); + + // galois::gPrint(header, "Weight update call\n"); + // gradient update: use gradients stored on each layer to update model + // for next epoch + Net::update_weights(opt); // update parameters + + t_epoch.Stop(); + + // validation / testing + set_netphases(net_phase::test); + +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "train_loss " << std::setprecision(3) << std::fixed + << train_loss << " train_acc " << train_acc << " "; +#else + if (hostID == 0) { + galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, + train_loss, " train_acc ", train_acc, separator); + } +#endif + + double epoch_time = t_epoch.Millisecs(); + total_train_time += epoch_time; + + // report current total time + accuracy as a stat +#ifndef GALOIS_ENABLE_GPU + if (hostID == 0) { + galois::runtime::reportParam( + std::string("GNN"), + "Epoch" + std::to_string(curEpoch) + "TestAccuracyAndTime", + std::to_string(train_acc) + ";" + std::to_string(total_train_time)); + } +#endif + + if (need_validate && (curEpoch % val_interval == 0)) { + // Validation + acc_t val_loss = 0.0, val_acc = 0.0; + double val_time = evaluate("val", val_loss, val_acc); +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "val_loss " << std::setprecision(3) << std::fixed + << val_loss << " val_acc " << val_acc << " "; + std::cout << header << "time " << std::setprecision(3) << std::fixed + << epoch_time + val_time << " ms (train_time " << epoch_time + << " val_time " << val_time << ")\n"; +#else + if (hostID == 0) { + galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, + val_loss, " val_acc ", val_acc, separator); + galois::gPrint(header, "time ", std::setprecision(3), std::fixed, + epoch_time + val_time, " ms (train_time ", epoch_time, + " val_time ", val_time, ")\n"); + } +#endif + } else { +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "train_time " << std::fixed << epoch_time + << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time + << ")\n"; +#else + if (hostID == 0) { + galois::gPrint(header, "train_time ", std::fixed, epoch_time, + " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, + ")\n"); + } +#endif + } + } // epoch loop + + double avg_train_time = total_train_time / (double)num_epochs; + double throughput = 1000.0 * (double)num_epochs / total_train_time; +#ifdef GALOIS_ENABLE_GPU + std::cout << "Average training time per epoch: " << avg_train_time + << "ms. Throughput " << throughput << " epoch/s\n"; +#else + galois::gPrint(header, "Average training time per epoch: ", avg_train_time, + " ms. Throughput: ", throughput, " epoch/s\n"); +#endif + train_timer.stop(); +} + +// evaluate, i.e. inference or predict +double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { + // TODO get rid of this timer + Timer t_eval; + t_eval.Start(); + + galois::StatTimer eval_timer("EvaluateTime"); + eval_timer.start(); + + size_t gBegin = 0, gEnd = 0, gCount = 0; + mask_t* gMasks = NULL; + + // TODO global here good for dist case? + if (type == "train") { + gBegin = globalTrainBegin; + gEnd = globalTrainEnd; + gCount = globalTrainCount; + gMasks = globalTrainMasks; + if (subgraph_sample_size) { + // update gMasks for subgraph + gMasks = NULL; + gBegin = 0; + gEnd = this->subgraphNumVertices; + gCount = this->subgraphNumVertices; + } + } else if (type == "val") { + gBegin = globalValBegin; + gEnd = globalValEnd; + gCount = globalValCount; + gMasks = globalValMasks; + } else { + gBegin = globalTestBegin; + gEnd = globalTestEnd; + gCount = globalTestCount; + gMasks = globalTestMasks; + } + + // switch to the original graph if not training + if (subgraph_sample_size && type != "train") { + for (size_t i = 0; i < num_layers; i++) + layers[i]->update_dim_size(distNumSamples); + for (size_t i = 0; i < num_conv_layers; i++) { +#ifdef GALOIS_ENABLE_GPU + layers[i]->set_graph_ptr(distContext->getGraphPointer()); +#else + layers[i]->set_graph_ptr(distContext->getLGraphPointer()); +#endif + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); + } + layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr()); + layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data + } +#ifdef GALOIS_ENABLE_GPU + if (type == "train") { + gMasks = d_train_masks; + } else if (type == "val") { + gMasks = d_val_masks; + } else { + gMasks = d_test_masks; + } +#endif + + // galois::gPrint(header, "Doing actual forward propagation\n"); + loss = fprop(gBegin, gEnd, gCount, gMasks); + // galois::gPrint(header, + // "Forward propagation donne, going to check accuracy\n"); + float_t* predictions = layers[num_layers - 1]->next()->get_data(); + + // labels will be subgraph labels if applicable + label_t* localLabels; + if (type == "train" && subgraph_sample_size) { + localLabels = distContext->get_labels_subg_ptr(); + } else { + // note this grabs local labels + localLabels = distContext->get_labels_ptr(); + } + + if (is_single_class) { + acc = + masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, localLabels); + } else { + acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, predictions, + localLabels); + } + + eval_timer.stop(); + + // TODO replace with stat timer + t_eval.Stop(); + return t_eval.Millisecs(); +} + +void Net::construct_layers() { + // append conv layers + // galois::gPrint(header, "Constructing layers...\n"); + for (size_t i = 0; i < num_conv_layers - 1; i++) { + append_conv_layer(i, true); // conv layers, act=true + } + append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false + + if (has_l2norm) { + append_l2norm_layer(num_conv_layers); // l2_norm layer + } + if (has_dense) { + append_dense_layer(num_layers - 2); // dense layer + } + append_out_layer(num_layers - 1); // output layer + + // allocate memory for intermediate features and gradients + for (size_t i = 0; i < num_layers; i++) { + layers[i]->add_edge(); + } + for (size_t i = 1; i < num_layers; i++) { + connect(layers[i - 1], layers[i]); + } + for (size_t i = 0; i < num_layers; i++) { + layers[i]->malloc_and_init(); + } + + layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data + // precompute the normalization constant based on graph structure + // context->norm_factor_computing(false); + distContext->constructNormFactor(graphTopologyContext); + for (size_t i = 0; i < num_conv_layers; i++) + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); + set_contexts(); +} + +//! Add an l2_norm layer to the network +void Net::append_l2norm_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = distNumSamples; + in_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims); +} + +//! Add an dense layer to the network +void Net::append_dense_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = distNumSamples; + in_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); +} + +//! Add an output layer to the network +void Net::append_out_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + + if (is_single_class) + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); + else + layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); + + layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr()); +} +//! Add a convolution layer to the network +void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, + bool dropout) { + assert(dropout_rate < 1.0); + assert(layer_id < num_conv_layers); + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, + dropout_rate, in_dims, out_dims); +#ifdef GALOIS_ENABLE_GPU + layers[layer_id]->set_graph_ptr(distContext->getGraphPointer()); +#else + layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer()); +#endif +} + +//! forward propagation: [begin, end) is the range of samples used. +//! calls "forward" on each layer and returns the loss of the final layer +acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { + galois::StatTimer fprop_timer("ForwardPropTime"); + fprop_timer.start(); + // set mask for the last layer; globals + // TODO this should be distirbuted sample gBegin->end not global; fix later + // seems to be unused in code right now anyways + // galois::gPrint(header, "fprop: set sample mask\n"); + layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks); + + for (size_t i = 0; i < num_layers; i++) { + galois::gPrint(header, "fprop: layer ", i, " forward call\n"); + layers[i]->forward(); + } + + // galois::gPrint(header, "fprop: getting loss\n"); + // prediction error + acc_t loss = layers[num_layers - 1]->get_prediction_loss(); + // Squared Norm Regularization to mitigate overfitting + loss += weight_decay * layers[0]->get_weight_decay_loss(); + fprop_timer.stop(); + return loss; +} + +// back propagation +void Net::bprop() { + galois::StatTimer bprop_timer("BackPropTime"); + bprop_timer.start(); + for (size_t i = num_layers; i != 0; i--) { + layers[i - 1]->backward(); + } + bprop_timer.stop(); +} + +// update trainable weights after back-propagation +void Net::update_weights(optimizer* opt) { + regularize(); + for (size_t i = 0; i < num_layers; i++) { + if (layers[i]->trainable()) { + layers[i]->update_weight(opt); + } + } +} + +//! Save the context object to all layers of the network +void Net::set_contexts() { + for (size_t i = 0; i < num_layers; i++) + layers[i]->set_context(distContext); +} + +//! set netphases for all layers in this network +void Net::set_netphases(net_phase phase) { + for (size_t i = 0; i < num_layers; i++) + layers[i]->set_netphase(phase); +} + +//! print all layers +void Net::print_layers_info() { + for (size_t i = 0; i < num_layers; i++) + layers[i]->print_layer_info(); +} + +// print the configurations +void Net::print_configs() { + galois::gPrint(header, "Configuration: num_threads ", num_threads, + ", num_conv_layers ", num_conv_layers, ", num_epochs ", + num_epochs, ", hidden_feat_len ", h1, ", learning_rate ", + learning_rate, ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp new file mode 100644 index 0000000000..ce9d709dbf --- /dev/null +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -0,0 +1,54 @@ +#include "deepgalois/layers/aggregator.h" +#include "deepgalois/math_functions.hh" +#include "galois/Galois.h" + +// TODO template arg +void deepgalois::update_all(size_t len, Graph& g, const float_t* in, + float_t* out, bool norm, float_t* norm_factor) { + // std::cout << "[update_all] graph size: " << n << "\n"; + size_t n = g.size(); + galois::do_all( + galois::iterate(size_t(0), n), + [&](const auto src) { + auto src_idx = src * len; + // zero out the output data + math::clear_cpu(len, &out[src_idx]); + float_t a = 0.0; + float_t b = 0.0; + // get normalization factor if needed + if (norm) + a = norm_factor[src]; + // gather neighbors' embeddings + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + const auto dst = g.getEdgeDst(e); + assert(dst < n); + auto dst_idx = dst * len; + if (norm) { + // normalize b as well + b = a * norm_factor[dst]; + // float_t* neighbor = new float_t[len]; // this is super slow + vec_t neighbor(len); + // scale the neighbor's data using the normalization factor + math::scale(len, b, &in[dst_idx], &neighbor[0]); + // use scaled data to update; out[src] += in[dst] + math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); + } else { + // add embeddings from neighbors together; out[src] += in[dst] + math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]); + } + } + }, + galois::steal(), galois::loopname("update_all")); +} + +void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, + float_t* out, bool, float_t* norm_factor) { + galois::StatTimer Tcsrmm("CSRMM-MKL"); + Tcsrmm.start(); + unsigned n = g.size(); + math::clear_cpu(n * len, out); + math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, + (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, + out); + Tcsrmm.stop(); +} diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu new file mode 100644 index 0000000000..b29e980da3 --- /dev/null +++ b/libdeepgalois/src/layers/aggregator.cu @@ -0,0 +1,102 @@ +#include "gg.h" +#include "ggcuda.h" +#include "cub/cub.cuh" +#include "deepgalois/cutils.h" +#include "deepgalois/layers/aggregator.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +// TODO: use warp +__device__ void scale_add(const int n, const float_t alpha, const float_t* a, + const float_t* b, float_t* y) { + for (int i = 0; i < n; i++) + y[i] = alpha * a[i] + b[i]; +} + +__global__ void update_all_naive(size_t n, size_t len, GraphGPU g, + const float_t* in, float_t* out, bool norm, + const float_t* norm_factor) { + CUDA_KERNEL_LOOP(src, n) { + float_t a = 0.0, b = 1.0; + if (norm) + a = norm_factor[src]; + index_type begin = g.edge_begin(src); + index_type end = g.edge_end(src); + for (index_type e = begin; e != end; e++) { + index_type dst = g.getEdgeDst(e); + if (norm) + b = a * norm_factor[dst]; + scale_add(len, b, in + dst * len, out + src * len, + out + src * len); // out[src] += in[dst] + } + } +} + +__global__ void update_all_warp(size_t n, size_t len, GraphGPU g, + const float_t* in, float_t* out, bool norm, + const float_t* norm_factor) { + __shared__ index_type ptrs[BLOCK_SIZE / WARP_SIZE][2]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int src = warp_id; src < n; src += num_warps) { + float_t a = 0.0, b = 1.0; + if (norm) + a = norm_factor[src]; + if (thread_lane < 2) + ptrs[warp_lane][thread_lane] = g.edge_begin(src + thread_lane); + __syncthreads(); + const index_type row_begin = ptrs[warp_lane][0]; + const index_type row_end = ptrs[warp_lane][1]; + index_type base_src = src * len; + for (index_type offset = row_begin; offset < row_end; offset++) { + index_type dst = g.getEdgeDst(offset); + if (norm) + b = a * norm_factor[dst]; + index_type base_dst = dst * len; + for (int i = 0; i < len; i += WARP_SIZE) + if (thread_lane + i < len) + out[base_src + thread_lane + i] += in[base_dst + thread_lane + i] * b; + } + } +} + +void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + unsigned n = g.size(); + CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); + // update_all_naive<<>>(n, len, g, in, + // out, norm, norm_factor); + update_all_warp<<<(n - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + n, len, g, in, out, norm, norm_factor); + CudaTest("solving update_all kernel failed"); +} + +void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + // g.print_test(); + unsigned n = g.size(); + auto nnz = g.sizeEdges(); + CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); + // std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << " + // nnz " << nnz << "\n"; print_device_vector(10, norm_factor, "norm_factor"); + float* temp; + const int* row_start = (const int*)g.row_start_ptr(); + const int* edge_dst = (const int*)g.edge_dst_ptr(); + //printf("row_start_ptr: 0x%x\n", row_start); + //printf("edge_dst_ptr: 0x%x\n", edge_dst); + // print_device_int_vector(10, row_start, "row_start"); + // print_device_int_vector(10, edge_dst, "edge_dst"); + float_malloc_device(n * len, temp); // TODO: avoid repetitive allocation + csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out); + float_free_device(temp); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h new file mode 100644 index 0000000000..d57f485a8c --- /dev/null +++ b/libdeepgalois/src/layers/gat_fw.h @@ -0,0 +1,158 @@ +// #define USE_GAT +#ifdef USE_GAT +// `Graph Attention Network ` +// NOTE: GAT paper uses "first concatenation then linear projection" +// to compute attention scores, while ours is "first projection then +// addition", the two approaches are mathematically equivalent: +// We decompose the weight vector a mentioned in the paper into +// [a_l || a_r], then a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j +// Our implementation is much efficient because we do not need to +// save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, +// addition could be optimized with DGL's built-in function u_add_v, +// which further speeds up computation and saves memory footprint. + +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { + size_t n = g.size(); + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + auto src_idx = src * len; + auto deg_src = g.get_degree(src); + + // concatenation, dot product, LeakyReLU + // int i = 0; + // vec_t scores(deg_src); + auto begin = g.edge_begin(src); + auto end = g.edge_end(src); + // alpha: learnable weight vector (shared by all vertices) + float_t src_score = math::dot(len, &alpha_l[0], &in[src_idx]); + for (auto e = begin; e != end; e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + // vec_t concat_vec(2*len); + // math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]); + // float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]); + float_t dst_score = math::dot(len, &alpha_r[0], &in[dst_idx]); + temp_scores[e] = src_score + dst_score; + math::leaky_relu(epsilon, temp_scores[e], scores[e]); + } + + // softmax to normalize the attention scores on each vertex’s incoming edges + // vec_t normalized_scores(deg_src, 0); + // math::softmax(deg_src, &scores[0], &normalized_scores[0]); + math::softmax(deg_src, &scores[begin], &norm_scores[begin]); + + // aggregation: scaled by the attention scores + math::clear_cpu(len, &out[src_idx]); + for (auto e = begin; e != end; e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + auto score = norm_scores[e]; + vec_t neighbor(len); + math::scale(len, score, &in[dst_idx], &neighbor[0]); + math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); + } + }); +} + +void graph_conv_layer::d_compute_scores(size_t len, Graph& g, + const float_t* in_data, + const float_t* out_data, + const float_t* in_grad) { + size_t n = g.size(); + + // compute gradients for the learnable vector `alpha` + // vec_t temp_grad(n*n); + // math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data, + // in_grad, 0.0, temp_grad); + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + auto begin = g.edge_begin(src); + auto end = g.edge_end(src); + auto deg_src = g.get_degree(src); + math::d_softmax(deg_src, &scores[begin], &norm_scores[begin], + &scores_grad[begin], &norm_scores_grad[begin]); + for (auto e = begin; e != end; e++) { + auto dst = g.getEdgeDst(e); + // use norm_scores_grad as temp_scores_grad since its data is useless + // already + math::d_leaky_relu(epsilon, &scores_grad[e], &temp_scores[e], + &norm_scores_grad[e]); + math::scale(len, norm_scores_grad[e], &in_data[src_idx], &alpha_lgrad[0]); + math::scale(len, norm_scores_grad[e], &in_data[dst_idx], &alpha_rgrad[0]); + } + }); +} + +void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in_grad, + float_t* out_grad) { + size_t n = g.size(); + + // aggregation: the derivative is transposed; + // the graph is undirected (structurally symmetric), + // but values are not the same for the symmetric positions + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + auto src_idx = src * len; + auto src_begin = g.edge_begin(src); + for (auto e = src_begin; e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + auto dst_begin = g.edge_begin(dst); + auto score = norm_scores[dst_begin + e - src_begin]; // transposed + vec_t neighbor(len); + math::scale(len, score, &in_grad[dst_idx], &neighbor[0]); + math::vadd_cpu(len, &out_grad[src_idx], &neighbor[0], &out_grad[src_idx]); + } + }); +} + +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + galois::StatTimer conv_timer("GraphConvForward"); + conv_timer.start(); + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + + // dropout + if (dropout_ && phase_ == net_phase::train) { + math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, + in_temp); + } else { + math::copy_cpu(x * y, in_data, in_temp); + } + + // linear transformation + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, + &layer::W[0], 0.0, out_temp); + + // aggregation + aggregate(z, *graph_cpu, out_temp, out_data); + + // ReLU + if (act_) + math::relu_cpu(x * z, out_data, out_data); +} + +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + if (act_) + math::d_relu_cpu(x * z, out_grad, out_data, out_grad); + + // compute gradients for alpha (alpha is a learnable vector) + d_compute_scores(z, *graph_cpu, in_temp, out_temp, out_grad); + // compute gradients for feature vectors + d_aggregate(z, *graph_cpu, out_grad, out_temp); + if (level_ != 0) { + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], + 0.0, in_grad); // x*z; z*y -> x*y + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, + 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + } + if (level_ != 0 && dropout_) + math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); +} + +#endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp new file mode 100644 index 0000000000..f13b26be25 --- /dev/null +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -0,0 +1,295 @@ +#include "deepgalois/layers/graph_conv_layer.h" +#include "deepgalois/math_functions.hh" +#include "deepgalois/utils.h" + +static galois::DynamicBitSet bitset_conv; + +#include "deepgalois/layers/GraphConvSyncStructures.h" +#include "deepgalois/layers/GradientSyncStructs.h" + +namespace deepgalois { +#include "gat_fw.h" + +//! Set this to let sync struct know where to get data from +float_t* _dataToSync = nullptr; +//! Set this to let sync struct know the size of the vector to use during +//! sync +long unsigned _syncVectorSize = 0; + +inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, + vec_t& matrix, unsigned seed) { + auto init_range = sqrt(6.0 / (dim_x + dim_y)); + std::default_random_engine rng(seed); + std::uniform_real_distribution dist(-init_range, init_range); + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i * dim_y + j] = dist(rng); + } +} + +inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, + vec_t& matrix) { + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i * dim_y + j] = 0; + } +} + +void graph_conv_layer::malloc_and_init() { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + + galois::gInfo("conv bitset size is going to be ", x); + bitset_conv.resize(x); + + // setup gluon + layer::gradientGraph = + new deepgalois::GluonGradients(layer::weight_grad, y * z); + layer::syncSub = + new galois::graphs::GluonSubstrate( + *layer::gradientGraph, layer::gradientGraph->myHostID(), + layer::gradientGraph->numHosts(), false); + galois::gInfo("gradient bitset size is going to be ", y * z, " ", y, " ", z); + + // make sure seed consistent across all hosts for weight matrix + rand_init_matrix(y, z, W, 1); + // rand_init_matrix(y, z, Q, 1); // for GraphSAGE + + zero_init_matrix(y, z, layer::weight_grad); + +#ifdef USE_GAT + // alpha is only used for GAT + rand_init_matrix(z, 1, alpha_l, 1); + rand_init_matrix(z, 1, alpha_r, 1); + alpha_lgrad.resize(2 * z); + alpha_rgrad.resize(2 * z); + std::fill(alpha_lgrad.begin(), alpha_lgrad.end(), 0); + std::fill(alpha_rgrad.begin(), alpha_rgrad.end(), 0); + auto ne = graph_cpu->sizeEdges(); // number of edges + scores.resize(ne); // a score for each edge + temp_scores.resize(ne); + scores_grad.resize(ne); + norm_scores.resize(ne); + norm_scores_grad.resize(ne); + epsilon = 0.2; // LeakyReLU angle of negative slope +#endif + dropout_ = true; + act_ = false; + + if (dropout_) + dropout_mask = new mask_t[x * y]; + in_temp = new float_t[x * y]; + out_temp = new float_t[x * z]; + trans_data = new float_t[y * x]; // y*x + if (y <= z) + in_temp1 = new float_t[x * y]; +} + +namespace { +void set_conv_bitset() { + // bitset setting + galois::do_all( + galois::iterate((size_t)0, bitset_conv.size()), + [&](size_t node_id) { + bool set_true = false; + // check for non-zeros; the moment one is found, set true becomes true + // and we break out of the loop + for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) { + auto val = + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + + i]; + if (val != 0) { + set_true = true; + break; + } + } + + if (set_true) { + bitset_conv.set(node_id); + } + }, + galois::loopname("BitsetGraphConv"), galois::no_stats()); +} + +} // end anonymous namespace + +void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, + const float_t* neighbors, float_t* out) { + float_t* a = new float_t[len]; + float_t* b = new float_t[len]; + math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a); + math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b); + math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors +} + +#ifndef USE_GAT +// aggregate based on graph topology +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { + galois::StatTimer aggregate_timer("AggregateTime"); + aggregate_timer.start(); + // normalization constant based on graph structure +#ifdef USE_MKL + update_all_csrmm(len, g, in, out, norm_, norm_consts); +#else + update_all(len, g, in, out, norm_, norm_consts); +#endif + aggregate_timer.stop(); +} + +// since graph is symmetric, the derivative is the same +void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { + galois::StatTimer aggregate_timer("AggregateDerivativeTime"); + aggregate_timer.start(); +#ifdef USE_MKL + update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z +#else + update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z +#endif + aggregate_timer.stop(); +} + +// 𝒉[𝑙] = σ(𝑊 * Σ(𝒉[𝑙-1])) +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + galois::StatTimer conv_timer("GraphConvForward"); + conv_timer.start(); + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + galois::gPrint("forward ", x, " ", y, " ", z, "\n"); + + galois::StatTimer drop_timer("GraphConvForwardDropout"); + drop_timer.start(); + // input: x*y; W: y*z; output: x*z + // if y > z: mult W first to reduce the feature size for aggregation + // else: aggregate first then mult W + if (dropout_ && phase_ == net_phase::train) { + math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, + in_temp); + } else { + math::copy_cpu(x * y, in_data, in_temp); + } + drop_timer.stop(); + + galois::StatTimer compute_timer("GraphConvForwardCompute"); + compute_timer.start(); + if (y > z) { + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, + &layer::W[0], 0.0, out_temp); + aggregate(z, *graph_cpu, out_temp, out_data); + } else { + aggregate(y, *graph_cpu, in_temp, in_temp1); + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, + &layer::W[0], 0.0, out_data); + } + compute_timer.stop(); + + // TODO sync of out_data required here + // TODO how to do this for the sampled case? + deepgalois::_syncVectorSize = z; + deepgalois::_dataToSync = out_data; + set_conv_bitset(); + + galois::gPrint("forward ", x, " ", y, " ", z, " sync calling\n"); + layer::context->getSyncSubstrate() + ->sync("GraphConvForward"); + + // run relu activation on output if specified + galois::StatTimer relu_timer("GraphConvForwardRelu"); + relu_timer.start(); + if (act_) + math::relu_cpu(x * z, out_data, out_data); + relu_timer.stop(); + + conv_timer.stop(); +} + +// 𝜕𝐸 / 𝜕𝑦[𝑙−1] = 𝜕𝐸 / 𝜕𝑦[𝑙] ∗ 𝑊 ^𝑇 +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + galois::StatTimer conv_timer("GraphConvBackward"); + conv_timer.start(); + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + // note; assumption here is that out_grad contains 1s or 0s via relu? + galois::StatTimer relu_timer("GraphConvBackwardRelu"); + relu_timer.start(); + if (act_) + math::d_relu_cpu(x * z, out_grad, out_data, out_grad); + relu_timer.stop(); + // else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying + + galois::StatTimer compute_timer("GraphConvBackwardCompute"); + compute_timer.start(); + if (y > z) { + d_aggregate(z, *graph_cpu, out_grad, out_temp); + // at this point, out_temp has the derivative of data from last step to + // use for both updating gradients for features and gradients for weights + // this calculates gradients for the node predictions + if (level_ != 0) { // no need to calculate in_grad for the first layer + // derivative of matmul needs transposed matrix + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], + 0.0, in_grad); // x*z; z*y -> x*y + } + // calculate weight gradients using input data; multiplied by gradients from + // last back prop step + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, + 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + } else { + if (level_ != 0) { + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0], + 0.0, in_temp); + d_aggregate(y, *graph_cpu, in_temp, in_grad); + } + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, + 0.0, &layer::weight_grad[0]); + } + compute_timer.stop(); + + // sync agg + // galois::gPrint(header, "x is ", x, " y is ", y, " z is ", z, "\n"); + if (level_ != 0) { + deepgalois::_syncVectorSize = y; + deepgalois::_dataToSync = in_grad; + set_conv_bitset(); + layer::context->getSyncSubstrate() + ->sync( + //->sync( + "GraphConvBackward"); + } + galois::StatTimer drop_timer("GraphConvBackwardDropout"); + drop_timer.start(); + if (level_ != 0 && dropout_) + math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); + drop_timer.stop(); + + deepgalois::_syncVectorSize = z; + deepgalois::_dataToSync = &layer::weight_grad[0]; + unsigned host_num = galois::runtime::getSystemNetworkInterface().Num; + layer::syncSub->sync("Gradients"); + galois::do_all( + galois::iterate((size_t)0, (size_t)z), + [&](size_t i) { + // galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n"); + layer::weight_grad[i] /= host_num; + // galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n"); + }, + galois::loopname("sync post process")); + + galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done"); + conv_timer.stop(); +} +#endif + +acc_t graph_conv_layer::get_weight_decay_loss() { + return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu new file mode 100644 index 0000000000..f8b59d3c0e --- /dev/null +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -0,0 +1,117 @@ +#include "deepgalois/layers/graph_conv_layer.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +void graph_conv_layer::malloc_and_init() { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + + if (dropout_) + CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t))); + float_malloc_device(x * y, in_temp); + init_const_gpu(x * y, 0.0, in_temp); + if (y <= z) { + float_malloc_device(x * y, in_temp1); + init_const_gpu(x * y, 0.0, in_temp1); + } + float_malloc_device(x * z, out_temp); + init_const_gpu(x * z, 0.0, out_temp); + float_malloc_device(y * z, d_W); + auto init_range = sqrt(6.0 / (y + z)); + // Glorot & Bengio (AISTATS 2010) + rng_uniform_gpu(y * z, -init_range, init_range, d_W); + float_malloc_device(y * z, layer::d_weight_grad); + init_const_gpu(y * z, 0.0, layer::d_weight_grad); +} + +void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in, + float_t* out) { +#ifdef USE_CUSPARSE + deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); +#else + deepgalois::update_all(len, g, in, out, norm_, norm_consts); +#endif +} + +void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, + float_t* out) { +#ifdef USE_CUSPARSE + deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); +#else + deepgalois::update_all(len, g, in, out, norm_, norm_consts); +#endif +} + +void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, + const float_t* neighbors, float_t* out) {} + +// GPU forward: compute output features +// NOTE: in_data will be used in back-prop, so it can not be modified +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + + // currently only support feature length <= 128 + if (z > MAX_NUM_CLASSES) { + std::cout << "Currently support maximum hidden feature length of " + << MAX_NUM_CLASSES << "\n"; + exit(0); + } + init_const_gpu(x * z, 0.0, out_temp); + if (dropout_ && phase_ == net_phase::train) + dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + else + copy_gpu(x * y, in_data, in_temp); + if (y > z) { + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, + out_temp); + graph_conv_layer::aggregate(z, *graph_gpu, out_temp, out_data); + } else { + graph_conv_layer::aggregate(y, *graph_gpu, in_temp, in_temp1); + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, + out_data); + } + if (act_) + relu_gpu(x * z, out_data, out_data); +} + +// GPU backward: compute input gradients (in_grad) and weight gradients +// (d_weight_grad) +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + + if (act_) + d_relu_gpu(x * z, out_grad, out_data, out_grad); + if (y > z) { + graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp); + if (level_ != 0) + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, + in_grad); + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, + layer::d_weight_grad); + } else { + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, + in_temp); + graph_conv_layer::d_aggregate(y, *graph_gpu, in_temp, in_grad); + } + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, + layer::d_weight_grad); + } + if (level_ != 0 && dropout_) + d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); +} + +acc_t graph_conv_layer::get_weight_decay_loss() { + return l2_norm_gpu(input_dims[1] * output_dims[1], d_W); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp new file mode 100644 index 0000000000..8de2406ede --- /dev/null +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -0,0 +1,53 @@ +#include "deepgalois/layers/l2_norm_layer.h" +#include "deepgalois/math_functions.hh" +#include "galois/Galois.h" + +namespace deepgalois { + +void l2_norm_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + galois::do_all( + galois::iterate((size_t)0, x), + [&](const auto i) { + // for (size_t i = 0; i < x; i++) { + float_t sum = 0.0; + size_t idx = i * y; + for (size_t j = 0; j < y; j++) { + sum += in_data[idx + j] * in_data[idx + j]; + } + sum = std::max(sum, epsilon_); + sum = sqrt(sum); + for (size_t j = 0; j < y; j++) { + out_data[idx + j] = in_data[idx + j] / sum * scale_; + } + }, + galois::loopname("l2_norm")); +} + +void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*, + float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + galois::do_all( + galois::iterate((size_t)0, x), + [&](const auto i) { + // for (size_t i = 0; i < x; i++) { + float_t sum_x2 = 0.0; + float_t coef0_axis0 = 0, coef1_axis0 = 0; + size_t idx = i * y; + for (size_t j = 0; j < y; j++) { + sum_x2 += powf(in_data[idx + j], 2); + coef0_axis0 -= in_data[idx + j] * out_grad[idx + j]; + } + coef1_axis0 = powf(sum_x2, -1.5); + for (size_t j = 0; j < y; j++) { + in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0 + + out_grad[idx + j] * sum_x2 * coef1_axis0; + } + }, + galois::loopname("d_l2_norm")); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu new file mode 100644 index 0000000000..ed86cf147d --- /dev/null +++ b/libdeepgalois/src/layers/l2_norm_layer.cu @@ -0,0 +1,21 @@ +#include "deepgalois/layers/l2_norm_layer.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +void l2_norm_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + l2_norm_gpu(x, y, in_data, out_data); +} + +void l2_norm_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t* out_grad, + float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + d_l2_norm_gpu(x, y, in_data, out_grad, in_grad); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp new file mode 100644 index 0000000000..a230de1090 --- /dev/null +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -0,0 +1,28 @@ +#include "deepgalois/layers/leaky_relu_layer.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, + dims_t out_dims) + : layer(level, in_dims, out_dims), epsilon_(eps) { + assert(input_dims[0] == output_dims[0]); // num_vertices + trainable_ = false; + n = input_dims[0] * input_dims[1]; + name_ = layer_type() + "_" + std::to_string(level); +} + +// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε +void leaky_relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + math::leaky_relu_cpu(n, epsilon_, in_data, out_data); +} + +// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε, 𝑖𝑓 (𝑦[𝑙] ≤ 0) +// = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑖𝑓 (𝑦[𝑙] > 0) +void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu new file mode 100644 index 0000000000..a6271086e9 --- /dev/null +++ b/libdeepgalois/src/layers/leaky_relu_layer.cu @@ -0,0 +1,20 @@ +#include "deepgalois/layers/leaky_relu_layer.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +// 𝑦[𝑙] = 𝑦[𝑙−1] > 0 ? 𝑦[𝑙−1]) : 𝑦[𝑙−1] * ε +void leaky_relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + leaky_relu_gpu(n, epsilon_, in_data, out_data); +} + +// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 𝜕𝐿 / 𝜕𝑦𝑙 * ε, 𝑖𝑓 (𝑦[𝑙] ≤ 0) +// = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑖𝑓 (𝑦[𝑙] > 0) +void leaky_relu_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_leaky_relu_gpu(n, epsilon_, out_grad, in_data, in_grad); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp new file mode 100644 index 0000000000..0576bea642 --- /dev/null +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -0,0 +1,21 @@ +#include "deepgalois/layers/relu_layer.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +// 𝑦[𝑙] = max(0, 𝑦[𝑙−1]) +void relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + size_t n = input_dims[0] * input_dims[1]; + math::relu_cpu(n, in_data, out_data); +} + +// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0) +// = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒 +void relu_layer::back_propagation(const float_t*, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t n = input_dims[0] * input_dims[1]; + math::d_relu_cpu(n, out_grad, out_data, in_grad); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu new file mode 100644 index 0000000000..d457c994ce --- /dev/null +++ b/libdeepgalois/src/layers/relu_layer.cu @@ -0,0 +1,22 @@ +#include "deepgalois/layers/relu_layer.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +// 𝑦[𝑙] = max(0, 𝑦[𝑙−1]) +void relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + const size_t count = input_dims[0] * input_dims[1]; + relu_gpu(count, in_data, out_data); +} + +// 𝜕𝐿 / 𝜕𝑦[𝑙−1] = 0, 𝑖𝑓 (𝑦[𝑙] < 0) +// = 𝜕𝐿 / 𝜕𝑦𝑙, 𝑜𝑡ℎ𝑒𝑟𝑤𝑖𝑠𝑒 +void relu_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t* out_grad, + float_t* in_grad) { + const size_t count = input_dims[0] * input_dims[1]; + d_relu_gpu(count, out_grad, in_data, in_grad); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp new file mode 100644 index 0000000000..8d72ed4b07 --- /dev/null +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -0,0 +1,122 @@ +#include "deepgalois/layers/sigmoid_loss_layer.h" +#include "deepgalois/math_functions.hh" +#include "galois/Galois.h" + +namespace deepgalois { + +sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); +} + +sigmoid_loss_layer::~sigmoid_loss_layer() { delete[] loss; } + +void sigmoid_loss_layer::malloc_and_init() { + loss = new float_t[input_dims[0]]; // error for each sample +} + +inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) { + // return context->get_label(i, j); + return labels[i * input_dims[1] + j]; +} + +void sigmoid_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + size_t featLen = input_dims[1]; + galois::do_all( + galois::iterate(begin_, end_), + [&](const auto& gid) { + if (!use_mask || masks_[gid] == 1) { // masked + // check if local to this host + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + size_t idx = featLen * lid; + + // output is normalized input for this layer + math::sigmoid(featLen, &in_data[idx], + &out_data[idx]); // normalize using sigmoid + + // one hot encoded vector for the labels + // TODO this is a bottleneck; big lock on memory allocator + float_t* ground_truth = new float_t[featLen]; + for (size_t j = 0; j < featLen; j++) + ground_truth[j] = (float_t)get_label(lid, j); + // loss calculation + this->loss[lid] = + math::cross_entropy(featLen, ground_truth, &out_data[idx]); + + // TODO this is a bottleneck, lock on memory possibly + delete[] ground_truth; + } + } + }, + galois::chunk_size(), galois::steal(), + galois::loopname("sigmoid-loss-fw")); +} + +void sigmoid_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t*, + float_t* in_grad) { + size_t featLen = layer::input_dims[1]; + + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& gid) { + if (!use_mask || masks_[gid] == 1) { // masked + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + + size_t idx = featLen * lid; + // TODO this is bad + float_t* norm_grad = new float_t[featLen]; + float_t* ground_truth = new float_t[featLen]; + for (size_t j = 0; j < featLen; j++) + ground_truth[j] = (float_t)get_label(lid, j); + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(featLen, ground_truth, &out_data[idx], + norm_grad); + // derviative sigmoid to gradient used in the next layer + math::d_sigmoid(featLen, &in_data[idx], &out_data[idx], + &in_grad[idx], norm_grad); + // TODO this is bad + delete[] norm_grad; + delete[] ground_truth; + } + } + }, + galois::chunk_size(), galois::steal(), + galois::loopname("sigmoid-loss-bw")); +} + +acc_t sigmoid_loss_layer::get_prediction_loss() { + galois::GAccumulator total_loss; + galois::GAccumulator valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& gid) { + if (!use_mask || masks_[gid]) { + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + total_loss += this->loss[lid]; + valid_sample_count += 1; + } + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + + size_t c = valid_sample_count.reduce(); + if (c > 0) { + return total_loss.reduce() / (acc_t)valid_sample_count.reduce(); + } else { + return 0; + } +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu new file mode 100644 index 0000000000..0f5ff9cb69 --- /dev/null +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -0,0 +1,40 @@ +#include "deepgalois/layers/sigmoid_loss_layer.h" +#include "deepgalois/math_functions.hh" +#include "gg.h" +#include "ggcuda.h" + +namespace deepgalois { + +sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); +} + +sigmoid_loss_layer::~sigmoid_loss_layer() { float_free_device(loss); } + +void sigmoid_loss_layer::malloc_and_init() { + float_malloc_device(input_dims[0], loss); +} + +void sigmoid_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + init_const_gpu(input_dims[0], 0.0, loss); + sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, + labels, loss, out_data); +} + +void sigmoid_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels, + out_data, in_grad); +} + +acc_t sigmoid_loss_layer::get_prediction_loss() { + return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp new file mode 100644 index 0000000000..17e7023176 --- /dev/null +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -0,0 +1,131 @@ +#include "deepgalois/layers/softmax_loss_layer.h" +#include "deepgalois/math_functions.hh" +#include "galois/Galois.h" + +namespace deepgalois { + +softmax_loss_layer::softmax_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); +} + +softmax_loss_layer::~softmax_loss_layer() { delete[] loss; } + +void softmax_loss_layer::malloc_and_init() { + loss = new float_t[input_dims[0]]; // error for each sample +} + +inline label_t softmax_loss_layer::get_label(size_t i) { + return labels[i]; + // return context->get_label(i); +} + +// TODO: need kernel fusion optimization +// 𝑦[i] = 𝑒^𝑥[i] / Σ 𝑒^𝑥[𝑘] +void softmax_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + // size_t numSamples = input_dims; + size_t featLen = input_dims[1]; + // zero out the output vector + for (unsigned i = 0; i < input_dims[0]; i++) { + for (unsigned j = 0; j < featLen; j++) { + out_data[i * featLen + j] = 0.0; + } + } + + galois::do_all( + galois::iterate(begin_, end_), + [&](const unsigned gid) { + // if no mask used it means all are fair game + if (!use_mask || masks_[gid] == 1) { + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + // output is normalized input for this layer + math::softmax(featLen, &in_data[featLen * lid], + &out_data[featLen * lid]); // normalize using softmax + // one hot encoded vector for the labels + vec_t groundTruth(output_dims[1], 0.0); // ground truth + // labels are local + groundTruth[get_label(lid)] = 1.0; // one-hot + // loss calculation + loss[lid] = math::cross_entropy(featLen, &groundTruth[0], + &out_data[featLen * lid]); + } + } + }, + galois::chunk_size<64>(), galois::steal(), + galois::loopname("softmax-loss-fw")); + + // no sync required in distributed execution since no graph topology used + // in this forward pass; only a post-process pretty much +} + +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t*, + float_t* in_grad) { + // note: out_grad is ignored because it shouldn't exist (this is output layer) + size_t featLen = layer::input_dims[1]; + + for (unsigned i = 0; i < input_dims[0]; i++) { + for (unsigned j = 0; j < featLen; j++) { + in_grad[i * featLen + j] = 0.0; + } + } + + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& gid) { + if (!use_mask || masks_[gid] == 1) { // masked + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + vec_t norm_grad(featLen); + std::vector groundTruth(featLen, 0.0); + groundTruth[get_label(lid)] = 1.0; + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(featLen, &groundTruth[0], + &out_data[featLen * lid], &norm_grad[0]); + // derviative softmax to gradient used in the next layer + math::d_softmax(featLen, &in_data[featLen * lid], + &out_data[featLen * lid], &in_grad[featLen * lid], + &norm_grad[0]); + } + } + }, + galois::chunk_size<64>(), galois::steal(), + galois::loopname("softmax-loss-bw")); + + // no weight sync required: this is all local graph information +} + +acc_t softmax_loss_layer::get_prediction_loss() { + galois::GAccumulator total_loss; + galois::GAccumulator valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& gid) { + if (!use_mask || masks_[gid]) { + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + total_loss += this->loss[lid]; + valid_sample_count += 1; + } + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + + size_t c = valid_sample_count.reduce(); + if (c > 0) { + return total_loss.reduce() / (acc_t)valid_sample_count.reduce(); + } else { + return 0; + } +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu new file mode 100644 index 0000000000..20b7e659d8 --- /dev/null +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -0,0 +1,40 @@ +#include "deepgalois/layers/softmax_loss_layer.h" +#include "deepgalois/math_functions.hh" +#include "gg.h" +#include "ggcuda.h" + +namespace deepgalois { + +softmax_loss_layer::softmax_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); +} + +softmax_loss_layer::~softmax_loss_layer() { float_free_device(loss); } + +void softmax_loss_layer::malloc_and_init() { + float_malloc_device(input_dims[0], loss); +} + +void softmax_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + init_const_gpu(input_dims[0], 0.0, loss); + softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, + labels, loss, out_data); +} + +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels, + out_data, in_grad); +} + +acc_t softmax_loss_layer::get_prediction_loss() { + return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp new file mode 100644 index 0000000000..31cd353e51 --- /dev/null +++ b/libdeepgalois/src/lgraph.cpp @@ -0,0 +1,41 @@ +#include "deepgalois/lgraph.h" +#include "deepgalois/utils.h" +#include "deepgalois/reader.h" +#include "galois/Galois.h" +#include + +namespace deepgalois { + +bool LearningGraph::isLocal(index_t) { return true; } + +index_t LearningGraph::getLID(index_t) { return 0; } + +bool LearningGraph::is_vertex_cut() { return true; } + +std::vector>& LearningGraph::getMirrorNodes() { + return mirrorNodes; +} + +uint64_t LearningGraph::numMasters() { return 0; } + +uint64_t LearningGraph::globalSize() { return 0; } + +void LearningGraph::readGraph(std::string dataset, bool selfloop) { + if (selfloop) + std::cout << "selfloop not yet implemented\n"; + deepgalois::Reader reader(dataset); + reader.readGraphFromGRFile(this); +} + +void LearningGraph::degree_counting() { + // if (degrees_ != NULL) return; + // degrees_ = new index_t[num_vertices_]; + galois::do_all( + galois::iterate(size_t(0), size_t(num_vertices_)), + [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; }, + galois::loopname("DegreeCounting")); +} + +void LearningGraph::dealloc() {} + +} // namespace deepgalois diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu new file mode 100644 index 0000000000..9e1f2ab29e --- /dev/null +++ b/libdeepgalois/src/lgraph.cu @@ -0,0 +1,77 @@ +#include "deepgalois/lgraph.h" +#include "deepgalois/cutils.h" +#include "deepgalois/reader.h" +#include + +namespace deepgalois { + +void LearningGraph::readGraph(std::string dataset, bool selfloop) { + deepgalois::Reader reader(dataset); + reader.readGraphFromGRFile(this); +} + +void LearningGraph::dealloc() { + assert(is_device); + CUDA_CHECK(cudaFree(d_colidx_)); + CUDA_CHECK(cudaFree(d_rowptr_)); + CUDA_CHECK(cudaFree(d_degrees_)); + if (edge_data_ != NULL) + CUDA_CHECK(cudaFree(d_edge_data_)); + if (vertex_data_ != NULL) + CUDA_CHECK(cudaFree(d_vertex_data_)); +} + +void LearningGraph::allocOnDevice(bool no_edge_data__) { + if (d_colidx_ != NULL) + return; + CUDA_CHECK(cudaMalloc((void**)&d_colidx_, num_edges_ * sizeof(index_t))); + CUDA_CHECK( + cudaMalloc((void**)&d_rowptr_, (num_vertices_ + 1) * sizeof(index_t))); + // CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * + // sizeof(index_t))); if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) + // &edge_data__, num_edges_ * sizeof(edge_data___t))); + // CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * + // sizeof(vdata_t))); + is_device = true; +} + +void LearningGraph::print_test() { + printf("d_rowptr_: 0x%x\n", d_rowptr_); + printf("d_colidx_: 0x%x\n", d_colidx_); + print_device_int_vector(10, (const int*)d_rowptr_, "row_start"); + print_device_int_vector(10, (const int*)d_colidx_, "edge_dst"); +} + +void LearningGraph::copy_to_gpu() { + allocOnDevice(edge_data_ == NULL); + CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), + num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), + (num_vertices_ + 1) * sizeof(index_t), + cudaMemcpyHostToDevice)); + print_test(); + // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * + // sizeof(index_t), cudaMemcpyHostToDevice)); if (edge_data__ != NULL) + // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * + // sizeof(edata_t), cudaMemcpyHostToDevice)); + // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ + // * sizeof(vdata_t), cudaMemcpyHostToDevice)); +} + +void LearningGraph::copy_to_cpu() { + CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_, + num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_, + (num_vertices_ + 1) * sizeof(index_t), + cudaMemcpyDeviceToHost)); + // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * + // sizeof(index_t), cudaMemcpyDeviceToHost)); if (edge_data__ != NULL) + // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * + // sizeof(edata_t), cudaMemcpyDeviceToHost)); + // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, + // num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); +} + +void LearningGraph::degree_counting() {} + +} // namespace deepgalois diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp new file mode 100644 index 0000000000..b8addfe887 --- /dev/null +++ b/libdeepgalois/src/math_functions.cpp @@ -0,0 +1,368 @@ +#include +#include +#include +#include +#include +#include "galois/Timer.h" +#include "galois/Galois.h" +#include "deepgalois/utils.h" +#include "deepgalois/random.h" +#include "deepgalois/math_functions.hh" + +#ifdef USE_MKL +#include +#else // If use MKL, simply include the MKL header +extern "C" { +#include +} +#endif + +#define NOT_IMPLEMENTED \ + do { \ + std::cout << "Not Implemented Yet"; \ + exit(1); \ + } while (0); + +/* +#include +typedef boost::mt19937 rng_t; +inline rng_t* deepgalois_rng() { + return static_cast(Context::rng_stream().generator()); +} + +void rng_bernoulli(size_t n, const float_t p, uint8_t* r) { + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(deepgalois_rng(), random_distribution); + for (size_t i = 0; i < n; ++i) + r[i] = variate_generator(); +} +*/ + +// anon namespace so these things don't leak elsewhere +namespace { +static deepgalois::PerThreadRNG* per_thread_rng = nullptr; +} + +namespace deepgalois { + +namespace math { + +inline uint8_t bernoulli(float_t p) { + if (!per_thread_rng) { + per_thread_rng = new PerThreadRNG(); + } + return per_thread_rng->get_number() > p ? 1 : 0; +} + +//! wrapper function to call cblas_sgemm +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + galois::StatTimer Tmatmul("MatMul"); + Tmatmul.start(); + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); + Tmatmul.stop(); +} + +#ifdef USE_MKL +void csrmm_cpu(const int M, const int N, const int K, const int, + const float alpha, float* A_nonzeros, int* A_idx_ptr, + int* A_nnz_idx, const float* B, const float beta, float* C) { +#else +void csrmm_cpu(const int, const int, const int, const int, const float, float*, + int*, int*, const float*, const float, float*) { +#endif +#ifdef USE_MKL + // mkl_set_num_threads(56); + // const char *matdescra = "GXXCX";//6 bytes + // const char transa = 'N'; + // mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx, + // A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N); + sparse_status_t status; + bool need_trans = false; + bool is_row_major = true; + sparse_matrix_t csrA = NULL; + sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO; + sparse_layout_t layout = + (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR); + status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr, + A_idx_ptr + 1, A_nnz_idx, A_nonzeros); + if (status != SPARSE_STATUS_SUCCESS) { + std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl; + exit(1); + } + sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE + : SPARSE_OPERATION_NON_TRANSPOSE); + struct matrix_descr descrA; + descrA.type = SPARSE_MATRIX_TYPE_GENERAL; + // descrA.mode = SPARSE_FILL_MODE_UPPER; + // descrA.diag = SPARSE_DIAG_NON_UNIT; + // mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1); + // mkl_sparse_optimize(csrA); + status = + mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N); + if (status != SPARSE_STATUS_SUCCESS) { + std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl; + exit(1); + } + mkl_sparse_destroy(csrA); +#else + NOT_IMPLEMENTED; +#endif +} + +// matrix-vector multiply +void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, const float beta, + float* y) { + cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512 + +void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) { +#ifdef USE_MKL + vsAdd(n, a, b, y); +#else +#ifdef __AVX2__ + const size_t alignedN = n - n % vec_len; + for (size_t i = 0; i < alignedN; i += vec_len) + _mm256_storeu_ps( + &y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) + y[i] = a[i] + b[i]; +#else + for (size_t i = 0; i < n; ++i) + y[i] = a[i] + b[i]; +#endif +#endif +} + +void scal(size_t n, const float_t alpha, float_t* x) { + cblas_sscal(n, alpha, x, 1); +} + +void scale(size_t n, const float_t alpha, const float_t* x, float_t* y) { + cblas_scopy(n, x, 1, y, 1); + cblas_sscal(n, alpha, y, 1); +} + +void axpy(size_t n, const float_t a, float_t* x, float_t* y) { + cblas_saxpy(n, a, x, 1, y, 1); +} + +int argmax(const size_t n, const float_t* x) { + float_t max = x[0]; + int max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + +// l2 normalization +float_t l2_norm(size_t n, const float_t* x) { return cblas_snrm2(n, x, 1); } + +// dot product +float_t dot(size_t n, const float_t* x, const float_t* y) { + return cblas_sdot(n, x, 1, y, 1); +} + +// concatenation of two vectors into one +void concat(size_t n, const float_t* x, const float_t* y, float_t* z) { + copy_cpu(n, x, z); + copy_cpu(n, y, z + n); +} + +void clear_cpu(size_t n, float_t* in) { + // for (size_t i = 0; i < n; i++) in[i] = 0; + std::fill(in, in + n, 0); + // memset(in, 0, n*sizeof(float_t)); +} + +void dropout(size_t m, float scale, float dropout_rate, const float_t* in, + mask_t* masks, float_t* out) { + for (size_t i = 0; i < m; ++i) + masks[i] = bernoulli(dropout_rate); + for (size_t i = 0; i < m; ++i) + out[i] = in[i] * (float_t)masks[i] * scale; +} + +void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, + const float_t* in, mask_t* masks, float_t* out) { + size_t len = n * m; + + galois::do_all( + galois::iterate((size_t)0, len), + [&](size_t i) { masks[i] = bernoulli(dropout_rate); }, + galois::loopname("dropout RNG")); + + galois::do_all( + galois::iterate((size_t)0, len), + [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; }, + galois::loopname("dropout")); +} + +void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks, + float_t* out) { + for (size_t i = 0; i < m; ++i) + out[i] = in[i] * (float_t)masks[i] * scale; +} + +void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, + mask_t* masks, float_t* out) { + galois::do_all( + galois::iterate((size_t)0, n * m), + [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; }, + galois::loopname("d_dropout")); +} + +void relu_cpu(size_t n, const float_t* in, float_t* out) { + // TODO: vectorize + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { out[i] = std::max(in[i], float_t(0)); }, + galois::chunk_size<64>(), galois::loopname("relu")); +} + +void d_relu_cpu(size_t n, const float_t* in, const float_t* data, + float_t* out) { + // TODO: vectorize + // check if original data greater than 0; if so keep grad + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { + out[i] = data[i] > float_t(0) ? in[i] : float_t(0); + }, + galois::chunk_size<64>(), galois::loopname("d_relu")); +} + +void leaky_relu(float_t epsilon, float_t in, float_t& out) { + out = in > 0.0 ? in : epsilon * in; +} + +void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out) { + out = in * (data > 0.0 ? 1.0 : epsilon); +} + +void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, + float_t* out) { + // TODO: vectorize + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; }, + galois::chunk_size<64>(), galois::loopname("leaky_relu")); +} + +void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, + const float_t* data, float_t* out) { + // TODO: vectorize + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { + out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon); + }, + galois::chunk_size<64>(), galois::loopname("d_leaky_relu")); +} + +void softmax(size_t n, const float_t* input, float_t* output) { + const float_t max = *std::max_element(input, input + n); + float_t denominator(0); + for (size_t i = 0; i < n; i++) { + output[i] = std::exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < n; i++) + output[i] /= denominator; +} + +void d_softmax(size_t n, const float_t*, const float_t* p, float_t* dy, + const float_t* dp) { + vec_t df(n, 0); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; + } + dy[i] = dot(n, dp, &df[0]); + } +} + +// cross-entropy loss function for multi-class classification +// y: ground truth +// p: predicted probability +float_t cross_entropy(size_t n, const float_t* y, const float_t* p) { + float_t loss = 0.0; + for (size_t i = 0; i < n; i++) { + if (y[i] == float_t(0)) + continue; + if (p[i] == float_t(0)) + loss -= y[i] * std::log(float_t(1e-10)); + else + loss -= y[i] * std::log(p[i]); + } + return loss; +} + +void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { + for (size_t i = 0; i < n; i++) { + d[i] = -y[i] / (p[i] + float_t(1e-10)); + } +} + +// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and +// amazon inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + +// 0.5; } +inline float_t sigmoid_func(float_t x) { return 1. / (1. + expf(-x)); } + +// Sigmoid +void sigmoid(size_t n, const float_t* in, float_t* out) { + for (size_t i = 0; i < n; i++) { + out[i] = 1. / (1. + expf(-in[i])); + } +} + +void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy, + const float_t* dp) { + for (size_t i = 0; i < n; i++) { + dy[i] = dp[i] * p[i] * (float_t(1) - p[i]); + } +} + +void copy_cpu(size_t n, const float_t* in, float_t* out) { + // std::copy(in, in + n, out); + // memcpy(out, in, sizeof(float_t) * n); + cblas_scopy(n, in, 1, out, 1); +} + +// num rows in A, C; num columns in B, C; num columns in A, rows in B +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, float_t* C) { + sgemm_cpu(CblasNoTrans, CblasNoTrans, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); +} + +// TODO make parallel +void transpose(size_t x, size_t y, const float_t* in, float_t* out) { + for (size_t i = 0; i < y; i++) { + for (size_t j = 0; j < x; j++) { + out[i * x + j] = in[j * y + i]; + } + } +} + +float reduce_mean(size_t n, const float_t* x) { + float_t sum = 0.; + for (size_t i = 0; i < n; i++) { + sum += (float_t)x[i]; + } + return sum / (float_t)n; +} + +} // end namespace math +} // end namespace deepgalois diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu new file mode 100644 index 0000000000..b9f7686867 --- /dev/null +++ b/libdeepgalois/src/math_functions.cu @@ -0,0 +1,800 @@ +#include "deepgalois/math_functions.hh" +#include "deepgalois/DistContext.h" +#include "gg.h" +#include "ggcuda.h" +#include "cub/cub.cuh" +#include + +__global__ void init_const_kernel(int n, float_t value, float_t* array) { + CUDA_KERNEL_LOOP(i, n) { array[i] = value; } +} + +void init_const_gpu(int n, float_t value, float_t* array) { + init_const_kernel<<>>(n, value, array); + CudaTest("solving init_const kernel failed"); +} + +__global__ void isnan_test(const int n, const float* data, bool* result) { + CUDA_KERNEL_LOOP(i, n) { + if (isnan(data[i])) + *result = true; + } +} + +bool isnan_gpu(int n, const float_t* array) { + bool *d_result, h_result = false; + cudaMalloc((void**)&d_result, sizeof(bool)); + cudaMemcpy(d_result, &h_result, sizeof(bool), cudaMemcpyHostToDevice); + isnan_test<<>>(n, array, d_result); + CudaTest("solving init_const kernel failed"); + cudaMemcpy(&h_result, d_result, sizeof(bool), cudaMemcpyDeviceToHost); + return h_result; +} + +void gpu_rng_uniform(size_t n, float_t* r) { + CURAND_CHECK( + curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n)); +} + +void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) { + CURAND_CHECK( + curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n)); + const float range = b - a; + if (range != float_t(1)) + scal_gpu(n, range, r); + if (a != float_t(0)) + add_scalar_gpu(n, a, r); +} + +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, + float_t* r) { + CURAND_CHECK(curandGenerateNormal(deepgalois::DistContext::curand_generator(), r, + n, mu, sigma)); +} + +bool is_allocated_device(float_t* data) { + if (data == NULL) + return false; + cudaPointerAttributes attributes; + CUDA_CHECK(cudaPointerGetAttributes(&attributes, data)); + if (attributes.devicePointer != NULL) + return true; + return false; +} + +void float_malloc_device(int n, float_t*& ptr) { + CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(float_t))); +} + +void float_free_device(float_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); } + +void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr) { + CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice)); +} + +void uint8_malloc_device(int n, uint8_t*& ptr) { + CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(uint8_t))); +} + +void uint8_free_device(uint8_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); } + +void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr) { + CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(uint8_t), cudaMemcpyHostToDevice)); +} + +void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { + assert(h_masks != NULL); + CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); + CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); +} + +__global__ void setup_curand_kernel(const int n, curandState* state) { + CUDA_KERNEL_LOOP(i, n) { + // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + curand_init(7 + i, i, 0, &state[i]); // Each thread gets different seed + } +} + +__global__ void dropout_kernel(int n, float scale, float threshold, + float_t* rands, const float_t* in, mask_t* masks, + float_t* out) { + CUDA_KERNEL_LOOP(i, n) { + masks[i] = rands[i] > threshold ? 1 : 0; + out[i] = in[i] * masks[i] * scale; + } +} + +void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + mask_t* masks, float_t* out) { + float_t* rands; + float_malloc_device(n, rands); + gpu_rng_uniform(n, rands); + dropout_kernel<<>>( + n, scale, dropout_rate, rands, in, masks, out); + CudaTest("solving dropout kernel failed"); + float_free_device(rands); +} + +__global__ void d_dropout_kernel(int n, float scale, float threshold, + const float_t* in, const mask_t* masks, + float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } +} + +void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + const mask_t* masks, float_t* out) { + d_dropout_kernel<<>>( + n, scale, dropout_rate, in, masks, out); + CudaTest("solving d_dropout kernel failed"); +} + +// flattern data into 1D before feed into the ReLU operater +__global__ void relu_kernel(const int n, const float_t* in, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : 0; } +} + +void relu_gpu(const int n, const float_t* in, float_t* out) { + relu_kernel<<>>(n, in, out); + CudaTest("solving relu kernel failed"); +} + +__global__ void d_relu_kernel(const int n, const float_t* in_diff, + const float_t* data, float_t* out_diff) { + CUDA_KERNEL_LOOP(i, n) { out_diff[i] = data[i] > 0 ? in_diff[i] : 0; } +} + +void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, + float_t* out_diff) { + d_relu_kernel<<>>(n, in_diff, data, + out_diff); + CudaTest("solving d_relu kernel failed"); +} + +// flattern data into 1D before feed into the ReLU operater +__global__ void leaky_relu_kernel(const int n, const float_t epsilon, + const float_t* in, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; } +} + +void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in, + float_t* out) { + leaky_relu_kernel<<>>(n, epsilon, in, + out); + CudaTest("solving leaky_relu kernel failed"); +} + +__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon, + const float_t* in_diff, const float_t* data, + float_t* out_diff) { + CUDA_KERNEL_LOOP(i, n) { + out_diff[i] = in_diff[i] * (data[i] > 0 ? 1.0 : epsilon); + } +} + +void d_leaky_relu_gpu(const int n, const float_t epsilon, + const float_t* in_diff, const float_t* data, + float_t* out_diff) { + d_leaky_relu_kernel<<>>( + n, epsilon, in_diff, data, out_diff); + CudaTest("solving d_leaky_relu kernel failed"); +} + +__global__ void matmul_kernel(int x, int y, int z, const float_t* A, + const float_t* B, float_t* C) { + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + float_t sum = 0.0f; + if (row < x && col < y) { + for (int i = 0; i < z; i++) { + sum += A[row * z + i] * B[i * y + col]; + } + } + C[row * y + col] = sum; +} + +#define TILE_SZ 16 +void matmul_gpu(const size_t x, const size_t y, const size_t z, + const float_t* A, const float_t* B, float_t* C) { + dim3 threadsPerBlock(TILE_SZ, TILE_SZ); + dim3 blocksPerGrid((y - 1) / TILE_SZ + 1, (x - 1) / TILE_SZ + 1); + matmul_kernel<<>>(x, y, z, A, B, C); + CudaTest("solving matmul kernel failed"); +} + +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(deepgalois::DistContext::cublas_handle(), cuTransB, + cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, + N)); +} + +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, float_t* C) { + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); +} + +// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix +// for vertex feature tensor. However, since cusparse only supports +// column-major, while feature tensor is stored in row-major, the actual +// computation is: C = trans(A x trans(B)). Currently, we use cublasSgeam to +// implement transposition and allocate intermediate workspace memory +// (transpose_C) for this. +void csrmm_gpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, const int* A_idx_ptr, + const int* A_nnz_idx, const float* B, const float beta, + float* transpose_C, float* C) { + //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; + CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, + M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(), + A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); + // transpose C + const float one = 1.0; + const float zero = 0.0; + CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T, + CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); +} +/* +void csrmm_gpu_new(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nnz_idx, + const float* B, const float beta, float *transpose_C, float* C) { + std::cout << "[debug]: csrmm_gpu\n"; + cusparseSpMatDescr_t A_descr; + CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx, +A_nonzeros, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + cusparseDnMatDescr_t B_descr; + CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F, +CUSPARSE_ORDER_COL)); cusparseDnMatDescr_t C_descr; + CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F, +CUSPARSE_ORDER_COL)); size_t bufferSize; + CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::DistContext::cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, +CUSPARSE_OPERATION_TRANSPOSE, (void*)&alpha, A_descr, B_descr, (void*)&beta, +C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize)); + cudaDeviceSynchronize(); + void* buffer = NULL; + if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize)); + CUSPARSE_CHECK(cusparseSpMM(deepgalois::DistContext::cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, + (const void*)&alpha, A_descr, B_descr, (const void*)&beta, +C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); cudaDeviceSynchronize(); + //transpose C + const float one = 1.0; + const float zero = 0.0; + CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T, +CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); +} +//*/ +void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, + const float beta, float* y) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasSgemv(deepgalois::DistContext::cublas_handle(), cuTransA, N, M, + &alpha, A, N, x, 1, &beta, y, 1)); +} + +void scal_gpu(const int N, const float alpha, float* X) { + CUBLAS_CHECK( + cublasSscal(deepgalois::DistContext::cublas_handle(), N, &alpha, X, 1)); +} + +void dot_gpu(const int n, const float* x, const float* y, float* out) { + CUBLAS_CHECK( + cublasSdot(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1, out)); +} + +void asum_gpu(const int n, const float* x, float* y) { + CUBLAS_CHECK(cublasSasum(deepgalois::DistContext::cublas_handle(), n, x, 1, y)); +} + +void scale_gpu(const int n, const float alpha, const float* x, float* y) { + CUBLAS_CHECK( + cublasScopy(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK( + cublasSscal(deepgalois::DistContext::cublas_handle(), n, &alpha, y, 1)); +} + +__global__ void set_kernel(const int n, const float_t alpha, float_t* y) { + CUDA_KERNEL_LOOP(i, n) { y[i] = alpha; } +} + +void set_gpu(const int n, const float_t alpha, float_t* y) { + if (alpha == 0) { + CUDA_CHECK(cudaMemset(y, 0, sizeof(float_t) * n)); + return; + } + set_kernel<<>>(n, alpha, y); + CudaTest("solving set kernel failed"); +} + +__global__ void add_scalar_kernel(const int n, const float_t a, float_t* y) { + CUDA_KERNEL_LOOP(i, n) { y[i] += a; } +} + +void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) { + add_scalar_kernel<<>>(n, alpha, Y); + CudaTest("solving add_scalar kernel failed"); +} + +__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, + float_t* y) { + CUDA_KERNEL_LOOP(i, n) { y[i] = a[i] + b[i]; } +} + +void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* y) { + vadd_kernel<<>>(n, a, b, y); + CudaTest("solving vadd kernel failed"); +} + +__global__ void axpy_kernel(const int n, const float_t a, const float_t* x, + float_t* y) { + CUDA_KERNEL_LOOP(i, n) { y[i] = a * x[i] + y[i]; } +} + +void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) { + // axpy_kernel<<>>(n, a, x, y); + CUBLAS_CHECK( + cublasSaxpy(deepgalois::DistContext::cublas_handle(), n, &a, x, 1, y, 1)); + CudaTest("solving axpy kernel failed"); +} + +__global__ void l2_norm_kernel(const int n, const float_t* a, float_t* sum) { + CUDA_KERNEL_LOOP(i, n) { + float_t product = a[i] * a[i]; + atomicAdd(sum, product); + } +} + +acc_t l2_norm_gpu(int n, const float_t* x) { + float_t sum = 0.0; + CUBLAS_CHECK(cublasSnrm2(deepgalois::DistContext::cublas_handle(), n, x, 1, &sum)); + // float_t *d_sum; + // CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t)); + // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice)); + // l2_norm_kernel<<>>(n, x, d_sum); + // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t), + // cudaMemcpyDeviceToHost)); + return (acc_t)sum / 2.0; +} + +void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out) {} + +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, + float_t* out_diff) {} + +void copy_gpu(int len, const float_t* in, float_t* out) { + CUDA_CHECK( + cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); +} + +// TODO: use warp +__device__ void softmax_device(int n, const float_t* input, float_t* output) { + float_t max = input[0]; + for (int i = 1; i < n; i++) + if (input[i] > max) + max = input[i]; + float_t denominator = 0.0; + for (int i = 0; i < n; i++) { + output[i] = expf(input[i] - max); + denominator += output[i]; + if (output[i] < 0.0) + printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]); + // assert(output[i] >= 0.0); + } + assert(denominator != 0.0); + for (int i = 0; i < n; i++) { + output[i] /= denominator; + // assert(output[i] >= 0.0); + // assert(output[i] <= 1.0); + } +} + +__device__ void sigmoid_device(int n, const float_t* in, float_t* out) { + for (int i = 0; i < n; i++) + out[i] = 1. / (1. + expf(-in[i])); +} + +__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, + float_t& loss) { + if (p[idx] == 0.0) + loss -= logf(float_t(1e-10)); + else + loss -= logf(p[idx]); +} + +// y: ground truth +// p: predictions +__device__ void cross_entropy_multi_device(int n, const label_t* y, + const float_t* p, float_t& loss) { + for (int i = 0; i < n; i++) { + if (y[i] == 0) + continue; + if (p[i] == float_t(0)) + loss -= logf(float_t(1e-10)); // avoid NaN exception + else + loss -= logf(p[i]); + } +} + +// n: number of vectors +// len: length of vectors +// for each vector, do softmax to normalize the vector, and then compute a loss +__global__ void softmax_cross_entropy_kernel(int len, int begin, int end, + const float_t* in_data, + const mask_t* masks, + const label_t* labels, + float_t* loss, float_t* out_data) { + CUDA_KERNEL_LOOP(i, end - begin) { + int id = begin + i; + if (masks[id] == 1) { // masked + // normalize using softmax + softmax_device(len, in_data + len * id, out_data + len * id); + // loss[id] = 0.0; + cross_entropy_device(len, labels[id], out_data + len * id, loss[id]); + } + } +} + +void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out) { + softmax_cross_entropy_kernel<<>>(len, begin, end, in, masks, + labels, loss, out); + CudaTest("solving softmax_cross_entropy kernel failed"); +} + +// n: number of vectors +// len: length of vectors +// for each vector, do softmax to normalize the vector, and then compute a loss +__global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end, + const float_t* in_data, + const mask_t* masks, + const label_t* labels, + float_t* loss, float_t* out_data) { + CUDA_KERNEL_LOOP(i, end - begin) { + int id = begin + i; + if (masks[id] == 1) { // masked + sigmoid_device(len, in_data + len * id, out_data + len * id); + cross_entropy_multi_device(len, labels, out_data + len * id, loss[id]); + } + } +} + +void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out) { + sigmoid_cross_entropy_kernel<<>>(len, begin, end, in, masks, + labels, loss, out); + CudaTest("solving sigmoid_cross_entropy kernel failed"); +} + +__device__ void d_cross_entropy_device(int n, const label_t idx, + const float_t* p, float_t* d) { + for (int i = 0; i < n; i++) { + if (i == (int)idx) + d[i] = -1.0 / (p[i] + 1e-10); + else + d[i] = 0.0; + } +} + +__global__ void d_cross_entropy_kernel(int len, int begin, int end, + const mask_t* masks, + const label_t* labels, + const float_t* data, float_t* grad) { + int base = begin * len; + CUDA_KERNEL_LOOP(i, (end - begin) * len) { + int id = begin + i / len; + if (masks[id] == 1) { // masked + if (i % len == (int)labels[id]) + grad[i] = -1.0 / (data[i + base] + 1e-10); + else + grad[i] = 0.0; + // d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i); + } + } +} + +__global__ void d_cross_entropy_warp(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* data, float_t* grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) + p[warp_lane][pid] = data[base + pid]; + } + __syncthreads(); + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + if (pid == (int)labels[id]) + grad[wid * len + pid] = -1.0 / (p[warp_lane][pid] + 1e-10); + else + grad[wid * len + pid] = 0.0; + } + } + } + } +} + +__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, + float_t* dy) { + for (int i = 0; i < n; i++) { + dy[i] = 0; + for (int j = 0; j < n; j++) { + float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i]; + dy[i] += df * dp[j]; + } + } +} + +__global__ void d_softmax_kernel(int len, int begin, int end, + const mask_t* masks, const float_t* data, + const float_t* in_grad, float_t* out_grad) { + CUDA_KERNEL_LOOP(i, end - begin) { + int id = begin + i; + if (masks[id] == 1) { // masked + d_softmax_device(len, data + len * id, in_grad + len * i, + out_grad + len * id); + } + } +} + +__global__ void d_softmax_warp(int len, int begin, int end, const mask_t* masks, + const float_t* data, const float_t* in_grad, + float_t* out_grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + p[warp_lane][pid] = data[base + pid]; + d[warp_lane][pid] = in_grad[wid * len + pid]; + } + } + __syncthreads(); + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + float_t sum = 0.0; + float_t self = p[warp_lane][pid]; + for (int j = 0; j < len; j++) { + float_t df = + (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; + sum += df * d[warp_lane][j]; + } + out_grad[base + pid] = sum; + } + } + __syncthreads(); + } + } +} + +__global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, + const mask_t* masks, + const label_t* labels, + const float_t* out, + float_t* diff) { + CUDA_KERNEL_LOOP(i, end - begin) { + int id = begin + i; + if (masks[id] == 1) { // masked + float_t out_grad[41]; // TODO + d_cross_entropy_device(len, labels[id], out + len * id, out_grad); + d_softmax_device(len, out + len * id, out_grad, diff + len * id); + } + } +} + +__global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, + const mask_t* masks, + const label_t* labels, + const float_t* data, + float_t* grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) + p[warp_lane][pid] = data[base + pid]; + } + __syncthreads(); + + // cross entropy derivative + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + if (pid == (int)labels[id]) + d[warp_lane][pid] = -1.0 / (p[warp_lane][pid] + 1e-10); + else + d[warp_lane][pid] = 0.0; + } + } + __syncthreads(); + + // softmax derivative + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + float_t sum = 0.0; + float_t self = p[warp_lane][pid]; + for (int j = 0; j < len; j++) { + float_t df = + (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; + sum += df * d[warp_lane][j]; + } + grad[base + pid] = sum; + } + } + __syncthreads(); + } + } +} + +void d_softmax_cross_entropy_gpu(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { + // d_softmax_cross_entropy_kernel<<>>( + // len, begin, end, masks, labels, out, diff); + // CudaTest("solving d_softmax_cross_entropy kernel failed"); + // float_t *grad; + // float_malloc_device((end-begin)*len, grad); + // d_cross_entropy_kernel<<>>( + // d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + // len, begin, end, masks, labels, out, grad); + // CudaTest("solving d_cross_entropy kernel failed"); + // d_softmax_kernel<<>>( + // d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + // len, begin, end, masks, out, grad, diff); + // CudaTest("solving d_softmax kernel failed"); + d_softmax_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1, + BLOCK_SIZE>>>(len, begin, end, masks, labels, + out, diff); + CudaTest("solving d_softmax_cross_entropy_warp kernel failed"); +} + +__global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end, + const mask_t* masks, + const label_t* labels, + const float_t* data, + float_t* grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) + p[warp_lane][pid] = data[base + pid]; + } + __syncthreads(); + + // cross entropy derivative + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + // if (p[warp_lane][pid] == 0) + d[warp_lane][pid] = + -(float_t)labels[base + pid] / (p[warp_lane][pid] + 1e-10); + // else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10; + } + } + __syncthreads(); + + // sigmoid derivative + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + float_t self = p[warp_lane][pid]; + float_t dp = d[warp_lane][pid]; + grad[base + pid] = dp * self * (float_t(1) - self); + } + } + __syncthreads(); + } + } +} + +void d_sigmoid_cross_entropy_gpu(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { + d_sigmoid_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1, + BLOCK_SIZE>>>(len, begin, end, masks, labels, + out, diff); + CudaTest("solving d_sigmoid_cross_entropy_warp kernel failed"); +} + +__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, + float_t* loss, + HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage local_loss; + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) + total.reduce(loss[begin + i]); + } + total.thread_exit>(local_loss); +} + +// acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* +// loss); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, + float_t* loss) { + assert(count > 0); + HGAccumulator loss_accum; + Shared total_loss = Shared(1); + *(total_loss.cpu_wr_ptr()) = 0; + loss_accum.rv = total_loss.gpu_wr_ptr(); + masked_avg_loss_kernel<<>>( + begin, end, masks, loss, loss_accum); + CudaTest("solving masked_avg_loss kernel failed"); + cudaDeviceSynchronize(); + return *(total_loss.cpu_rd_ptr()) / count; +} diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp new file mode 100644 index 0000000000..e5e9fa7c10 --- /dev/null +++ b/libdeepgalois/src/node.cpp @@ -0,0 +1,28 @@ +#include "deepgalois/layers/node.h" +#include + +namespace deepgalois { + +void edge::alloc() { + data_ = new float_t[num_samples_ * ft_dim_]; + grad_ = new float_t[num_samples_ * ft_dim_]; +} + +void edge::merge_grads(float_t* dst) { + assert(grad_ != NULL); + if (dst) + delete[] dst; + dst = new float_t[ft_dim_]; + std::copy(grad_, grad_ + ft_dim_, dst); + // @todo consider adding parallelism and vectorization + for (size_t sample = 1; sample < num_samples_; ++sample) { + for (size_t i = 0; i < ft_dim_; i++) + dst[i] += grad_[sample * ft_dim_ + i]; + } +} + +void edge::clear_grads() { + std::fill(grad_, grad_ + ft_dim_ * num_samples_, float_t(0)); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu new file mode 100644 index 0000000000..2151162752 --- /dev/null +++ b/libdeepgalois/src/node.cu @@ -0,0 +1,24 @@ +#include "deepgalois/layers/node.h" +#include "deepgalois/cutils.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +void edge::alloc() { + CUDA_CHECK( + cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK( + cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); +} + +void edge::merge_grads(float_t* dst) { + CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), + cudaMemcpyDeviceToHost)); +} + +void edge::clear_grads() { + // CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); + init_const_gpu(num_samples_ * ft_dim_, 0.0, grad_); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp new file mode 100644 index 0000000000..4538d1c956 --- /dev/null +++ b/libdeepgalois/src/optimizer.cpp @@ -0,0 +1,101 @@ +#include "deepgalois/optimizer.h" +#include "galois/Galois.h" +#include "deepgalois/math_functions.hh" + +namespace deepgalois { + +void adagrad::update(const vec_t& dW, vec_t& W) { + vec_t& g = get<0>(W); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, + galois::loopname("adagrad_update")); + /* + for (size_t i = 0; i < W.size(); i++) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + } + */ +} + +void RMSprop::update(const vec_t& dW, vec_t& W) { + vec_t& g = get<0>(W); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; + W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); + }, + galois::loopname("rms_update")); +} + +void adam::update(const vec_t& dW, vec_t& W) { + vec_t& mt = get<0>(W); + vec_t& vt = get<1>(W); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("adam_update")); + // TODO/NOTE: this is incorrect: adam parameters should not be shared + // among layers, but this is making it shared + b1_t *= b1; + b2_t *= b2; +} + +void adamax::update(const vec_t& dW, vec_t& W) { + vec_t& mt = get<0>(W); + vec_t& ut = get<1>(W); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, + galois::loopname("adamax_update")); + b1_t *= b1; +} + +void gradient_descent::update(const vec_t& dW, vec_t& W) { + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); }, + galois::loopname("gradient_descent_update")); +} + +void momentum::update(const vec_t& dW, vec_t& W) { + vec_t& dWprev = get<0>(W); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + }, + galois::loopname("momentum_update")); +} + +void nesterov_momentum::update(const vec_t& dW, vec_t& W) { + vec_t& dWprev = get<0>(W); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + }, + galois::loopname("nesterov_momentum_update")); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu new file mode 100644 index 0000000000..15f2fe5515 --- /dev/null +++ b/libdeepgalois/src/optimizer.cu @@ -0,0 +1,55 @@ +#include "deepgalois/optimizer.h" +#include "deepgalois/cutils.h" +#include "deepgalois/math_functions.hh" + +__global__ void update_kernel(const int n, float_t alpha, float_t b1, + float_t b2, float_t b1_t, float_t b2_t, + float_t eps, float_t* mt, float_t* vt, + const float_t* dW, float_t* W) { + CUDA_KERNEL_LOOP(i, n) { + mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i]; + vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i]; + W[i] -= + alpha * (mt[i] / (1.0 - b1_t)) / sqrtf((vt[i] / (1.0 - b2_t)) + eps); + } +} + +namespace deepgalois { + +template +template +float_t* stateful_optimizer::get_gpu(const size_t n, const float_t* key) { + static_assert(Index < N, "index out of range"); + if (!is_allocated_device(dE_[Index][key])) { + float_malloc_device(n, dE_[Index][key]); + init_const_gpu(n, 0.0, dE_[Index][key]); + } + return dE_[Index][key]; +} + +void adam::update(const vec_t& dW, vec_t& W) {} +void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { + // std::cout << "updating weights on GPU, n = " << n << "\n"; + // print_device_vector(10, dW, "dW"); + float_t* cache = get_gpu<0>(n, W); + float_t* velocity = get_gpu<1>(n, W); + + update_kernel<<>>( + n, alpha, b1, b2, b1_t, b2_t, eps, cache, velocity, dW, W); + b1_t *= b1; + b2_t *= b2; +} + +void adagrad::update_gpu(const size_t, const float_t*, float_t*) {} + +void RMSprop::update_gpu(const size_t, const float_t*, float_t*) {} + +void adamax::update_gpu(const size_t, const float_t*, float_t*) {} + +void gradient_descent::update_gpu(const size_t, const float_t*, float_t*) {} + +void momentum::update_gpu(const size_t, const float_t*, float_t*) {} + +void nesterov_momentum::update_gpu(const size_t, const float_t*, float_t*) {} + +} // namespace deepgalois diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp new file mode 100644 index 0000000000..d7e1bcf44b --- /dev/null +++ b/libdeepgalois/src/reader.cpp @@ -0,0 +1,311 @@ +#include "deepgalois/reader.h" +#include "deepgalois/utils.h" +#include "deepgalois/configs.h" +#include "galois/Galois.h" +#include +#include +#include +#include /* For O_RDWR */ +#include /* For open(), creat() */ +#include +#include +#ifndef GALOIS_ENABLE_GPU +#include "galois/DistGalois.h" +#endif + +namespace deepgalois { + +// labels contain the ground truth (e.g. vertex classes) for each example +// (num_examples x 1). Note that labels is not one-hot encoded vector and it can +// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if +// required. +size_t Reader::read_labels(bool is_single_class, label_t*& labels) { + unsigned myID = 0; +#ifndef GALOIS_ENABLE_GPU + myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Reader: Reading labels...\n"); +#endif + + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + "-labels.txt"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m, num_classes; // m: number of samples + in >> m >> num_classes >> std::ws; + if (is_single_class) { + std::cout << "[" << myID + << "] Reader: Using single-class (one-hot) labels\n"; + // galois::gPrint("[", myID, + // "] Reader: Using single-class (one-hot) labels\n"); + labels = + new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 + } else { + // galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) + // labels\n"); + std::cout << "[" << myID + << "] Reader: Using multi-class (one-hot) labels\n"; + labels = + new label_t[m * + num_classes]; // multi-class label for each vertex: N x E + } + unsigned v = 0; + while (std::getline(in, line)) { + std::istringstream label_stream(line); + unsigned x; + for (size_t idx = 0; idx < num_classes; ++idx) { + label_stream >> x; + if (is_single_class) { + if (x != 0) { + labels[v] = idx; + break; + } + } else { + labels[v * num_classes + idx] = x; + } + } + v++; + } + in.close(); + t_read.Stop(); + // print the number of vertex classes + std::cout << "[" << myID << "] Done, unique label counts: " << num_classes + << ", time: " << t_read.Millisecs() << " ms\n"; + // galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes, + //", time: ", t_read.Millisecs(), " ms\n"); + // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << + // unsigned(labels[i]) << "\n"; + return num_classes; +} + +//! Read features, return the length of a feature vector +//! Features are stored in the Context class +size_t Reader::read_features(float_t*& feats, std::string filetype) { + // filetype = "txt"; + std::cout << "Reading features ... "; + Timer t_read; + t_read.Start(); + size_t m, feat_len; // m = number of vertices + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + + if (filetype == "bin") { + std::string file_dims = path + dataset_str + "-dims.txt"; + std::ifstream ifs; + ifs.open(file_dims, std::ios::in); + ifs >> m >> feat_len >> std::ws; + ifs.close(); + } else { + in.open(filename, std::ios::in); + in >> m >> feat_len >> std::ws; + } + std::cout << "N x D: " << m << " x " << feat_len << "\n"; + feats = new float_t[m * feat_len]; + if (filetype == "bin") { + filename = path + dataset_str + "-feats.bin"; + in.open(filename, std::ios::binary | std::ios::in); + in.read((char*)feats, sizeof(float_t) * m * feat_len); + } else { + std::string line; + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + edge_stream >> u; + edge_stream >> v; + edge_stream >> w; + feats[u * feat_len + v] = w; + } + } + in.close(); + t_read.Stop(); + std::cout << "Done, feature length: " << feat_len + << ", time: " << t_read.Millisecs() << " ms\n"; + // for (auto i = 0; i < 6; i ++) + // for (auto j = 0; j < 6; j ++) + // std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] << + // "\n"; + return feat_len; +} + +//! Get masks from datafile where first line tells range of +//! set to create mask from +size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, + size_t& end, mask_t* masks) { + std::cout << "n:" << n << "\n"; + bool dataset_found = false; + for (int i = 0; i < NUM_DATASETS; i++) { + if (dataset_str == dataset_names[i]) { + dataset_found = true; + break; + } + } + if (!dataset_found) { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + // std::cout << "Reading " << filename << "\n"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + masks[i] = 1; + sample_count++; + } + } + i++; + } + std::cout << "Global read " << mask_type << "_mask range: [" << begin << ", " + << end << ") Number of valid samples: " << sample_count << " (" + << (float)sample_count / (float)n * (float)100 << "\%)\n"; + // galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", + // end, + // ") Number of valid samples: ", sample_count, " (", + // (float)sample_count / (float)n * (float)100, "\%)\n"); + in.close(); + return sample_count; +} + +void Reader::progressPrint(unsigned max, unsigned i) { + const unsigned nsteps = 10; + unsigned ineachstep = (max / nsteps); + if (ineachstep == 0) + ineachstep = 1; + if (i % ineachstep == 0) { + int progress = ((size_t)i * 100) / max + 1; + printf("\t%3d%%\r", progress); + fflush(stdout); + } +} + +void Reader::readGraphFromGRFile(LearningGraph* g) { + std::string filename = path + dataset_str + ".csgr"; + std::ifstream ifs; + ifs.open(filename); + int masterFD = open(filename.c_str(), O_RDONLY); + if (masterFD == -1) { + std::cout << "LearningGraph: unable to open" << filename << "\n"; + exit(1); + } + struct stat buf; + int f = fstat(masterFD, &buf); + if (f == -1) { + std::cout << "LearningGraph: unable to stat" << filename << "\n"; + exit(1); + } + size_t masterLength = buf.st_size; + int _MAP_BASE = MAP_PRIVATE; + void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0); + if (m == MAP_FAILED) { + m = 0; + std::cout << "LearningGraph: mmap failed.\n"; + exit(1); + } + Timer t; + t.Start(); + + uint64_t* fptr = (uint64_t*)m; + __attribute__((unused)) uint64_t version = le64toh(*fptr++); + assert(version == 1); + uint64_t sizeEdgeTy = le64toh(*fptr++); + uint64_t nv = le64toh(*fptr++); + uint64_t ne = le64toh(*fptr++); + uint64_t* outIdx = fptr; + fptr += nv; + uint32_t* fptr32 = (uint32_t*)fptr; + uint32_t* outs = fptr32; + fptr32 += ne; + if (ne % 2) + fptr32 += 1; + if (sizeEdgeTy != 0) { + std::cout << "LearningGraph: currently edge data not supported.\n"; + exit(1); + } + g->allocateFrom(nv, ne); + auto rowptr = g->row_start_host_ptr(); + for (unsigned vid = 0; vid < nv; ++vid) { + g->fixEndEdge(vid, le64toh(outIdx[vid])); + auto degree = rowptr[vid + 1] - rowptr[vid]; + for (unsigned jj = 0; jj < degree; ++jj) { + unsigned eid = rowptr[vid] + jj; + unsigned dst = le32toh(outs[eid]); + if (dst >= nv) { + printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj, + eid); + exit(0); + } + g->constructEdge(eid, dst); + } + progressPrint(nv, vid); + } + ifs.close(); + + /* + std::string file_dims = path + dataset + "-dims.bin"; + std::string file_rowptr = path + dataset + "-rowptr.bin"; + std::string file_colidx = path + dataset + "-colidx.bin"; + index_t dims[2]; + ifs.open(file_dims, std::ios::binary|std::ios::in); + ifs.read((char*)dims, sizeof(index_t) * 2); + ifs.close(); + num_vertices_ = dims[0]; + num_edges_ = dims[1]; + degrees_ = new index_t[num_vertices_]; + rowptr_ = new index_t[num_vertices_+1]; + colidx_ = new index_t[num_edges_]; + ifs.open(file_rowptr, std::ios::binary|std::ios::in); + ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1)); + ifs.close(); + ifs.open(file_colidx, std::ios::binary|std::ios::in); + ifs.read((char*)colidx_, sizeof(index_t) * num_edges_); + ifs.close(); + */ + t.Stop(); + // double runtime = t.Millisecs(); + // std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" + // << masterLength / 1000.0 / runtime << " MB/s)\n\n"; +} + +/* +void add_selfloop(Graph& og, Graph& g) { + g.allocateFrom(og.size(), og.size() + og.sizeEdges()); + g.constructNodes(); + for (size_t src = 0; src < og.size(); src++) { + //g.getData(src) = 1; + auto begin = og.edge_begin(src); + auto end = og.edge_end(src); + g.fixEndEdge(src, end+src+1); + bool self_inserted = false; + if (begin == end) { + new_edge_dst[begin+i] = i; + continue; + } + for (auto e = begin; e != end; e++) { + auto dst = og.getEdgeDst(e); + if (!self_inserted) { + if (dst > src) { + g.constructEdge(e+src, src, 0); + g.constructEdge(e+src+1, dst, 0); + self_inserted = true; + } else if (e+1 == end) { + g.constructEdge(e+src+1, src, 0); + g.constructEdge(e+src, dst, 0); + self_inserted = true; + } else g.constructEdge(e+src, dst, 0); + } else g.constructEdge(e+src+1, dst, 0); + } + } +} +//*/ + +} // namespace deepgalois diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp new file mode 100644 index 0000000000..1b237ff7c3 --- /dev/null +++ b/libdeepgalois/src/utils.cpp @@ -0,0 +1,132 @@ +#include "galois/Galois.h" +#include "deepgalois/utils.h" +#ifndef GALOIS_ENABLE_GPU +#include "galois/DistGalois.h" +#endif + +namespace deepgalois { + +// parallel prefix sum +template +OutTy* parallel_prefix_sum(const std::vector& in) { + const size_t block_size = 1 << 20; + const size_t num_blocks = (in.size() + block_size - 1) / block_size; + std::vector local_sums(num_blocks); + // count how many bits are set on each thread + galois::do_all( + galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { + OutTy lsum = 0; + size_t block_end = std::min((block + 1) * block_size, in.size()); + for (size_t i = block * block_size; i < block_end; i++) + lsum += in[i]; + local_sums[block] = lsum; + }); + std::vector bulk_prefix(num_blocks + 1); + OutTy total = 0; + for (size_t block = 0; block < num_blocks; block++) { + bulk_prefix[block] = total; + total += local_sums[block]; + } + bulk_prefix[num_blocks] = total; + // TODO do not use new here: difficult to track and free later + OutTy* prefix = new OutTy[in.size() + 1]; + galois::do_all( + galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { + OutTy local_total = bulk_prefix[block]; + size_t block_end = std::min((block + 1) * block_size, in.size()); + for (size_t i = block * block_size; i < block_end; i++) { + prefix[i] = local_total; + local_total += in[i]; + } + }); + prefix[in.size()] = bulk_prefix[num_blocks]; + return prefix; +} + +template uint32_t* +parallel_prefix_sum(const std::vector& in); + +// Compute the F1 score, also known as balanced F-score or F-measure +// The F1 score can be interpreted as a weighted average of the precision and +// recall, where an F1 score reaches its best value at 1 and worst score at 0. +// The relative contribution of precision and recall to the F1 score are equal. +// The formula for the F1 score is: +// F1 = 2 * (precision * recall) / (precision + recall) +// where precision = TP / (TP + FP), recall = TP / (TP + FN) +// TP: true positive; FP: false positive; FN: false negative. +// In the multi-class and multi-label case, this is the weighted average of the +// F1 score of each class. Please refer to +// https://sebastianraschka.com/faq/docs/multiclass-metric.html, +// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf +// (p.1672) and +// https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp +acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, + size_t num_classes, label_t* ground_truth, + float_t* pred) { + // TODO dist version; make aware of distributed execution + double precision_cls(0.), recall_cls(0.), f1_accum(0.); + int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0); + + for (size_t col = 0; col < num_classes; col++) { + int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0); + + for (size_t row = begin; row < end; row++) { + if (masks == NULL || masks[row] == 1) { + auto idx = row * num_classes + col; + if (ground_truth[idx] == 1 && pred[idx] > 0.5) { + //__sync_fetch_and_add(&tp_cls, 1); + tp_cls += 1; + } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) { + //__sync_fetch_and_add(&fp_cls, 1); + fp_cls += 1; + } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) { + //__sync_fetch_and_add(&fn_cls, 1); + fn_cls += 1; + } else if (ground_truth[idx] == 0 && pred[idx] <= 0.5) { + //__sync_fetch_and_add(&tn_cls, 1); + tn_cls += 1; + } + } + } + + tp_accum += tp_cls; + fn_accum += fn_cls; + fp_accum += fp_cls; + tn_accum += tn_cls; + precision_cls = + tp_cls + fp_cls > 0 ? (double)tp_cls / (double)(tp_cls + fp_cls) : 0.; + recall_cls = + tp_cls + fn_cls > 0 ? (double)tp_cls / (double)(tp_cls + fn_cls) : 0.; + f1_accum += + recall_cls + precision_cls > 0. + ? 2. * (recall_cls * precision_cls) / (recall_cls + precision_cls) + : 0.; + } + + double f1_macro = f1_accum / (double)num_classes; + // double accuracy_mic = + // (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum); + double precision_mic = tp_accum + fp_accum > 0 + ? (double)tp_accum / (double)(tp_accum + fp_accum) + : 0.; + double recall_mic = tp_accum + fn_accum > 0 + ? (double)tp_accum / (double)(tp_accum + fn_accum) + : 0.; + double f1_micro = + recall_mic + precision_mic > 0. + ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic) + : 0.; + + unsigned myID = 0; +#ifndef GALOIS_ENABLE_GPU + myID = galois::runtime::getSystemNetworkInterface().ID; +#endif + std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed + << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n"; + // galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed, + // " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n"); + + return f1_micro; +} + +} // namespace deepgalois diff --git a/libdist/CMakeLists.txt b/libdist/CMakeLists.txt index 138a4edabd..2930d37cbf 100644 --- a/libdist/CMakeLists.txt +++ b/libdist/CMakeLists.txt @@ -21,7 +21,7 @@ target_include_directories(galois_dist_async PUBLIC target_link_libraries(galois_dist_async PUBLIC MPI::MPI_CXX) target_link_libraries(galois_dist_async PUBLIC galois_shmem) -target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1) +#target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1) if (GALOIS_USE_BARE_MPI) target_compile_definitions(galois_dist_async PRIVATE GALOIS_USE_BARE_MPI=1) @@ -49,7 +49,7 @@ if (GALOIS_USE_LCI) add_dependencies(galois_dist_async lci) target_link_libraries(galois_dist_async PRIVATE ${LCI_LIBRARY} -lpsm2) - target_include_directories(galois_dist_async PUBLIC + target_include_directories(galois_dist_async PUBLIC $ $ ) diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h new file mode 100644 index 0000000000..8066f3a25e --- /dev/null +++ b/libdist/include/galois/BufferWrapper.h @@ -0,0 +1,115 @@ +#ifndef GALOIS_BUFFER_WRAPPER +#define GALOIS_BUFFER_WRAPPER +#include "galois/gstl.h" +#include + +namespace galois { + +//! Wraps a pointer representing an array with the number of elements the +//! array contains (or that we want to handle with this class) +//! +//! Used to avoid copying of memory into a vector for +//! serialization/deserialization purpose +//! @todo give this a better name +template +class BufferWrapper { +public: + using size_type = size_t; + using value_type = ElementType; + +private: + //! This vector is allocated when creating a buffer wrapper from scratch + //! (i.e. during deserialization into one) + galois::gstl::Vector dummy; + //! Raw memory kept by this class; either points to existing memory or is + //! empty (vector.data changes when this object is copied, causes issues + //! with correcntess) + ElementType* raw_memory; + //! Number of elements that can be accessed from the raw_memory pointer + size_type num_elements; + +public: + //! Default constructor 0s everything + BufferWrapper() { + dummy.clear(); + this->raw_memory = 0; + this->num_elements = 0; + } + + //! frees dummy vector + ~BufferWrapper() { + // explicit vector clear; regular destructor probably frees it, but + // doing it for safetey + if (dummy.size()) { + dummy.clear(); + } + } + + //! Save a pointer and the number of elements in that array that this can + //! access + BufferWrapper(ElementType* pointer, size_t num_elements_) + : raw_memory(pointer), num_elements(num_elements_){}; + + //! Returns element at some specified index of the array + ElementType& operator[](size_t index) { + assert(index < this->num_elements); + if (dummy.size()) { + return dummy[index]; + } else { + return raw_memory[index]; + } + } + + //! Returns element at some specified index of the array; const i.e. not + //! modifiable + const ElementType& operator[](size_t index) const { + assert(index < this->num_elements); + if (dummy.size()) { + return dummy[index]; + } else { + return raw_memory[index]; + } + } + + //! Return number of elements in the array + size_t size() const { return this->num_elements; } + + //! return unmodifiable pointer to raw_memory + const ElementType* data() const { + if (dummy.size()) { + return dummy.data(); + } else { + return raw_memory; + } + } + + //! return pointer to raw_memory + ElementType* data() { + if (dummy.size()) { + return dummy.data(); + } else { + return raw_memory; + } + } + + //! Allocates memory in the underlying vector; should only be used for + //! deserialization into this class during communication + //! This also means you shouldn't use raw_data + void resize(size_t new_size) { + if (!this->dummy.size()) { + this->dummy.resize(new_size); + this->num_elements = this->dummy.size(); + } else { + GALOIS_DIE("calling resize when there is already memory " + "allocated"); + } + } + + ElementType* get_vec_data() { + assert(this->dummy.size()); + return dummy.data(); + } +}; + +} // namespace galois +#endif diff --git a/libdist/include/galois/DTerminationDetector.h b/libdist/include/galois/DTerminationDetector.h index 0f6d696838..28c58b3666 100644 --- a/libdist/include/galois/DTerminationDetector.h +++ b/libdist/include/galois/DTerminationDetector.h @@ -150,10 +150,8 @@ class DGTerminator { bool terminate() { bool active = (local_mdata != 0); - // if (active) galois::gDebug("[", net.ID, "] local work done \n"); if (!active) { active = net.anyPendingSends(); - // if (active) galois::gDebug("[", net.ID, "] pending send \n"); } int snapshot_ended = 0; if (!active) { @@ -166,8 +164,6 @@ class DGTerminator { } if (!active) { // check pending receives after checking snapshot active = net.anyPendingReceives(); - if (active) - galois::gDebug("[", net.ID, "] pending receive"); } if (active) { work_done = true; @@ -178,16 +174,11 @@ class DGTerminator { work_done = false; prev_snapshot = snapshot; ++snapshot; - galois::gDebug("[", net.ID, "] work done, taking snapshot ", - snapshot); initiate_snapshot(); } else if (prev_snapshot != snapshot) { prev_snapshot = snapshot; - galois::gDebug("[", net.ID, "] no work done, taking snapshot ", - snapshot); initiate_snapshot(); } else { - galois::gDebug("[", net.ID, "] terminating ", snapshot); // an explicit barrier may be required here // so that the next async phase begins on all hosts at the same time // however, this may add overheads when it is not required diff --git a/libdist/include/galois/DistGalois.h b/libdist/include/galois/DistGalois.h index b87c539f3e..e39f311470 100644 --- a/libdist/include/galois/DistGalois.h +++ b/libdist/include/galois/DistGalois.h @@ -44,10 +44,10 @@ class DistMemSys : public runtime::SharedMem { ~DistMemSys(); - DistMemSys(const DistMemSys&) = delete; + DistMemSys(const DistMemSys&) = delete; DistMemSys& operator=(const DistMemSys&) = delete; - DistMemSys(DistMemSys&&) = delete; + DistMemSys(DistMemSys&&) = delete; DistMemSys& operator=(DistMemSys&&) = delete; }; diff --git a/libdist/include/galois/runtime/Network.h b/libdist/include/galois/runtime/Network.h index e4695c0c2b..1560b20914 100644 --- a/libdist/include/galois/runtime/Network.h +++ b/libdist/include/galois/runtime/Network.h @@ -109,7 +109,7 @@ class NetworkInterface { //! tag (tag) and some data (buf) //! on the receiver, buf will be returned on a receiveTagged(tag) //! buf is invalidated by this operation - virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf, + virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf, int type = 0) = 0; //! Send a message to all hosts. A message is simply a @@ -123,9 +123,6 @@ class NetworkInterface { template void broadcastSimple(void (*recv)(uint32_t, Args...), Args... param); - //! Receive and dispatch messages - void handleReceives(); - //! Wrapper to reset the mem usage tracker's stats inline void resetMemUsage() { memUsageTracker.resetMemUsage(); } @@ -134,8 +131,7 @@ class NetworkInterface { //! Receive a tagged message virtual std::optional> - recieveTagged(uint32_t tag, std::unique_lock* rlg, - int type = 0) = 0; + recieveTagged(uint32_t tag, int type = 0) = 0; //! move send buffers out to network virtual void flush() = 0; @@ -195,9 +191,6 @@ NetworkInterface& makeNetworkLCI(); //! @warning Should not be called within a parallel region; assumes only one //! thread is calling it substrate::Barrier& getHostBarrier(); -//! Returns a fence that ensures all pending messages are delivered, acting -//! like a memory-barrier -substrate::Barrier& getHostFence(); //////////////////////////////////////////////////////////////////////////////// // Implementations @@ -220,7 +213,7 @@ void NetworkInterface::sendSimple(uint32_t dest, SendBuffer buf; gSerialize(buf, (uintptr_t)recv, param..., (uintptr_t)genericLandingPad); - sendTagged(dest, 0, buf); + sendTagged(dest, 0, std::move(buf)); } template diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index b7f7cab60e..bc3cad4b01 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -28,7 +28,9 @@ #define GALOIS_RUNTIME_SERIALIZE_H #include +#include #include +#include #include #include #include @@ -43,64 +45,81 @@ #include #include #include "galois/CopyableTuple.h" +#include "galois/BufferWrapper.h" #include "galois/Bag.h" namespace galois { namespace runtime { +struct BufferHeader { + enum class BufferType { kSingleMessage, kMultipleMessages, kPartialMessage }; + BufferType type{BufferType::kSingleMessage}; + uint8_t num_segments{1}; + uint8_t segment_id{0}; + uint8_t segment_tag{0}; +}; + class DeSerializeBuffer; // forward declaration for friend declaration /** * Buffer for serialization of data. Mainly used during network communication. */ class SerializeBuffer { + static constexpr size_t kHeaderSize = sizeof(BufferHeader); + //! Access to a deserialize buffer friend DeSerializeBuffer; //! type of data buffer // using vTy = std::vector; - using vTy = galois::PODResizeableArray; + using vTy = galois::PODResizeableArray; + using size_type = vTy::size_type; + //! the actual data stored in this buffer vTy bufdata; public: //! default constructor - SerializeBuffer() = default; + SerializeBuffer() { + BufferHeader header; + insert(reinterpret_cast(&header), kHeaderSize); + } + //! disabled copy constructor SerializeBuffer(SerializeBuffer&& rhs) = default; - //! Creates a buffer from another buffer - //! @param d buffer to create from - //! @param len amount of copy from buffer d - SerializeBuffer(const char* d, unsigned len) : bufdata(d, d + len) {} + + SerializeBuffer& operator=(SerializeBuffer&& rhs) { + auto buf = std::move(rhs); + bufdata = std::move(buf.get()); + return *this; + } //! Push a character onto the serialize buffer inline void push(const char c) { bufdata.push_back(c); } //! Insert characters from a buffer into the serialize buffer void insert(const uint8_t* c, size_t bytes) { - bufdata.insert(bufdata.end(), c, c + bytes); + if (bytes > 0) { + bufdata.insert(bufdata.end(), c, c + bytes); + } } //! Insert characters from a buffer into the serialize buffer at a particular //! offset void insertAt(const uint8_t* c, size_t bytes, size_t offset) { - std::copy_n(c, bytes, bufdata.begin() + offset); + offset += kHeaderSize; + assert((offset + bytes) <= bufdata.size()); + if (bytes > 0) { + std::copy_n(c, bytes, bufdata.begin() + offset); + } } - /** - * Reserve space at the end for inserting new data into the serialize - * buffer - * - * @param bytes number of bytes to reserve at the end - * @returns offset to the end of the buffer before new space was reserved - */ - size_t encomber(size_t bytes) { - size_t retval = bufdata.size(); - bufdata.resize(retval + bytes); - return retval; - } + //! Returns an iterator to the beginning of the data in this serialize buffer + vTy::const_iterator begin() const { return bufdata.cbegin(); } + //! Returns an iterator to the end of the data in this serialize buffer + vTy::const_iterator end() const { return bufdata.cend(); } - void resize(size_t bytes) { bufdata.resize(bytes); } + void resize(size_t bytes) { bufdata.resize(kHeaderSize + bytes); } /** * Reserve more space in the serialize buffer. @@ -110,34 +129,20 @@ class SerializeBuffer { void reserve(size_t s) { bufdata.reserve(bufdata.size() + s); } //! Returns a pointer to the data stored in this serialize buffer - const uint8_t* linearData() const { return bufdata.data(); } + const uint8_t* linearData() const { return bufdata.data() + kHeaderSize; } //! Returns vector of data stored in this serialize buffer - vTy& getVec() { return bufdata; } + vTy& get() { return bufdata; } - //! Returns an iterator to the beginning of the data in this serialize buffer - vTy::const_iterator begin() const { return bufdata.cbegin(); } - //! Returns an iterator to the end of the data in this serialize buffer - vTy::const_iterator end() const { return bufdata.cend(); } - - using size_type = vTy::size_type; - - //! Returns the size of the serialize buffer - size_type size() const { return bufdata.size(); } - - //! Utility print function for the serialize buffer - //! @param o stream to print to - void print(std::ostream& o) const { - o << "<{" << std::hex; - for (auto& i : bufdata) - o << (unsigned int)i << " "; - o << std::dec << "}>"; + //! Get a pointer to the remaining data of the deserialize buffer + //! (as determined by offset) + const uint8_t* data() const { return bufdata.data() + kHeaderSize; } + uint8_t* data() { return bufdata.data() + kHeaderSize; } + uint8_t* DataAtOffset(size_t offset) { + return bufdata.data() + kHeaderSize + offset; } - //! Operator that calls the print function of the serialize buffer - friend std::ostream& operator<<(std::ostream& os, const SerializeBuffer& b) { - b.print(os); - return os; - } + //! Returns the size of the serialize buffer + size_type size() const { return bufdata.size() - kHeaderSize; } }; /** @@ -145,50 +150,54 @@ class SerializeBuffer { * communication. */ class DeSerializeBuffer { + static constexpr size_t kHeaderSize = sizeof(BufferHeader); //! Access to serialize buffer friend SerializeBuffer; //! type of data buffer // using vTy = std::vector; using vTy = galois::PODResizeableArray; //! the actual data stored in this buffer - vTy bufdata; - int offset; + vTy bufdata{kHeaderSize}; + size_t offset{kHeaderSize}; public: //! Constructor initializes offset into buffer to 0 - DeSerializeBuffer() : offset(0) {} + DeSerializeBuffer() : offset(kHeaderSize) {} //! Disable copy constructor DeSerializeBuffer(DeSerializeBuffer&&) = default; //! Move constructor //! @param v vector to act as deserialize buffer //! @param start offset to start saving data into DeSerializeBuffer(vTy&& v, uint32_t start = 0) - : bufdata(std::move(v)), offset(start) {} + : bufdata(std::move(v)), offset(start + kHeaderSize) { + assert(bufdata.size() >= offset); + } //! Constructor that takes an existing vector to use as the deserialize //! buffer explicit DeSerializeBuffer(vTy& data) { bufdata.swap(data); - offset = 0; + offset = kHeaderSize; } /** * Initializes the deserialize buffer with a certain size * @param [in] count size to initialize buffer to */ - explicit DeSerializeBuffer(int count) : bufdata(count), offset(0) {} + explicit DeSerializeBuffer(int count) + : bufdata(count + kHeaderSize), offset(kHeaderSize) {} /** * Initializes the deserialize buffer using vector initialization from * 2 iterators. */ template - DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{0} {} + DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{kHeaderSize} {} /** * Initialize a deserialize buffer from a serialize buffer */ - explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(0) { + explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(kHeaderSize) { bufdata.swap(buf.bufdata); } @@ -202,31 +211,15 @@ class DeSerializeBuffer { * @param count new size of buffer */ void reset(int count) { - offset = 0; - bufdata.resize(count); + offset = kHeaderSize; + bufdata.resize(count + kHeaderSize); } - //! Gets the current offset into the deserialize buffer - unsigned getOffset() const { return offset; } - //! Sets the offset into the deserialize buffer - void setOffset(unsigned off) { - assert(off <= size()); - offset = off; - } - - //! Gets the size of the deserialize buffer - unsigned size() const { return bufdata.size(); } - - //! Returns true if the deserialize buffer is empty - //! @returns true if the deserialize buffer is empty - bool empty() const { return bufdata.empty(); } - //! Get the next character in the deserialize buffer unsigned char pop() { return bufdata.at(offset++); } - //! Clears the last x bytes of the deserialize buffer, resizing it as well - //! @param x How many bytes from the end to clear - void pop_back(unsigned x) { bufdata.resize(bufdata.size() - x); } + //! Gets the size of the deserialize buffer + unsigned size() const { return bufdata.size() - offset; } /** * Extracts a certain amount of data from the deserialize buffer @@ -235,45 +228,23 @@ class DeSerializeBuffer { * @param num Amount of data to get from deserialize buffer */ void extract(uint8_t* dst, size_t num) { + assert(offset >= kHeaderSize); + assert((offset + num) <= bufdata.size()); if (num > 0) { - memcpy(dst, &bufdata[offset], num); + std::copy_n(&bufdata[offset], num, dst); offset += num; } } //! Get the underlying vector storing the data of the deserialize //! buffer - vTy& getVec() { return bufdata; } + vTy& get() { return bufdata; } //! Get a pointer to the underlying data of the deserialize buffer - void* linearData() { return &bufdata[0]; } - - //! Get a pointer to the remaining data of the deserialize buffer - //! (as determined by offset) - const uint8_t* r_linearData() const { return &bufdata[offset]; } - //! Get the remaining size of the deserialize buffer (as determined - //! by offset) - size_t r_size() const { return bufdata.size() - offset; } - - //! Checks if the current location in the deserialize buffer is aligned - //! to some size a - bool atAlignment(size_t a) { return (uintptr_t)r_linearData() % a == 0; } - - //! Utility print of deserialize buffer - //! @param o stream to print to - void print(std::ostream& o) const { - o << "<{(" << offset << ") " << std::hex; - for (auto ii = bufdata.begin(), ee = bufdata.end(); ii != ee; ++ii) - o << (unsigned int)*ii << " "; - o << std::dec << "}>"; - } + void* linearData() { return &bufdata[offset]; } - //! Operator for printing deserialize buffer - friend std::ostream& operator<<(std::ostream& os, - const DeSerializeBuffer& buf) { - buf.print(os); - return os; - } + const uint8_t* data() const { return &bufdata[offset]; } + uint8_t* data() { return &bufdata[offset]; } }; namespace internal { @@ -307,6 +278,19 @@ gSizedObj(const T&, return sizeof(uintptr_t); } +//! Size of BufferWrapper is size + number of things in it +template +inline size_t gSizedObj(const galois::BufferWrapper& data) { + return sizeof(size_t) + data.size() * sizeof(T); +} + +template +inline size_t gSizedObj(const std::unordered_map& data) { + size_t sz = 0; + for (auto i : data) + sz += gSizedObj(i.first) + gSizedObj(i.second); + return sz; +} /** * Returns the size necessary for storing 2 elements of a pair into a * serialize buffer. @@ -400,7 +384,7 @@ inline size_t gSizedObj(const SerializeBuffer& data) { return data.size(); } * * @returns size of the deserialize buffer passed into it */ -inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.r_size(); } +inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.size(); } /** * Returns the size of the passed in insert bag. @@ -472,6 +456,15 @@ inline void gSerializeObj( * @param [in,out] buf Serialize buffer to serialize into * @param [in] data Data to serialize */ +template +inline void gSerializeObj(SerializeBuffer& buf, + const std::unordered_map& data) { + gSerialize(buf, data.size()); + for (auto i : data) { + gSerialize(buf, i.first, i.second); + } +} + template inline void gSerializeObj(SerializeBuffer& buf, const T& data, @@ -563,6 +556,11 @@ template inline void gSerializeObj(SerializeBuffer& buf, const std::vector& data); +// Forward declaration of buff serialize +template +inline void gSerializeObj(SerializeBuffer& buf, + const galois::BufferWrapper& data); + /** * Serialize a sequence type into a buffer. * @@ -610,6 +608,18 @@ inline void gSerializeObj(SerializeBuffer& buf, gSerializeSeq(buf, data); } +//! Serialize BufferWrapper similarly to vector +template +inline void gSerializeObj(SerializeBuffer& buf, + const galois::BufferWrapper& data) { + if (is_memory_copyable::value) { + gSerializeLinearSeq(buf, data); + } else { + GALOIS_DIE("have not implemented support for serializing nonPOD buffer " + "wrapper"); + } +} + /** * Serialize a PODResizeableArray into a buffer, choosing to do a memcopy or * to serialize each element individually depending on data. @@ -654,7 +664,7 @@ inline void gSerializeObj(SerializeBuffer& buf, * @param [in] data serialize buffer to get data from */ inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) { - buf.insert(data.linearData(), data.size()); + buf.insert(data.data(), data.size()); } /** @@ -665,7 +675,7 @@ inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) { */ inline void gSerializeObj(SerializeBuffer& buf, const DeSerializeBuffer& rbuf) { // buf.reserve(rbuf.r_size()); - buf.insert(rbuf.r_linearData(), rbuf.r_size()); + buf.insert(rbuf.data(), rbuf.size()); } /** @@ -729,8 +739,10 @@ gSerializeLazySeq(SerializeBuffer& buf, unsigned num, Seq*) { "Not POD Sequence"); typename Seq::size_type size = num; internal::gSerializeObj(buf, size); - size_t tsize = sizeof(typename Seq::value_type); - return LazyRef{buf.encomber(tsize * num)}; + size_t tsize = sizeof(typename Seq::value_type); + size_t cur_size = buf.size(); + buf.resize(cur_size + (tsize * num)); + return LazyRef{cur_size}; } /** @@ -800,6 +812,19 @@ void gDeserializeObj( data.deserialize(buf); } +template +void gDeserializeObj(DeSerializeBuffer& buf, std::unordered_map& data) { + uint64_t elts; + gDeserializeObj(buf, elts); + for (uint64_t i = 0; i < elts; i++) { + std::pair elt; + gDeserialize(buf, elt.first, elt.second); + if (buf.size() <= 0) { + break; + } + data[elt.first] = elt.second; + } +} /** * Deserialize a pair from a buffer. * @@ -921,6 +946,10 @@ gDeserializeObj(DeSerializeBuffer& buf, template void gDeserializeObj(DeSerializeBuffer& buf, std::vector& data); +// Forward declaration of buff wrapper deserialize +template +void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper& data); + /** * Deserialize into a sequence object * @@ -948,18 +977,10 @@ void gDeserializeSeq(DeSerializeBuffer& buf, Seq& seq) { template void gDeserializeLinearSeq(DeSerializeBuffer& buf, Seq& seq) { typedef typename Seq::value_type T; - // seq.clear(); typename Seq::size_type size; gDeserializeObj(buf, size); - // If the alignment is right, cast to a T array and insert - if (buf.atAlignment(alignof(T))) { - T* src = (T*)buf.r_linearData(); - seq.assign(src, &src[size]); - buf.setOffset(buf.getOffset() + size * sizeof(T)); - } else { - seq.resize(size); - buf.extract((uint8_t*)seq.data(), size * sizeof(T)); - } + seq.resize(size); + buf.extract((uint8_t*)seq.data(), size * sizeof(T)); } /** @@ -988,6 +1009,20 @@ void gDeserializeObj(DeSerializeBuffer& buf, std::vector& data) { gDeserializeSeq(buf, data); } +//! deserialize into buf wrapper +template +void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper& bf) { + if (is_memory_copyable::value) { + // manual deserialization here + size_t buffer_size{0}; + gDeserializeObj(buf, buffer_size); + bf.resize(buffer_size); + buf.extract((uint8_t*)bf.get_vec_data(), buffer_size * sizeof(T)); + } else { + GALOIS_DIE("deserialize for buf wrapper not implemented for nonpod"); + } +} + /** * Deserialize into a PODResizeableArray * @@ -1051,9 +1086,10 @@ inline void gDeserialize(DeSerializeBuffer&) {} * @param data Object to save data in the iterator type into */ template -auto gDeserializeRaw(Iter iter, T& data) -> decltype( - std::declval::value>::type>(), - Iter()) { +auto gDeserializeRaw(Iter iter, T& data) + -> decltype(std::declval::value>::type>(), + Iter()) { unsigned char* pdata = (unsigned char*)&data; for (size_t i = 0; i < sizeof(T); ++i) pdata[i] = *iter++; diff --git a/libdist/src/Barrier.cpp b/libdist/src/Barrier.cpp index 455e22aaed..0558d8ebb4 100644 --- a/libdist/src/Barrier.cpp +++ b/libdist/src/Barrier.cpp @@ -41,52 +41,6 @@ #include "galois/runtime/BareMPI.h" namespace { -class HostFence : public galois::substrate::Barrier { -public: - virtual const char* name() const { return "HostFence"; } - - virtual void reinit(unsigned) {} - - //! control-flow barrier across distributed hosts - //! acts as a distributed-memory fence as well (flushes send and receives) - virtual void wait() { - auto& net = galois::runtime::getSystemNetworkInterface(); - - if (galois::runtime::evilPhase == 0) { - galois::gWarn("evilPhase is 0, implying loop-around or no use: fence " - "may not work correctly!"); - } - - for (unsigned h = 0; h < net.Num; ++h) { - if (h == net.ID) - continue; - galois::runtime::SendBuffer b; - galois::runtime::gSerialize(b, net.ID + 1); // non-zero message - net.sendTagged(h, galois::runtime::evilPhase, b); - } - net.flush(); // flush all sends - - unsigned received = 1; // self - while (received < net.Num) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; - do { - net.handleReceives(); // flush all receives from net.sendMsg() or - // net.sendSimple() - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); - } while (!p); - assert(p->first != net.ID); - // ignore received data - ++received; - } - ++galois::runtime::evilPhase; - if (galois::runtime::evilPhase >= - static_cast( - std::numeric_limits::max())) { // limit defined by MPI or - // LCI - galois::runtime::evilPhase = 1; - } - } -}; class HostBarrier : public galois::substrate::Barrier { public: @@ -110,8 +64,3 @@ galois::substrate::Barrier& galois::runtime::getHostBarrier() { static HostBarrier b; return b; } - -galois::substrate::Barrier& galois::runtime::getHostFence() { - static HostFence b; - return b; -} diff --git a/libdist/src/DistStats.cpp b/libdist/src/DistStats.cpp index 8faf4cee5a..1fe46bc514 100644 --- a/libdist/src/DistStats.cpp +++ b/libdist/src/DistStats.cpp @@ -105,8 +105,8 @@ void DistStatManager::combineAtHost_0_helper(void) { SendBuffer b; gSerialize(b, hTotalMap.region(i), hTotalMap.category(i), hTotalMap.stat(i).totalTy()); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } @@ -126,8 +126,8 @@ void DistStatManager::combineAtHost_0_helper(void) { } else { SendBuffer b; gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } } @@ -151,8 +151,8 @@ void DistStatManager::combineAtHost_0_helper2(void) { } else { SendBuffer b; gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } @@ -172,8 +172,8 @@ void DistStatManager::combineAtHost_0_helper2(void) { } else { SendBuffer b; gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } } @@ -182,10 +182,10 @@ void DistStatManager::receiveAtHost_0_helper(void) { size_t syncTypePhase = 0; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { RecvBuffer& b = p->second; @@ -203,10 +203,10 @@ void DistStatManager::receiveAtHost_0_helper(void) { ++syncTypePhase; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { uint32_t hostID = p->first; @@ -230,10 +230,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) { size_t syncTypePhase = 0; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { uint32_t hostID = p->first; @@ -255,10 +255,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) { ++syncTypePhase; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { uint32_t hostID = p->first; @@ -286,13 +286,18 @@ void DistStatManager::combineAtHost_0(void) { combineAtHost_0_helper(); getSystemNetworkInterface().flush(); + // work done before check + td += 1; + // barrier while (td.reduce()) { + td.reset(); if (getHostID() == 0) { // receive from other hosts receiveAtHost_0_helper(); } - }; + } + // explicit barrier after logical barrier is required // as next async phase begins immediately getHostBarrier().wait(); @@ -302,13 +307,18 @@ void DistStatManager::combineAtHost_0(void) { combineAtHost_0_helper2(); getSystemNetworkInterface().flush(); + td += 1; + // barrier while (td.reduce()) { + td.reset(); + if (getHostID() == 0) { // receive from other hosts receiveAtHost_0_helper2(); } - }; + } + // explicit barrier after logical barrier is required // as next async phase begins immediately getHostBarrier().wait(); diff --git a/libdist/src/Network.cpp b/libdist/src/Network.cpp index 44a189f7ad..7bf499a00b 100644 --- a/libdist/src/Network.cpp +++ b/libdist/src/Network.cpp @@ -93,7 +93,7 @@ void NetworkInterface::sendMsg(uint32_t dest, void (*recv)(uint32_t, RecvBuffer&), SendBuffer& buf) { gSerialize(buf, recv); - sendTagged(dest, 0, buf); + sendTagged(dest, 0, std::move(buf)); } void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&), @@ -104,30 +104,14 @@ void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&), if (x != ID) { SendBuffer b; gSerialize(b, fp, buf, (uintptr_t)&bcastLandingPad); - sendTagged(x, 0, b); + sendTagged(x, 0, std::move(b)); } else if (self) { - RecvBuffer rb(buf.begin(), buf.end()); + RecvBuffer rb = RecvBuffer(std::move(buf.get())); recv(ID, rb); } } } -void NetworkInterface::handleReceives() { - std::unique_lock lg; - auto opt = recieveTagged(0, &lg); - while (opt) { - uint32_t src = std::get<0>(*opt); - RecvBuffer& buf = std::get<1>(*opt); - uintptr_t fp = 0; - gDeserializeRaw(buf.r_linearData() + buf.r_size() - sizeof(uintptr_t), fp); - buf.pop_back(sizeof(uintptr_t)); - assert(fp); - auto f = (void (*)(uint32_t, RecvBuffer&))fp; - f(src, buf); - opt = recieveTagged(0, &lg); - } -} - NetworkInterface& galois::runtime::getSystemNetworkInterface() { #ifndef GALOIS_USE_LCI return makeNetworkBuffered(); diff --git a/libdist/src/NetworkBuffered.cpp b/libdist/src/NetworkBuffered.cpp index 7b6d6c6ce1..a58f16c3ab 100644 --- a/libdist/src/NetworkBuffered.cpp +++ b/libdist/src/NetworkBuffered.cpp @@ -67,6 +67,12 @@ class NetworkInterfaceBuffered : public NetworkInterface { // using vTy = std::vector; using vTy = galois::PODResizeableArray; + static constexpr size_t kHeaderSize = sizeof(BufferHeader); + static constexpr uint8_t kMaxSegmentTag = std::numeric_limits::max(); + static constexpr size_t kMaxBufferSize = + static_cast(std::numeric_limits::max()); + static constexpr size_t kMaxDataSize = kMaxBufferSize - kHeaderSize; + /** * Receive buffers for the buffered network interface */ @@ -77,6 +83,38 @@ class NetworkInterfaceBuffered : public NetworkInterface { // tag of head of queue std::atomic dataPresent; + struct PartialMessages { + uint8_t num_segments{0}; + std::vector segments; + }; + std::unordered_map partial_messages_map_; + + std::optional CombinePartialMessages(const BufferHeader& header, + vTy&& vec) { + auto& partial_messages = partial_messages_map_[header.segment_tag]; + if (partial_messages.num_segments == 0) { + partial_messages.segments.resize(header.num_segments); + } + + partial_messages.segments[header.segment_id] = std::move(vec); + ++partial_messages.num_segments; + + if (partial_messages.num_segments != header.num_segments) { + assert(partial_messages.num_segments < header.num_segments); + assert(partial_messages.segments.size() == header.num_segments); + return std::nullopt; + } + + std::vector& segments = partial_messages.segments; + vTy message = std::move(segments[0]); + for (size_t i = 1, end = segments.size(); i < end; ++i) { + message.insert(message.end(), segments[i].begin() + kHeaderSize, + segments[i].end()); + } + partial_messages_map_.erase(header.segment_tag); + return std::make_optional(std::move(message)); + } + bool sizeAtLeast(size_t n, uint32_t tag) { size_t tot = -frontOffset; for (auto& v : data) { @@ -163,30 +201,6 @@ class NetworkInterfaceBuffered : public NetworkInterface { std::optional popMsg(uint32_t tag, std::atomic& inflightRecvs) { std::lock_guard lg(qlock); -#ifndef NO_AGG - uint32_t len = getLenFromFront(tag); - // assert(len); - if (len == ~0U || len == 0) - return std::optional(); - if (!sizeAtLeast(sizeof(uint32_t) + len, tag)) - return std::optional(); - erase(4, inflightRecvs); - - // Try just using the buffer - if (auto r = popVec(len, inflightRecvs)) { - auto start = r->size() - len; - // std::cerr << "FP " << r->size() << " " << len << " " << start - // << "\n"; - return std::optional(RecvBuffer(std::move(*r), start)); - } - - RecvBuffer buf(len); - // FIXME: This is slows things down 25% - copyOut((char*)buf.linearData(), len); - erase(len, inflightRecvs); - // std::cerr << "p " << tag << " " << len << "\n"; - return std::optional(std::move(buf)); -#else if (data.empty() || data.front().tag != tag) return std::optional(); @@ -201,31 +215,28 @@ class NetworkInterfaceBuffered : public NetworkInterface { } return std::optional(RecvBuffer(std::move(vec), 0)); -#endif } // Worker thread interface - void add(NetworkIO::message m) { + bool add(NetworkIO::message m) { + BufferHeader* header = reinterpret_cast(m.data.data()); + if (header->type == BufferHeader::BufferType::kPartialMessage) { + std::optional segment = + CombinePartialMessages(*header, std::move(m.data)); + if (!segment) { + return false; + } + + m.data = std::move(*segment); + } std::lock_guard lg(qlock); if (data.empty()) { galois::runtime::trace("ADD LATEST ", m.tag); dataPresent = m.tag; } - // std::cerr << m.data.size() << " " << - // std::count(m.data.begin(), m.data.end(), 0) << "\n"; - // for (auto x : m.data) { - // std::cerr << (int) x << " "; - // } - // std::cerr << "\n"; - // std::cerr << "A " << m.host << " " << m.tag << " " << m.data.size() << - // "\n"; - data.push_back(std::move(m)); - - assert(data.back().data.size() != - (unsigned int)std::count(data.back().data.begin(), - data.back().data.end(), 0)); + return true; } bool hasData(uint32_t tag) { return dataPresent == tag; } @@ -245,7 +256,7 @@ class NetworkInterfaceBuffered : public NetworkInterface { struct msg { uint32_t tag; vTy data; - msg(uint32_t t, vTy& _data) : tag(t), data(std::move(_data)) {} + msg(uint32_t t, vTy&& _data) : tag(t), data(std::move(_data)) {} }; std::deque messages; @@ -254,6 +265,43 @@ class NetworkInterfaceBuffered : public NetworkInterface { //! @todo FIXME track time since some epoch in an atomic. std::chrono::high_resolution_clock::time_point time; SimpleLock lock, timelock; + uint8_t segment_tag_{0}; + + void IncrementSegmentTag() { + if (segment_tag_ == kMaxSegmentTag) { + segment_tag_ = 0; + } else { + ++segment_tag_; + } + } + + std::vector Split(uint32_t host, uint32_t tag, + vTy&& vec) { + std::vector segments; + segments.emplace_back(std::move(vec)); + auto begin = segments[0].begin(); + for (size_t i = kMaxBufferSize, end = segments[0].size(); i < end; + i += kMaxDataSize) { + vTy segment(kHeaderSize); + size_t segment_end = std::min(end, i + kMaxDataSize); + segment.insert(segment.end(), begin + i, begin + segment_end); + segments.emplace_back(std::move(segment)); + } + segments[0].resize(kMaxBufferSize); + + std::vector msg; + for (size_t i = 0; i < segments.size(); ++i) { + auto& segment = segments[i]; + BufferHeader* header = reinterpret_cast(segment.data()); + header->type = BufferHeader::BufferType::kPartialMessage; + header->num_segments = segments.size(); + header->segment_id = i; + header->segment_tag = segment_tag_; + msg.emplace_back(host, tag, std::move(segment)); + } + IncrementSegmentTag(); + return msg; + } public: unsigned long statSendTimeout; @@ -269,103 +317,35 @@ class NetworkInterfaceBuffered : public NetworkInterface { } } - bool ready() { -#ifndef NO_AGG - if (numBytes == 0) - return false; - if (urgent) { - ++statSendUrgent; - return true; - } - if (numBytes > COMM_MIN) { - ++statSendOverflow; - return true; - } - auto n = std::chrono::high_resolution_clock::now(); - decltype(n) mytime; - { - std::lock_guard lg(timelock); - mytime = time; - } - auto elapsed = - std::chrono::duration_cast(n - mytime); - if (elapsed.count() > COMM_DELAY) { - ++statSendTimeout; - return true; - } - return false; -#else - return messages.size() > 0; -#endif - } + bool ready() { return messages.size() > 0; } - std::pair - assemble(std::atomic& GALOIS_UNUSED(inflightSends)) { + std::vector assemble(uint32_t host) { std::unique_lock lg(lock); - if (messages.empty()) - return std::make_pair(~0, vTy()); -#ifndef NO_AGG - // compute message size - uint32_t len = 0; - int num = 0; - uint32_t tag = messages.front().tag; - for (auto& m : messages) { - if (m.tag != tag) { - break; - } else { - // do not let it go over the integer limit because MPI_Isend cannot - // deal with it - if ((m.data.size() + sizeof(uint32_t) + len + num) > - static_cast(std::numeric_limits::max())) { - break; - } - len += m.data.size(); - num += sizeof(uint32_t); - } - } - lg.unlock(); - // construct message - vTy vec; - vec.reserve(len + num); - // go out of our way to avoid locking out senders when making messages - lg.lock(); - do { - auto& m = messages.front(); - lg.unlock(); - union { - uint32_t a; - uint8_t b[sizeof(uint32_t)]; - } foo; - foo.a = m.data.size(); - vec.insert(vec.end(), &foo.b[0], &foo.b[sizeof(uint32_t)]); - vec.insert(vec.end(), m.data.begin(), m.data.end()); - if (urgent) - --urgent; - lg.lock(); - messages.pop_front(); - --inflightSends; - } while (vec.size() < len + num); - ++inflightSends; - numBytes -= len; -#else + assert(!messages.empty()); uint32_t tag = messages.front().tag; vTy vec(std::move(messages.front().data)); messages.pop_front(); -#endif - return std::make_pair(tag, std::move(vec)); + + if (vec.size() > kMaxBufferSize) { + return Split(host, tag, std::move(vec)); + } + + BufferHeader* header = reinterpret_cast(vec.data()); + header->type = BufferHeader::BufferType::kSingleMessage; + std::vector msgs; + msgs.emplace_back(host, tag, std::move(vec)); + return msgs; } - void add(uint32_t tag, vTy& b) { + void add(uint32_t tag, vTy&& b) { std::lock_guard lg(lock); if (messages.empty()) { std::lock_guard lg(timelock); time = std::chrono::high_resolution_clock::now(); } - unsigned oldNumBytes = numBytes; + assert(b.size() >= kHeaderSize); numBytes += b.size(); - galois::runtime::trace("BufferedAdd", oldNumBytes, numBytes, tag, - galois::runtime::printVec(b)); - messages.emplace_back(tag, b); + messages.emplace_back(tag, std::move(b)); } }; // end send buffer class @@ -402,24 +382,26 @@ class NetworkInterfaceBuffered : public NetworkInterface { // handle send queue i auto& sd = sendData[i]; if (sd.ready()) { - NetworkIO::message msg; - msg.host = i; - std::tie(msg.tag, msg.data) = sd.assemble(inflightSends); - galois::runtime::trace("BufferedSending", msg.host, msg.tag, - galois::runtime::printVec(msg.data)); - ++statSendEnqueued; - netio->enqueue(std::move(msg)); + std::vector msgs = sd.assemble(i); + if (msgs.size() > 1) { + inflightSends += msgs.size() - 1; + } + + for (auto& msg : msgs) { + ++statSendEnqueued; + netio->enqueue(std::move(msg)); + } } + // handle receive NetworkIO::message rdata = netio->dequeue(); if (rdata.data.size()) { ++statRecvDequeued; - assert(rdata.data.size() != - (unsigned int)std::count(rdata.data.begin(), rdata.data.end(), - 0)); - galois::runtime::trace("BufferedRecieving", rdata.host, rdata.tag, - galois::runtime::printVec(rdata.data)); - recvData[rdata.host].add(std::move(rdata)); + uint32_t h = rdata.host; + bool not_partial_segment = recvData[h].add(std::move(rdata)); + if (!not_partial_segment) { + --inflightRecvs; + } } } } @@ -454,22 +436,19 @@ class NetworkInterfaceBuffered : public NetworkInterface { std::unique_ptr netio; - virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf, + virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf, int phase) { - ++inflightSends; tag += phase; statSendNum += 1; - statSendBytes += buf.size(); - galois::runtime::trace("sendTagged", dest, tag, - galois::runtime::printVec(buf.getVec())); + statSendBytes += buf.size() + kHeaderSize; + memUsageTracker.incrementMemUsage(buf.size() + kHeaderSize); + ++inflightSends; auto& sd = sendData[dest]; - sd.add(tag, buf.getVec()); + sd.add(tag, std::move(buf.get())); } virtual std::optional> - recieveTagged(uint32_t tag, - std::unique_lock* rlg, - int phase) { + recieveTagged(uint32_t tag, int phase) { tag += phase; for (unsigned h = 0; h < recvData.size(); ++h) { auto& rq = recvData[h]; @@ -480,12 +459,8 @@ class NetworkInterfaceBuffered : public NetworkInterface { auto buf = rq.popMsg(tag, inflightRecvs); if (buf) { ++statRecvNum; - statRecvBytes += buf->size(); - memUsageTracker.decrementMemUsage(buf->size()); - if (rlg) - *rlg = std::move(lg); - galois::runtime::trace("recvTagged", h, tag, - galois::runtime::printVec(buf->getVec())); + statRecvBytes += buf->size() + kHeaderSize; + memUsageTracker.decrementMemUsage(buf->size() + kHeaderSize); anyReceivedMessages = true; return std::optional>( std::make_pair(h, std::move(*buf))); diff --git a/libdist/src/NetworkLCI.cpp b/libdist/src/NetworkLCI.cpp index 59b17a1d35..3770356c8c 100644 --- a/libdist/src/NetworkLCI.cpp +++ b/libdist/src/NetworkLCI.cpp @@ -182,8 +182,8 @@ class NetworkInterfaceLCI : public NetworkInterface { statSendBytes += buf.size(); // int count = 0; #ifndef GALOIS_SUPPORT_ASYNC - if (buf.getVec().size() < 8192) { - while (lc_sendm(buf.getVec().data(), buf.getVec().size(), dest, tag, + if (buf.get().size() < 8192) { + while (lc_sendm(buf.get().data(), buf.get().size(), dest, tag, lc_p2p_ep[phase]) != LC_OK) { sched_yield(); } @@ -191,7 +191,7 @@ class NetworkInterfaceLCI : public NetworkInterface { #endif { pendingReq* msg = - new pendingReq(dest, tag, phase, buf.getVec(), inflightSends); + new pendingReq(dest, tag, phase, buf.get(), inflightSends); while (lc_sendl(msg->buf.data(), msg->buf.size(), dest, tag, lc_p2p_ep[phase], free_req, msg) != LC_OK) { sched_yield(); diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt index 8e9d56d48e..4721bc0261 100644 --- a/libgalois/CMakeLists.txt +++ b/libgalois/CMakeLists.txt @@ -10,7 +10,7 @@ set(sources "${CMAKE_CURRENT_BINARY_DIR}/Version.cpp" src/Barrier_Counting.cpp src/Barrier.cpp - src/Barrier_Dissemination.cpp + src/Barrier_Dissemination.cpp src/Barrier_MCS.cpp src/Barrier_Pthread.cpp src/Barrier_Simple.cpp @@ -86,6 +86,7 @@ endif() target_link_libraries(galois_shmem INTERFACE pygalois) target_link_libraries(galois_shmem PRIVATE Threads::Threads) +target_link_libraries(galois_shmem PUBLIC galois_support) if (CMAKE_HAVE_PTHREAD_H) target_compile_definitions(galois_shmem PRIVATE GALOIS_HAVE_PTHREAD) diff --git a/libgalois/include/galois/Atomic.h b/libgalois/include/galois/Atomic.h deleted file mode 100644 index e073bf5aa7..0000000000 --- a/libgalois/include/galois/Atomic.h +++ /dev/null @@ -1,284 +0,0 @@ -/* - * This file belongs to the Galois project, a C++ library for exploiting - * parallelism. The code is being released under the terms of the 3-Clause BSD - * License (a copy is located in LICENSE.txt at the top-level directory). - * - * Copyright (C) 2018, The University of Texas at Austin. All rights reserved. - * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS - * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF - * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF - * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH - * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances - * shall University be liable for incidental, special, indirect, direct or - * consequential damages or loss of profits, interruption of business, or - * related expenses which may arise from use of Software or Documentation, - * including but not limited to those resulting from defects in Software and/or - * Documentation, or loss or inaccuracy of data of any kind. - */ - -#ifndef GALOIS_ATOMIC_H -#define GALOIS_ATOMIC_H - -#include - -#include "galois/config.h" -#include "galois/substrate/CacheLineStorage.h" - -namespace galois { - -namespace internal { -/** - * Common implementation. - */ -template class W, bool CONCURRENT> -class GAtomicImpl { - // galois::runtime::LL::CacheLineStorage val; - W val; - -public: - //! Initialize with a value - explicit GAtomicImpl(const T& i) : val(i) {} - //! default constructor - GAtomicImpl() {} - - //! atomic add and fetch - T operator+=(const T& rhs) { return __sync_add_and_fetch(&val.data, rhs); } - //! atomic sub and fetch - T operator-=(const T& rhs) { return __sync_sub_and_fetch(&(val.data), rhs); } - //! atomic increment and fetch - T operator++() { return __sync_add_and_fetch(&(val.data), 1); } - //! atomic fetch and increment - T operator++(int) { return __sync_fetch_and_add(&(val.data), 1); } - //! atomic decrement and fetch - T operator--() { return __sync_sub_and_fetch(&(val.data), 1); } - //! atomic fetch and decrement - T operator--(int) { return __sync_fetch_and_sub(&(val.data), 1); } - //! conversion operator to base data type - operator T() const { return val.data; } - //! assign from underlying type - T& operator=(const T& i) { return val.data = i; } - //! assignment operator - T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; } - //! direct compare and swap - bool cas(const T& expected, const T& updated) { - if (val.data != expected) { - return false; - } -#if defined(__INTEL_COMPILER) - return __sync_bool_compare_and_swap( - &val.data, *reinterpret_cast(&expected), - *reinterpret_cast(&updated)); -#else - return __sync_bool_compare_and_swap(&val.data, expected, updated); -#endif - } -}; - -// non-current version -template class W> -class GAtomicImpl { - // galois::runtime::LL::CacheLineStorage val; - W val; - -public: - //! Initialize with a value - explicit GAtomicImpl(const T& i) : val(i) {} - //! default constructor - GAtomicImpl() {} - - //! atomic add and fetch - T operator+=(const T& rhs) { return (val.data += rhs); } - //! atomic sub and fetch - T operator-=(const T& rhs) { return (val.data -= rhs); } - //! atomic increment and fetch - T operator++() { return ++(val.data); } - //! atomic fetch and increment - T operator++(int) { return (val.data)++; } - //! atomic decrement and fetch - T operator--() { return --(val.data); } - //! atomic fetch and decrement - T operator--(int) { return (val.data)--; } - //! conversion operator to base data type - operator T() const { return val.data; } - //! assign from underlying type - T& operator=(const T& i) { return val.data = i; } - //! assignment operator - T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; } - //! direct compare and swap - bool cas(const T& expected, const T& updated) { - if (val.data != expected) { - return false; - } else { - val.data = updated; - return true; - } - } -}; - -//! Basic atomic -template class W, bool CONCURRENT> -class GAtomicBase : public GAtomicImpl { - typedef GAtomicImpl Super_ty; - -public: - //! Initialize with a value - explicit GAtomicBase(const T& i) : Super_ty(i) {} - - //! default constructor - GAtomicBase() : Super_ty() {} - - T& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); } - - T& operator=(const T& that) { return Super_ty::operator=(that); } -}; - -//! Specialization for pointers -template class W, bool CONCURRENT> -class GAtomicBase : public GAtomicImpl { - typedef GAtomicImpl Super_ty; - -public: - typedef typename std::iterator_traits::difference_type difference_type; - - GAtomicBase() : Super_ty() {} - - GAtomicBase(T* i) : Super_ty(i) {} - - T*& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); } - - T*& operator=(T* that) { return Super_ty::operator=(that); } - - T* operator+=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_add_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data += rhs); - } - } - - T* operator-=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_sub_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data -= rhs); - } - } -}; - -//! Specialization for const pointers -template class W, bool CONCURRENT> -class GAtomicBase - : public GAtomicImpl { - typedef GAtomicImpl Super_ty; - -public: - typedef - typename std::iterator_traits::difference_type difference_type; - - GAtomicBase() : Super_ty() {} - - GAtomicBase(const T* i) : Super_ty(i) {} - - const T*& operator=(const GAtomicBase& that) { - return Super_ty::operator=(that); - } - - const T*& operator=(const T* that) { return Super_ty::operator=(that); } - - const T* operator+=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_add_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data += rhs); - } - } - - const T* operator-=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_sub_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data -= rhs); - } - } -}; - -//! Specialization for bools -template