sandialabs
diff --git a/‎CMakeLists.txt
+26-14 b/‎CMakeLists.txt
+26-14
diff --git a/‎README.md
+6-1 b/‎README.md
+6-1
diff --git a/‎contract.hpp
+23-124 b/‎contract.hpp
+23-124
@@ -1,19 +1,28 @@
-#IMPORTANT: Use the cmake flag -DCMAKE_CXX_COMPILER=/path/to/your/nvcc_wrapper
 cmake_minimum_required(VERSION 3.18)
 project(jetpartition CXX)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+# GKlib is a dependency of Metis when built from its github repo
+# Unfortunately there is no simple way to avoid also linking GKlib in this case
+# Older distributions of metis do not create this dependency
+SET(LINK_GKLIB False CACHE BOOL "Newer Metis distributions require us to link GKlib")
 
-#ALL DEVICE BUILDS
 find_package(KokkosKernels REQUIRED)
-add_compile_options(-O3 -Wall -Wextra -Wshadow)
-#add_compile_options(-g -DDEBUG)
+add_compile_options(-Wall -Wextra -Wshadow)
+
+# This is used by the build script
+# to avoid putting metis and gklib in the global path
+include_directories(${METIS_DIR}/include)
+link_directories(${METIS_DIR}/lib)
 
 add_executable(jet partition.cpp)
 add_executable(jet4 partition.cpp)
 add_executable(jet2 partition.cpp)
 add_executable(jet_host partition.cpp)
-add_executable(jet_import partition.cpp)
+add_executable(jet_import import_coarse.cpp)
 add_executable(jet_export partition.cpp)
 add_executable(jet_serial partition.cpp)
 add_executable(pstat part_eval.cpp)
@@ -22,14 +31,17 @@ target_compile_definitions(jet PUBLIC HASHMAP_P)
 target_compile_definitions(jet4 PUBLIC HASHMAP_P FOUR9)
 target_compile_definitions(jet2 PUBLIC HASHMAP_P TWO9)
 target_compile_definitions(jet_host PUBLIC HASHMAP_P HOST)
-target_compile_definitions(jet_import PUBLIC HASHMAP_P HOST IMP)
+target_compile_definitions(jet_import PUBLIC HASHMAP_P HOST)
 target_compile_definitions(jet_export PUBLIC HASHMAP_P HOST EXP)
 target_compile_definitions(jet_serial PUBLIC HASHMAP_P SERIAL)
-target_link_libraries(jet Kokkos::kokkos Kokkos::kokkoskernels metis)
-target_link_libraries(jet4 Kokkos::kokkos Kokkos::kokkoskernels metis)
-target_link_libraries(jet2 Kokkos::kokkos Kokkos::kokkoskernels metis)
-target_link_libraries(jet_host Kokkos::kokkos Kokkos::kokkoskernels metis)
-target_link_libraries(jet_import Kokkos::kokkos Kokkos::kokkoskernels)
-target_link_libraries(jet_export Kokkos::kokkos Kokkos::kokkoskernels metis)
-target_link_libraries(jet_serial Kokkos::kokkos Kokkos::kokkoskernels metis)
-target_link_libraries(pstat Kokkos::kokkos Kokkos::kokkoskernels)
+foreach(prog jet jet4 jet2 jet_host jet_import jet_export jet_serial pstat)
+    target_link_libraries(${prog} Kokkos::kokkos Kokkos::kokkoskernels)
+endforeach(prog)
+foreach(prog jet jet4 jet2 jet_host jet_export jet_serial)
+    target_link_libraries(${prog} metis)
+endforeach(prog)
+if(LINK_GKLIB)
+foreach(prog jet jet4 jet2 jet_host jet_export jet_serial)
+    target_link_libraries(${prog} GKlib)
+endforeach(prog)
+endif()
@@ -8,10 +8,15 @@ For details about the algorithm, please see https://arxiv.org/abs/2304.13194
 
 Kokkos (https://github.com/kokkos/kokkos): Enables performance portable parallelism.  
 KokkosKernels (https://github.com/kokkos/kokkos-kernels): Necessary only for KokkosSparse::CrsMatrix class.  
-Metis (https://github.com/KarypisLab/METIS): Used for initial partitioning of coarsest graph.
+Metis (https://github.com/KarypisLab/METIS): Used for initial partitioning of coarsest graph.  
+(Circumstantial) GKLib (https://github.com/KarypisLab/GKlib.git): Needed to link against the github distribution of Metis. Not needed for older distributions of Metis.
 
 ## Usage
 
+### Building
+
+Standard CMake build process. If your Metis build requires GKlib, add `-DLINK_GKLIB=True` to your cmake command when building Jet. Example build scripts are provided for macOS with OpenMP and Linux systems with Cuda. These scripts handle all required dependencies.
+
 ### Executables
 
 #### Partitioners
 
@@ -48,15 +48,6 @@
 
 namespace jet_partitioner {
 
-template<typename ordinal_t>
-KOKKOS_INLINE_FUNCTION ordinal_t xorshiftHash(ordinal_t key) {
-  ordinal_t x = key;
-  x ^= x << 13;
-  x ^= x >> 17;
-  x ^= x << 5;
-  return x;
-}
-
 template<class crsMat> //typename ordinal_t, typename edge_offset_t, typename scalar_t, class Device>
 class contracter {
 public:
@@ -99,7 +90,7 @@ class contracter {
         wgt_view_t vtx_w;
         coarse_map interp_mtx;
         int level;
-        bool uniform_weights;
+        bool uniform_weights = false;
     };
 
     // define behavior-controlling enums
@@ -112,6 +103,16 @@ class contracter {
     ordinal_t coarse_vtx_cutoff = 1000;
     ordinal_t min_allowed_vtx = 250;
     unsigned int max_levels = 200;
+    const ordinal_t large_row_threshold = 1000;
+    
+bool has_large_row(const matrix_t g){
+    ordinal_t max_row = 0;
+    Kokkos::parallel_reduce("find max row", policy_t(0, g.numRows()), KOKKOS_LAMBDA(const ordinal_t i, ordinal_t& update){
+        ordinal_t degree = g.graph.row_map(i+1) - g.graph.row_map(i);
+        if(degree > update) update = degree;
+    }, Kokkos::Max<ordinal_t, Kokkos::HostSpace>(max_row));
+    return (max_row >= large_row_threshold);
+}
 
 bool should_use_dyn(const ordinal_t n, const Kokkos::View<const edge_offset_t*, Device> work, int t_count){
     bool use_dyn = false;
@@ -194,15 +195,15 @@ struct combineAndDedupe {
         edge_offset_t insert(const edge_offset_t& hash_start, const edge_offset_t& size, const ordinal_t& u) const {
             edge_offset_t offset = abs(xorshiftHash<ordinal_t>(u)) % size;
             while(true){
-                if(htable(hash_start + offset) == -1){
-                    Kokkos::atomic_compare_exchange(&htable(hash_start + offset), -1, u);
+                ordinal_t v = htable(hash_start + offset);
+                if(v == -1){
+                    v = Kokkos::atomic_compare_exchange(&htable(hash_start + offset), -1, u);
                 }
-                if(htable(hash_start + offset) == u){
+                if(v == u || v == -1){
                     return offset;
-                } else {
-                    offset++;
-                    if(offset >= size) offset -= size;
                 }
+                offset++;
+                if(offset >= size) offset -= size;
             }
         }
 
@@ -372,11 +373,13 @@ coarse_level_triple build_coarse_graph(const coarse_level_triple level,
     vtx_view_t htable(Kokkos::ViewAllocateWithoutInitializing("hashtable keys"), hash_size);
     Kokkos::deep_copy(htable, -1);
     wgt_view_t hvals("hashtable values", hash_size);
+    // use thread teams on gpu when graph has decent average degree or very large max degree
+    bool use_team = (!is_host_space && (hash_size / n >= 12 || has_large_row(g)));
     //insert each coarse vertex into a bucket determined by a hash
     //use linear probing to resolve conflicts
     //combine weights using atomic addition
     combineAndDedupe cnd(g, vcmap.map, htable, hvals, hrow_map);
-    if(!is_host_space && hash_size / n >= 12) {
+    if(use_team) {
         Kokkos::parallel_for("deduplicate", team_policy_t(n, Kokkos::AUTO), cnd);
     } else {
         bool use_dyn = should_use_dyn(n, g.graph.row_map, exec_space().concurrency());
@@ -391,7 +394,7 @@ coarse_level_triple build_coarse_graph(const coarse_level_triple level,
     timer.reset();
     edge_view_t coarse_row_map_f("edges_per_source", nc + 1);
     countUnique cu(htable, hrow_map, coarse_row_map_f);
-    if(!is_host_space && hash_size / nc >= 12) {
+    if(use_team) {
         Kokkos::parallel_for("count unique", team_policy_t(nc, Kokkos::AUTO), cu);
     } else {
         Kokkos::parallel_for("count unique", policy_t(0, nc), cu);
@@ -412,7 +415,7 @@ coarse_level_triple build_coarse_graph(const coarse_level_triple level,
     vtx_view_t entries_coarse(Kokkos::ViewAllocateWithoutInitializing("coarse entries"), hash_size);
     wgt_view_t wgts_coarse(Kokkos::ViewAllocateWithoutInitializing("coarse weights"), hash_size);
     consolidateUnique consolidate(htable, entries_coarse, hvals, wgts_coarse, hrow_map, coarse_row_map_f);
-    if(!is_host_space && hash_size / nc >= 12) {
+    if(use_team) {
         Kokkos::parallel_for("consolidate", team_policy_t(nc, Kokkos::AUTO).set_scratch_size(0, Kokkos::PerTeam(4*sizeof(ordinal_t))), consolidate);
     } else {
         bool use_dyn = should_use_dyn(nc, hrow_map, exec_space().concurrency());
@@ -474,118 +477,14 @@ coarse_map generate_coarse_mapping(const matrix_t g,
             break;
         case Match:
         case MtMetis:
-            interpolation_graph = mapper.coarsen_match(g, uniform_weights, rand_pool, experiment, choice);
+            interpolation_graph = mapper.coarsen_match(g, uniform_weights, rand_pool, choice);
             break;
     }
     Kokkos::fence();
     experiment.addMeasurement(Measurement::Map, timer.seconds());
     return interpolation_graph;
 }
 
-void dump_stats(const matrix_t g, const wgt_view_t vtx_w_dev){
-    typename edge_view_t::HostMirror row_map("row map mirror", g.numRows() + 1);
-    typename wgt_view_t::HostMirror values("entries mirror", g.nnz());
-    typename wgt_view_t::HostMirror vtx_w("vtx_w mirror", g.numRows());
-    Kokkos::deep_copy(row_map, g.graph.row_map);
-    Kokkos::deep_copy(values, g.values);
-    Kokkos::deep_copy(vtx_w, vtx_w_dev);
-    std::ofstream degree_out("dump/degree.txt");
-    std::ofstream vtx_w_out("dump/vtx_w.txt");
-    std::ofstream wgt_degree_out("dump/wgt_degree.txt");
-    for(ordinal_t i = 0; i < g.numRows(); i++){
-        degree_out << row_map(i+1) - row_map(i) << std::endl;
-        vtx_w_out << vtx_w(i) << std::endl;
-        edge_offset_t wgt_degree = 0;
-        for(edge_offset_t j = row_map(i); j < row_map(i+1); j++){
-            wgt_degree += values(j);
-        }
-        wgt_degree_out << wgt_degree << std::endl;
-    }
-    degree_out.close();
-    wgt_degree_out.close();
-    vtx_w_out.close();
-}
-
-std::list<coarse_level_triple> load_coarse(){
-    FILE* cgfp = fopen("/home/mike/workspace/mt-KaHIP/coarse_graphs.out", "r");
-    int size = 0;
-    fread(&size, sizeof(int), 1, cgfp);
-    printf("Number of graphs: %i\n", size);
-    std::list<coarse_level_triple> levels;
-    ordinal_t prev_n = 0;
-    for(int i = 0; i < size; i++){
-        coarse_level_triple level;
-        level.level = i + 1;
-        ordinal_t N = 0;
-        fread(&N, sizeof(ordinal_t), 1, cgfp);
-        edge_offset_t M = 0;
-        fread(&M, sizeof(edge_offset_t), 1, cgfp);
-        edge_view_t rows("rows", N + 1);
-        auto rows_m = Kokkos::create_mirror_view(rows);
-        fread(rows_m.data(), sizeof(edge_offset_t), N + 1, cgfp);
-        Kokkos::deep_copy(rows, rows_m);
-        vtx_view_t entries("entries", M);
-        auto entries_m = Kokkos::create_mirror_view(entries);
-        fread(entries_m.data(), sizeof(ordinal_t), M, cgfp);
-        Kokkos::deep_copy(entries, entries_m);
-        wgt_view_t values("values", M);
-        auto values_m = Kokkos::create_mirror_view(values);
-        fread(values_m.data(), sizeof(scalar_t), M, cgfp);
-        Kokkos::deep_copy(values, values_m);
-        graph_type graph(entries, rows);
-        matrix_t g("g", N, values, graph);
-        level.mtx = g;
-        wgt_view_t vtx_wgts("vtx wgts", N);
-        auto vtx_wgts_m = Kokkos::create_mirror_view(vtx_wgts);
-        fread(vtx_wgts_m.data(), sizeof(scalar_t), N, cgfp);
-        Kokkos::deep_copy(vtx_wgts, vtx_wgts_m);
-        level.vtx_w = vtx_wgts;
-        if(level.level > 1){
-            vtx_view_t i_entries("entries", prev_n);
-            auto i_entries_m = Kokkos::create_mirror_view(i_entries);
-            fread(i_entries_m.data(), sizeof(ordinal_t), prev_n, cgfp);
-            Kokkos::deep_copy(i_entries, i_entries_m);
-            coarse_map i_g;
-            i_g.coarse_vtx = N;
-            i_g.map = i_entries;
-            level.interp_mtx = i_g;
-        }
-        prev_n = N;
-        levels.push_back(level);
-    }
-    fclose(cgfp);
-    return levels;
-}
-
-void dump_coarse(std::list<coarse_level_triple> levels){
-    FILE* cgfp = fopen("/home/mike/workspace/mt-KaHIP/coarse_graphs.out", "w");
-    int size = levels.size();
-    fwrite(&size, sizeof(int), 1, cgfp);
-    ordinal_t prev_n = 0;
-    for(auto level : levels){
-        matrix_t g = level.mtx;
-        ordinal_t N = g.numRows();
-        edge_offset_t M = g.nnz();
-        auto rows = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), g.graph.row_map);
-        auto entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), g.graph.entries);
-        auto values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), g.values);
-        auto vtx_wgts = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), level.vtx_w);
-        fwrite(&N, sizeof(ordinal_t), 1, cgfp);
-        fwrite(&M, sizeof(edge_offset_t), 1, cgfp);
-        fwrite(rows.data(), sizeof(edge_offset_t), N+1, cgfp);
-        fwrite(entries.data(), sizeof(ordinal_t), M, cgfp);
-        fwrite(values.data(), sizeof(scalar_t), M, cgfp);
-        fwrite(vtx_wgts.data(), sizeof(scalar_t), N, cgfp);
-        if(level.level > 1){
-            coarse_map interp_mtx = level.interp_mtx;
-            auto i_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interp_mtx.map);
-            fwrite(i_entries.data(), sizeof(ordinal_t), prev_n, cgfp);
-        }
-        prev_n = N;
-    }
-    fclose(cgfp);
-}
-
 std::list<coarse_level_triple> generate_coarse_graphs(const matrix_t fine_g, const wgt_view_t vweights, ExperimentLoggerUtil<scalar_t>& experiment, bool uniform_eweights = false) {
 
     Kokkos::Timer timer;