From 2d8f5facdbec0c5c3e8646c24ac87c80b8978bcb Mon Sep 17 00:00:00 2001
From: Jochen Topf <jochen@topf.org>
Date: Wed, 2 Sep 2020 19:56:17 +0200
Subject: [PATCH] Way node index using shifted node ids

OSM ways have a locality property that we can use to reduce the size of
the index looking up ways a node is in: they are often made up of
sequential node ids. If node N is contained in the way, then there is a
good chance that N+1, N+2, ... are contained in it as well. Thus, if we
group nearby nodes and create an index from node groups to ways, the
index will be significantly smaller. The drawback is that a lookup in
such an index returns false positives, i.e. ways that do not contain the
node of interest. So the smaller index is paid for with a performance
loss for updates.

"Grouping" the ids happens by shifting the id a few bits to the right.
How many exactly can be configured with the
--middle-way-node-index-id-shift option.

This commit sets the default shift for the node ids to 0, i.e. no shift,
so it is completely backwards compatible. Users can set a different
shift using the environment. See docs/bucket-index.md for details.

Setting the shift to something like 4 or 5 can significantly reduce the
disk space needed (saves something like 200 GB on a full planet), but
it costs some performance on updates (they are about 30% slower).

This is an improved version of
https://github.com/openstreetmap/osm2pgsql/pull/1058
---
 docs/bucket-index.md | 88 ++++++++++++++++++++++++++++++++++++++++++++
 docs/osm2pgsql.md    |  4 ++
 src/middle-pgsql.cpp | 60 +++++++++++++++++++++++++-----
 src/options.cpp      |  8 ++++
 src/options.hpp      |  8 ++++
 5 files changed, 159 insertions(+), 9 deletions(-)
 create mode 100644 docs/bucket-index.md

diff --git a/docs/bucket-index.md b/docs/bucket-index.md
new file mode 100644
index 000000000..8f853216b
--- /dev/null
+++ b/docs/bucket-index.md
@@ -0,0 +1,88 @@
+
+NOTE: This is only available from osm2pgsql version 1.4.0!
+
+NOTE: The default is still to create the old index for now.
+
+# Bucket index for slim mode
+
+Osm2pgsql can use an index for way node lookups in slim mode that needs a lot
+less disk space than earlier versions did. For a planet the savings can be
+about 200 GB! Lookup times are slightly slower, but this shouldn't be an issue
+for most people.
+
+*If you are not using slim mode and/or not doing updates of your database, this
+does not apply to you.*
+
+For backwards compatibility osm2pgsql will never update an existing database
+to the new index. It will keep using the old index. So you do not have to do
+anything when upgrading osm2pgsql.
+
+If you want to use the new index, there are two ways of doing this: The "safe"
+way for most users and the "doit-it-yourself" way for expert users. Note that
+once you switched to the new index, older versions of osm2pgsql will not work
+correctly any more.
+
+## Update for most users
+
+NOTE: This does not work yet. Currently the default is still to create the
+old type of index.
+
+If your database was created with an older version of osm2pgsql you might want
+to start again from an empty database. Just do a reimport and osm2pgsql will
+use the new space-saving index.
+
+## Update for expert users
+
+This is only for users who are very familiar with osm2pgsql and PostgreSQL
+operation. You can break your osm2pgsql database beyond repair if something
+goes wrong here and you might not even notice.
+
+You can create the index yourself by following these steps:
+
+Drop the existing index. Replace `{prefix}` by the prefix you are using.
+Usually this is `planet_osm`:
+
+```
+DROP INDEX {prefix}_ways_nodes_idx;
+```
+
+Create the `index_bucket` function needed for the index. Replace
+`{way_node_index_id_shift}` by the number of bits you want the id to be
+shifted. If you don't have a reason to use something else, use `5`:
+
+```
+CREATE FUNCTION {prefix}_index_bucket(int8[]) RETURNS int8[] AS $$
+  SELECT ARRAY(SELECT DISTINCT unnest($1) >> {way_node_index_id_shift})
+$$ LANGUAGE SQL IMMUTABLE;
+```
+
+Now you can create the new index. Again, replace `{prefix}` by the prefix
+you are using:
+
+```
+CREATE INDEX {prefix}_ways_nodes_bucket_idx ON {prefix}_ways
+  USING GIN ({prefix}_index_bucket(nodes))
+  WITH (fastupdate = off);
+```
+
+If you want to create the index in a specific tablespace you can do this:
+
+```
+CREATE INDEX {prefix}_ways_nodes_bucket_idx ON {prefix}_ways
+  USING GIN ({prefix}_index_bucket(nodes))
+  WITH (fastupdate = off) TABLESPACE {tablespace};
+```
+
+## Id shift (for experts)
+
+When creating a new database (when used in create mode with slim option),
+osm2pgsql can create a bucket index using a configurable id shift.
+
+You can set the shift with the command line option
+`--middle-way-node-index-id-shift`. Values between about 3 and 6 might make
+sense.
+
+To completely disable the bucket index and create an index compatible with
+earlier versions of osm2pgsql, use `--middle-way-node-index-id-shift=0`.
+(This is currently still the default.)
+
diff --git a/docs/osm2pgsql.md b/docs/osm2pgsql.md
index 8ff2739f4..8e47e9778 100644
--- a/docs/osm2pgsql.md
+++ b/docs/osm2pgsql.md
@@ -220,6 +220,10 @@ starting with two dashes (`--`). A summary of options is included below.
 -v, \--verbose
 :   Verbose output.
 
+--middle-way-node-index-id-shift shift
+:   Set ID shift for way node bucket index in middle. Experts only. See
+    documentation for details.
+
 
 # SUPPORTED PROJECTIONS
 
diff --git a/src/middle-pgsql.cpp b/src/middle-pgsql.cpp
index 5a74363c4..d1f92ec87 100644
--- a/src/middle-pgsql.cpp
+++ b/src/middle-pgsql.cpp
@@ -55,8 +55,8 @@ static std::string build_sql(options_t const &options, char const *templ)
         fmt::arg("unlogged", options.droptemp ? "UNLOGGED" : ""),
         fmt::arg("using_tablespace", using_tablespace),
         fmt::arg("data_tablespace", tablespace_clause(options.tblsslim_data)),
-        fmt::arg("index_tablespace",
-                 tablespace_clause(options.tblsslim_index)));
+        fmt::arg("index_tablespace", tablespace_clause(options.tblsslim_index)),
+        fmt::arg("way_node_index_id_shift", options.way_node_index_id_shift));
 }
 
 middle_pgsql_t::table_desc::table_desc(options_t const &options,
@@ -634,7 +634,8 @@ static table_sql sql_for_nodes() noexcept
     return sql;
 }
 
-static table_sql sql_for_ways() noexcept
+static table_sql sql_for_ways(bool has_bucket_index,
+                              uint8_t way_node_index_id_shift) noexcept
 {
     table_sql sql{};
 
@@ -653,12 +654,33 @@ static table_sql sql_for_ways() noexcept
                         "  SELECT id, nodes, tags"
                         "    FROM {prefix}_ways WHERE id = ANY($1::int8[]);\n";
 
-    sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS"
-                       "  SELECT id FROM {prefix}_ways"
-                       "    WHERE nodes && ARRAY[$1];\n";
+    if (has_bucket_index) {
+        sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS"
+                           "  SELECT id FROM {prefix}_ways w"
+                           "    WHERE $1 = ANY(nodes)"
+                           "      AND {prefix}_index_bucket(w.nodes)"
+                           "       && {prefix}_index_bucket(ARRAY[$1]);\n";
+    } else {
+        sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS"
+                           "  SELECT id FROM {prefix}_ways"
+                           "    WHERE nodes && ARRAY[$1];\n";
+    }
 
-    sql.create_index = "CREATE INDEX ON {prefix}_ways USING GIN (nodes)"
-                       "  WITH (fastupdate = off) {index_tablespace};\n";
+    if (way_node_index_id_shift == 0) {
+        sql.create_index = "CREATE INDEX ON {prefix}_ways USING GIN (nodes)"
+                           "  WITH (fastupdate = off) {index_tablespace};\n";
+    } else {
+        sql.create_index = "CREATE OR REPLACE FUNCTION"
+                           "    {prefix}_index_bucket(int8[])"
+                           "  RETURNS int8[] AS $$\n"
+                           "  SELECT ARRAY(SELECT DISTINCT"
+                           "    unnest($1) >> {way_node_index_id_shift})\n"
+                           "$$ LANGUAGE SQL IMMUTABLE;\n"
+                           "CREATE INDEX {prefix}_ways_nodes_bucket_idx"
+                           "  ON {prefix}_ways"
+                           "  USING GIN ({prefix}_index_bucket(nodes))"
+                           "  WITH (fastupdate = off) {index_tablespace};\n";
+    }
 
     return sql;
 }
@@ -697,6 +719,16 @@ static table_sql sql_for_relations() noexcept
     return sql;
 }
 
+static bool check_bucket_index(pg_conn_t *db_connection,
+                               std::string const &prefix)
+{
+    auto const res = db_connection->query(
+        PGRES_TUPLES_OK,
+        "SELECT relname FROM pg_class WHERE relkind='i' AND"
+        "  relname = '{}_ways_nodes_bucket_idx';"_format(prefix));
+    return res.num_tuples() > 0;
+}
+
 middle_pgsql_t::middle_pgsql_t(options_t const *options)
 : m_append(options->append), m_out_options(options),
   m_cache(new node_ram_cache{options->alloc_chunkwise | ALLOC_LOSSY,
@@ -712,8 +744,18 @@ middle_pgsql_t::middle_pgsql_t(options_t const *options)
 
     fmt::print(stderr, "Mid: pgsql, cache={}\n", options->cache);
 
+    bool const has_bucket_index =
+        check_bucket_index(&m_db_connection, options->prefix);
+
+    if (!has_bucket_index && options->append) {
+        fmt::print(stderr, "You don't have a bucket index. See"
+                           " docs/bucket-index.md for details.\n");
+    }
+
     m_tables[NODE_TABLE] = table_desc{*options, sql_for_nodes()};
-    m_tables[WAY_TABLE] = table_desc{*options, sql_for_ways()};
+    m_tables[WAY_TABLE] =
+        table_desc{*options, sql_for_ways(has_bucket_index,
+                                          options->way_node_index_id_shift)};
     m_tables[REL_TABLE] = table_desc{*options, sql_for_relations()};
 }
 
diff --git a/src/options.cpp b/src/options.cpp
index 65f76111c..e682de429 100644
--- a/src/options.cpp
+++ b/src/options.cpp
@@ -58,6 +58,7 @@ const struct option long_options[] = {
     {"keep-coastlines", no_argument, nullptr, 'K'},
     {"latlong", no_argument, nullptr, 'l'},
     {"merc", no_argument, nullptr, 'm'},
+    {"middle-way-node-index-id-shift", required_argument, nullptr, 300},
     {"multi-geometry", no_argument, nullptr, 'G'},
     {"number-processes", required_argument, nullptr, 205},
     {"output", required_argument, nullptr, 'O'},
@@ -182,6 +183,10 @@ void long_usage(char const *arg0, bool verbose)
 #endif
         printf("%s", "\
     \n\
+    Middle options (experts only):\n\
+          --middle-way-node-index-id-shift shift  Set ID shift for bucket\
+                             index. See documentation for details.\
+    \n\
     Expiry options:\n\
        -e|--expire-tiles [min_zoom-]max_zoom    Create a tile expiry list.\n\
                              Zoom levels must be larger than 0 and smaller\n\
@@ -567,6 +572,9 @@ options_t::options_t(int argc, char *argv[]) : options_t()
             fprintf(stderr, "\n");
             exit(EXIT_SUCCESS);
             break;
+        case 300:
+            way_node_index_id_shift = atoi(optarg);
+            break;
         case '?':
         default:
             short_usage(argv[0]);
diff --git a/src/options.hpp b/src/options.hpp
index 13cb7fe6d..070cb5a95 100644
--- a/src/options.hpp
+++ b/src/options.hpp
@@ -130,6 +130,14 @@ class options_t
 
     std::vector<std::string> input_files;
 
+    /**
+     * How many bits should the node id be shifted for the way node index?
+     * Use 0 to disable for backwards compatibility.
+     * Currently the default is 0, making osm2pgsql backwards compatible to
+     * earlier versions.
+     */
+    uint8_t way_node_index_id_shift = 0;
+
 private:
     /**
      * Check input options for sanity