osm2pgsql-dev · lonvia · Oct 8, 2020 · Sep 2, 2020
diff --git a/docs/bucket-index.md b/docs/bucket-index.md
@@ -0,0 +1,88 @@
+
+NOTE: This is only available from osm2pgsql version 1.4.0!
+
+NOTE: The default is still to create the old index for now.
+
+# Bucket index for slim mode
+
+Osm2pgsql can use an index for way node lookups in slim mode that needs a lot
+less disk space than earlier versions did. For a planet the savings can be
+about 200 GB! Lookup times are slightly slower, but this shouldn't be an issue
+for most people.
+
+*If you are not using slim mode and/or not doing updates of your database, this
+does not apply to you.*
+
+For backwards compatibility osm2pgsql will never update an existing database
+to the new index. It will keep using the old index. So you do not have to do
+anything when upgrading osm2pgsql.
+
+If you want to use the new index, there are two ways of doing this: The "safe"
+way for most users and the "doit-it-yourself" way for expert users. Note that
+once you switched to the new index, older versions of osm2pgsql will not work
+correctly any more.
+
+## Update for most users
+
+NOTE: This does not work yet. Currently the default is still to create the
+old type of index.
+
+If your database was created with an older version of osm2pgsql you might want
+to start again from an empty database. Just do a reimport and osm2pgsql will
+use the new space-saving index.
+
+## Update for expert users
+
+This is only for users who are very familiar with osm2pgsql and PostgreSQL
+operation. You can break your osm2pgsql database beyond repair if something
+goes wrong here and you might not even notice.
+
+You can create the index yourself by following these steps:
+
+Drop the existing index. Replace `{prefix}` by the prefix you are using.
+Usually this is `planet_osm`:
+
+```
+DROP INDEX {prefix}_ways_nodes_idx;
+```
+
+Create the `index_bucket` function needed for the index. Replace
+`{way_node_index_id_shift}` by the number of bits you want the id to be
+shifted. If you don't have a reason to use something else, use `5`:
+
+```
+CREATE FUNCTION {prefix}_index_bucket(int8[]) RETURNS int8[] AS $$
+  SELECT ARRAY(SELECT DISTINCT unnest($1) >> {way_node_index_id_shift})
+$$ LANGUAGE SQL IMMUTABLE;
+```
+
+Now you can create the new index. Again, replace `{prefix}` by the prefix
+you are using:
+
+```
+CREATE INDEX {prefix}_ways_nodes_bucket_idx ON {prefix}_ways
+  USING GIN ({prefix}_index_bucket(nodes))
+  WITH (fastupdate = off);
+```
+
+If you want to create the index in a specific tablespace you can do this:
+
+```
+CREATE INDEX {prefix}_ways_nodes_bucket_idx ON {prefix}_ways
+  USING GIN ({prefix}_index_bucket(nodes))
+  WITH (fastupdate = off) TABLESPACE {tablespace};
+```
+
+## Id shift (for experts)
+
+When creating a new database (when used in create mode with slim option),
+osm2pgsql can create a bucket index using a configurable id shift.
+
+You can set the shift with the command line option
+`--middle-way-node-index-id-shift`. Values between about 3 and 6 might make
+sense.
+
+To completely disable the bucket index and create an index compatible with
+earlier versions of osm2pgsql, use `--middle-way-node-index-id-shift=0`.
+(This is currently still the default.)
+
diff --git a/docs/osm2pgsql.md b/docs/osm2pgsql.md
@@ -220,6 +220,10 @@ starting with two dashes (`--`). A summary of options is included below.
 -v, \--verbose
 :   Verbose output.
 
+--middle-way-node-index-id-shift shift
+:   Set ID shift for way node bucket index in middle. Experts only. See
+    documentation for details.
+
 
 # SUPPORTED PROJECTIONS
 

diff --git a/src/middle-pgsql.cpp b/src/middle-pgsql.cpp
@@ -55,8 +55,8 @@ static std::string build_sql(options_t const &options, char const *templ)
         fmt::arg("unlogged", options.droptemp ? "UNLOGGED" : ""),
         fmt::arg("using_tablespace", using_tablespace),
         fmt::arg("data_tablespace", tablespace_clause(options.tblsslim_data)),
-        fmt::arg("index_tablespace",
-                 tablespace_clause(options.tblsslim_index)));
+        fmt::arg("index_tablespace", tablespace_clause(options.tblsslim_index)),
+        fmt::arg("way_node_index_id_shift", options.way_node_index_id_shift));
 }
 
 middle_pgsql_t::table_desc::table_desc(options_t const &options,
@@ -634,7 +634,8 @@ static table_sql sql_for_nodes() noexcept
     return sql;
 }
 
-static table_sql sql_for_ways() noexcept
+static table_sql sql_for_ways(bool has_bucket_index,
+                              uint8_t way_node_index_id_shift) noexcept
 {
     table_sql sql{};
 
@@ -653,12 +654,33 @@ static table_sql sql_for_ways() noexcept
                         "  SELECT id, nodes, tags"
                         "    FROM {prefix}_ways WHERE id = ANY($1::int8[]);\n";
 
-    sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS"
-                       "  SELECT id FROM {prefix}_ways"
-                       "    WHERE nodes && ARRAY[$1];\n";
+    if (has_bucket_index) {
+        sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS"
+                           "  SELECT id FROM {prefix}_ways w"
+                           "    WHERE $1 = ANY(nodes)"
+                           "      AND {prefix}_index_bucket(w.nodes)"
+                           "       && {prefix}_index_bucket(ARRAY[$1]);\n";
+    } else {
+        sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS"
+                           "  SELECT id FROM {prefix}_ways"
+                           "    WHERE nodes && ARRAY[$1];\n";
+    }
 
-    sql.create_index = "CREATE INDEX ON {prefix}_ways USING GIN (nodes)"
-                       "  WITH (fastupdate = off) {index_tablespace};\n";
+    if (way_node_index_id_shift == 0) {
+        sql.create_index = "CREATE INDEX ON {prefix}_ways USING GIN (nodes)"
+                           "  WITH (fastupdate = off) {index_tablespace};\n";
+    } else {
+        sql.create_index = "CREATE OR REPLACE FUNCTION"
+                           "    {prefix}_index_bucket(int8[])"
+                           "  RETURNS int8[] AS $$\n"
+                           "  SELECT ARRAY(SELECT DISTINCT"
+                           "    unnest($1) >> {way_node_index_id_shift})\n"
+                           "$$ LANGUAGE SQL IMMUTABLE;\n"
+                           "CREATE INDEX {prefix}_ways_nodes_bucket_idx"
+                           "  ON {prefix}_ways"
+                           "  USING GIN ({prefix}_index_bucket(nodes))"
+                           "  WITH (fastupdate = off) {index_tablespace};\n";
+    }
 
     return sql;
 }
@@ -697,6 +719,16 @@ static table_sql sql_for_relations() noexcept
     return sql;
 }
 
+static bool check_bucket_index(pg_conn_t *db_connection,
+                               std::string const &prefix)
+{
+    auto const res = db_connection->query(
+        PGRES_TUPLES_OK,
+        "SELECT relname FROM pg_class WHERE relkind='i' AND"
+        "  relname = '{}_ways_nodes_bucket_idx';"_format(prefix));
+    return res.num_tuples() > 0;
+}
+
 middle_pgsql_t::middle_pgsql_t(options_t const *options)
 : m_append(options->append), m_out_options(options),
   m_cache(new node_ram_cache{options->alloc_chunkwise | ALLOC_LOSSY,
@@ -712,8 +744,18 @@ middle_pgsql_t::middle_pgsql_t(options_t const *options)
 
     fmt::print(stderr, "Mid: pgsql, cache={}\n", options->cache);
 
+    bool const has_bucket_index =
+        check_bucket_index(&m_db_connection, options->prefix);
+
+    if (!has_bucket_index && options->append) {
+        fmt::print(stderr, "You don't have a bucket index. See"
+                           " docs/bucket-index.md for details.\n");
+    }
+
     m_tables[NODE_TABLE] = table_desc{*options, sql_for_nodes()};
-    m_tables[WAY_TABLE] = table_desc{*options, sql_for_ways()};
+    m_tables[WAY_TABLE] =
+        table_desc{*options, sql_for_ways(has_bucket_index,
+                                          options->way_node_index_id_shift)};
     m_tables[REL_TABLE] = table_desc{*options, sql_for_relations()};
 }
 

diff --git a/src/options.cpp b/src/options.cpp
@@ -58,6 +58,7 @@ const struct option long_options[] = {
     {"keep-coastlines", no_argument, nullptr, 'K'},
     {"latlong", no_argument, nullptr, 'l'},
     {"merc", no_argument, nullptr, 'm'},
+    {"middle-way-node-index-id-shift", required_argument, nullptr, 300},
     {"multi-geometry", no_argument, nullptr, 'G'},
     {"number-processes", required_argument, nullptr, 205},
     {"output", required_argument, nullptr, 'O'},
@@ -182,6 +183,10 @@ void long_usage(char const *arg0, bool verbose)
 #endif
         printf("%s", "\
     \n\
+    Middle options (experts only):\n\
+          --middle-way-node-index-id-shift shift  Set ID shift for bucket\
+                             index. See documentation for details.\
+    \n\
     Expiry options:\n\
        -e|--expire-tiles [min_zoom-]max_zoom    Create a tile expiry list.\n\
                              Zoom levels must be larger than 0 and smaller\n\
@@ -567,6 +572,9 @@ options_t::options_t(int argc, char *argv[]) : options_t()
             fprintf(stderr, "\n");
             exit(EXIT_SUCCESS);
             break;
+        case 300:
+            way_node_index_id_shift = atoi(optarg);
+            break;
         case '?':
         default:
             short_usage(argv[0]);

diff --git a/src/options.hpp b/src/options.hpp
@@ -130,6 +130,14 @@ class options_t
 
     std::vector<std::string> input_files;
 
+    /**
+     * How many bits should the node id be shifted for the way node index?
+     * Use 0 to disable for backwards compatibility.
+     * Currently the default is 0, making osm2pgsql backwards compatible to
+     * earlier versions.
+     */
+    uint8_t way_node_index_id_shift = 0;
+
 private:
     /**
      * Check input options for sanity