diff --git a/docs/bucket-index.md b/docs/bucket-index.md new file mode 100644 index 000000000..44ba86566 --- /dev/null +++ b/docs/bucket-index.md @@ -0,0 +1,79 @@ + +# Bucket index for slim mode + +Since version XXX osm2pgsql can use an index for way node lookups in slim mode +that needs a lot less disk space than before. For a planet the savings can be +about 200 GB! Lookup times are slightly slower, but this shouldn't be an issue +for most people. + +If you are not using slim mode and/or not doing updates of your database, this +does not apply to you. + +For backwards compatibility osm2pgsql will **not** update an existing database +to the new index. It will keep using the old index. So you do not have to do +anything when upgrading osm2pgsql. + +If you want to use the new index, there are two ways of doing this: The "safe" +way for most users and the "doit-it-yourself" way for expert users. Note that +once you switched to the new index, older versions of osm2pgsql will not work +correctly any more. + +## Update for most users + +If your database was created with an older version of osm2pgsql you might want +to start again from an empty database. Just do a reimport and osm2pgsql will +use the new space-saving index. + +## Update for expert users + +This is only for users who are very familiar with osm2pgsql and PostgreSQL +operation. You can break your osm2pgsql database beyond repair if something +goes wrong here and you might not even notice. + +You can create the index yourself by following these steps: + +Drop the existing index. Replace `{prefix}` by the prefix you are using. +Usually this is `planet_osm`: + +``` +DROP INDEX {prefix}_ways_nodes_idx; +``` + +Create the `index_bucket` function needed for the index. Replace +`{index_bucket_size}` by the bucket size you want. If you don't have a reason +to use something else, use `32`: + +``` +CREATE FUNCTION {prefix}_index_bucket(int8[]) RETURNS int8[] AS $$ + SELECT ARRAY(SELECT DISTINCT unnest($1)/{index_bucket_size}) +$$ LANGUAGE SQL IMMUTABLE; +``` + +Now you can create the new index. Again, replace `{prefix}` by the prefix +you are using: + +``` +CREATE INDEX {prefix}_ways_index_bucket_idx ON {prefix}_ways + USING GIN ({prefix}_index_bucket(nodes)) + WITH (fastupdate = off); +``` + +If you want to create the index in a specific tablespace you can do this: + +``` +CREATE INDEX {prefix}_ways_index_bucket_idx ON {prefix}_ways + USING GIN ({prefix}_index_bucket(nodes)) + WITH (fastupdate = off) TABLESPACE {tablespace}; +``` + +## Bucket size (for experts) + +When creating a new database (when used in create mode with slim option), +osm2pgsql will create a bucket index using bucket size 32. + +You can set the environment variable `OSM2PGSQL_INDEX_BUCKET_SIZE` to the +bucket size you want. Values between about 8 and 64 might make sense. + +To completely disable the bucket index and create an index compatible with +earlier versions of osm2pgsql, set `OSM2PGSQL_INDEX_BUCKET_SIZE` to `0`. + diff --git a/src/middle-pgsql.cpp b/src/middle-pgsql.cpp index 5a74363c4..7762a1760 100644 --- a/src/middle-pgsql.cpp +++ b/src/middle-pgsql.cpp @@ -55,8 +55,8 @@ static std::string build_sql(options_t const &options, char const *templ) fmt::arg("unlogged", options.droptemp ? "UNLOGGED" : ""), fmt::arg("using_tablespace", using_tablespace), fmt::arg("data_tablespace", tablespace_clause(options.tblsslim_data)), - fmt::arg("index_tablespace", - tablespace_clause(options.tblsslim_index))); + fmt::arg("index_tablespace", tablespace_clause(options.tblsslim_index)), + fmt::arg("index_bucket_size", options.index_bucket_size)); } middle_pgsql_t::table_desc::table_desc(options_t const &options, @@ -634,7 +634,8 @@ static table_sql sql_for_nodes() noexcept return sql; } -static table_sql sql_for_ways() noexcept +static table_sql sql_for_ways(bool has_bucket_index, + uint8_t index_bucket_size) noexcept { table_sql sql{}; @@ -653,12 +654,33 @@ static table_sql sql_for_ways() noexcept " SELECT id, nodes, tags" " FROM {prefix}_ways WHERE id = ANY($1::int8[]);\n"; - sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS" - " SELECT id FROM {prefix}_ways" - " WHERE nodes && ARRAY[$1];\n"; + if (has_bucket_index) { + sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS" + " SELECT id FROM {prefix}_ways w" + " WHERE $1 = ANY(nodes)" + " AND {prefix}_index_bucket(w.nodes)" + " && {prefix}_index_bucket(ARRAY[$1]);\n"; + } else { + sql.prepare_mark = "PREPARE mark_ways_by_node(int8) AS" + " SELECT id FROM {prefix}_ways" + " WHERE nodes && ARRAY[$1];\n"; + } - sql.create_index = "CREATE INDEX ON {prefix}_ways USING GIN (nodes)" - " WITH (fastupdate = off) {index_tablespace};\n"; + if (index_bucket_size == 0) { + sql.create_index = "CREATE INDEX ON {prefix}_ways USING GIN (nodes)" + " WITH (fastupdate = off) {index_tablespace};\n"; + } else { + sql.create_index = "CREATE OR REPLACE FUNCTION" + " {prefix}_index_bucket(int8[])" + " RETURNS int8[] AS $$\n" + " SELECT ARRAY(SELECT DISTINCT" + " unnest($1)/{index_bucket_size})\n" + "$$ LANGUAGE SQL IMMUTABLE;\n" + "CREATE INDEX {prefix}_ways_index_bucket_idx" + " ON {prefix}_ways" + " USING GIN ({prefix}_index_bucket(nodes))" + " WITH (fastupdate = off) {index_tablespace};\n"; + } return sql; } @@ -697,6 +719,16 @@ static table_sql sql_for_relations() noexcept return sql; } +static bool check_bucket_index(pg_conn_t *db_connection, + std::string const &prefix) +{ + auto const res = db_connection->query( + PGRES_TUPLES_OK, + "SELECT relname FROM pg_class WHERE relkind='i' AND" + " relname = '{}_ways_index_bucket_idx';"_format(prefix)); + return res.num_tuples() > 0; +} + middle_pgsql_t::middle_pgsql_t(options_t const *options) : m_append(options->append), m_out_options(options), m_cache(new node_ram_cache{options->alloc_chunkwise | ALLOC_LOSSY, @@ -712,8 +744,17 @@ middle_pgsql_t::middle_pgsql_t(options_t const *options) fmt::print(stderr, "Mid: pgsql, cache={}\n", options->cache); + bool const has_bucket_index = + check_bucket_index(&m_db_connection, options->prefix); + + if (!has_bucket_index && options->append) { + fmt::print(stderr, "You don't have a bucket index. See" + " docs/bucket-index.md for details.\n"); + } + m_tables[NODE_TABLE] = table_desc{*options, sql_for_nodes()}; - m_tables[WAY_TABLE] = table_desc{*options, sql_for_ways()}; + m_tables[WAY_TABLE] = table_desc{ + *options, sql_for_ways(has_bucket_index, options->index_bucket_size)}; m_tables[REL_TABLE] = table_desc{*options, sql_for_relations()}; } diff --git a/src/options.cpp b/src/options.cpp index 7bee6520f..5b82bc3cf 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -324,8 +324,45 @@ static osmium::Box parse_bbox(char const *bbox) return osmium::Box{minx, miny, maxx, maxy}; } +template +T get_env_unsigned(char const *var, T default_value) +{ + char const *const str = std::getenv(var); + + if (!str) { + return default_value; + } + + char *end = nullptr; + auto const val = std::strtoull(str, &end, 10); + + if (*end != '\0') { + fmt::print("Warning! Could not parse value of env variable {}.\n" + " Using default value of {}.\n\n", + var, default_value); + return default_value; + } + + if (val > std::numeric_limits::max()) { + fmt::print("Warning! Value of env variable {} out of range.\n" + " Using default value of {}.\n\n", + var, default_value); + return default_value; + } + + return static_cast(val); +} + +void options_t::get_options_from_env() +{ + index_bucket_size = + get_env_unsigned("OSM2PGSQL_INDEX_BUCKET_SIZE", 32); +} + options_t::options_t(int argc, char *argv[]) : options_t() { + get_options_from_env(); + bool help_verbose = false; // Will be set when -v/--verbose is set int c; diff --git a/src/options.hpp b/src/options.hpp index 13cb7fe6d..cb52097a5 100644 --- a/src/options.hpp +++ b/src/options.hpp @@ -130,7 +130,16 @@ class options_t std::vector input_files; + /** + * Size of buckets used for way node index. Use 0 to disable for backwards + * compatibility. + */ + uint8_t index_bucket_size = 32; + private: + /// Set advanced options from environment + void get_options_from_env(); + /** * Check input options for sanity */