diff --git a/Makefile b/Makefile index 6f26579..21e3c89 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,6 @@ RAM=2 # to make unit test results visible at the end. .PHONY: all all: docker-exec-region docker-exec-input-file \ - docker-exec-replication-w-input-file \ docker-exec-default unit-tests .PHONY: docker-clean @@ -115,44 +114,6 @@ docker-exec-input-file: build-run-docker -.PHONE: docker-exec-replication-w-input-file -docker-exec-replication-w-input-file: build-run-docker - # NOTE: This step tests --replication file for an initial load. - # It does **NOT** test the actual replication process for updating data - # using replication mode. Testing actual replication over time in this format - # will not be trivial. The historic file used (2021-01-13) cannot be used - # to seed a replication process, there is a time limit in upstream software - # that requires more recency to the source data. This also cannot simply - # download a file from Geofabrik, as the "latest" file will not have a diff - # to apply so also will not test the actual replication. - # - # Open a PR, Issue, discussion on https://github.com/rustprooflabs/pgosm-flex - # if you have an idea on how to implement this testing functionality. - - # copy with arbitrary file name to test --input-file - docker cp tests/data/district-of-columbia-2021-01-13.osm.pbf \ - pgosm:/app/output/$(INPUT_FILE_NAME) - - # allow files created in later step to be created - docker exec -it pgosm \ - chown $(CURRENT_UID):$(CURRENT_GID) /app/output/ - # Needed for unit-tests - docker exec -it pgosm \ - chown $(CURRENT_UID):$(CURRENT_GID) /app/docker/ - - # process it, this time without providing the region but directly the filename - docker exec -it \ - -e POSTGRES_PASSWORD=mysecretpassword \ - -e POSTGRES_USER=postgres \ - -u $(CURRENT_UID):$(CURRENT_GID) \ - pgosm python3 docker/pgosm_flex.py \ - --layerset=minimal \ - --ram=$(RAM) \ - --replication \ - --input-file=/app/output/$(INPUT_FILE_NAME) \ - --skip-qgis-style --skip-nested # Make this test run faster - - .PHONE: docker-exec-region docker-exec-region: build-run-docker # copy for simulating region diff --git a/db/deploy/README.md b/db/deploy/README.md new file mode 100644 index 0000000..7efcaca --- /dev/null +++ b/db/deploy/README.md @@ -0,0 +1,6 @@ +# PgOSM Flex SQL deploy scripts + +The scripts in this folder are executed during PgOSM Flex initialization via +the `prepare_osm_schema()` function in `docker/db.py`. +New or removed files in this folder must be adjusted in that function +as appropriate. diff --git a/db/deploy/replication_functions.sql b/db/deploy/replication_functions.sql index 5f0fdf7..ec3070f 100644 --- a/db/deploy/replication_functions.sql +++ b/db/deploy/replication_functions.sql @@ -1,4 +1,9 @@ +/* + Creates functions used for maintaining data when --replication is used. + These functions are also used when using `--update append` mode of + PgOSM Flex. +*/ BEGIN; diff --git a/docker/db.py b/docker/db.py index 637fa6a..d04164b 100644 --- a/docker/db.py +++ b/docker/db.py @@ -210,7 +210,10 @@ def log_pg_details(): def prepare_pgosm_db(skip_qgis_style, db_path, import_mode, schema_name): - """Runs through series of steps to prepare database for PgOSM. + """Runs through steps to prepare the target database for PgOSM Flex. + + Includes additional preparation for using --replication and --updated=append + modes. Parameters -------------------------- @@ -245,6 +248,9 @@ def prepare_pgosm_db(skip_qgis_style, db_path, import_mode, schema_name): schema_name=schema_name) run_insert_pgosm_road(db_path=db_path, schema_name=schema_name) + if import_mode.replication_update or import_mode.update == 'append': + osm2pgsql_replication_start() + def start_import(pgosm_region, pgosm_date, srid, language, layerset, git_info, osm2pgsql_version, import_mode, schema_name, input_file): @@ -477,7 +483,7 @@ def get_db_conn(conn_string): return conn -def pgosm_after_import(flex_path): +def pgosm_after_import(flex_path: str) -> bool: """Runs post-processing SQL via Lua script. Layerset logic is established via environment variable, must happen @@ -508,17 +514,38 @@ def pgosm_after_import(flex_path): def pgosm_nested_admin_polygons(flex_path: str, schema_name: str): - """Runs stored procedure to calculate nested admin polygons via psql. + """Runs two stored procedures to calculate nested admin polygons via psql. Parameters ---------------------- flex_path : str schema_name : str """ - sql_raw = f'CALL {schema_name}.build_nested_admin_polygons();' + # Populate the table + sql_raw_1 = f'CALL {schema_name}.populate_place_polygon_nested();' conn_string = os.environ['PGOSM_CONN'] - cmds = ['psql', '-d', conn_string, '-c', sql_raw] + cmds = ['psql', '-d', conn_string, '-c', sql_raw_1] + LOGGER.info('Populating place_polygon_nested table (osm.populate_place_polygon_nested() )') + output = subprocess.run(cmds, + text=True, + cwd=flex_path, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + LOGGER.info(f'Nested polygon output: \n {output.stdout}') + + if output.returncode != 0: + err_msg = f'Failed to populate nested polygon data. Return code: {output.returncode}' + LOGGER.error(err_msg) + sys.exit(f'{err_msg} - Check the log output for details.') + + + # Build the data + sql_raw_2 = f' CALL {schema_name}.build_nested_admin_polygons();' + + conn_string = os.environ['PGOSM_CONN'] + cmds = ['psql', '-d', conn_string, '-c', sql_raw_2] LOGGER.info('Building nested polygons... (this can take a while)') output = subprocess.run(cmds, text=True, @@ -537,9 +564,11 @@ def pgosm_nested_admin_polygons(flex_path: str, schema_name: str): def osm2pgsql_replication_start(): """Runs pre-replication step to clean out FKs that would prevent updates. + + This function is necessary for using `--replication (osm2pgsql-replication) + and `--update append` mode. """ LOGGER.info('Prep database to allow data updates.') - # This use of append applies to both osm2pgsql --append and osm2pgsq-replication, not renaming from "append" sql_raw = 'CALL osm.append_data_start();' with get_db_conn(conn_string=connection_string()) as conn: @@ -547,8 +576,11 @@ def osm2pgsql_replication_start(): cur.execute(sql_raw) -def osm2pgsql_replication_finish(skip_nested): - """Runs post-replication step to put FKs back and refresh materialied views. +def osm2pgsql_replication_finish(skip_nested: bool): + """Runs post-replication step to refresh materialized views and rebuild + nested data when appropriate. + + Only needed for `--replication`, not used for `--update append` mode. Parameters --------------------- diff --git a/docker/geofabrik.py b/docker/geofabrik.py index 0b02d4c..59dc6b8 100644 --- a/docker/geofabrik.py +++ b/docker/geofabrik.py @@ -107,8 +107,10 @@ def set_date_from_metadata(pbf_file: str): os.environ['PBF_TIMESTAMP'] = meta_timestamp -def pbf_download_needed(pbf_file_with_date: str, md5_file_with_date: str, - pgosm_date: str) -> bool: +def pbf_download_needed(pbf_file_with_date: str, + md5_file_with_date: str, + pgosm_date: str + ) -> bool: """Decides if the PBF/MD5 files need to be downloaded. Parameters @@ -123,6 +125,8 @@ def pbf_download_needed(pbf_file_with_date: str, md5_file_with_date: str, """ logger = logging.getLogger('pgosm-flex') # If the PBF file exists, check for the MD5 file too. + logger.debug(f'Checking for PBF File: {pbf_file_with_date}') + if os.path.exists(pbf_file_with_date): logger.info(f'PBF File exists {pbf_file_with_date}') diff --git a/docker/helpers.py b/docker/helpers.py index aa4aae5..cff1791 100644 --- a/docker/helpers.py +++ b/docker/helpers.py @@ -109,8 +109,9 @@ def verify_checksum(md5_file: str, path: str): logger.debug('md5sum validated') -def set_env_vars(region, subregion, srid, language, pgosm_date, layerset, - layerset_path, replication, schema_name): +def set_env_vars(region: str, subregion: str, srid: str, language: str, + pgosm_date: str, layerset: str, + layerset_path: str, schema_name: str, skip_nested: bool): """Sets environment variables needed by PgOSM Flex. Also creates DB record in `osm.pgosm_flex` table. @@ -122,11 +123,11 @@ def set_env_vars(region, subregion, srid, language, pgosm_date, layerset, language : str pgosm_date : str layerset : str + Name of layerset matching the INI filename. layerset_path : str str when set, or None - replication : bool - Indicates when osm2pgsql-replication is used schema_name : str + skip_nested : bool """ logger = logging.getLogger('pgosm-flex') logger.debug('Ensuring env vars are not set from prior run') @@ -159,6 +160,7 @@ def set_env_vars(region, subregion, srid, language, pgosm_date, layerset, pgosm_region = get_region_combined(region, subregion) logger.debug(f'PGOSM_REGION_COMBINED: {pgosm_region}') + os.environ['SKIP_NESTED'] = str(skip_nested) def get_region_combined(region: str, subregion: str) -> str: @@ -225,7 +227,7 @@ def get_git_info(tag_only: bool=False) -> str: def unset_env_vars(): - """Unsets environment variables used by PgOSM Flex. + """Unset environment variables used by PgOSM Flex. Does not pop POSTGRES_DB on purpose to allow non-Docker operation. """ @@ -239,6 +241,7 @@ def unset_env_vars(): os.environ.pop('PGOSM_CONN', None) os.environ.pop('PGOSM_CONN_PG', None) os.environ.pop('SCHEMA_NAME', None) + os.environ.pop('SKIP_NESTED', None) class ImportMode(): @@ -310,17 +313,17 @@ def okay_to_run(self, prior_import: dict) -> bool: """ self.logger.debug(f'Checking if it is okay to run...') if self.force: - self.logger.warn(f'Using --force, kiss existing data goodbye') + self.logger.warning('Using --force, kiss existing data goodbye.') return True # If no prior imports, do not require force if len(prior_import) == 0: - self.logger.debug(f'No prior import found, okay to proceed.') + self.logger.debug('No prior import found, okay to proceed.') return True prior_replication = prior_import['replication'] - # Check git version against latest. + # Check PgOSM version using Git tags # If current version is lower than prior version from latest import, stop. prior_import_version = prior_import['pgosm_flex_version_no_hash'] git_tag = get_git_info(tag_only=True) @@ -345,6 +348,9 @@ def okay_to_run(self, prior_import: dict) -> bool: self.logger.debug('Okay to proceed with replication') return True + if self.update == 'append': + return True + msg = 'Prior data exists in the osm schema and --force was not used.' self.logger.error(msg) return False diff --git a/docker/pgosm_flex.py b/docker/pgosm_flex.py index 190490c..56dd39c 100644 --- a/docker/pgosm_flex.py +++ b/docker/pgosm_flex.py @@ -95,7 +95,8 @@ def run_pgosm_flex(ram, region, subregion, debug, force, region = input_file helpers.set_env_vars(region, subregion, srid, language, pgosm_date, - layerset, layerset_path, replication, schema_name) + layerset, layerset_path, schema_name, + skip_nested) db.wait_for_postgres() if force and db.pg_conn_parts()['pg_host'] == 'localhost': msg = 'Using --force with the built-in database is unnecessary.' @@ -267,7 +268,6 @@ def run_replication_update(skip_nested, flex_path): """ logger = logging.getLogger('pgosm-flex') conn_string = db.connection_string() - db.osm2pgsql_replication_start() update_cmd = """ osm2pgsql-replication update -d $PGOSM_CONN \ @@ -531,10 +531,13 @@ def run_post_processing(flex_path, skip_nested, import_mode, schema_name): logger = logging.getLogger('pgosm-flex') if not import_mode.run_post_sql: - logger.info('Running with --update append: Skipping post-processing SQL') + msg = 'Running with --update append: Skipping post-processing SQL.' + msg += ' Running osm2pgsql_replication_finish() instead.' + logger.info(msg) + db.osm2pgsql_replication_finish(skip_nested=skip_nested) return True - post_processing_sql = db.pgosm_after_import(flex_path) + post_processing_sql = db.pgosm_after_import(flex_path=flex_path) if skip_nested: logger.info('Skipping calculating nested polygons') diff --git a/docker/tests/test_geofabrik.py b/docker/tests/test_geofabrik.py index 73c3349..54234e8 100644 --- a/docker/tests/test_geofabrik.py +++ b/docker/tests/test_geofabrik.py @@ -6,7 +6,10 @@ REGION_US = 'north-america/us' SUBREGION_DC = 'district-of-columbia' LAYERSET = 'default' -PGOSM_DATE = '2021-12-02' +PGOSM_DATE = '2021-01-13' + +PBF_FILE_WITH_DATE = f'/app/tests/data/district-of-columbia-{PGOSM_DATE}.osm.pbf' +MD5_FILE_WITH_DATE = f'/app/tests/data/district-of-columbia-{PGOSM_DATE}.osm.pbf.md5' class GeofabrikTests(unittest.TestCase): @@ -19,8 +22,8 @@ def setUp(self): pgosm_date=PGOSM_DATE, layerset=LAYERSET, layerset_path=None, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) def tearDown(self): @@ -34,15 +37,15 @@ def test_get_region_filename_returns_subregion_when_exists(self): def test_get_region_filename_returns_region_when_subregion_None(self): # Override Subregion to None helpers.unset_env_vars() - helpers.set_env_vars(region='north-america/us', + helpers.set_env_vars(region=REGION_US, subregion=None, srid=3857, language=None, pgosm_date=PGOSM_DATE, layerset=LAYERSET, layerset_path=None, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) result = geofabrik.get_region_filename() expected = f'{REGION_US}-latest.osm.pbf' @@ -64,7 +67,6 @@ def test_get_pbf_url_returns_proper_with_region_and_subregion(self): def test_pbf_download_needed_returns_boolean(self): pgosm_date = geofabrik.helpers.get_today() - region_filename = geofabrik.get_region_filename() expected = bool result = geofabrik.pbf_download_needed(pbf_file_with_date='does-not-matter', md5_file_with_date='not-a-file', @@ -73,11 +75,9 @@ def test_pbf_download_needed_returns_boolean(self): def test_pbf_download_needed_returns_true_when_file_not_exists(self): pgosm_date = geofabrik.helpers.get_today() - region_filename = geofabrik.get_region_filename() expected = True result = geofabrik.pbf_download_needed(pbf_file_with_date='does-not-matter', md5_file_with_date='not-a-file', pgosm_date=pgosm_date) self.assertEqual(expected, result) - diff --git a/docker/tests/test_pgosm_flex.py b/docker/tests/test_pgosm_flex.py index 1d7670f..ed190a0 100644 --- a/docker/tests/test_pgosm_flex.py +++ b/docker/tests/test_pgosm_flex.py @@ -19,8 +19,8 @@ def setUp(self): pgosm_date=PGOSM_DATE, layerset=LAYERSET, layerset_path=None, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) def tearDown(self): @@ -91,8 +91,8 @@ def test_get_export_filename_region_only(self): pgosm_date=PGOSM_DATE, layerset=LAYERSET, layerset_path=None, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) input_file = None result = pgosm_flex.get_export_filename(input_file) @@ -109,8 +109,8 @@ def test_layerset_include_place_returns_boolean(self): pgosm_date=PGOSM_DATE, layerset=LAYERSET, layerset_path=layerset_path, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) paths = pgosm_flex.get_paths() result = pgosm_flex.layerset_include_place(flex_path=paths['flex_path']) @@ -128,8 +128,8 @@ def test_layerset_include_place_returns_True_with_default_layerset(self): pgosm_date=PGOSM_DATE, layerset=LAYERSET, layerset_path=layerset_path, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) paths = pgosm_flex.get_paths() actual = pgosm_flex.layerset_include_place(flex_path=paths['flex_path']) @@ -147,8 +147,8 @@ def test_layerset_include_place_returns_false_when_place_false_in_ini(self): pgosm_date=PGOSM_DATE, layerset=layerset, layerset_path=layerset_path, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) paths = pgosm_flex.get_paths() actual = pgosm_flex.layerset_include_place(flex_path=paths['flex_path']) @@ -166,8 +166,8 @@ def test_layerset_include_place_returns_false_when_place_missing_in_ini(self): pgosm_date=PGOSM_DATE, layerset=layerset, layerset_path=layerset_path, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) paths = pgosm_flex.get_paths() actual = pgosm_flex.layerset_include_place(flex_path=paths['flex_path']) @@ -185,8 +185,8 @@ def test_layerset_include_place_returns_true_when_place_true_in_ini(self): pgosm_date=PGOSM_DATE, layerset=layerset, layerset_path=layerset_path, - replication=False, - schema_name='osm') + schema_name='osm', + skip_nested=True) paths = pgosm_flex.get_paths() actual = pgosm_flex.layerset_include_place(flex_path=paths['flex_path']) diff --git a/docs/src/docker-build.md b/docs/src/docker-build.md index 5962052..993ffb2 100644 --- a/docs/src/docker-build.md +++ b/docs/src/docker-build.md @@ -89,3 +89,29 @@ docker pull postgis/postgis:16-3.4 docker build --no-cache -t rustprooflabs/pgosm-flex:dev . ``` + +## Building PgOSM Flex from an `osm2pgsql` feature branch + +There are times it is helpful to build the PgOSM Flex Docker image with a +specific feature branch. To do this, change the `OSM2PGSQL_BRANCH` +and/or `OSM2PGSQL_REPO` arguments as necessary at the beginning of the Dockerfile. +The production setup looks like the following example. + + +```Dockerfile +ARG OSM2PGSQL_BRANCH=master +ARG OSM2PGSQL_REPO=https://github.com/osm2pgsql-dev/osm2pgsql.git +``` + +To test the feature branch associated with +[osm2pgsql #2212](https://github.com/osm2pgsql-dev/osm2pgsql/pull/2212) +the updated version was set like the following example. +This changes the `OSM2PGSQL_BRANCH` to `check-date-on-replication-init` +and changes the username in the `OSM2PGSQL_REPO` to `lonvia`. + + +```Dockerfile +ARG OSM2PGSQL_BRANCH=check-date-on-replication-init +ARG OSM2PGSQL_REPO=https://github.com/lonvia/osm2pgsql.git +``` + diff --git a/docs/src/replication.md b/docs/src/replication.md index d751ad5..88db77c 100644 --- a/docs/src/replication.md +++ b/docs/src/replication.md @@ -34,6 +34,12 @@ your specific database and process.** ---- +## Not tested by `make` + +The function exposed by `--replication` is not tested via PgOSM's `Makefile`. + + + ## Max connections The other important change when using replication is to increase Postgres' `max_connections`. diff --git a/docs/src/update-mode.md b/docs/src/update-mode.md index b7540d6..01d0daa 100644 --- a/docs/src/update-mode.md +++ b/docs/src/update-mode.md @@ -1,84 +1,56 @@ # Using Update Mode -Running in experimental Update mode enables using osm2pgsql's `--append` -option. - -> Note: This is **not** the `--append` option that existed in PgOSM Flex 0.6.3 and prior. +Running with `--update` enables using osm2pgsql's `--append` option to load a second +input file. The PgOSM Flex functionality uses `--update create` and `--update append`. +See the discussion in [#275](https://github.com/rustprooflabs/pgosm-flex/issues/275) +for more context behind the intent for this feature. +Using `--update append` requires the initial import used `--update create`. Attempting +to use `--update append` without first using `--update create` results in the error: +"ERROR: This database is not updatable. To create an updatable database use --slim (without --drop)." -## Testing steps -Important -- Needs higher max connections! +If your goal is to easily refresh the data for a single, standard region/sub-region +you should investigate the [`--replication` feature](/replication.md). Using +replication is the easier and more efficient way to maintain data. +> Note: This is **not** the `--append` option that existed in PgOSM Flex 0.6.3 and prior. -```bash -docker stop pgosm && docker build -t rustprooflabs/pgosm-flex . -docker run --name pgosm -d --rm \ - -v ~/pgosm-data:/app/output \ - -v /etc/localtime:/etc/localtime:ro \ - -e POSTGRES_PASSWORD=$POSTGRES_PASSWORD \ - -p 5433:5432 -d rustprooflabs/pgosm-flex \ - -c max_connections=300 -``` - -Run fresh import w/ D.C. using `--update create`. This ensures osm2pgsql -uses `--slim` w/out `--drop`. Tested from commit `672d9fd`. +## Example +The following command uses `--update create` to load the `district-of-columbia` +sub-region. This example assumes you have set the environment variables and +have ran the docker container as shown in the [Quick Start](quick-start.md) section. ```bash -time docker exec -it \ +docker exec -it \ pgosm python3 docker/pgosm_flex.py \ --ram=8 \ --region=north-america/us \ --subregion=district-of-columbia \ - --skip-nested --skip-dump \ --update create - -... - -2022-12-27 09:02:37,654:INFO:pgosm-flex:helpers:PgOSM-Flex version: 0.6.3 672d9fd - -... - -real 0m43.904s -user 0m0.020s -sys 0m0.012s ``` -Run with a second sub-region using `--update append`. +The following loads a second sub-region (`maryland`) using `--update append`. ```bash time docker exec -it \ pgosm python3 docker/pgosm_flex.py \ --ram=8 \ --region=north-america/us \ - --subregion=new-hampshire \ - --skip-nested --skip-dump \ + --subregion=maryland \ --update append - -... - -2022-12-27 10:14:26,792:INFO:pgosm-flex:helpers:2022-12-27 10:14:26 osm2pgsql took 1420s (23m 40s) overall. -2022-12-27 10:14:26,832:INFO:pgosm-flex:pgosm_flex:osm2pgsql completed -2022-12-27 10:14:26,832:INFO:pgosm-flex:pgosm_flex:Running with --update append: Skipping post-processing SQL -2022-12-27 10:14:26,832:INFO:pgosm-flex:pgosm_flex:Skipping pg_dump -2022-12-27 10:14:26,832:INFO:pgosm-flex:pgosm_flex:PgOSM Flex complete! - -real 23m47.564s -user 0m0.083s -sys 0m0.025s - ``` -It seems to work, new output at the end: `Skipping post-processing SQL`. - -Verified that both New Hampshire and D.C. regions were loaded in `osm.place_polygon`. ## Smaller test -Put the following into `~/pgosm-data/extracts/colorado-extract.json`: +> This section has notes that probably belong elsewhere but I'm leaving them here for now. +> They were initially helpful for testing the logic for this functionality. + +Put the following into `~/pgosm-data/extracts/colorado-extract.json`. ```json { @@ -109,14 +81,14 @@ Put the following into `~/pgosm-data/extracts/colorado-extract.json`: ``` -Create Boulder and Longmont extracts. +Create Boulder and Longmont extracts using `osmium extract`. -``` +```bash osmium extract -c extracts/colorado-extracts.json colorado-2022-12-27.osm.pbf ``` -``` +```bash ryanlambert@tag201:~/pgosm-data$ ls -alh | grep boulder -rw-rw-r-- 1 ryanlambert ryanlambert 2.4M Dec 27 14:31 colorado-boulder-latest.osm.pbf ryanlambert@tag201:~/pgosm-data$ ls -alh | grep longmont @@ -125,24 +97,24 @@ ryanlambert@tag201:~/pgosm-data$ ls -alh | grep longmont Takes 11 seconds. -``` -time docker exec -it \ +```bash +docker exec -it \ pgosm python3 docker/pgosm_flex.py \ --ram=8 \ --region=north-america/us \ --subregion=colorado-longmont --input-file colorado-longmont-latest.osm.pbf \ - --skip-dump --update create + --update create ``` Takes 2 minutes. -``` -time docker exec -it \ +```bash +docker exec -it \ pgosm python3 docker/pgosm_flex.py \ --ram=8 \ --region=north-america/us \ --subregion=colorado-boulder --input-file colorado-boulder-latest.osm.pbf \ - --skip-dump --update append + --update append ``` diff --git a/flex-config/run-sql.lua b/flex-config/run-sql.lua index 7925647..26de478 100644 --- a/flex-config/run-sql.lua +++ b/flex-config/run-sql.lua @@ -23,6 +23,19 @@ else end +local skip_nested_env = os.getenv("SKIP_NESTED") +local skip_nested = nil + +print(string.format('SKIP_NESTED: %s', skip_nested_env)) + +if skip_nested_env then + skip_nested = skip_nested_env +else + error('Environment variable SKIP_NESTED must be set.') +end + + + layers = {'amenity', 'building', 'building_combined_point', 'indoor' , 'infrastructure', 'landuse', 'leisure' , 'natural', 'place', 'poi', 'public_transport' @@ -64,6 +77,12 @@ end local errors = 0 +if skip_nested then + if not post_processing("place_polygon_nested") then + errors = errors + 1 + end +end + for ix, layer in ipairs(layers) do if conf['layerset'][layer] then if not post_processing(layer) then diff --git a/flex-config/sql/place.sql b/flex-config/sql/place.sql index a0df97d..bc22d99 100644 --- a/flex-config/sql/place.sql +++ b/flex-config/sql/place.sql @@ -57,185 +57,6 @@ DELETE -DROP TABLE IF EXISTS osm.place_polygon_nested; -CREATE TABLE osm.place_polygon_nested -( - osm_id BIGINT NOT NULL PRIMARY KEY, - name TEXT NOT NULL, - osm_type TEXT NOT NULL, - admin_level INT NOT NULL, - nest_level BIGINT NULL, - name_path TEXT[] NULL, - osm_id_path BIGINT[] NULL, - admin_level_path INT[] NULL, - row_innermost BOOLEAN NOT NULL GENERATED ALWAYS AS ( - CASE WHEN osm_id_path[array_length(osm_id_path, 1)] = osm_id THEN True - ELSE False - END - ) STORED, - innermost BOOLEAN NOT NULL DEFAULT False, - geom GEOMETRY NOT NULL, -- Can't enforce geom type b/c SRID is dynamic project wide. Can't set MULTIPOLYGON w/out SRID too - CONSTRAINT fk_place_polygon_nested - FOREIGN KEY (osm_id) REFERENCES osm.place_polygon (osm_id) -); - - - -CREATE INDEX ix_osm_place_polygon_nested_osm_id - ON osm.place_polygon_nested (osm_id) -; -CREATE INDEX ix_osm_place_polygon_nested_name_path - ON osm.place_polygon_nested USING GIN (name_path) -; -CREATE INDEX ix_osm_place_polygon_nested_osm_id_path - ON osm.place_polygon_nested USING GIN (osm_id_path) -; - -COMMENT ON TABLE osm.place_polygon_nested IS 'Provides hierarchy of administrative polygons. Built on top of osm.vplace_polygon. Artifact of PgOSM-Flex (place.sql).'; - -COMMENT ON COLUMN osm.place_polygon_nested.osm_id IS 'OpenStreetMap ID. Unique along with geometry type.'; - - -COMMENT ON COLUMN osm.place_polygon_nested.admin_level IS 'Value from admin_level if it exists. Defaults to 99 if not.'; -COMMENT ON COLUMN osm.place_polygon_nested.nest_level IS 'How many polygons is the current polygon nested within. 1 indicates polygon with no containing polygon.'; -COMMENT ON COLUMN osm.place_polygon_nested.name_path IS 'Array of names of the current polygon (last) and all containing polygons.'; -COMMENT ON COLUMN osm.place_polygon_nested.osm_id_path IS 'Array of osm_id for the current polygon (last) and all containing polygons.'; -COMMENT ON COLUMN osm.place_polygon_nested.admin_level_path IS 'Array of admin_level values for the current polygon (last) and all containing polygons.'; -COMMENT ON COLUMN osm.place_polygon_nested.name IS 'Best name option determined by helpers.get_name(). Keys with priority are: name, short_name, alt_name and loc_name. See pgosm-flex/flex-config/helpers.lua for full logic of selection.'; -COMMENT ON COLUMN osm.place_polygon_nested.row_innermost IS 'Indicates if the osm_id is the most inner ID of the current row. Used to calculated innermost after all nesting paths have been calculated.'; -COMMENT ON COLUMN osm.place_polygon_nested.innermost IS 'Indicates this row is the innermost admin level of the current data set and does **not** itself contain another admin polygon. Calculated by procedure osm.build_nested_admin_polygons() defined in pgosm-flex/flex-config/place.sql.'; -COMMENT ON COLUMN osm.place_polygon_nested.osm_type IS 'Values from place if a place tag exists. If no place tag, values boundary or admin_level indicate the source of the feature.'; - -COMMENT ON COLUMN osm.place_polygon_nested.geom IS 'Geometry loaded by osm2pgsql.'; - - -CREATE OR REPLACE PROCEDURE osm.populate_place_polygon_nested() -LANGUAGE sql -AS $$ - - - INSERT INTO osm.place_polygon_nested (osm_id, name, osm_type, admin_level, geom) - SELECT p.osm_id, p.name, p.osm_type, - COALESCE(p.admin_level::INT, 99) AS admin_level, - geom - FROM osm.place_polygon p - WHERE (p.boundary = 'administrative' - OR p.osm_type IN ('neighborhood', 'city', 'suburb', 'town', 'admin_level', 'locality') - ) - AND p.name IS NOT NULL - AND NOT EXISTS ( - SELECT osm_id - FROM osm.place_polygon_nested n - WHERE n.osm_id = p.osm_id - ) - ; - -$$; - - -CALL osm.populate_place_polygon_nested(); - - -CREATE OR REPLACE PROCEDURE osm.build_nested_admin_polygons( - batch_row_limit BIGINT = 100 - ) - LANGUAGE plpgsql - AS $$ - DECLARE - rows_to_update BIGINT; - BEGIN - - SELECT COUNT(*) INTO rows_to_update - FROM osm.place_polygon_nested r - WHERE nest_level IS NULL - ; - RAISE NOTICE 'Rows to update: %', rows_to_update; - RAISE NOTICE 'Updating in batches of % rows', $1; - - FOR counter IN 1..rows_to_update by $1 LOOP - - DROP TABLE IF EXISTS places_for_nesting; - CREATE TEMP TABLE places_for_nesting AS - SELECT p.osm_id - FROM osm.place_polygon_nested p - WHERE p.name IS NOT NULL - AND (admin_level IS NOT NULL - OR osm_type IN ('boundary', 'admin_level', 'suburb', - 'neighbourhood') - ) - ; - CREATE UNIQUE INDEX tmp_ix_places_for_nesting - ON places_for_nesting (osm_id); - - - DROP TABLE IF EXISTS place_batch; - CREATE TEMP TABLE place_batch AS - SELECT p.osm_id, t.nest_level, t.name_path, t.osm_id_path, t.admin_level_path - FROM osm.place_polygon p - INNER JOIN LATERAL ( - SELECT COUNT(i.osm_id) AS nest_level, - ARRAY_AGG(i.name ORDER BY COALESCE(i.admin_level::INT, 99::INT) ASC) AS name_path, - ARRAY_AGG(i.osm_id ORDER BY COALESCE(i.admin_level::INT, 99::INT) ASC) AS osm_id_path, - ARRAY_AGG(COALESCE(i.admin_level::INT, 99::INT) ORDER BY i.admin_level ASC) AS admin_level_path - FROM osm.place_polygon i - WHERE ST_Within(p.geom, i.geom) - AND EXISTS ( - SELECT 1 FROM places_for_nesting include - WHERE i.osm_id = include.osm_id - ) - AND i.name IS NOT NULL - ) t ON True - WHERE EXISTS ( - SELECT 1 FROM osm.place_polygon_nested miss - WHERE miss.nest_level IS NULL - AND p.osm_id = miss.osm_id - ) - AND EXISTS ( - SELECT 1 FROM places_for_nesting include - WHERE p.osm_id = include.osm_id - ) - LIMIT $1 - ; - - UPDATE osm.place_polygon_nested n - SET nest_level = t.nest_level, - name_path = t.name_path, - osm_id_path = t.osm_id_path, - admin_level_path = t.admin_level_path - FROM place_batch t - WHERE n.osm_id = t.osm_id - ; - COMMIT; - END LOOP; - - DROP TABLE IF EXISTS place_batch; - DROP TABLE IF EXISTS places_for_nesting; - - -- With all nested paths calculated the innermost value can be determined. - WITH calc_inner AS ( - SELECT a.osm_id - FROM osm.place_polygon_nested a - WHERE a.row_innermost -- Start with per row check... - -- If an osm_id is found in any other path, cannot be innermost - AND NOT EXISTS ( - SELECT 1 - FROM osm.place_polygon_nested i - WHERE a.osm_id <> i.osm_id - AND a.osm_id = ANY(osm_id_path) - ) - ) - UPDATE osm.place_polygon_nested n - SET innermost = True - FROM calc_inner i - WHERE n.osm_id = i.osm_id - ; -END $$; - - - -COMMENT ON PROCEDURE osm.build_nested_admin_polygons IS 'Warning: Expensive procedure! Use to populate the osm.place_polygon_nested table. This procedure is not ran as part of SQL script automatically due to excessive run time on large regions.'; - - CREATE MATERIALIZED VIEW osm.vplace_polygon_subdivide AS SELECT osm_id, ST_Subdivide(geom) AS geom FROM osm.place_polygon diff --git a/flex-config/sql/place_polygon_nested.sql b/flex-config/sql/place_polygon_nested.sql new file mode 100644 index 0000000..3ee0c84 --- /dev/null +++ b/flex-config/sql/place_polygon_nested.sql @@ -0,0 +1,177 @@ + + +DROP TABLE IF EXISTS osm.place_polygon_nested; +CREATE TABLE osm.place_polygon_nested +( + osm_id BIGINT NOT NULL PRIMARY KEY, + name TEXT NOT NULL, + osm_type TEXT NOT NULL, + admin_level INT NOT NULL, + nest_level BIGINT NULL, + name_path TEXT[] NULL, + osm_id_path BIGINT[] NULL, + admin_level_path INT[] NULL, + row_innermost BOOLEAN NOT NULL GENERATED ALWAYS AS ( + CASE WHEN osm_id_path[array_length(osm_id_path, 1)] = osm_id THEN True + ELSE False + END + ) STORED, + innermost BOOLEAN NOT NULL DEFAULT False, + geom GEOMETRY NOT NULL, -- Can't enforce geom type b/c SRID is dynamic project wide. Can't set MULTIPOLYGON w/out SRID too + CONSTRAINT fk_place_polygon_nested + FOREIGN KEY (osm_id) REFERENCES osm.place_polygon (osm_id) +); + + + +CREATE INDEX ix_osm_place_polygon_nested_osm_id + ON osm.place_polygon_nested (osm_id) +; +CREATE INDEX ix_osm_place_polygon_nested_name_path + ON osm.place_polygon_nested USING GIN (name_path) +; +CREATE INDEX ix_osm_place_polygon_nested_osm_id_path + ON osm.place_polygon_nested USING GIN (osm_id_path) +; + +COMMENT ON TABLE osm.place_polygon_nested IS 'Provides hierarchy of administrative polygons. Built on top of osm.vplace_polygon. Artifact of PgOSM-Flex (place.sql).'; + +COMMENT ON COLUMN osm.place_polygon_nested.osm_id IS 'OpenStreetMap ID. Unique along with geometry type.'; + + +COMMENT ON COLUMN osm.place_polygon_nested.admin_level IS 'Value from admin_level if it exists. Defaults to 99 if not.'; +COMMENT ON COLUMN osm.place_polygon_nested.nest_level IS 'How many polygons is the current polygon nested within. 1 indicates polygon with no containing polygon.'; +COMMENT ON COLUMN osm.place_polygon_nested.name_path IS 'Array of names of the current polygon (last) and all containing polygons.'; +COMMENT ON COLUMN osm.place_polygon_nested.osm_id_path IS 'Array of osm_id for the current polygon (last) and all containing polygons.'; +COMMENT ON COLUMN osm.place_polygon_nested.admin_level_path IS 'Array of admin_level values for the current polygon (last) and all containing polygons.'; +COMMENT ON COLUMN osm.place_polygon_nested.name IS 'Best name option determined by helpers.get_name(). Keys with priority are: name, short_name, alt_name and loc_name. See pgosm-flex/flex-config/helpers.lua for full logic of selection.'; +COMMENT ON COLUMN osm.place_polygon_nested.row_innermost IS 'Indicates if the osm_id is the most inner ID of the current row. Used to calculated innermost after all nesting paths have been calculated.'; +COMMENT ON COLUMN osm.place_polygon_nested.innermost IS 'Indicates this row is the innermost admin level of the current data set and does **not** itself contain another admin polygon. Calculated by procedure osm.build_nested_admin_polygons() defined in pgosm-flex/flex-config/place.sql.'; +COMMENT ON COLUMN osm.place_polygon_nested.osm_type IS 'Values from place if a place tag exists. If no place tag, values boundary or admin_level indicate the source of the feature.'; + +COMMENT ON COLUMN osm.place_polygon_nested.geom IS 'Geometry loaded by osm2pgsql.'; + + +CREATE OR REPLACE PROCEDURE osm.populate_place_polygon_nested() +LANGUAGE sql +AS $$ + + + INSERT INTO osm.place_polygon_nested (osm_id, name, osm_type, admin_level, geom) + SELECT p.osm_id, p.name, p.osm_type, + COALESCE(p.admin_level::INT, 99) AS admin_level, + geom + FROM osm.place_polygon p + WHERE (p.boundary = 'administrative' + OR p.osm_type IN ('neighborhood', 'city', 'suburb', 'town', 'admin_level', 'locality') + ) + AND p.name IS NOT NULL + AND NOT EXISTS ( + SELECT osm_id + FROM osm.place_polygon_nested n + WHERE n.osm_id = p.osm_id + ) + ; + +$$; + + + +CREATE OR REPLACE PROCEDURE osm.build_nested_admin_polygons( + batch_row_limit BIGINT = 100 + ) + LANGUAGE plpgsql + AS $$ + DECLARE + rows_to_update BIGINT; + BEGIN + + SELECT COUNT(*) INTO rows_to_update + FROM osm.place_polygon_nested r + WHERE nest_level IS NULL + ; + RAISE NOTICE 'Rows to update: %', rows_to_update; + RAISE NOTICE 'Updating in batches of % rows', $1; + + FOR counter IN 1..rows_to_update by $1 LOOP + + DROP TABLE IF EXISTS places_for_nesting; + CREATE TEMP TABLE places_for_nesting AS + SELECT p.osm_id + FROM osm.place_polygon_nested p + WHERE p.name IS NOT NULL + AND (admin_level IS NOT NULL + OR osm_type IN ('boundary', 'admin_level', 'suburb', + 'neighbourhood') + ) + ; + CREATE UNIQUE INDEX tmp_ix_places_for_nesting + ON places_for_nesting (osm_id); + + + DROP TABLE IF EXISTS place_batch; + CREATE TEMP TABLE place_batch AS + SELECT p.osm_id, t.nest_level, t.name_path, t.osm_id_path, t.admin_level_path + FROM osm.place_polygon p + INNER JOIN LATERAL ( + SELECT COUNT(i.osm_id) AS nest_level, + ARRAY_AGG(i.name ORDER BY COALESCE(i.admin_level::INT, 99::INT) ASC) AS name_path, + ARRAY_AGG(i.osm_id ORDER BY COALESCE(i.admin_level::INT, 99::INT) ASC) AS osm_id_path, + ARRAY_AGG(COALESCE(i.admin_level::INT, 99::INT) ORDER BY i.admin_level ASC) AS admin_level_path + FROM osm.place_polygon i + WHERE ST_Within(p.geom, i.geom) + AND EXISTS ( + SELECT 1 FROM places_for_nesting include + WHERE i.osm_id = include.osm_id + ) + AND i.name IS NOT NULL + ) t ON True + WHERE EXISTS ( + SELECT 1 FROM osm.place_polygon_nested miss + WHERE miss.nest_level IS NULL + AND p.osm_id = miss.osm_id + ) + AND EXISTS ( + SELECT 1 FROM places_for_nesting include + WHERE p.osm_id = include.osm_id + ) + LIMIT $1 + ; + + UPDATE osm.place_polygon_nested n + SET nest_level = t.nest_level, + name_path = t.name_path, + osm_id_path = t.osm_id_path, + admin_level_path = t.admin_level_path + FROM place_batch t + WHERE n.osm_id = t.osm_id + ; + COMMIT; + END LOOP; + + DROP TABLE IF EXISTS place_batch; + DROP TABLE IF EXISTS places_for_nesting; + + -- With all nested paths calculated the innermost value can be determined. + WITH calc_inner AS ( + SELECT a.osm_id + FROM osm.place_polygon_nested a + WHERE a.row_innermost -- Start with per row check... + -- If an osm_id is found in any other path, cannot be innermost + AND NOT EXISTS ( + SELECT 1 + FROM osm.place_polygon_nested i + WHERE a.osm_id <> i.osm_id + AND a.osm_id = ANY(osm_id_path) + ) + ) + UPDATE osm.place_polygon_nested n + SET innermost = True + FROM calc_inner i + WHERE n.osm_id = i.osm_id + ; +END $$; + + + +COMMENT ON PROCEDURE osm.build_nested_admin_polygons IS 'Warning: Expensive procedure! Use to populate the osm.place_polygon_nested table. This procedure is not ran as part of SQL script automatically due to excessive run time on large regions.';