diff --git a/.travis.yml b/.travis.yml index ee346469..8fb67e6d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -64,6 +64,9 @@ before_install: # Install pygdal bindings compatible with dpkg based gdal libs - pip install pygdal>="$(gdal-config --version)" + - pip install --progress-bar off --requirement requirements-test.txt + - pip install git+https://github.com/mapbox/rasterio.git@master + install: - travis_retry pip install --upgrade pytest - travis_retry pip install codecov cython boto3 diff --git a/check-code.sh b/check-code.sh index 9501dc75..428c4d32 100755 --- a/check-code.sh +++ b/check-code.sh @@ -26,5 +26,5 @@ shellcheck ./**/*.sh yamllint ./**/*.yaml # Users can specify extra folders (ie. integration_tests) as arguments. -py.test -r sx --cov digitalearthau --durations=5 digitalearthau scripts/**/*.py "$@" +py.test -r sx --cov digitalearthau --durations=5 digitalearthau scripts_tests/*.py scripts/**/*.py "$@" diff --git a/digitalearthau/testing/plugin.py b/digitalearthau/testing/plugin.py index b73fdee1..d956a46f 100644 --- a/digitalearthau/testing/plugin.py +++ b/digitalearthau/testing/plugin.py @@ -1,14 +1,11 @@ import itertools - import os -import pytest from pathlib import Path from typing import Iterable -import datacube -import digitalearthau -import digitalearthau.system +import pytest from datacube.config import LocalConfig + from . import factories # These are unavoidable in pytests due to fixtures @@ -26,6 +23,8 @@ def pytest_report_header(config): if config.getoption('verbose') > 0: + import datacube + import digitalearthau return ( f"digitaleathau {digitalearthau.__version__}, " f"opendatacube {datacube.__version__}" diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 00000000..80cfbee8 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,2 @@ +colorama +xmlschema diff --git a/scripts/index_aster_lpdaac.py b/scripts/index_aster_lpdaac.py new file mode 100755 index 00000000..39b4ed8b --- /dev/null +++ b/scripts/index_aster_lpdaac.py @@ -0,0 +1,641 @@ +#!/usr/bin/env python +# language=rst +""" +ASTER L1T Indexing +================== + +This program allows indexing the Australia region ASTER (Advanced Spaceborne Thermal +Emission and Reflection Radiometer) L1T data stored on the NCI into an ODC Database. + +Background +---------- + +ASTER data consists of visible and near infrared (VNIR) frequencies +with three bands at 15-meter resolution, short-wave infrared (SWIR) +frequencies with six bands at 30-meter resolution, and thermal infrared (TIR) +wavelength with five bands at 90-meter resolution. + +Further details of AST_L1T data is available from +https://lpdaac.usgs.gov/dataset_discovery/aster/aster_products_table/ast_l1t_v003 + +The ASTER L1T data product is derived from ASTER Level 1A data that has been +geometrically corrected and reprojected to a north-up Universal Transverse Mercator (UTM) +projection. +(Please see: https://lpdaac.usgs.gov/sites/default/files/public/elearning/ASTER_L1T_Tutorial.html) + +Further, depending on whether the following modes are enabled, dataset may present +different bands. + +:: + + ASTEROBSERVATIONMODE.1=VNIR1, ON/OFF + ASTEROBSERVATIONMODE.2=VNIR2, ON/OFF + ASTEROBSERVATIONMODE.3=SWIR, ON/OFF + ASTEROBSERVATIONMODE.4=TIR, ON/OFF + +Regarding `SWIR` bands please note the following advice from +https://asterweb.jpl.nasa.gov/swir-alert.asp + +:: + + ASTER SWIR detectors are no longer functioning due to anomalously high SWIR detector + temperatures. ASTER SWIR data acquired since April 2008 are not useable, and + show saturation of values and severe striping. All attempts to bring the SWIR bands + back to life have failed, and no further action is envisioned. -- January 12, 2009 + +Usage +----- + +This script has three modes, all of which need a directory of ASTER_L1T HDF data files. + + 1. Create the product definition in an ODC database + 2. Generate VRT files with GDAL usable geospatial information + 3. Index Datasets into an ODC Database + +The input data is stored in sets of hdf files in `/g/data/v10/ASTER_AU/`. + + + +:: + + ./index_nci_aster_lpdaac.py create-product --product aster_l1t_vnir /g/data/v10/ASTER_AU/2018.01.01 + ./index_nci_aster_lpdaac.py create-vrt --product aster_l1t_vnir /g/data/v10/ASTER_AU/2018.01.01 + ./index_nci_aster_lpdaac.py index-data --product aster_l1t_vnir /g/data/v10/ASTER_AU/2018.01.01 + +:: + + for i in /g/data/v10/ASTER_AU/*; do + ./index_nci_aster_lpdaac.py --config aster_lpdacc.conf index-data + --product aster_l1t_vnir $i + done + +Test Database Creation Steps +---------------------------- + +:: + + psql -h agdcdev-db.nci.org.au + CREATE DATABASE aster_lpdaac WITH OWNER agdc_admin; + GRANT TEMPORARY, CONNECT ON DATABASE aster_lpdaac to public; + +Example ODC Configuration: ``aster_lpdaac.conf`` + +:: + + [datacube] + db_hostname: agdcdev-db.nci.org.au + db_port: 6432 + db_database: aster_lpdaac + +:: + + datacube --config aster_lpdaac.conf system init + + +Implementation Details +---------------------- + +The ODC Index datasets points to to the corresponding VRT files in order to access +raster measurement data. The VRT file in turn points to the original `.hdf` file +through `absolute path names` (Relative path names are not working at the moment, +and it is advised that the VRT files must be generated for the final resident +location of `.hdf` files. + +Each VRT file specify consistent set of bands from ASTER as a single product. +For example, `vnir` sensors correspond to `aster_l1t_vnir` product, `tir` +sensors correspond to `aster_l1t_tir` product, and `swir` sensors correspond +to `aster_l1t_swir` product. The corresponding definitions of these product +names and corresponding bands (with band names as identified in the original +`hdf` file) are defined in the constant `PRODUCTS` in this script. + +It attempts to create stable UUIDs for the generated Datasets, +based on the file path and modification time of the underlying HDF file Data +as well as product name. + +The VRT file for each product and dataset YAML file for each product is written +onto the same directory that contains the original `.hdf` file. +""" +import functools +import json +import logging +import uuid +from datetime import datetime, timezone +from pathlib import Path + +import click +import numpy as np +import rasterio +import yaml +from osgeo import gdal, osr + +from datacube import Datacube +from datacube.index.hl import Doc2Dataset + +LOG = logging.getLogger(__name__) + +PRODUCTS = { + 'aster_l1t_vnir': {'bands': {'ImageData2', 'ImageData1', 'ImageData3N'}, + 'include_only': {'ASTER', 'CORRECT', 'EAST', 'FLY', 'GAIN', 'INPUT', 'LOWER', 'MAP', + 'NORTH', 'NUMBERGCP', 'ORBIT', 'POINT', 'QAPERCENT', 'RECURRENT', 'SCENE', + 'SIZE', 'SOLAR', 'SOUTH', 'UPPER', 'UTMZONENUMBER', 'WEST'}}, + 'aster_l1t_swir': {'bands': {'ImageData4', 'ImageData5', 'ImageData6', 'ImageData7', 'ImageData8', 'ImageData9'}, + 'include_only': {'ASTER', 'CORRECT', 'EAST', 'FLY', 'GAIN', 'INPUT', 'LOWER', 'MAP', + 'NORTH', 'NUMBERGCP', 'ORBIT', 'POINT', 'QAPERCENT', 'RECURRENT', 'SCENE', + 'SIZE', 'SOLAR', 'SOUTH', 'UPPER', 'UTMZONENUMBER', 'WEST'}}, + 'aster_l1t_tir': {'bands': {'ImageData10', 'ImageData11', 'ImageData12', 'ImageData13', 'ImageData14'}, + 'exclude': {'BAND', 'CALENDAR', 'COARSE', 'FUTURE', 'GEO', 'HDF', 'IDENT', 'IMAGE', + 'PGE', 'PROCESSED', 'PROCESSING', 'RADIO', 'RECEIVING', 'REPROCESSING', 'SOURCE', + 'TIME', 'UTMZONECODE'}} +} + + +@click.group(help=__doc__) +@click.option('--config', '-c', help="Pass the configuration file to access the database", + type=click.Path(exists=True)) +@click.pass_context +def cli(ctx, config): + """ Used to pass the datacube index to functions via click.""" + ctx.obj = Datacube(config=config).index + + +@cli.command() +@click.argument('path') +@click.option('--product', help='Which ASTER product? vnir, swir, or tir') +def create_vrt(path, product): + file_paths = find_lpdaac_file_paths(Path(path)) + + for file_path in file_paths: + print(file_path) + bands = selected_bands(file_path, product) + if bands: + vrt = generate_vrt(file_path, bands) + with open(vrt_file_path(file_path, product), 'w') as fd: + fd.write(vrt) + else: + logging.error("File does not have bands of this product: %s", file_path) + + +@cli.command() +@click.argument('path') +@click.option('--product', help='Which ASTER product? vnir, swir, or tir') +@click.pass_obj +def show(index, path, product): + file_paths = find_lpdaac_file_paths(Path(path)) + + for file_path in file_paths: + print(file_path) + bands = selected_bands(file_path, product) + + if bands: + doc = generate_lpdaac_doc(file_path, product) + print_dict(doc) + else: + logging.error("File does not have bands of this product: %s", file_path) + + +@cli.command() +@click.argument('path') +@click.option('--product', help='Which ASTER product? vnir, swir, or tir') +@click.pass_obj +def create_product(index, path, product): + file_paths = find_lpdaac_file_paths(Path(path)) + + # Find a file which has the specified bands of this product + file_path = None + for file_path_ in file_paths: + try: + _ = selected_bands(file_path_, product) + except AssertionError: + pass + else: + file_path = file_path_ + break + + if file_path: + measurements = raster_to_measurements(file_path, product) + for measure in measurements: + measure.pop('path') # This is not needed here + print_dict(measurements) + product_def = generate_lpdaac_defn(measurements, product) + print_dict(product_def) + + print(index) + product = index.products.from_doc(product_def) + print(product) + indexed_product = index.products.add(product) + print(indexed_product) + else: + logging.error("No file found having the specified bands of this product: %s", product) + + +@cli.command() +@click.argument('path') +@click.option('--product', help='Which ASTER product? vnir, swir, or tir') +@click.pass_obj +def index_data(index, path, product): + file_paths = find_lpdaac_file_paths(Path(path)) + + resolver = Doc2Dataset(index) + for file_path in file_paths: + print(file_path) + + bands = selected_bands(file_path, product) + if bands: + vrt_path = vrt_file_path(file_path, product) + + if vrt_path.exists(): + + doc = generate_lpdaac_doc(file_path, product) + print_dict(doc) + dataset, err = resolver(doc, vrt_path.as_uri()) + + print(dataset) + if err is not None: + logging.error("%s", err) + try: + index.datasets.add(dataset) + except Exception as e: + logging.error("Couldn't index %s", file_path) + logging.exception("Exception", e) + else: + with open(yaml_file_path(file_path, product), 'w') as yaml_file: + yaml.safe_dump(doc, yaml_file) + + else: + logging.error("VRT file not found: %s", vrt_path) + else: + logging.error("File does not have bands of this product: %s", file_path) + + +def vrt_file_path(file_path, product): + return file_path.with_name(f'{file_path.stem}_{product}.vrt') + + +def yaml_file_path(file_path, product): + return file_path.with_name(f'{file_path.stem}_{product}.yaml') + + +def print_dict(doc): + print(json.dumps(doc, indent=4, sort_keys=True, cls=NumpySafeEncoder)) + + +def find_lpdaac_file_paths(path: Path): + """ + Return a list of hdf file path objects. + + :param path: + :return: A generator of path objects. + """ + for afile in path.iterdir(): + if afile.suffix == '.hdf' and afile.stem[:7] == 'AST_L1T': + yield afile + + +def raster_to_measurements(file_path, product): + bands = selected_bands(file_path, product) + + measurements = [] + for index, band in enumerate(bands): + measure = dict(name=str(index + 1)) + measure['path'] = vrt_file_path(file_path, product).name + + with rasterio.open(band) as band_: + measure['dtype'] = str(band_.dtypes[0]) + measure['nodata'] = band_.nodatavals[0] or 0 + measure['units'] = str(band_.units[0] or 1) + measure['aliases'] = [band.split(':')[-1]] + measurements.append(measure) + return measurements + + +@functools.lru_cache(maxsize=None) +def selected_bands(file_path, product): + band_suffixes = PRODUCTS[product]['bands'] + + ds = gdal.Open(str(file_path), gdal.GA_ReadOnly) + sub_datasets = ds.GetSubDatasets() + # Check the last field of the band name: something like 'ImageDataXX' + + available_bands_of_product = [band[0] for band in sub_datasets if band[0].split(':')[-1] in band_suffixes] + + assert len(available_bands_of_product) == len(band_suffixes) + + available_bands_of_product.sort(key=lambda x: x.split(':')[-1]) + + return available_bands_of_product + + +def generate_lpdaac_defn(measurements, product): + """ + Generate the product def for the product. + """ + return { + 'name': product, + 'metadata_type': 'eo', + 'metadata': { + 'product_type': product, + 'platform': {'code': 'ASTER'}, + 'version': 1, + 'coverage': 'aust' + }, + 'description': 'ASTER L1T - Precision Terrain Corrected Registered At-Sensor Radiance data', + 'measurements': measurements + } + + +def generate_lpdaac_doc(file_path, product): + modification_time = file_path.stat().st_mtime + + unique_ds_uri = f'{file_path.as_uri()}#{modification_time}#{product}' + + left, bottom, right, top = compute_extents(file_path) + spatial_ref = infer_aster_srs(file_path) + geo_ref_points = { + 'ul': {'x': left, 'y': top}, + 'ur': {'x': right, 'y': top}, + 'll': {'x': left, 'y': bottom}, + 'lr': {'x': right, 'y': bottom}, + } + + acquisition_time = get_acquisition_time(file_path) + measurements = raster_to_measurements(file_path, product) + + the_format = 'VRT' + + doc = { + 'id': str(uuid.uuid5(uuid.NAMESPACE_URL, unique_ds_uri)), + 'product_type': product, + 'creation_dt': str(datetime.fromtimestamp(modification_time)), + 'platform': {'code': 'ASTER'}, + 'extent': { + 'from_dt': str(acquisition_time), + 'to_dt': str(acquisition_time), + 'coord': geo_ref_points + }, + 'format': {'name': the_format}, + 'grid_spatial': { + 'projection': { + 'geo_ref_points': geo_ref_points, + 'spatial_reference': spatial_ref, + } + }, + 'image': { + 'bands': { + measure['name']: { + 'path': measure['path'], + 'layer': str(index + 1) + } for index, measure in enumerate(measurements) + } + }, + 'version': 1, + 'coverage': 'aust', + 'lineage': {'source_datasets': {}}, + 'further_info': filter_metadata(file_path, product) + } + return doc + + +def infer_aster_srs(file_path: Path): + """ + Compute SRS based on metadata (UTMZONENUMBER and NORTHBOUNDINGCOORDINATE) in the file and + generic osr.SpatialReference data. + Reference: + https://git.earthdata.nasa.gov/projects/LPDUR/repos/aster-l1t/raw/ASTERL1T_hdf2tif.py?at=refs%2Fheads%2Fmaster + """ + + ds = gdal.Open(str(file_path), gdal.GA_ReadOnly) + meta = ds.GetMetadata() + + # Define UL, LR, UTM zone + utm = np.int(meta['UTMZONENUMBER']) + n_s = np.float(meta['NORTHBOUNDINGCOORDINATE']) + + # Create UTM zone code numbers + utm_n = [i + 32600 for i in range(60)] + utm_s = [i + 32700 for i in range(60)] + + # Define UTM zone based on North or South + if n_s < 0: + utm_zone = utm_s[utm] + else: + utm_zone = utm_n[utm] + + srs = osr.SpatialReference() + srs.ImportFromEPSG(utm_zone) + + return srs.ExportToWkt() + + +def generate_vrt(file_path: Path, bands): + """ + Generate a VRT file for a given file + The following tags did not show visual impact on raster bands when rendering: + 1. Top level GeoTransform + """ + + assert bands + + x_size, y_size = get_raster_sizes(bands) + geo_transform = compute_geo_transform(file_path, bands) + + return """\ + + {srs} + {geo} + {raster_bands} + + """.format(x=x_size, y=y_size, srs=infer_aster_srs(file_path), + geo=', '.join(('%1.16e' % v for v in geo_transform)), + raster_bands=get_raster_bands_vrt(bands)) + + +def get_raster_bands_vrt(bands): + """ + Compute the tags for each band ang return them as a single string + + :param bands: GDAL SubDatasets + """ + + raster_band_template = """\ + + {nodata} + + {band_name} + + + """ + + raster_bands = '' + for index, band in enumerate(bands): + sdt = gdal.Open(band, gdal.GA_ReadOnly) + data_type = gdal.GetDataTypeName(sdt.GetRasterBand(1).DataType) + raster_bands += raster_band_template.format(dtype=data_type, number=str(index + 1), + nodata=0, + band_name=band) + return raster_bands + + +def get_raster_sizes(bands): + """ + Raster sizes of different bands are different. So compute the max of x axis + and max of y axis + + :param bands: GDAL SubDataset names + """ + + x_size = [] + y_size = [] + for band in bands: + sdt = gdal.Open(band, gdal.GA_ReadOnly) + x_size.append(sdt.RasterXSize) + y_size.append(sdt.RasterYSize) + return max(x_size), max(y_size) + + +def get_acquisition_time(file_path): + dt = gdal.Open(str(file_path), gdal.GA_ReadOnly) + meta = dt.GetMetadata() + date_string = meta['CALENDARDATE'] + + time_ = meta['TIMEOFDAY'] + + return datetime(year=int(date_string[:4]), month=int(date_string[4:6]), day=int(date_string[6:8]), + hour=int(time_[:2]), minute=int(time_[2:4]), second=int(time_[4:6]), + microsecond=int(time_[6:12]), tzinfo=timezone.utc) + + +def compute_geo_transform(file_path, bands): + """ + Compute the geo transform for the given bands. If the geo transform is not same + for all the given bands an assert error is raised. + """ + + # pylint: disable=round-builtin + + dt = gdal.Open(str(file_path), gdal.GA_ReadOnly) + meta = dt.GetMetadata() + + # Define UL, LR, UTM zone + ul = [np.float(x) for x in meta['UPPERLEFTM'].split(', ')] + lr = [np.float(x) for x in meta['LOWERRIGHTM'].split(', ')] + n_s = np.float(meta['NORTHBOUNDINGCOORDINATE']) + + # Define extent and provide offset for UTM South zones + if n_s < 0: + ul_y = ul[0] + 10000000 + ul_x = ul[1] + + lr_y = lr[0] + 10000000 + lr_x = lr[1] + + # Define extent for UTM North zones + else: + ul_y = ul[0] + ul_x = ul[1] + + lr_y = lr[0] + lr_x = lr[1] + + # We want all the bands to be consistent in terms of data type, + # raster number of columns and rows + band_info = dict() + band_info['ncol'] = set() + band_info['nrow'] = set() + band_info['data_type'] = set() + for band in bands: + band_ds = gdal.Open(band, gdal.GA_ReadOnly) + data_type = gdal.GetDataTypeName(band_ds.GetRasterBand(1).DataType) + if data_type == 'Byte': + band_data = band_ds.ReadAsArray().astype(np.byte) + elif data_type == 'UInt16': + band_data = band_ds.ReadAsArray().astype(np.uint16) + else: + raise ValueError('Unexpected band type') + + # Query raster dimensions + ncol, nrow = band_data.shape + + band_info['data_type'].add(data_type) + band_info['ncol'].add(ncol) + band_info['nrow'].add(nrow) + + assert len(band_info['data_type']) == 1 and len(band_info['ncol']) == 1 and len(band_info['nrow']) == 1 + + # Compute resolutions + y_res = -1 * round((max(ul_y, lr_y) - min(ul_y, lr_y)) / band_info['ncol'].pop()) + x_res = round((max(ul_x, lr_x) - min(ul_x, lr_x)) / band_info['nrow'].pop()) + + # Define UL x and y coordinates based on spatial resolution + ul_yy = ul_y - (y_res / 2) + ul_xx = ul_x - (x_res / 2) + + return ul_xx, x_res, 0., ul_yy, 0., y_res + + +def compute_extents(file_path): + """ + Compute the union of extents of individual raster bands. + https://git.earthdata.nasa.gov/projects/LPDUR/repos/aster-l1t/raw/ASTERL1T_hdf2tif.py?at=refs%2Fheads%2Fmaster + """ + dt = gdal.Open(str(file_path), gdal.GA_ReadOnly) + meta = dt.GetMetadata() + + # Define LL, UR + ll = [np.float(x) for x in meta['LOWERLEFTM'].split(', ')] + ur = [np.float(x) for x in meta['UPPERRIGHTM'].split(', ')] + n_s = np.float(meta['NORTHBOUNDINGCOORDINATE']) + # Define extent and provide offset for UTM South zones + if n_s < 0: + ll_y = ll[0] + 10000000 + ll_x = ll[1] + + ur_y = ur[0] + 10000000 + ur_x = ur[1] + + # Define extent for UTM North zones + else: + ll_y = ll[0] + ll_x = ll[1] + + ur_y = ur[0] + ur_x = ur[1] + + # Do we need to offset pixel center by half of pixel resolution as in the above reference? + # Note: pixel resolution vary per band + + return ll_x, ll_y, ur_x, ur_y + + +def filter_metadata(file_path, product): + """ + Filter the metadata dictionary based on what is to include or exclude defined by + the global PRODUCTS + """ + + dt = gdal.Open(str(file_path), gdal.GA_ReadOnly) + meta = dt.GetMetadata() + items = set() + if PRODUCTS[product].get('include_only'): + for prefix in PRODUCTS[product]['include_only']: + items.update({meta_item for meta_item in meta if meta_item.startswith(prefix)}) + elif PRODUCTS[product].get('exclude'): + for prefix in PRODUCTS[product]['exclude']: + items.update({meta_item for meta_item in meta}) + items.difference({meta_item for meta_item in meta if meta_item.startswith(prefix)}) + return {item: meta[item] for item in items} + + +class NumpySafeEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(NumpySafeEncoder, self).default(obj) + + +if __name__ == '__main__': + cli() diff --git a/scripts/index_aster_lpdaac.sh b/scripts/index_aster_lpdaac.sh new file mode 100755 index 00000000..87cb3f77 --- /dev/null +++ b/scripts/index_aster_lpdaac.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# The argument is the config file location +# e.g. +# ./index_aster_lpdaac.sh ../../../dsg547_dev.conf +#./index_aster_lpdaac.py --config $1 index-data --product aster_l1t_vnir /g/data/v10/ASTER_AU/2018.01.01 + +# Remeber to do create product; +# python ./index_aster_lpdaac.py +# --config /g/data/u46/users/dsg547/dsg547_dev.conf +# create-product --product aster_l1t_vnir +# /g/data/v10/ASTER_AU/2018.01.01 + +module use /g/data/v10/public/modules/modulefiles +module load dea + +for i in /g/data/u46/users/aj9439/aster/tests/*; do + ./index_aster_lpdaac.py --config "$1" create-vrt --product aster_l1t_swir "$i" + ./index_aster_lpdaac.py --config "$1" index-data --product aster_l1t_swir "$i" +done \ No newline at end of file diff --git a/scripts_tests/data/aster/2007.12.27/shrunk.hdf.xz b/scripts_tests/data/aster/2007.12.27/shrunk.hdf.xz new file mode 100644 index 00000000..915c5e9b Binary files /dev/null and b/scripts_tests/data/aster/2007.12.27/shrunk.hdf.xz differ diff --git a/scripts_tests/data/aster/2017.12.10/shrunk.hdf.xz b/scripts_tests/data/aster/2017.12.10/shrunk.hdf.xz new file mode 100644 index 00000000..c198a216 Binary files /dev/null and b/scripts_tests/data/aster/2017.12.10/shrunk.hdf.xz differ diff --git a/scripts_tests/data/aster/hdf_shrinker.py b/scripts_tests/data/aster/hdf_shrinker.py new file mode 100644 index 00000000..4a300e42 --- /dev/null +++ b/scripts_tests/data/aster/hdf_shrinker.py @@ -0,0 +1,51 @@ +# Thanks http://fhs.github.io/pyhdf/modules/SD.html#programming-models +import lzma +import shutil +from pathlib import Path + +import click +import numpy as np +from pyhdf.SD import SD, SDC + + +class PathlibPath(click.Path): + """A Click path argument that returns a pathlib Path, not a string""" + + def convert(self, value, param, ctx): + return Path(super().convert(value, param, ctx)) + + +@click.command() +@click.argument('in_file_name', type=click.Path(exists=True, dir_okay=False, readable=True)) +@click.argument('out_file_name', type=click.Path(exists=False, writable=True)) +def shrink_inplace(in_file_name, out_file_name): + """ + Shrink a HDF4 File + + Replace all data values with 0, and then compress with LZMA. + + Useful for generating tiny test data files that still contain all the metadata. + """ + shutil.copy(in_file_name, out_file_name) + + in_file = SD(out_file_name, SDC.WRITE) + + for dataset_name, dataset_def in in_file.datasets().items(): + coord_axis, shape, dataset_type, index = dataset_def + print(dataset_name, dataset_def) + + dataset = in_file.select(dataset_name) + + print(dataset[:]) + dataset[:] = np.zeros(shape, dataset.get().dtype) + + dataset.endaccess() + + in_file.end() + + with open(out_file_name, 'rb') as fin, lzma.open(out_file_name + '.xz', 'wb') as fout: + fout.write(fin.read()) + + +if __name__ == '__main__': + shrink_inplace() diff --git a/scripts_tests/data/aster/vrt_schema.xsd b/scripts_tests/data/aster/vrt_schema.xsd new file mode 100644 index 00000000..2000996e --- /dev/null +++ b/scripts_tests/data/aster/vrt_schema.xsd @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/scripts_tests/test_aster.py b/scripts_tests/test_aster.py new file mode 100644 index 00000000..7f655b04 --- /dev/null +++ b/scripts_tests/test_aster.py @@ -0,0 +1,134 @@ +import lzma +from pathlib import Path + +import pytest + +from datacube.index.hl import Doc2Dataset +from scripts.index_aster_lpdaac import generate_lpdaac_defn, generate_lpdaac_doc, generate_vrt +from scripts.index_aster_lpdaac import selected_bands, raster_to_measurements, vrt_file_path +from scripts.index_aster_lpdaac import PRODUCTS + +pytest_plugins = "digitalearthau.testing.plugin" + +SCRIPTS_TEST_DATA = Path(__file__).parent / 'data' + + +def uncompress_xz(in_file, dest_file): + """ + Uncompress the lzma compressed file + """ + with lzma.open(in_file, 'rb') as fin, open(dest_file, 'wb') as fout: + fout.write(fin.read()) + + +@pytest.fixture +def aster_files(tmp_path): + """ + Return the locations of aster HDF files + """ + dest_file1 = tmp_path / 'AST_L1T_00312102017022934_20171211115854_25347.hdf' + uncompress_xz(SCRIPTS_TEST_DATA / 'aster' / '2017.12.10' / 'shrunk.hdf.xz', + dest_file1) + + dest_file2 = tmp_path / 'AST_L1T_00312272007012132_20150522121457_113468.hdf' + uncompress_xz(SCRIPTS_TEST_DATA / 'aster' / '2007.12.27' / 'shrunk.hdf.xz', + dest_file2) + + return dest_file1, dest_file2 + + +def products_present(aster_file_path): + """ + Return products present in a given aster file + """ + products = [] + for product in PRODUCTS: + try: + # See if band extraction fails for this product + _ = selected_bands(aster_file_path, product) + + products.append(product) + except AssertionError: + pass + return products + + +def test_product_defs(aster_files): + """ + Test product definitions generated for given files + """ + for aster_file in aster_files: + for product in products_present(aster_file): + measurements = raster_to_measurements(aster_file, product) + for measure in measurements: + measure.pop('path') # This is not needed here + product_def = generate_lpdaac_defn(measurements, product) + + assert product_def['metadata']['product_type'] == product + # Check all expected band names ['1', '2', '3', etc] + num_bands = len(PRODUCTS[product]['bands']) + assert all([a == b for a, b in zip([str(band_num) + for band_num in range(1, num_bands + 1)], + [m['name'] for m in product_def['measurements']])]) + + +def test_vrt_generation(aster_files): + """ + Test generated VRT strings for given files + """ + import xml.etree.ElementTree as ET + import xmlschema + + for aster_file in aster_files: + for product in products_present(aster_file): + bands = selected_bands(aster_file, product) + vrt = generate_vrt(aster_file, bands) + + # Is it valid VRT schema + xsd = xmlschema.XMLSchema(str(SCRIPTS_TEST_DATA / 'aster/vrt_schema.xsd')) + xsd.validate(vrt) + + tree = ET.fromstring(vrt) + + assert len(tree.findall('VRTRasterBand')) == len(PRODUCTS[product]['bands']) + sources = tree.findall('SourceFilename') + for source in sources: + parts = source.text.split(':') + # We want the source path name to be absolute + assert aster_file == Path(parts[2]) + assert parts[4] in PRODUCTS[product]['bands'] + + +def test_dataset_doc(aster_files): + """ + Test dataset docs generated for given files. + """ + for aster_file in aster_files: + for product in products_present(aster_file): + doc = generate_lpdaac_doc(aster_file, product) + assert doc['grid_spatial']['projection']['spatial_reference'] + assert len(doc['image']['bands']) == len(PRODUCTS[product]['bands']) + + +def test_dataset_indexing(dea_index, aster_files): + """ + Test datacube indexing for each product for the given files + """ + + for aster_file in aster_files: + for product in products_present(aster_file): + vrt_path = vrt_file_path(aster_file, product) + measurements = raster_to_measurements(aster_file, product) + for measure in measurements: + measure.pop('path') # This is not needed here + product_def = generate_lpdaac_defn(measurements, product) + product_ = dea_index.products.from_doc(product_def) + indexed_product = dea_index.products.add(product_) + + assert indexed_product + + doc = generate_lpdaac_doc(aster_file, product) + resolver = Doc2Dataset(dea_index) + dataset, _ = resolver(doc, vrt_path.as_uri()) + print('the dataset to be indexed: ', dataset) + dea_index.datasets.add(dataset) diff --git a/scripts/test_index_nci_modis_lpdaac.py b/scripts_tests/test_index_nci_modis_lpdaac.py similarity index 93% rename from scripts/test_index_nci_modis_lpdaac.py rename to scripts_tests/test_index_nci_modis_lpdaac.py index 9a031a83..33698d21 100644 --- a/scripts/test_index_nci_modis_lpdaac.py +++ b/scripts_tests/test_index_nci_modis_lpdaac.py @@ -1,7 +1,7 @@ from pathlib import Path from datetime import datetime -from index_nci_modis_lpdaac import * +from scripts.index_nci_modis_lpdaac import * def test_modis_path_to_date_range(): diff --git a/setup.py b/setup.py index 7c9a66ce..9c5b1612 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ license='Apache License 2.0', packages=find_packages( - exclude=('tests', 'tests.*', + exclude=('scripts_tests', 'scripts_tests.*', 'integration_tests', 'integration_tests.*') ), package_data={ @@ -37,7 +37,7 @@ 'pytest-runner' ], install_requires=[ - 'colorama', + 'colorama', # required to work around a structlog issue 'click>=5.0', 'datacube[celery]', 'python-dateutil', @@ -49,7 +49,6 @@ 'boltons', 'lxml', 'pydash', - 'colorama', # required to work around a structlog issue ], tests_require=tests_require, extras_require=extras_require, @@ -68,6 +67,8 @@ 'dea-stacker = digitalearthau.stacker:cli', 'dea-system = digitalearthau.system:cli', 'dea-test-env = digitalearthau.test_env:cli', - ] + ], + # This might be a bad idea, it pulls in all of datacube whenever pytest is run + #'pytest11': ['digitalearthau = digitalearthau.testing.plugin'] }, )