diff --git a/django_project/catalogue/ingestors/landsat.py b/django_project/catalogue/ingestors/landsat.py index c50e7e88..f7d1c74e 100644 --- a/django_project/catalogue/ingestors/landsat.py +++ b/django_project/catalogue/ingestors/landsat.py @@ -7,7 +7,6 @@ import os import sys import glob -from cmath import log from datetime import datetime from xml.dom.minidom import parse import traceback @@ -181,6 +180,17 @@ def get_spatial_resolution_y(filename): return 30 +def get_mission_value(filename): + if 'LO7' in filename \ + or 'LC7' in filename \ + or 'LT07' in filename: + return 'L7' + elif 'LO8' in filename \ + or 'LC8' in filename \ + or 'LT08' in filename: + return 'L8' + + def get_product_profile(log_message, dom): """Find the product_profile for this record. @@ -315,8 +325,8 @@ def get_projection(dom): In Landsat we only get 'UTM' for the CRS which is basically unusable for us (since we need the zone too) so we will always fail and return EPSG:4326 - :param specific_parameters: Dom Document containing the bounds of the scene. - :type specific_parameters: DOM document. + :param dom: Dom Document containing the bounds of the scene. + :type dom: DOM document. :returns: A projection model for the specified EPSG. :rtype: Projection @@ -348,6 +358,7 @@ def ingest( 'landsat/'), verbosity_level=2, halt_on_error_flag=True, + use_txt_flag=True, ignore_missing_thumbs=False): """ Ingest a collection of Landsat metadata folders. @@ -367,6 +378,9 @@ def ingest( error is encountered. Default is True. :type halt_on_error_flag: bool + :param use_txt_flag: Whether to read a txt or xml file. Default False. + :type use_txt_flag: bool + :param ignore_missing_thumbs: Whether we should raise an error if we find we are missing a thumbnails. Default is False. :type ignore_missing_thumbs: bool @@ -393,175 +407,250 @@ def log_message(log_message_content, level=1): # Scan the source folder and look for any sub-folders # The sub-folder names should be e.g. # L5-_TM-_HRF_SAM-_0176_00_0078_00_920606_080254_L0Ra_UTM34S - log_message('Scanning folders in %s' % source_path, 1) + log_message('Scanning folders in %s' % source_path, 2) # Loop through each folder found - ingestor_version = 'Landsat7/8 ingestor version 1.1' + ingestor_version = 'Landsat7/8 ingestor version 1.2' record_count = 0 updated_record_count = 0 created_record_count = 0 failed_record_count = 0 - log_message('Starting directory scan...', 2) - - for myFolder in glob.glob(os.path.join(source_path, '*')): - record_count += 1 - try: - log_message('', 2) - log_message('---------------', 2) - # Get the folder name - product_folder = os.path.split(myFolder)[-1] - log_message(product_folder, 2) - - # Find the first and only xml file in the folder - search_path = os.path.join(str(myFolder), '*.xml') - log_message(search_path, 2) - xml_file = glob.glob(search_path)[0] - log_message(xml_file, 2) - file_path = os.path.basename(xml_file) - filename = os.path.splitext(file_path)[0] - - # Create a DOM document from the file - dom = parse(xml_file) - # - # First grab all the generic properties that any - # Landsat will have... - geometry = get_geometry(dom) - start_date_time, center_date_time = get_dates( - log_message, dom) - # projection for GenericProduct - projection = get_projection(dom) - original_product_id = get_original_product_id(dom, filename) - # Band count for GenericImageryProduct - band_count = get_band_count() - # orbit_number = get_orbit_number(dom) -- NOT SPECIFIED IN METADATA - # # Spatial resolution x for GenericImageryProduct - spatial_resolution_x = float(get_spatial_resolution_x(filename)) - # # Spatial resolution y for GenericImageryProduct - spatial_resolution_y = float( - get_spatial_resolution_y(filename)) - log_message('Spatial resolution y: %s' % spatial_resolution_y, 2) - # Spatial resolution for GenericImageryProduct calculated as (x+y)/2 - spatial_resolution = ( - spatial_resolution_x + spatial_resolution_y) / 2 - log_message('Spatial resolution: %s' % spatial_resolution, 2) - radiometric_resolution = get_radiometric_resolution(dom) - log_message( - 'Radiometric resolution: %s' % radiometric_resolution, 2) - quality = get_quality() - # ProductProfile for OpticalProduct - product_profile = get_product_profile(log_message, dom) - # Get the original text file metadata - metadata_file = file(xml_file, 'rt') - metadata = metadata_file.readlines() - metadata_file.close() - log_message('Metadata retrieved', 2) - - unique_product_id = original_product_id - # Check if there is already a matching product based - # on original_product_id - - # Other data - cloud_cover = get_cloud_cover(dom) - solar_zenith_angle = get_solar_zenith_angle(dom) - solar_azimuth_angle = get_solar_azimuth_angle(dom) - - # Do the ingestion here... - data = { - 'metadata': metadata, - 'spatial_coverage': geometry, - 'radiometric_resolution': radiometric_resolution, - 'band_count': band_count, - 'original_product_id': original_product_id, - 'unique_product_id': unique_product_id, - 'spatial_resolution_x': spatial_resolution_x, - 'spatial_resolution_y': spatial_resolution_y, - 'spatial_resolution': spatial_resolution, - 'product_profile': product_profile, - 'product_acquisition_start': start_date_time, - 'product_date': center_date_time, - # 'orbit_number': orbit_number, # Not in current metadata - 'cloud_cover': cloud_cover, - 'projection': projection, - 'quality': quality, - 'solar_zenith_angle': solar_zenith_angle, - 'solar_azimuth_angle': solar_azimuth_angle - } - log_message(data, 3) - # Check if it's already in catalogue: - try: - today = datetime.today() - time_stamp = today.strftime("%Y-%m-%d") - log_message('Time Stamp: %s' % time_stamp, 2) - except Exception, e: - print e.message + log_message('Starting directory scan...', 1) + + # loop on subfolders + for dirName, subdirlist, fileList in os.walk(source_path): + + # Check flag: find xml or txt + full_path = os.path.join(source_path, dirName) + if use_txt_flag: + search_path = os.path.join(full_path, '*.txt') + else: + search_path = os.path.join(full_path, '*.xml') + log_message('search path: %s ' % search_path, 2) + + # for each sub folder, search metadata and thumbnail + for myFolder in glob.glob(search_path): + record_count += 1 - update_mode = True try: - log_message('Trying to update') - # original_product_id is not necessarily unique - # so we use product_id - product = OpticalProduct.objects.get( - original_product_id=original_product_id - ).getConcreteInstance() - log_message(('Already in catalogue: updating %s.' - % original_product_id), 2) - new_record_flag = False - message = product.ingestion_log - message += '\n' - message += '%s : %s - updating record' % ( - time_stamp, ingestor_version) - data['ingestion_log'] = message - product.__dict__.update(data) - except ObjectDoesNotExist: - log_message('Not in catalogue: creating.', 2) - update_mode = False - message = '%s : %s - creating record' % ( - time_stamp, ingestor_version) - data['ingestion_log'] = message + log_message('', 2) + log_message('---------------', 1) + # Get the folder name + product_folder = os.path.split(myFolder)[-1] + log_message('product folder: %s ' % product_folder, 2) + + xml_file = glob.glob(myFolder)[0] + log_message('xml_file: %s ' % xml_file, 2) + + if use_txt_flag: + log_message('processing txt metadata', 2) + data = ingest_txt(xml_file) + original_product_id = data['original_product_id'] + else: + log_message('processing xml metadata', 2) + file_path = os.path.basename(xml_file) + filename = os.path.splitext(file_path)[0] + # Create a DOM document from the file + dom = parse(xml_file) + # + # First grab all the generic properties that any + # Landsat will have... + geometry = get_geometry(dom) + start_date_time, center_date_time = get_dates( + log_message, dom) + # projection for GenericProduct + projection = get_projection(dom) + original_product_id = get_original_product_id(dom, filename) + # Band count for GenericImageryProduct + band_count = get_band_count() + # orbit_number = get_orbit_number(dom) -- NOT SPECIFIED IN METADATA + # # Spatial resolution x for GenericImageryProduct + spatial_resolution_x = float(get_spatial_resolution_x(filename)) + # # Spatial resolution y for GenericImageryProduct + spatial_resolution_y = float( + get_spatial_resolution_y(filename)) + log_message('Spatial resolution y: %s' % spatial_resolution_y, 2) + # Spatial resolution for GenericImageryProduct calculated as (x+y)/2 + spatial_resolution = ( + spatial_resolution_x + spatial_resolution_y) / 2 + log_message('Spatial resolution: %s' % spatial_resolution, 2) + radiometric_resolution = get_radiometric_resolution(dom) + log_message( + 'Radiometric resolution: %s' % radiometric_resolution, 2) + quality = get_quality() + # ProductProfile for OpticalProduct + product_profile = get_product_profile(log_message, dom) + # Get the original text file metadata + metadata_file = file(xml_file, 'rt') + metadata = metadata_file.readlines() + metadata_file.close() + log_message('Metadata retrieved', 2) + + unique_product_id = original_product_id + # Check if there is already a matching product based + # on original_product_id + + # Other data + cloud_cover = get_cloud_cover(dom) + solar_zenith_angle = get_solar_zenith_angle(dom) + solar_azimuth_angle = get_solar_azimuth_angle(dom) + + # Do the ingestion here... + data = { + 'metadata': metadata, + 'spatial_coverage': geometry, + 'radiometric_resolution': radiometric_resolution, + 'band_count': band_count, + 'original_product_id': original_product_id, + 'unique_product_id': unique_product_id, + 'spatial_resolution_x': spatial_resolution_x, + 'spatial_resolution_y': spatial_resolution_y, + 'spatial_resolution': spatial_resolution, + 'product_profile': product_profile, + 'product_acquisition_start': start_date_time, + 'product_date': center_date_time, + # 'orbit_number': orbit_number, # Not in current metadata + 'cloud_cover': cloud_cover, + 'projection': projection, + 'quality': quality, + 'solar_zenith_angle': solar_zenith_angle, + 'solar_azimuth_angle': solar_azimuth_angle + } + + # Check if it's already in catalogue: try: - product = OpticalProduct(**data) - log_message('Product: %s' % product) + today = datetime.today() + time_stamp = today.strftime("%Y-%m-%d") + log_message('Time Stamp: %s' % time_stamp, 2) + except Exception, e: + print e.message + update_mode = True + try: + log_message('Trying to update') + # original_product_id is not necessarily unique + # so we use product_id + product = OpticalProduct.objects.get( + original_product_id=original_product_id + ).getConcreteInstance() + log_message(('Already in catalogue: updating %s.' + % original_product_id), 2) + new_record_flag = False + message = product.ingestion_log + message += '\n' + message += '%s : %s - updating record' % ( + time_stamp, ingestor_version) + data['ingestion_log'] = message + product.__dict__.update(data) + except ObjectDoesNotExist: + log_message('Not in catalogue: creating.', 2) + update_mode = False + message = '%s : %s - creating record' % ( + time_stamp, ingestor_version) + data['ingestion_log'] = message + try: + product = OpticalProduct(**data) + log_message('Product: %s' % product) + + except Exception, e: + log_message(e.message, 2) + + new_record_flag = True except Exception, e: - log_message(e.message, 2) + print e.message - new_record_flag = True - except Exception, e: - print e.message + log_message('Saving product and setting thumb', 2) + try: + product.save() + if update_mode: + updated_record_count += 1 + else: + created_record_count += 1 + if new_record_flag: + log_message('Product %s imported.' % record_count, 2) + pass + else: + log_message('Product %s updated.' % updated_record_count, 2) + pass + except Exception, e: + traceback.print_exc(file=sys.stdout) + raise CommandError('Cannot import: %s' % e) - log_message('Saving product and setting thumb', 2) - try: - product.save() - if update_mode: - updated_record_count += 1 + if test_only_flag: + log_message('Testing: image not saved.', 2) + pass else: - created_record_count += 1 + thumbs_folder = os.path.join( + settings.THUMBS_ROOT, + product.thumbnailDirectory()) + try: + os.makedirs(thumbs_folder) + except OSError: + # TODO: check for creation failure rather than + # attempt to recreate an existing dir + pass + + jpeg_path = os.path.join(str(xml_file)) + # reconstruct jpeg path and file name + jpeg_file = product_folder.split('_')[0] + "_THUMB.png" + jpeg_path = os.path.join(jpeg_path.rsplit('/', 1)[0], jpeg_file) + + log_message('jpeg_path: %s' % jpeg_path, 2) + + new_name = '%s.jpg' % product.original_product_id + + if jpeg_path: + try: + if use_txt_flag: + shutil.copyfile( + jpeg_path, + os.path.join(thumbs_folder, new_name)) + else: + shutil.copyfile( + os.path.join(jpeg_path, new_name), + os.path.join(thumbs_folder, new_name)) + print new_name + except IOError as e: + if ignore_missing_thumbs: + continue + else: + raise Exception('Missing thumbnail in %s' % jpeg_path) + # Transform and store .wld file + # log_message('Referencing thumb', 2) + # try: + # path = product.georeferencedThumbnail() + # log_message('Georeferenced Thumb: %s' % path, 2) + # except: + # traceback.print_exc(file=sys.stdout) + # elif ignore_missing_thumbs: + # log_message('IGNORING missing thumb:') + else: + raise Exception('Missing thumbnail in %s' % jpeg_path) + if new_record_flag: log_message('Product %s imported.' % record_count, 2) pass else: log_message('Product %s updated.' % updated_record_count, 2) pass - except Exception, e: - traceback.print_exc(file=sys.stdout) - raise CommandError('Cannot import: %s' % e) - - if test_only_flag: - transaction.rollback() - log_message('Imported scene : %s' % product_folder, 1) - log_message('Testing only: transaction rollback.', 1) - else: - transaction.commit() - log_message('Imported scene : %s' % product_folder, 1) - except Exception, e: - log_message('Record import failed. AAAAAAARGH! : %s' % - product_folder, 1) - failed_record_count += 1 - if halt_on_error_flag: - print e.message - break - else: - continue + + if test_only_flag: + transaction.rollback() + log_message('Imported scene : %s' % product_folder, 2) + log_message('Testing only: transaction rollback.', 2) + else: + transaction.commit() + log_message('Imported scene : %s' % product_folder, 2) + + except Exception as e: + log_message('Record import failed. AAAAAAARGH! : %s %s' % + (product_folder, e), 2) + failed_record_count += 1 + if halt_on_error_flag: + print e.message + break + else: + continue # To decide: should we remove ingested product folders? @@ -571,3 +660,279 @@ def log_message(log_message_content, level=1): print 'Products imported : %s ' % created_record_count print 'Products failed to import : %s ' % failed_record_count print '===============================' + + +list_keywords = ['CORNER_UL_LAT_PRODUCT', + 'CORNER_UL_LON_PRODUCT', + 'CORNER_UR_LAT_PRODUCT', + 'CORNER_UR_LON_PRODUCT', + 'CORNER_LL_LAT_PRODUCT', + 'CORNER_LL_LON_PRODUCT', + 'CORNER_LR_LAT_PRODUCT', + 'CORNER_LR_LON_PRODUCT', + 'FILE_DATE', + 'CLOUD_COVER', + 'SENSOR_ID', + 'UTM_ZONE', + 'LANDSAT_SCENE_ID' + ] + + +def get_cloud_cover_txt(list_values): + return float(list_values['CLOUD_COVER']) + + +def get_geometry_txt(list_values): + polygon = 'POLYGON ((' ' %s %s, %s %s, %s %s, %s %s, %s %s' '))' % ( + list_values['CORNER_UL_LON_PRODUCT'], list_values['CORNER_UL_LAT_PRODUCT'], + list_values['CORNER_UR_LON_PRODUCT'], list_values['CORNER_UR_LAT_PRODUCT'], + list_values['CORNER_LR_LON_PRODUCT'], list_values['CORNER_LR_LAT_PRODUCT'], + list_values['CORNER_LL_LON_PRODUCT'], list_values['CORNER_LL_LAT_PRODUCT'], + list_values['CORNER_UL_LON_PRODUCT'], list_values['CORNER_UL_LAT_PRODUCT'], + ) + polygon_geometry = WKTReader().read(polygon) + return polygon_geometry + + +def get_dates_txt(list_values): + start_date = list_values['FILE_DATE'] + + start_year = start_date[0:4] + start_month = start_date[5:7] + start_day = start_date[:8:10] + start_time = start_date[11:18] + tokens = start_time.split(':') + start_hour = tokens[0] + start_minute = tokens[1] + start_seconds = tokens[2] + + # print "%s-%s-%sT%s:%s:%s" % ( + # start_year, start_month, start_day, + # start_hour, start_minute, start_seconds) + parsed_date_time = datetime( + int(start_year), + int(start_month), + int(start_day), + int(start_hour), + int(start_minute), + int(start_seconds)) + return parsed_date_time + + +def get_original_product_id_txt(list_values): + return list_values['LANDSAT_SCENE_ID'] + + +def get_product_profile_txt(list_values, filename): + """Find the product_profile for this record. + + It can be that one or more spectral modes are associated with a product. + For example Landsat8 might have Pan (1 band), Multispectral (8 bands) and + Thermal (2 bands) modes associated with a single product (total 11 bands). + + Because of this there is a many to many relationship on + OpticalProductProfile and to get a specific OpticalProductProfile record + we would need to know the satellite instrument and all the associated + spectral modes to that profile record. + + We use the following elements to reverse engineer what the + OpticalProductProfile is:: + + HRF + OLI_TIRS + LANDSAT8 + + :param list_values: A list of values from txt file. + :type list_values: dict + + :param filename: A full path file name. + :type filename: String. + + :return: A product profile for the given product. + :rtype: OpticalProductProfile + """ + # We need type, sensor and mission so that we can look up the + # OpticalProductProfile that applies to this product + sensor_value = list_values['SENSOR_ID'].strip('"') + mission_value = get_mission_value(filename) + + try: + instrument_type = InstrumentType.objects.get( + operator_abbreviation=sensor_value) # e.g. OLI_TIRS + except Exception, e: + # print e.message + raise e + + satellite = Satellite.objects.get(abbreviation=mission_value) + + try: + satellite_instrument_group = SatelliteInstrumentGroup.objects.get( + satellite=satellite, instrument_type=instrument_type) + except Exception, e: + print e.message + raise e + + # Note that in some cases e.g. Landsat you may get more that one instrument + # groups matched. When the time comes you will need to add more filtering + # rules to ensure that you end up with only one instrument group. + # For the mean time, we can assume that Landsat will return only one. + + try: + satellite_instrument = SatelliteInstrument.objects.get( + satellite_instrument_group=satellite_instrument_group) + except Exception, e: + print e.message + raise e + + try: + spectral_modes = SpectralMode.objects.filter( + instrument_type=instrument_type) + except Exception, e: + print e.message + raise + + try: + product_profile = OpticalProductProfile.objects.get( + satellite_instrument=satellite_instrument, + spectral_mode__in=spectral_modes) + except Exception, e: + print e.message + print 'Searched for satellite instrument: %s and spectral modes %s' % ( + satellite_instrument, spectral_modes + ) + raise e + + return product_profile + + +def get_projection_txt(list_values): + """Get the projection for this product record. + + The project is always expressed as an EPSG code and we fetch the related + Projection model for that code. + + In Landsat we only get 'UTM' for the CRS which is basically unusable for + us (since we need the zone too) so we will always fail and return EPSG:4326 + + :param list_values: A list of values from txt file. + :type list_values: dict + + :returns: A projection model for the specified EPSG. + :rtype: Projection + """ + epsg_default_code = '32' + zone = list_values['UTM_ZONE'] + location_code = '7' # 6 for north and 7 for south + epsg_code = epsg_default_code + location_code + zone + + projection = Projection.objects.get(epsg_code=epsg_code) + return projection + + +def get_radiometric_resolution_txt(filename): + """Get the radiometric resolution for the supplied product record. + + :param filename: A full path file name. + :type filename: String. + + :returns: Either 8 or 16. + :rtype: integer. + """ + mission_index_value = filename[0:3] + if mission_index_value == 'LO7': + return 8 + elif mission_index_value == 'LO8': + return 16 + # detect LC + elif mission_index_value == 'LC7': + return 8 + elif mission_index_value == 'LC8': + return 16 + # detect LC ONLY IF IT HAS DIFFERENT FILE NAME + elif filename[0:4] == 'LC07': + return 8 + elif filename[0:4] == 'LC08': + return 16 + + +def ingest_txt(search_path): + """ + Ingest a collection of Landsat metadata folders using txt files. + + The entry point is from ingest function. + + :param search_path: A path to the Landsat folder. + :type search_path: String + + :returns: A dictionary of data to be populated to database. + :rtype: dict + """ + def log_message(log_message_content, level=1): + """Log a message for a given level. + + :param log_message_content: A message. + :param level: A log level. + """ + if level == 1: + print log_message_content + + try: + log_message('Processing txt files', 2) + file_path = os.path.basename(search_path) + filename = os.path.splitext(file_path)[0] + + list_values = {} + metadata = "" + # find the keywords and save it to the list + with open(search_path, 'r') as f: + for line in f: + loi = line.strip().split(' = ') + search_keyword = loi[0] + metadata += line + if search_keyword in list_keywords: + search_value = loi[1] + list_values[search_keyword] = search_value + + geometry = get_geometry_txt(list_values) + radiometric_resolution = get_radiometric_resolution_txt(filename) + # original_product_id = get_original_product_id_txt(filename) + original_product_id = get_original_product_id_txt(list_values) + unique_product_id = original_product_id + + cloud_cover = get_cloud_cover_txt(list_values) + projection = get_projection_txt(list_values) + quality = get_quality() + start_date_time = get_dates_txt(list_values) + center_date_time = get_dates_txt(list_values) + spatial_resolution_x = get_spatial_resolution_x(filename) + spatial_resolution_y = get_spatial_resolution_y(filename) + spatial_resolution = (spatial_resolution_x + spatial_resolution_y) / 2 + product_profile = get_product_profile_txt(list_values, filename) + + data = { + 'metadata': metadata, + 'spatial_coverage': geometry, + 'radiometric_resolution': radiometric_resolution, + # 'band_count': band_count, + 'band_count': 9, + 'original_product_id': original_product_id, + 'unique_product_id': unique_product_id, + 'spatial_resolution_x': spatial_resolution_x, + 'spatial_resolution_y': spatial_resolution_y, + 'spatial_resolution': spatial_resolution, + 'product_profile': product_profile, + 'product_acquisition_start': start_date_time, + 'product_date': center_date_time, + # 'orbit_number': orbit_number, # Not in current metadata + 'cloud_cover': cloud_cover, + 'projection': projection, + 'quality': quality, + 'solar_zenith_angle': 45.00, + 'solar_azimuth_angle': 45.00 + # 'solar_zenith_angle': solar_zenith_angle, + # 'solar_azimuth_angle': solar_azimuth_angle + } + log_message(data, 3) + except Exception as e: + log_message('Error on ingest_txt: %s' % e, 2) + return data diff --git a/django_project/catalogue/management/commands/landsat_harvest.py b/django_project/catalogue/management/commands/landsat_harvest.py index 6bae839b..fdb049d3 100644 --- a/django_project/catalogue/management/commands/landsat_harvest.py +++ b/django_project/catalogue/management/commands/landsat_harvest.py @@ -21,6 +21,13 @@ class Command(BaseCommand): action='store_true', help='Just test, nothing will be written into the DB.', default=False), + make_option( + '--use_txt', + '-u', + dest='use_txt_flag', + action='store_true', + help='Use text file to ingest.', + default=False), make_option( '--source_dir', '-d', @@ -31,7 +38,7 @@ class Command(BaseCommand): 'thumbnail to import.'), default=( '/home/web/django_project' - '/data/landsat/')), + '/data/Landsat/')), make_option( '--halt_on_error', '-e', dest='halt_on_error_flag', action='store', @@ -43,7 +50,7 @@ class Command(BaseCommand): '--ignore-missing-thumbs', '-i', dest='ignore_missing_thumbs_flag', - action='store', + action='store_true', help=( 'Continue with importing records even if they miss their' 'thumbnails.'), @@ -62,6 +69,7 @@ def _parameter_to_bool(parameter): def handle(self, *args, **options): """ command execution """ test_only = self._parameter_to_bool(options.get('test_only_flag')) + use_txt = options.get('use_txt_flag') source_dir = options.get('source_dir') verbose = int(options.get('verbosity')) halt_on_error = self._parameter_to_bool( @@ -72,6 +80,7 @@ def handle(self, *args, **options): landsat.ingest( source_path=source_dir, test_only_flag=test_only, + use_txt_flag=use_txt, verbosity_level=verbose, halt_on_error_flag=halt_on_error, ignore_missing_thumbs=ignore_missing_thumbs