diff --git a/adsmp/app.py b/adsmp/app.py index b507106..b94cbaf 100644 --- a/adsmp/app.py +++ b/adsmp/app.py @@ -21,10 +21,12 @@ from copy import deepcopy import sys from sqlalchemy.dialects.postgresql import insert +import re import csv from datetime import timedelta from SciXPipelineUtils import scix_id + class ADSMasterPipelineCelery(ADSCelery): def __init__(self, app_name, *args, **kwargs): @@ -843,38 +845,69 @@ def generate_boost_request_message(self, bibcode, run_id=None, output_path=None) def generate_links_for_resolver(self, record): """use nonbib or bib elements of database record and return links for resolver and checksum""" - # nonbib data has something like - # "data_links_rows": [{"url": ["http://arxiv.org/abs/1902.09522"] - # bib data has json string for: - # "links_data": [{"access": "open", "instances": "", "title": "", "type": "preprint", - - # "url": "http://arxiv.org/abs/1902.09522"}] + # Links data can come from two sources with different formats: + # + # 1. Nonbib data in either: + # - Old format with "data_links_rows": + # [{"url": ["http://arxiv.org/abs/1902.09522"], "title": [""], "item_count": 0}] + # - New format with "links": + # {"DATA": {"url": ["http://arxiv.org/abs/1902.09522"], "title": [""], "count": 0}} + # + # 2. Bib data with "links_data" as JSON string: + # [{"access": "open", + # "instances": "", + # "title": "", + # "type": "preprint", + # "url": "http://arxiv.org/abs/1902.09522"}] + # + # We prioritize nonbib data when available, falling back to bib data if needed. + # We also assume it will be in either the old or new format, but not both. + # If there is no relevant data, we return None. + resolver_record = None # default value to return bibcode = record.get('bibcode') nonbib = record.get('nonbib_data', {}) - if type(nonbib) is not dict: + + # Validate nonbib is a dict before accessing its methods + if not isinstance(nonbib, dict): nonbib = {} # in case database has None or something odd - nonbib_links = nonbib.get('data_links_rows', None) - if nonbib_links: - # when avilable, prefer link info from nonbib - resolver_record = {'bibcode': bibcode, - 'data_links_rows': nonbib_links} - else: - # as a fallback, use link from bib/direct ingest + + # New format + nonbib_new_links = nonbib.get('links', {}) + + resolver_record = {'bibcode': bibcode, + 'links': nonbib_new_links} + + # Old format links are in the "data_links_rows" key + nonbib_old_links = nonbib.get('data_links_rows', []) + + # If nonbib data but in old format, transform into new format + if not nonbib_new_links and nonbib_old_links: + # transform into new format + resolver_record['links'] = self._transform_old_links_to_new_format(nonbib_old_links) + + # If not nonbib data in any format, use link from bib/direct ingest + elif not nonbib_new_links: + self.logger.debug('No nonbib data in any format, using link from bib/direct ingest') + bib = record.get('bib_data', {}) - if type(bib) is not dict: - bib = {} + if not isinstance(bib, dict): + bib = {} + bib_links_record = bib.get('links_data', None) + + if bib_links_record: try: bib_links_data = json.loads(bib_links_record[0]) + self.logger.debug('Bib links data: {}'.format(bib_links_data)) + url = bib_links_data.get('url', None) if url: - # need to change what direct sends + # Need to change what direct sends url_pdf = url.replace('/abs/', '/pdf/') - resolver_record = {'bibcode': bibcode, - 'data_links_rows': [{'url': [url], + default_data_links_rows = {'data_links_rows': [{'url': [url], 'title': [''], 'item_count': 0, 'link_type': 'ESOURCE', 'link_sub_type': 'EPRINT_HTML'}, @@ -882,10 +915,310 @@ def generate_links_for_resolver(self, record): 'title': [''], 'item_count': 0, 'link_type': 'ESOURCE', 'link_sub_type': 'EPRINT_PDF'}]} - except (KeyError, ValueError): - # here if record holds unexpected value + self.logger.debug('Default data links rows: {}'.format(default_data_links_rows)) + # Transform into new format + resolver_record['links'] = self._transform_old_links_to_new_format(default_data_links_rows['data_links_rows']) + except (KeyError, ValueError, IndexError): + # here if record holds unexpected value or empty list self.logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(bibcode, type(bib_links_record), bib_links_record)) + return + + # Only populate links if it has relevant data, otherwise return None + if resolver_record.get('links', {}): + # Now populate any missing link information from all available sources + resolver_record = self._populate_links_structure(record, resolver_record) + return resolver_record + + + + def _transform_old_links_to_new_format(self, data_links_rows): + """Transform the old links format to the new links format""" + + self.logger.debug('Transforming old links to new format. Old format: {}'.format(data_links_rows)) + + link_type_mapping = { + 'DATA': 'DATA', + 'ESOURCE': 'ESOURCE', + 'ASSOCIATED': 'ASSOCIATED', + 'INSPIRE': 'INSPIRE', + 'LIBRARYCATALOG': 'LIBRARYCATALOG', + 'PRESENTATION': 'PRESENTATION' + } + + new_links = { + "ARXIV": [], + "DOI": [], + "DATA": {}, + "ESOURCE": {}, + "ASSOCIATED": { + "url": [], + "title": [], + "count": 0 + }, + "INSPIRE": { + "url": [], + "title": [], + "count": 0 + }, + "LIBRARYCATALOG": { + "url": [], + "title": [], + "count": 0 + }, + "PRESENTATION": { + "url": [], + "title": [], + "count": 0 + }, + "ABSTRACT": True, + "CITATIONS": False, + "GRAPHICS": True, + "METRICS": False, + "OPENURL": True, + "REFERENCES": False, + "TOC": False, + "COREAD": True + } + + for row in data_links_rows: + link_type = row.get('link_type', '') + + # TOC is a boolean flag, not a URL-based link type + if link_type == 'TOC': + new_links['TOC'] = True + continue + + if link_type not in link_type_mapping: + continue + + mapped_type = link_type_mapping[link_type] + + if mapped_type in ('DATA', 'ESOURCE'): + sub_type = row.get('link_sub_type', '') + if sub_type not in new_links[mapped_type]: + new_links[mapped_type][sub_type] = { + 'url': [], + 'title': [], + 'count': 0 + } + if 'url' in row: + new_links[mapped_type][sub_type]['url'].extend(row['url']) + if 'title' in row: + new_links[mapped_type][sub_type]['title'].extend(row['title']) + if 'item_count' in row: + new_links[mapped_type][sub_type]['count'] = row['item_count'] + + else: + if 'url' in row: + new_links[mapped_type]['url'].extend(row['url']) + if 'title' in row: + new_links[mapped_type]['title'].extend(row['title']) + if 'item_count' in row: + new_links[mapped_type]['count'] = row['item_count'] + + self.logger.debug('Transformed old links to new format: {}'.format(new_links)) + return new_links + + def is_arxiv_id(self, identifier): + + self.logger.debug('Checking if identifier is an arXiv ID: {}'.format(identifier)) + + identifier = str(identifier).lower() + + patterns = [ + r'^arxiv:\d{4}\.\d{4,5}(v\d+)?$', # new-style: arXiv:YYMM.NNNN(N)(vV) + r'^arxiv:[a-z\-]+(\.[a-z]{2})?/\d{7}(v\d+)?$', # old-style: archive(.SC)/YYMMNNN(vV) + r'^10\.48550/arxiv\.\d{4}\.\d{4,5}(v\d+)?$', # DOI new-style + r'^10\.48550/arxiv\.[a-z\-]+(\.[a-z]{2})?/\d{7}(v\d+)?$' # DOI old-style + ] + + result = any(re.match(pat, identifier) for pat in patterns) + self.logger.debug('Is arXiv ID: {}'.format(result)) + + return result + + + def is_doi_id(self, identifier): + self.logger.debug('Checking if identifier is a DOI ID: {}'.format(identifier)) + + + identifier = str(identifier).strip().lower() + + # Regex pattern for DOI + doi_pattern = re.compile(r'^10\.\d{4,9}/\S+$', re.IGNORECASE) + return bool(doi_pattern.match(identifier)) + + + def _extract_data_components(self, record): + """Extract and validate data components from a record. + + Args: + record (dict): The complete record with all available components + + Returns: + tuple: Contains (bibcode, bib_data, fulltext, metrics, nonbib, orcid_claims) + """ + bibcode = record.get('bibcode') + + # Extract and validate all needed components + bib_data = record.get('bib_data', {}) + if not isinstance(bib_data, dict): + bib_data = {} + + fulltext = record.get('fulltext', {}) + if not isinstance(fulltext, dict): + fulltext = {} + + metrics = record.get('metrics', {}) + if not isinstance(metrics, dict): + metrics = {} + + nonbib = record.get('nonbib_data', {}) + if not isinstance(nonbib, dict): + nonbib = {} + + orcid_claims = record.get('orcid_claims', {}) + if not isinstance(orcid_claims, dict): + orcid_claims = {} + + self.logger.debug('Extracted data components from record. Bibcode: {}, Bib data: {}, Fulltext: {}, Metrics: {}, Nonbib: {}, Orcid claims: {}'.format(bibcode, bib_data, fulltext, metrics, nonbib, orcid_claims)) + return bibcode, bib_data, fulltext, metrics, nonbib, orcid_claims + + def _populate_identifiers(self, bibcode, bib_data, resolver_record, links): + """Populate the identifier list and extract ARXIV and DOI links. + + Args: + record (dict): The complete record with all available components from the database + resolver_record (dict): The resolver record to update with identifiers + links (dict): The links structure to update with ARXIV and DOI information + + Returns: + dict: The updated links structure with ARXIV and DOI fields populated + """ + + # Collect all identifiers from all sources + identifiers = self._collect_identifiers(bibcode, bib_data) + resolver_record['identifier'] = identifiers + + # Process identifiers for ARXIV and DOI fields + arxiv_ids = set() + doi_ids = set() + + for identifier in identifiers: + + # ARXIV identifiers + if identifier and self.is_arxiv_id(identifier): + arxiv_ids.add(identifier) + + # DOI identifiers + if identifier and self.is_doi_id(identifier): + doi_ids.add(identifier) + + # Initialize ARXIV and DOI fields if they don't exist or are not lists + if 'ARXIV' not in links or not isinstance(links['ARXIV'], list): + links['ARXIV'] = [] + + if 'DOI' not in links or not isinstance(links['DOI'], list): + links['DOI'] = [] + + # Add identifiers to links structure + if arxiv_ids: + links['ARXIV'].extend(list(arxiv_ids)) + + if doi_ids: + links['DOI'].extend(list(doi_ids)) + + self.logger.debug('Populated identifiers for record: {}. ARXIV: {}, DOI: {}'.format(bibcode, arxiv_ids, doi_ids)) + return links + + def _populate_links_structure(self, record, resolver_record): + '''Finally populates missing data like identifier, arxiv, doi, abstract, graphics, metrics, citations and references. + + Args: + record (dict): The complete record with all available components from the database + resolver_record (dict): The partially populated resolver record with base links structure + + Returns: + dict: The fully populated resolver record with all available link information + ''' + self.logger.debug('Populating links structure for record: {}'.format(record)) + links = resolver_record.get('links', {}) + + # Extract all necessary components + bibcode, bib_data, _, metrics, nonbib, _ = self._extract_data_components(record) + + # Populate identifiers and extract ARXIV and DOI links + links = self._populate_identifiers(bibcode, bib_data, resolver_record, links) + + # Always set ABSTRACT to True + links['ABSTRACT'] = True + + # Set CITATIONS flag if citations are present in metrics or nonbib + if ('citation_num' in metrics and metrics['citation_num'] > 0) or \ + ('citation_count' in nonbib and nonbib['citation_count'] > 0): + links['CITATIONS'] = True + + # Always set GRAPHICS to True + links['GRAPHICS'] = True + + # Set METRICS flag if metrics data is available + if metrics and any(key in metrics for key in ['citation_num', 'downloads', 'reads']): + links['METRICS'] = True + + # Always set OPENURL to True + links['OPENURL'] = True + + # Set REFERENCES flag if references are present in nonbib or metrics indicates references + if ('reference' in nonbib and len(nonbib['reference']) > 0) or \ + (metrics and 'reference_num' in metrics and metrics['reference_num'] > 0): + links['REFERENCES'] = True + + # Check for TOC flag + if 'property' in nonbib and isinstance(nonbib['property'], list) and 'TOC' in nonbib['property']: + links['TOC'] = True + elif bib_data.get('metadata', {}).get('properties', {}).get('doctype', {}).get('content') == 'toc': + links['TOC'] = True + + # Always set COREAD to True + links['COREAD'] = True + + # Update the links in the resolver record + resolver_record['links'] = links + + self.logger.debug('Populated links structure for record: {}'.format(resolver_record)) return resolver_record + + def _collect_identifiers(self, bibcode, bib_data): + """Collect identifiers from all available sources. + + Args: + bibcode (str): The bibcode of the record + bib_data (dict): The bibliographic data dictionary + + Returns: + list: A list of all collected identifiers (deduplicated) + """ + identifiers = set() + + # 1. Add bibcode itself as an identifier + if bibcode: + identifiers.add(bibcode) + + # 2. Get identifiers from bib_data + if 'identifier' in bib_data: + bib_identifiers = bib_data.get('identifier', []) + if isinstance(bib_identifiers, list): + identifiers.update([id for id in bib_identifiers if id]) + + # 3. Get any additional identifiers from alternate_bibcode + if 'alternate_bibcode' in bib_data: + alt_bibcodes = bib_data.get('alternate_bibcode', []) + if isinstance(alt_bibcodes, list): + identifiers.update([id for id in alt_bibcodes if id]) + + self.logger.debug('Collected identifiers: {}'.format(identifiers)) + return list(identifiers) + def should_include_in_sitemap(self, record): """ @@ -1311,4 +1644,4 @@ def chunked(self, iterable, chunk_size): chunk = list(islice(iterator, chunk_size)) if not chunk: break - yield chunk \ No newline at end of file + yield chunk diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py index 808a961..b1418e7 100644 --- a/adsmp/solr_updater.py +++ b/adsmp/solr_updater.py @@ -446,62 +446,42 @@ def transform_json_record(db_record): timestamps = [] for k, v in DB_COLUMN_DESTINATIONS: ts = db_record.get(k + "_updated", None) - if ts: - ts = time.mktime(ts.timetuple()) - else: - ts = sys.maxsize # default to use option without timestamp + ts = time.mktime(ts.timetuple()) if ts else sys.maxsize # default to use option without timestamp timestamps.append((k, v, ts)) timestamps.sort(key=lambda x: x[2]) # merge data based on timestamps - for field, target, _ in timestamps: + for field, target, ts in timestamps: # fields = {bib_data, nonbib_data, orcid_claims, metrics ..} if db_record.get(field, None): - if target: + if not target: # bib_data + out.update(db_record.get(field)) + else: if callable(target): - x = target( + enriched_data = target( db_record.get(field), out ) # in the interest of speed, don't create copy of out - if x: - out.update(x) - else: - out[target] = db_record.get(field) - else: - if target is None: - continue - - out.update(db_record.get(field)) - - elif field.startswith("#"): + if enriched_data: + out.update(enriched_data) + else: # id + out[target] = db_record.get(field) + elif field.startswith("#"): # timestamps if callable(target): - x = target( + enriched_data = target( db_record, out ) # in the interest of speed, don't create copy of out - if x: - out.update(x) - - # override temporal priority for links data - if ( - db_record.get("bib_data", None) - and db_record.get("nonbib_data", None) - and db_record["bib_data"].get("links_data", None) - and db_record["nonbib_data"].get("links_data", None) + if enriched_data: + out.update(enriched_data) + + # If both bib and nonbib pipeline provided links data + # use nonbib data even if it is older + if all( + db_record.get(key, {}).get("links_data") + for key in ("bib_data", "nonbib_data") ): - # here if both bib and nonbib pipeline provided links data - # use nonbib data even if it is older + logger.debug('Both bib and nonbib data provided links data. Using nonbib data: {}'.format(db_record["nonbib_data"]["links_data"])) out["links_data"] = db_record["nonbib_data"]["links_data"] - - # override temporal priority for bibgroup and bibgroup_facet, prefer nonbib - if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get( - "bibgroup", None - ): - out["bibgroup"] = db_record["nonbib_data"]["bibgroup"] - if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get( - "bibgroup_facet", None - ): - out["bibgroup_facet"] = db_record["nonbib_data"]["bibgroup_facet"] - - # if only bib data is available, use it to compute property - if db_record.get("nonbib_data", None) is None and db_record.get("bib_data", None): + elif db_record.get("bib_data", {}).get("links_data"): + logger.debug('Only bib data provided links data. Using bib data: {}'.format(db_record["bib_data"]["links_data"])) links_data = db_record["bib_data"].get("links_data", None) if links_data: try: @@ -528,6 +508,18 @@ def transform_json_record(db_record): db_record["bibcode"], type(links_data), links_data ) ) + + # override temporal priority for bibgroup and bibgroup_facet, prefer nonbib + if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get( + "bibgroup", None + ): + out["bibgroup"] = db_record["nonbib_data"]["bibgroup"] + if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get( + "bibgroup_facet", None + ): + out["bibgroup_facet"] = db_record["nonbib_data"]["bibgroup_facet"] + + boost_columns = ['doctype_boost', 'recency_boost', 'boost_factor', 'astronomy_final_boost', 'physics_final_boost', \ 'earth_science_final_boost', 'planetary_science_final_boost', 'heliophysics_final_boost', 'general_final_boost'] @@ -562,5 +554,5 @@ def transform_json_record(db_record): if any([char.isalnum() for char in out_field]): has.append(field) out["has"] = has - - return out + logger.debug('Out: {}'.format(out)) + return out \ No newline at end of file diff --git a/adsmp/tasks.py b/adsmp/tasks.py index 7620cc9..2445094 100644 --- a/adsmp/tasks.py +++ b/adsmp/tasks.py @@ -279,21 +279,21 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True # check if we have complete record for bibcode in bibcodes: - r = app.get_record(bibcode, load_only=fields) + record = app.get_record(bibcode, load_only=fields) - if r is None: + if record is None: logger.error('The bibcode %s doesn\'t exist!', bibcode) continue - augments_updated = r.get('augments_updated', None) - bib_data_updated = r.get('bib_data_updated', None) - fulltext_updated = r.get('fulltext_updated', None) - metrics_updated = r.get('metrics_updated', None) - nonbib_data_updated = r.get('nonbib_data_updated', None) - orcid_claims_updated = r.get('orcid_claims_updated', None) + augments_updated = record.get('augments_updated', None) + bib_data_updated = record.get('bib_data_updated', None) + fulltext_updated = record.get('fulltext_updated', None) + metrics_updated = record.get('metrics_updated', None) + nonbib_data_updated = record.get('nonbib_data_updated', None) + orcid_claims_updated = record.get('orcid_claims_updated', None) year_zero = '1972' - processed = r.get('processed', adsputils.get_date(year_zero)) + processed = record.get('processed', adsputils.get_date(year_zero)) if processed is None: processed = adsputils.get_date(year_zero) @@ -314,8 +314,7 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True metrics_updated, augments_updated)) # build the solr record if update_solr: - solr_payload = solr_updater.transform_json_record(r) - + solr_payload = solr_updater.transform_json_record(record) # ADS microservices assume the identifier field exists and contains the canonical bibcode: if 'identifier' not in solr_payload: solr_payload['identifier'] = [] @@ -323,7 +322,7 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True solr_payload['identifier'].append(solr_payload['bibcode']) logger.debug('Built SOLR record for %s', solr_payload['bibcode']) solr_checksum = app.checksum(solr_payload) - if ignore_checksums or r.get('solr_checksum', None) != solr_checksum: + if ignore_checksums or record.get('solr_checksum', None) != solr_checksum: solr_records.append(solr_payload) solr_records_checksum.append(solr_checksum) else: @@ -331,9 +330,9 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True # get data for metrics if update_metrics: - metrics_payload = r.get('metrics', None) + metrics_payload = record.get('metrics', None) metrics_checksum = app.checksum(metrics_payload or '') - if (metrics_payload and ignore_checksums) or (metrics_payload and r.get('metrics_checksum', None) != metrics_checksum): + if (metrics_payload and ignore_checksums) or (metrics_payload and record.get('metrics_checksum', None) != metrics_checksum): metrics_payload['bibcode'] = bibcode logger.debug('Got metrics: %s', metrics_payload) metrics_records.append(metrics_payload) @@ -342,10 +341,10 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True logger.debug('Checksum identical or no metrics data available, skipping metrics update for: %s', bibcode) if update_links and links_url: - datalinks_payload = app.generate_links_for_resolver(r) + datalinks_payload = app.generate_links_for_resolver(record) if datalinks_payload: datalinks_checksum = app.checksum(datalinks_payload) - if ignore_checksums or r.get('datalinks_checksum', None) != datalinks_checksum: + if ignore_checksums or record.get('datalinks_checksum', None) != datalinks_checksum: links_data_records.append(datalinks_payload) links_data_records_checksum.append(datalinks_checksum) else: diff --git a/adsmp/tests/test_app.py b/adsmp/tests/test_app.py index 34e4ba8..33e2e84 100644 --- a/adsmp/tests/test_app.py +++ b/adsmp/tests/test_app.py @@ -39,15 +39,14 @@ def setUp(self): unittest.TestCase.setUp(self) proj_home = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) - with mock.patch.dict('os.environ', {'ADS_API_TOKEN': 'fixme'}): - self.app = app.ADSMasterPipelineCelery('test', local_config=\ - { - 'SQLALCHEMY_URL': 'sqlite:///', - 'METRICS_SQLALCHEMY_URL': 'postgresql://postgres@127.0.0.1:15678/test', - 'SQLALCHEMY_ECHO': False, - 'PROJ_HOME' : proj_home, - 'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'), - }) + self.app = app.ADSMasterPipelineCelery('test', local_config=\ + { + 'SQLALCHEMY_URL': 'sqlite:///', + 'METRICS_SQLALCHEMY_URL': 'postgresql://postgres@127.0.0.1:15678/test', + 'SQLALCHEMY_ECHO': False, + 'PROJ_HOME' : proj_home, + 'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'), + }) Base.metadata.bind = self.app._session.get_bind() Base.metadata.create_all() @@ -60,6 +59,88 @@ def tearDown(self): MetricsBase.metadata.drop_all() self.app.close_app() + @classmethod + def get_document_data(cls): + """ + Shared fixture-like method for test data. + """ + return { + "bibcode": "2013MNRAS.435.1904M", + "identifier": ["2013MNRAS.435.1904M", "2013arXiv1307.6556M", "2013MNRAS.tmp.2206M", "10.1093/mnras/stt1379", "arXiv:1307.6556"], + "links": { + "DOI": ["10.1093/mnras/stt1379"], + "ARXIV": ["arXiv:1307.6556"], + "DATA": { + "Chandra": { + "url": ["https://cda.harvard.edu/chaser?obsid=494,493,5290,5289,5286,5288,5287,3666,6162,6159,6163,6160,6161,13413,12028,10900,10898,13416,13414,12029,12027,13417,10899,13412,10901,13415,12026"], + "title": ["Chandra Data Archive ObsIds 494, 493, 5290, 5289, 5286, 5288, 5287, 3666, 6162, 6159, 6163, 6160, 6161, 13413, 12028, 10900, 10898, 13416, 13414, 12029, 12027, 13417, 10899, 13412, 10901, 13415, 12026"], + "count": 1 + }, + "ESA": { + "url": ["http://archives.esac.esa.int/ehst/#bibcode=2013MNRAS.435.1904M"], + "title": ["European HST References (EHST)"], + "count": 1 + }, + "HEASARC": { + "url": ["http://heasarc.gsfc.nasa.gov/cgi-bin/W3Browse/biblink.pl?code=2013MNRAS.435.1904M"], + "title": ["http://heasarc.gsfc.nasa.gov/cgi-bin/W3Browse/biblink.pl?code=2013MNRAS.435.1904M"], + "count": 1 + }, + "Herschel": { + "url": ["http://archives.esac.esa.int/hsa/whsa/?ACTION=PUBLICATION&ID=2013MNRAS.435.1904M"], + "title": ["http://archives.esac.esa.int/hsa/whsa/?ACTION=PUBLICATION&ID=2013MNRAS.435.1904M"], + "count": 1 + }, + "MAST": { + "url": ["https://archive.stsci.edu/mastbibref.php?bibcode=2013MNRAS.435.1904M"], + "title": ["MAST References (HST, EUVE, GALEX)"], + "count": 3 + }, + "NED": { + "url": ["https://$NED$/uri/NED::InRefcode/2013MNRAS.435.1904M"], + "title": ["NED Objects (1)"], + "count": 1 + }, + "SIMBAD": { + "url": ["http://$SIMBAD$/simbo.pl?bibcode=2013MNRAS.435.1904M"], + "title": ["SIMBAD Objects (30)"], + "count": 30 + }, + "XMM": { + "url": ["https://nxsa.esac.esa.int/nxsa-web/#bibcode=2013MNRAS.435.1904M"], + "title": ["XMM data (1 observations)"], + "count": 1 + } + }, + "ESOURCE": { + "EPRINT_HTML": { + "url": ["https://arxiv.org/abs/1307.6556"], + "title": [''] + }, + "EPRINT_PDF": { + "url": ["https://arxiv.org/pdf/1307.6556"], + "title": [''] + }, + "PUB_HTML": { + "url": ["https://doi.org/10.1093%2Fmnras%2Fstt1379"], + "title": [''] + }, + "PUB_PDF": { + "url": ["https://academic.oup.com/mnras/pdf-lookup/doi/10.1093/mnras/stt1379"], + "title": [''] + } + }, + "CITATIONS": True, + "REFERENCES": True, + "ABSTRACT": True, + "METRICS": False, + "TOC": True, + "COREAD": True, + "GRAPHICS": True, + "OPENURL": True, + } + } + def test_app(self): assert self.app._config.get('SQLALCHEMY_URL') == 'sqlite:///' assert self.app.conf.get('SQLALCHEMY_URL') == 'sqlite:///' @@ -303,19 +384,22 @@ def test_index_datalinks_success(self): """ m = mock.Mock() m.status_code = 200 + + document_data = self.get_document_data() # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) + self.app.update_storage('2013MNRAS.435.1904M', 'nonbib_data', document_data['links']) + with mock.patch('requests.put', return_value=m) as p: - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} + + checksum = 'thechecksum' - self.app.index_datalinks([datalinks_payload], [checksum]) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), + self.app.index_datalinks([document_data], [checksum]) + p.assert_called_with('http://localhost:8080/update_new', + data=json.dumps([document_data]), headers={'Authorization': 'Bearer fixme'}) self.assertEqual(p.call_count, 1) # verify database updated - rec = self.app.get_record(bibcode='linkstest') + rec = self.app.get_record(bibcode='2013MNRAS.435.1904M') self.assertEqual(rec['datalinks_checksum'], 'thechecksum') self.assertEqual(rec['solr_checksum'], None) self.assertEqual(rec['metrics_checksum'], None) @@ -328,18 +412,21 @@ def test_index_datalinks_service_failure(self): """ m = mock.Mock() m.status_code = 500 + + document_data = self.get_document_data() + # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) + self.app.update_storage('2013MNRAS.435.1904M', 'nonbib_data', document_data['links']) + with mock.patch('requests.put', return_value=m) as p: - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} + checksum = 'thechecksum' - self.app.index_datalinks([datalinks_payload], [checksum]) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), + self.app.index_datalinks([document_data], [checksum]) + p.assert_called_with('http://localhost:8080/update_new', + data=json.dumps([document_data]), headers={'Authorization': 'Bearer fixme'}) - rec = self.app.get_record(bibcode='linkstest') + rec = self.app.get_record(bibcode='2013MNRAS.435.1904M') self.assertEqual(p.call_count, 2) self.assertEqual(rec['datalinks_checksum'], None) self.assertEqual(rec['solr_checksum'], None) @@ -349,23 +436,26 @@ def test_index_datalinks_service_failure(self): def test_index_datalinks_service_only_batch_failure(self): # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) + + document_data = self.get_document_data() + self.app.update_storage('2013MNRAS.435.1904M', 'nonbib_data', document_data['links']) with mock.patch('requests.put') as p: bad = mock.Mock() bad.status_code = 500 + good = mock.Mock() good.status_code = 200 + p.side_effect = [bad, good] - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} + checksum = 'thechecksum' - self.app.index_datalinks([datalinks_payload], [checksum]) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), + self.app.index_datalinks([document_data], [checksum]) + p.assert_called_with('http://localhost:8080/update_new', + data=json.dumps([document_data]), headers={'Authorization': 'Bearer fixme'}) self.assertEqual(p.call_count, 2) # verify database updated - rec = self.app.get_record(bibcode='linkstest') + rec = self.app.get_record(bibcode='2013MNRAS.435.1904M') self.assertEqual(rec['datalinks_checksum'], 'thechecksum') self.assertEqual(rec['solr_checksum'], None) self.assertEqual(rec['metrics_checksum'], None) @@ -376,17 +466,17 @@ def test_index_datalinks_update_processed_false(self): m = mock.Mock() m.status_code = 200 # init database so timestamps and checksum can be updated - nonbib_data = {'data_links_rows': [{'baz': 0}]} - self.app.update_storage('linkstest', 'nonbib_data', nonbib_data) + document_data = self.get_document_data() + + self.app.update_storage('2013MNRAS.435.1904M', 'nonbib_data', document_data['links']) with mock.patch('requests.put', return_value=m) as p: - datalinks_payload = {u'bibcode': u'linkstest', u'data_links_rows': [{u'baz': 0}]} checksum = 'thechecksum' - self.app.index_datalinks([datalinks_payload], [checksum], update_processed=False) - p.assert_called_with('http://localhost:8080/update', - data=json.dumps([{'bibcode': 'linkstest', 'data_links_rows': [{'baz': 0}]}]), + self.app.index_datalinks([document_data], [checksum], update_processed=False) + p.assert_called_with('http://localhost:8080/update_new', + data=json.dumps([document_data]), headers={'Authorization': 'Bearer fixme'}) # verify database updated - rec = self.app.get_record(bibcode='linkstest') + rec = self.app.get_record(bibcode='2013MNRAS.435.1904M') self.assertEqual(rec['datalinks_checksum'], None) self.assertEqual(rec['solr_checksum'], None) self.assertEqual(rec['metrics_checksum'], None) @@ -411,33 +501,188 @@ def test_rename_bibcode(self): self.assertTrue(self.app.get_changelog('abc'), [{'target': u'def', 'key': u'abc'}]) def test_generate_links_for_resolver(self): - only_nonbib = {'bibcode': 'asdf', + only_nonbib = {'bibcode': '2025arXiv250220510L', 'nonbib_data': - {'data_links_rows': [{'url': ['http://arxiv.org/abs/1902.09522']}]}} + {'data_links_rows': [ + { + "link_type": "DATA", + "link_sub_type": "SIMBAD", + "url": [ + "https://simbad.u-strasbg.fr/simbad/sim-id?Ident=2025XYZ1234" + ], + "title": [ + "SIMBAD Astronomical Database" + ], + "item_count": 1 + }, + { + "link_type": "DATA", + "link_sub_type": "VIZIER", + "url": [ + "https://vizier.u-strasbg.fr/viz-bin/VizieR", + "https://vizier.u-strasbg.fr/viz-bin/VizieR-2" + ], + "title": [ + "VizieR Catalog Entry 1", + "VizieR Catalog Entry 2" + ], + "item_count": 2 + }, + { + "link_type": "ESOURCE", + "link_sub_type": "PUBLISHER", + "url": [ + "https://journalpublisher.com/paper/2025XYZ1234" + ], + "title": [ + "Published Paper" + ], + "item_count": 1 + }, + { + "link_type": "PRESENTATION", + "url": [ + "https://conference.org/2025/presentation123", + "https://conference.org/2025/slides123" + ], + "title": [ + "Conference Presentation", + "Presentation Slides" + ], + "item_count": 2 + }, + { + "link_type": "LIBRARYCATALOG", + "url": [ + "https://library.university.edu/catalog/2025XYZ1234" + ], + "title": [ + "University Library Catalog Entry" + ], + "item_count": 1 + } + ] + } + } + links = self.app.generate_links_for_resolver(only_nonbib) self.assertEqual(only_nonbib['bibcode'], links['bibcode']) - self.assertEqual(only_nonbib['nonbib_data']['data_links_rows'], links['data_links_rows']) + + expected_links = {'bibcode': '2025arXiv250220510L', + 'links': {'ARXIV': [], 'DOI': [], + 'DATA': {'SIMBAD': {'url': ['https://simbad.u-strasbg.fr/simbad/sim-id?Ident=2025XYZ1234'], 'title': ['SIMBAD Astronomical Database'], 'count': 1}, + 'VIZIER': {'url': ['https://vizier.u-strasbg.fr/viz-bin/VizieR', 'https://vizier.u-strasbg.fr/viz-bin/VizieR-2'], 'title': ['VizieR Catalog Entry 1', 'VizieR Catalog Entry 2'], 'count': 2}}, + 'ESOURCE': {'PUBLISHER': {'url': ['https://journalpublisher.com/paper/2025XYZ1234'], 'title': ['Published Paper'], 'count': 1}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, + 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': ['https://library.university.edu/catalog/2025XYZ1234'], 'title': ['University Library Catalog Entry'], 'count': 1}, + 'PRESENTATION': {'url': ['https://conference.org/2025/presentation123', 'https://conference.org/2025/slides123'], 'title': ['Conference Presentation', 'Presentation Slides'], 'count': 2}, + 'ABSTRACT': True, + 'CITATIONS': False, + 'GRAPHICS': True, + 'METRICS': False, + 'OPENURL': True, + 'REFERENCES': False, + 'TOC': False, + 'COREAD': True}, + 'identifier': ['2025arXiv250220510L']} + self.assertEqual(links, expected_links) only_bib = {'bibcode': 'asdf', 'bib_data': {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}']}} links = self.app.generate_links_for_resolver(only_bib) self.assertEqual(only_bib['bibcode'], links['bibcode']) - first = links['data_links_rows'][0] - self.assertEqual('http://arxiv.org/abs/1902.09522', first['url'][0]) - self.assertEqual('ESOURCE', first['link_type']) - self.assertEqual('EPRINT_HTML', first['link_sub_type']) - self.assertEqual([''], first['title']) - self.assertEqual(0, first['item_count']) + expected_links = {'bibcode': 'asdf', 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, + 'ESOURCE': {'EPRINT_HTML': {'url': ['http://arxiv.org/abs/1902.09522'], + 'title': [''], 'count': 0}, + 'EPRINT_PDF': {'url': ['http://arxiv.org/pdf/1902.09522'], 'title': [''], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, + 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': True, + 'CITATIONS': False, + 'GRAPHICS': True, + 'METRICS': False, + 'OPENURL': True, + 'REFERENCES': False, + 'TOC': False, + 'COREAD': True}, + 'identifier': ['asdf']} + + self.assertEqual(links, expected_links) + + # Nonbib in old format and bib. Nonbib should be preferred bib_and_nonbib = {'bibcode': 'asdf', 'bib_data': - {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522zz"}']}, + {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://shouldnotbeused"}']}, 'nonbib_data': - {'data_links_rows': [{'url': ['http://arxiv.org/abs/1902.09522']}]}} + {'data_links_rows': [{'url': ['http://returnthis'], 'link_type': 'ESOURCE', 'link_sub_type': 'EPRINT_HTML'}]}} links = self.app.generate_links_for_resolver(bib_and_nonbib) - self.assertEqual(only_nonbib['bibcode'], links['bibcode']) - self.assertEqual(only_nonbib['nonbib_data']['data_links_rows'], links['data_links_rows']) + + self.assertEqual(bib_and_nonbib['bibcode'], links['bibcode']) + expected_links = {'bibcode': 'asdf', + 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, + 'ESOURCE': {'EPRINT_HTML': {'url': ['http://returnthis'], 'title': [], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': True, + 'CITATIONS': False, + 'GRAPHICS': True, + 'METRICS': False, + 'OPENURL': True, + 'REFERENCES': False, + 'TOC': False, + 'COREAD': True}, + 'identifier': ['asdf']} + self.assertEqual(links, expected_links) + + # Nonbib in new format and bib. Nonbib should be preferred + bib_and_nonbib = {'bibcode': 'asdf', + 'bib_data': + {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://shouldnotbeused"}']}, + 'nonbib_data': + {'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, + 'ESOURCE': {'EPRINT_HTML': {'url': ['http://returnthis'], 'title': [], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, 'CITATIONS': False, 'GRAPHICS': False, 'METRICS': False, 'OPENURL': False, 'REFERENCES': False, 'TOC': False, 'COREAD': False}}} + links = self.app.generate_links_for_resolver(bib_and_nonbib) + + expected_links = {'bibcode': 'asdf', 'links': {'ARXIV': [], + 'DOI': [], 'DATA': {}, + 'ESOURCE': {'EPRINT_HTML': + {'url': ['http://returnthis'], 'title': [], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, + 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, + 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 'ABSTRACT': False, 'CITATIONS': False, 'GRAPHICS': True, 'METRICS': False, 'OPENURL': True, 'REFERENCES': False, 'TOC': False, 'COREAD': True}, 'identifier': ['asdf']} + + + # Nonbib in new format and Nonbib in old format. This is a bug, but if it does happen, new format wins + bib_and_nonbib = {'bibcode': 'asdf', + 'bib_data': + {'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://shouldnotbeused"}']}, + 'nonbib_data': + {'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, + 'ESOURCE': {'EPRINT_HTML': {'url': ['http://returnthis'], 'title': [], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, 'CITATIONS': False, 'GRAPHICS': False, 'METRICS': False, 'OPENURL': False, 'REFERENCES': False, 'TOC': False, 'COREAD': False}}, + 'data_links_rows': [{'url': ['http://returnthis'], 'link_type': 'ESOURCE', 'link_sub_type': 'EPRINT_HTML'}]} + links = self.app.generate_links_for_resolver(bib_and_nonbib) + + expected_links = {'bibcode': 'asdf', 'links': {'ARXIV': [], + 'DOI': [], 'DATA': {}, + 'ESOURCE': {'EPRINT_HTML': + {'url': ['http://returnthis'], 'title': [], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, + 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, + 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 'ABSTRACT': True, 'CITATIONS': False, 'GRAPHICS': True, 'METRICS': False, 'OPENURL': True, 'REFERENCES': False, 'TOC': False, 'COREAD': True}, 'identifier': ['asdf']} + + self.assertEqual(links, expected_links) # string in database only_bib = {'bibcode': 'asdf', @@ -445,11 +690,12 @@ def test_generate_links_for_resolver(self): {'links_data': [u'{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}']}} links = self.app.generate_links_for_resolver(only_bib) self.assertEqual(only_bib['bibcode'], links['bibcode']) - first = links['data_links_rows'][0] - self.assertEqual('http://arxiv.org/abs/1902.09522', first['url'][0]) - self.assertEqual('ESOURCE', first['link_type']) - self.assertEqual('EPRINT_HTML', first['link_sub_type']) + eprint_html = links['links']['ESOURCE']['EPRINT_HTML'] + self.assertEqual('http://arxiv.org/abs/1902.09522', eprint_html['url'][0]) + self.assertEqual([''], eprint_html['title']) + self.assertEqual(0, eprint_html['count']) + # bad string in database with mock.patch.object(self.app.logger, 'error') as m: only_bib = {'bibcode': 'testbib', diff --git a/adsmp/tests/test_identifiers.py b/adsmp/tests/test_identifiers.py new file mode 100644 index 0000000..0487373 --- /dev/null +++ b/adsmp/tests/test_identifiers.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import unittest +import os +import copy +import json +from mock import patch + +from adsmp import app +from adsmp.models import Base +import testing.postgresql + + +class TestIdentifierExtraction(unittest.TestCase): + """ + Tests the identifier extraction and processing methods in the application + """ + + @classmethod + def setUpClass(cls): + cls.postgresql = \ + testing.postgresql.Postgresql(host='127.0.0.1', port=15678, user='postgres', + database='test') + + @classmethod + def tearDownClass(cls): + cls.postgresql.stop() + + def setUp(self): + unittest.TestCase.setUp(self) + + proj_home = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) + self.app = app.ADSMasterPipelineCelery('test', local_config=\ + { + 'SQLALCHEMY_URL': 'sqlite:///', + 'METRICS_SQLALCHEMY_URL': 'postgresql://postgres@127.0.0.1:15678/test', + 'SQLALCHEMY_ECHO': False, + 'PROJ_HOME': proj_home, + 'TEST_DIR': os.path.join(proj_home, 'adsmp/tests'), + }) + Base.metadata.bind = self.app._session.get_bind() + Base.metadata.create_all() + + def tearDown(self): + unittest.TestCase.tearDown(self) + Base.metadata.drop_all() + self.app.close_app() + + def test_extract_data_components(self): + """Test that _extract_data_components correctly extracts and validates data""" + + # Test with complete record + complete_record = { + 'bibcode': '2008arXiv0802.0143H', + 'bib_data': {'title': 'Test Title'}, + 'fulltext': {'body': 'Test body'}, + 'metrics': {'citation_num': 5}, + 'nonbib_data': {'property': ['ARTICLE']}, + 'orcid_claims': {'authors': []} + } + + bibcode, bib_data, fulltext, metrics, nonbib, orcid_claims = \ + self.app._extract_data_components(complete_record) + + self.assertEqual(bibcode, '2008arXiv0802.0143H') + self.assertEqual(bib_data, {'title': 'Test Title'}) + self.assertEqual(fulltext, {'body': 'Test body'}) + self.assertEqual(metrics, {'citation_num': 5}) + self.assertEqual(nonbib, {'property': ['ARTICLE']}) + self.assertEqual(orcid_claims, {'authors': []}) + + # Test with missing components + incomplete_record = { + 'bibcode': '2008arXiv0802.0143H', + 'bib_data': None, # This should be converted to {} + # missing fulltext + 'metrics': 'invalid', # Not a dict, should be converted to {} + # missing nonbib + 'orcid_claims': [] # Not a dict, should be converted to {} + } + + bibcode, bib_data, fulltext, metrics, nonbib, orcid_claims = \ + self.app._extract_data_components(incomplete_record) + + self.assertEqual(bibcode, '2008arXiv0802.0143H') + self.assertEqual(bib_data, {}) + self.assertEqual(fulltext, {}) + self.assertEqual(metrics, {}) + self.assertEqual(nonbib, {}) + self.assertEqual(orcid_claims, {}) + + def test_is_arxiv_id(self): + """Test that _is_arxiv_id correctly identifies arXiv identifiers""" + + # Test standard arXiv format with prefix + self.assertTrue(self.app.is_arxiv_id('arxiv:2301.12345')) + self.assertTrue(self.app.is_arxiv_id('arXiv:2301.12345')) + + # Test URL format + self.assertTrue(self.app.is_arxiv_id('10.48550/arXiv.2502.20510')) + self.assertTrue(self.app.is_arxiv_id('10.48550/arXiv.astro-ph/0610305')) + + # Test edge cases + self.assertTrue(self.app.is_arxiv_id('arXiv:2301.12345')) # Original capitalization preserved + self.assertTrue(self.app.is_arxiv_id('ARXIV:2301.12345')) # Case preserved + + # Test with non-arXiv identifiers + self.assertFalse(self.app.is_arxiv_id('doi:10.1234/test')) + self.assertFalse(self.app.is_arxiv_id('2008arXiv0802.0143H')) # Bibcode + + # Test with None or non-string input + self.assertFalse(self.app.is_arxiv_id(123)) + + def test_is_doi_id(self): + """Test that _is_doi_id correctly identifies DOI identifiers""" + + # Test standard DOI format with prefix + # self.assertTrue(self.app.is_doi_id('10.1234/test')) + + self.assertTrue(self.app.is_doi_id("10.48550/arXiv.2502.20510")) + self.assertTrue(self.app.is_doi_id("10.1016/j.jtbi.2008.11.029")) + + # Test direct DOI format (no prefix) + self.assertTrue(self.app.is_doi_id('10.1234/test')) + + # Test with non-DOI identifiers + self.assertFalse(self.app.is_doi_id('arxiv:2301.12345')) + self.assertFalse(self.app.is_doi_id('2008arXiv0802.0143H')) # Bibcode + + # Test with None or non-string input + self.assertFalse(self.app.is_doi_id(123)) + + def test_collect_identifiers(self): + """Test that _collect_identifiers correctly collects identifiers from all sources""" + + # Test with all sources having identifiers + bibcode = '2008arXiv0802.0143H' + bib_data = { + 'identifier': ["1988AnBot..61..393A", + "10.1093/oxfordjournals.aob.a087569", + "10.1016/j.physleta.2005.08.078", + "10.48550/arXiv.nlin/0510022", + "arXiv:nlin/0510022", + "2005nlin.....10022X", + "2006PhLA..349..128X", + "10.1134/S0036024419050133", + "2019RJPCA..93..993G", + "2005hep.ph...10301C"], + 'alternate_bibcode': [ "1942JGR....47..251E", + "2004cond.mat.11661L", + "1942QB51.B5........", + "2005cond.mat..9445G", + "2005cond.mat..3372D", + "1977VeMFA..18...25K", + "1942QB107.M3.......", + "2004math.ph..12044E", + "2005nlin.....10022X", + None] + } + + + identifiers = self.app._collect_identifiers(bibcode, bib_data) + + expected = set([bibcode] + bib_data['identifier'] + [id for id in bib_data['alternate_bibcode'] if id is not None]) + + self.assertEqual(sorted(identifiers), sorted(expected)) + + # Test with missing data + bibcode = '2008arXiv0802.0143H' + bib_data = {} # No identifiers + + identifiers = self.app._collect_identifiers(bibcode, bib_data) + + # Should only contain the bibcode + self.assertEqual(set(identifiers), {'2008arXiv0802.0143H'}) + + def test_populate_identifiers(self): + """Test that _populate_identifiers correctly extracts and populates identifier information""" + + # Create a test record with a variety of identifiers + test_record = { + 'bibcode': '2008arXiv0802.0143H', + 'bib_data': { + 'identifier': [ + "10.48550/arXiv.2502.20510", + "arXiv:2502.20510", + "2025arXiv250220510L", + "10.48550/arXiv.2502.20407", + "2025arXiv250220407K", + "arXiv:2502.20407", + "10.48550/arXiv.2004.00015", + "arXiv:2004.00015", + "2020arXiv200400015S", + "2020arXiv200712475E", + "arXiv:2007.12475", + "10.48550/arXiv.2007.12475", + "arXiv:2502.20561" + ], + 'alternate_bibcode': ["1942JGR....47..251E", + "2004cond.mat.11661L", + "1942QB51.B5........", + "2005cond.mat..9445G", + "2005cond.mat..3372D", + "1977VeMFA..18...25K", + "1942QB107.M3.......", + "2004math.ph..12044E", + "2005nlin.....10022X"] + } + } + + # Create a resolver record and links structure + resolver_record = {'bibcode': '2008arXiv0802.0143H'} + links = { + 'ARXIV': [], + 'DOI': [] + } + + # Call the method + updated_links = self.app._populate_identifiers(test_record['bibcode'], test_record['bib_data'], resolver_record, links) + + # Check that identifiers were extracted and consolidated + self.assertIn('identifier', resolver_record) + self.assertEqual(len(resolver_record['identifier']), len(set(test_record['bib_data']['identifier'] + test_record['bib_data']['alternate_bibcode'] + [test_record['bibcode']]))) # All unique identifiers + + # Check that ARXIV and DOI fields were populated correctly - now with full identifiers + self.assertEqual(set(updated_links['ARXIV']), set(['10.48550/arXiv.2502.20510', 'arXiv:2502.20510', '10.48550/arXiv.2004.00015', 'arXiv:2502.20561', 'arXiv:2004.00015', '10.48550/arXiv.2007.12475', 'arXiv:2007.12475', 'arXiv:2502.20407', '10.48550/arXiv.2502.20407'])) + self.assertEqual(set(updated_links['DOI']), set(['10.48550/arXiv.2502.20407', '10.48550/arXiv.2502.20510', '10.48550/arXiv.2007.12475', '10.48550/arXiv.2004.00015'])) + + + def test_integration_with_populate_links_structure(self): + """Test the integration between _populate_identifiers and _populate_links_structure""" + + # Create a comprehensive test record + test_record = { + 'bibcode': '2008arXiv0802.0143H', + 'bib_data': { + 'abstract': 'This is a test abstract', + 'identifier': [ + 'arXiv:math/0406160', + 'arXiv:2502.20510', + '10.48550/arXiv.2004.00015', + '10.1007/s10955-009-9793-2' + ] + }, + 'fulltext': { + 'body': 'Test body' + }, + 'metrics': { + 'citation_num': 10, + 'reads': [20, 10, 5], + 'reference_num': 15 + }, + 'nonbib_data': { + 'reference': ['ref1', 'ref2', 'ref3'], + 'property': ['ARTICLE', 'REFEREED', 'TOC'] + } + } + + # Create a basic resolver record + resolver_record = { + 'bibcode': '2008arXiv0802.0143H', + 'links': { + 'ARXIV': [], + 'DOI': [], + 'DATA': {}, + 'ESOURCE': {}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, + 'CITATIONS': False, + 'GRAPHICS': False, + 'METRICS': False, + 'OPENURL': False, + 'REFERENCES': False, + 'TOC': False, + 'COREAD': False + } + } + + # Populate the links structure + enriched_record = self.app._populate_links_structure(test_record, resolver_record) + + # Verify identifiers were collected + self.assertIn('identifier', enriched_record) + self.assertIn('2008arXiv0802.0143H', enriched_record['identifier']) + self.assertIn('arXiv:math/0406160', enriched_record['identifier']) + self.assertIn('10.48550/arXiv.2004.00015', enriched_record['identifier']) + + # Verify ARXIV and DOI fields were populated with original identifiers + self.assertIn('arXiv:2502.20510', enriched_record['links']['ARXIV']) + self.assertIn('10.48550/arXiv.2004.00015', enriched_record['links']['DOI']) + self.assertIn('10.1007/s10955-009-9793-2', enriched_record['links']['DOI']) + + # Verify other fields were set correctly + self.assertTrue(enriched_record['links']['ABSTRACT']) + self.assertTrue(enriched_record['links']['CITATIONS']) + self.assertTrue(enriched_record['links']['GRAPHICS']) + self.assertTrue(enriched_record['links']['METRICS']) + self.assertTrue(enriched_record['links']['REFERENCES']) + self.assertTrue(enriched_record['links']['TOC']) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/adsmp/tests/test_links_resolver.py b/adsmp/tests/test_links_resolver.py new file mode 100644 index 0000000..b2650f5 --- /dev/null +++ b/adsmp/tests/test_links_resolver.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import unittest +import os +import copy +import json + +from adsmp import app +from adsmp.models import Base +from adsputils import get_date +import testing.postgresql + +class TestLinksResolver(unittest.TestCase): + """ + Tests for the links resolver data structure completeness + """ + + @classmethod + def setUpClass(cls): + cls.postgresql = \ + testing.postgresql.Postgresql(host='127.0.0.1', port=15678, user='postgres', + database='test') + + @classmethod + def tearDownClass(cls): + cls.postgresql.stop() + + def setUp(self): + unittest.TestCase.setUp(self) + + proj_home = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) + self.app = app.ADSMasterPipelineCelery('test', local_config=\ + { + 'SQLALCHEMY_URL': 'sqlite:///', + 'METRICS_SQLALCHEMY_URL': 'postgresql://postgres@127.0.0.1:15678/test', + 'SQLALCHEMY_ECHO': False, + 'PROJ_HOME' : proj_home, + 'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'), + }) + Base.metadata.bind = self.app._session.get_bind() + Base.metadata.create_all() + + def tearDown(self): + unittest.TestCase.tearDown(self) + Base.metadata.drop_all() + self.app.close_app() + + def create_complete_test_record(self): + """Create a test record with all fields needed for a complete resolver structure""" + return { + 'bibcode': '2023MNRAS.518..529S', + 'bib_data': { + 'identifier': [ + 'arXiv:2301.12345', + '10.1093/mnras/stac3079', + '10.48550/arXiv.astro-ph/0610305', + '2023MNRAS.518..529S', + ], + 'alternate_bibcode': ['2023arXiv230112345S'], + 'abstract': 'This is a test abstract for a complete record', + 'pub': 'Monthly Notices of the Royal Astronomical Society', + 'volume': '518', + 'issue': '1', + 'page': '529-545', + }, + 'fulltext': { + 'body': 'This is the full text body.', + 'acknowledgements': 'The authors would like to thank everyone.' + }, + 'metrics': { + 'citation_num': 25, + 'reads': [50, 30, 20, 10], + 'downloads': [20, 15, 10, 5], + 'reference_num': 45, + 'citations': ['2023ApJ...934L...7B', '2023A&A...672A..34C'], + 'rn_citations_hist': { + '0': 10, # Number of reads in the most recent period + '1': 8, + '2': 6 + } + }, + 'nonbib_data': { + 'citation': ['2023ApJ...934L...7B', '2023A&A...672A..34C'], + 'citation_count': 25, + 'reference': ['2021ApJ...908L..16F', '2022ApJ...930L..10K', '2022MNRAS.512.1317B'], + 'read_count': 110, + 'property': ['REFEREED', 'ARTICLE', 'TOC'], + 'data_links_rows': [ + { + 'link_type': 'ESOURCE', + 'link_sub_type': 'PUB_HTML', + 'url': ['https://academic.oup.com/mnras/article/518/1/529/6795543'], + 'title': ['Publisher HTML'], + 'item_count': 1 + }, + { + 'link_type': 'DATA', + 'link_sub_type': 'CDS', + 'url': ['https://cdsarc.cds.unistra.fr/viz-bin/cat/J/MNRAS/518/529'], + 'title': ['CDS Astronomical Database'], + 'item_count': 1 + }, + { + 'link_type': 'INSPIRE', + 'url': ['https://inspirehep.net/literature/2023MNRAS.518..529S'], + 'title': ['INSPIRE Entry'], + 'item_count': 1 + }, + { + 'link_type': 'ASSOCIATED', + 'url': ['https://zenodo.org/record/12345'], + 'title': ['Zenodo Dataset'], + 'item_count': 1 + }, + { + 'link_type': 'PRESENTATION', + 'url': ['https://conference.org/2023/presentation123'], + 'title': ['Conference Presentation'], + 'item_count': 1 + }, + { + 'link_type': 'LIBRARYCATALOG', + 'url': ['https://library.institution.edu/catalog/2023MNRAS.518..529S'], + 'title': ['Library Catalog Entry'], + 'item_count': 1 + } + ] + } + } + + def test_links_resolver_completeness(self): + """Test that the links resolver structure is complete with all expected fields""" + + # Create a complex test record with all required fields + test_record = self.create_complete_test_record() + + # Generate resolver record + resolver_record = self.app.generate_links_for_resolver(test_record) + + # Check that the resolver record is created and has the correct bibcode + self.assertIsNotNone(resolver_record) + self.assertEqual(test_record['bibcode'], resolver_record['bibcode']) + + # Check that links structure exists and is a dict + self.assertIn('links', resolver_record) + self.assertIsInstance(resolver_record['links'], dict) + + # Verify all identifiers were collected + self.assertIn('identifier', resolver_record) + + # IDENTIFIERS + identifiers = resolver_record['identifier'] + self.assertIsInstance(identifiers, list) + # Check that all identifiers from the test record are included + for identifier in test_record['bib_data']['identifier']: + self.assertIn(identifier, identifiers) + # Check that alternate bibcode is included + self.assertIn(test_record['bib_data']['alternate_bibcode'][0], identifiers) + + # LINKS STRUCTURE COMPLETENESS + links = resolver_record['links'] + + # Check core fields are present + self.assertIn('ARXIV', links) + self.assertIn('DOI', links) + self.assertIn('DATA', links) + self.assertIn('ESOURCE', links) + self.assertIn('ASSOCIATED', links) + self.assertIn('INSPIRE', links) + self.assertIn('LIBRARYCATALOG', links) + self.assertIn('PRESENTATION', links) + + # Check boolean flags + self.assertIn('ABSTRACT', links) + self.assertIn('CITATIONS', links) + self.assertIn('GRAPHICS', links) + self.assertIn('METRICS', links) + self.assertIn('OPENURL', links) + self.assertIn('REFERENCES', links) + self.assertIn('TOC', links) + self.assertIn('COREAD', links) + + # Verify ARXIV and DOI fields contain the correct identifiers + self.assertIsInstance(links['ARXIV'], list) + self.assertIsInstance(links['DOI'], list) + self.assertIn('arXiv:2301.12345', links['ARXIV']) + self.assertIn('10.1093/mnras/stac3079', links['DOI']) + self.assertIn('10.48550/arXiv.astro-ph/0610305', links['DOI']) + # Verify DATA link types + self.assertIsInstance(links['DATA'], dict) + self.assertIn('CDS', links['DATA']) + self.assertEqual(links['DATA']['CDS']['url'][0], 'https://cdsarc.cds.unistra.fr/viz-bin/cat/J/MNRAS/518/529') + self.assertEqual(links['DATA']['CDS']['title'][0], 'CDS Astronomical Database') + self.assertEqual(links['DATA']['CDS']['count'], 1) + + # Verify ESOURCE link types + self.assertIsInstance(links['ESOURCE'], dict) + self.assertIn('PUB_HTML', links['ESOURCE']) + self.assertEqual(links['ESOURCE']['PUB_HTML']['url'][0], 'https://academic.oup.com/mnras/article/518/1/529/6795543') + self.assertEqual(links['ESOURCE']['PUB_HTML']['title'][0], 'Publisher HTML') + self.assertEqual(links['ESOURCE']['PUB_HTML']['count'], 1) + + # Verify ASSOCIATED link + self.assertIsInstance(links['ASSOCIATED'], dict) + self.assertIn('url', links['ASSOCIATED']) + self.assertIn('title', links['ASSOCIATED']) + self.assertIn('count', links['ASSOCIATED']) + self.assertEqual(links['ASSOCIATED']['url'][0], 'https://zenodo.org/record/12345') + self.assertEqual(links['ASSOCIATED']['title'][0], 'Zenodo Dataset') + self.assertEqual(links['ASSOCIATED']['count'], 1) + + # Verify INSPIRE link + self.assertIsInstance(links['INSPIRE'], dict) + self.assertIn('url', links['INSPIRE']) + self.assertIn('title', links['INSPIRE']) + self.assertIn('count', links['INSPIRE']) + self.assertEqual(links['INSPIRE']['url'][0], 'https://inspirehep.net/literature/2023MNRAS.518..529S') + self.assertEqual(links['INSPIRE']['title'][0], 'INSPIRE Entry') + self.assertEqual(links['INSPIRE']['count'], 1) + + # Verify PRESENTATION link + self.assertIsInstance(links['PRESENTATION'], dict) + self.assertIn('url', links['PRESENTATION']) + self.assertIn('title', links['PRESENTATION']) + self.assertIn('count', links['PRESENTATION']) + self.assertEqual(links['PRESENTATION']['url'][0], 'https://conference.org/2023/presentation123') + self.assertEqual(links['PRESENTATION']['title'][0], 'Conference Presentation') + self.assertEqual(links['PRESENTATION']['count'], 1) + + # Verify LIBRARYCATALOG link + self.assertIsInstance(links['LIBRARYCATALOG'], dict) + self.assertIn('url', links['LIBRARYCATALOG']) + self.assertIn('title', links['LIBRARYCATALOG']) + self.assertIn('count', links['LIBRARYCATALOG']) + self.assertEqual(links['LIBRARYCATALOG']['url'][0], 'https://library.institution.edu/catalog/2023MNRAS.518..529S') + self.assertEqual(links['LIBRARYCATALOG']['title'][0], 'Library Catalog Entry') + self.assertEqual(links['LIBRARYCATALOG']['count'], 1) + + # Verify boolean flags are all set correctly + self.assertTrue(links['ABSTRACT']) + self.assertTrue(links['CITATIONS']) + self.assertTrue(links['GRAPHICS']) + self.assertTrue(links['METRICS']) + self.assertTrue(links['OPENURL']) + self.assertTrue(links['REFERENCES']) + self.assertTrue(links['TOC']) + self.assertTrue(links['COREAD']) + + # Test record without links_data - should return None + minimal_record = { + 'bibcode': 'minimal2023', + 'bib_data': { + 'identifier': ['minimal2023'] + } + } + + minimal_resolver = self.app.generate_links_for_resolver(minimal_record) + self.assertIsNone(minimal_resolver) + + # Test minimal record with links_data containing a URL + minimal_record_with_links = { + 'bibcode': 'minimal2023', + 'bib_data': { + 'identifier': ['minimal2023'], + 'links_data': ['{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1902.09522"}'] + } + } + + minimal_resolver = self.app.generate_links_for_resolver(minimal_record_with_links) + + # Check that resolver record is not None + self.assertIsNotNone(minimal_resolver) + self.assertEqual('minimal2023', minimal_resolver['bibcode']) + + # Check links structure + minimal_links = minimal_resolver['links'] + + # Verify structure is complete even with minimal data + self.assertIn('ARXIV', minimal_links) + self.assertIn('DOI', minimal_links) + self.assertIn('DATA', minimal_links) + self.assertIn('ESOURCE', minimal_links) + self.assertIn('ASSOCIATED', minimal_links) + self.assertIn('INSPIRE', minimal_links) + self.assertIn('LIBRARYCATALOG', minimal_links) + self.assertIn('PRESENTATION', minimal_links) + self.assertIn('ABSTRACT', minimal_links) + self.assertIn('CITATIONS', minimal_links) + self.assertIn('GRAPHICS', minimal_links) + self.assertIn('METRICS', minimal_links) + self.assertIn('OPENURL', minimal_links) + self.assertIn('REFERENCES', minimal_links) + self.assertIn('TOC', minimal_links) + self.assertIn('COREAD', minimal_links) + + # Check ESOURCE fields for the URL from links_data + self.assertIn('EPRINT_HTML', minimal_links['ESOURCE']) + self.assertEqual(minimal_links['ESOURCE']['EPRINT_HTML']['url'][0], 'http://arxiv.org/abs/1902.09522') + + # Check that a PDF link is also created + self.assertIn('EPRINT_PDF', minimal_links['ESOURCE']) + self.assertEqual(minimal_links['ESOURCE']['EPRINT_PDF']['url'][0], 'http://arxiv.org/pdf/1902.09522') + + # Test with non-bib data links + nonbib_links_record = { + 'bibcode': 'nonbib2023', + 'nonbib_data': { + 'data_links_rows': [ + { + 'link_type': 'ESOURCE', + 'link_sub_type': 'EPRINT_HTML', + 'url': ['http://arxiv.org/abs/2301.00001'], + 'title': ['arXiv Preprint'], + 'item_count': 1 + } + ] + } + } + + nonbib_resolver = self.app.generate_links_for_resolver(nonbib_links_record) + self.assertIsNotNone(nonbib_resolver) + nonbib_links = nonbib_resolver['links'] + + # Check that ESOURCE contains the expected data + self.assertIn('EPRINT_HTML', nonbib_links['ESOURCE']) + self.assertEqual(nonbib_links['ESOURCE']['EPRINT_HTML']['url'][0], 'http://arxiv.org/abs/2301.00001') + self.assertEqual(nonbib_links['ESOURCE']['EPRINT_HTML']['title'][0], 'arXiv Preprint') + + def test_links_structure_with_edge_cases(self): + """Test that the links resolver structure handles edge cases correctly""" + + # Test with empty fields + empty_record = { + 'bibcode': 'empty2023', + 'bib_data': {}, + 'fulltext': {}, + 'metrics': {}, + 'nonbib_data': {} + } + + empty_resolver = self.app.generate_links_for_resolver(empty_record) + self.assertIsNone(empty_resolver) + + # Test with None values + none_record = { + 'bibcode': 'none2023', + 'bib_data': None, + 'fulltext': None, + 'metrics': None, + 'nonbib_data': None + } + + none_resolver = self.app.generate_links_for_resolver(none_record) + self.assertIsNone(none_resolver) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py index bf1072f..7b8864f 100644 --- a/adsmp/tests/test_tasks.py +++ b/adsmp/tests/test_tasks.py @@ -63,7 +63,7 @@ def setUp(self): "SQLALCHEMY_ECHO": False, "SOLR_URLS": ["http://foo.bar.com/solr/v1"], "METRICS_SQLALCHEMY_URL": None, - "LINKS_RESOLVER_UPDATE_URL": "http://localhost:8080/update", + "LINKS_RESOLVER_UPDATE_URL": "http://localhost:8080/update_new", "ADS_API_TOKEN": "api_token", }, ) @@ -487,15 +487,21 @@ def test_task_index_records_links(self): force=True, ) p.assert_called_with( - "http://localhost:8080/update", + "http://localhost:8080/update_new", data=json.dumps( - [{"bibcode": "linkstest", "data_links_rows": [{"baz": 0}]}] + [{"bibcode": "linkstest", + "links": {"ARXIV": [], "DOI": [], "DATA": {}, "ESOURCE": {}, + "ASSOCIATED": {"url": [], "title": [], "count": 0}, + "INSPIRE": {"url": [], "title": [], "count": 0}, + "LIBRARYCATALOG": {"url": [], "title": [], "count": 0}, + "PRESENTATION": {"url": [], "title": [], "count": 0}, + "ABSTRACT": True, "CITATIONS": False, "GRAPHICS": True, "METRICS": False, "OPENURL": True, "REFERENCES": False, "TOC": False, "COREAD": True}, + "identifier": ["linkstest"]}] ), - headers={"Authorization": "Bearer api_token"}, - ) + headers={"Authorization": "Bearer api_token"}) rec = self.app.get_record(bibcode="linkstest") - self.assertEqual(rec["datalinks_checksum"], "0x80e85169") + self.assertEqual(rec["datalinks_checksum"], "0xfed29826") self.assertEqual(rec["solr_checksum"], None) self.assertEqual(rec["metrics_checksum"], None) @@ -611,7 +617,7 @@ def test_ignore_checksums_datalinks(self): "bib_data_updated": get_date(), "nonbib_data_updated": get_date(), "processed": get_date(str(future_year)), - "datalinks_checksum": "0x80e85169", + "datalinks_checksum": "0xfed29826", }, ), patch( "adsmp.tasks.task_index_data_links_resolver.apply_async", diff --git a/config.py b/config.py index bd4ba64..c6a33bd 100644 --- a/config.py +++ b/config.py @@ -44,7 +44,7 @@ # url and token for the update endpoint of the links resolver microservice # new links data is sent to this url, the mircoservice updates its datastore -LINKS_RESOLVER_UPDATE_URL = "http://localhost:8080/update" +LINKS_RESOLVER_UPDATE_URL = "http://localhost:8080/update_new" ADS_API_TOKEN = "fixme" # Sitemap configuration