Skip to content
349 changes: 329 additions & 20 deletions adsmp/app.py

Large diffs are not rendered by default.

76 changes: 32 additions & 44 deletions adsmp/solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,62 +384,43 @@ def transform_json_record(db_record):
timestamps = []
for k, v in DB_COLUMN_DESTINATIONS:
ts = db_record.get(k + "_updated", None)
if ts:
ts = time.mktime(ts.timetuple())
else:
ts = sys.maxsize # default to use option without timestamp
ts = time.mktime(ts.timetuple()) if ts else sys.maxsize # default to use option without timestamp
timestamps.append((k, v, ts))
timestamps.sort(key=lambda x: x[2])

# merge data based on timestamps
for field, target, _ in timestamps:
for field, target, ts in timestamps: # fields = {bib_data, nonbib_data, orcid_claims, metrics ..}
if db_record.get(field, None):
if target:
if not target: # bib_data
out.update(db_record.get(field))
else:
if callable(target):
x = target(
enriched_data = target(
db_record.get(field), out
) # in the interest of speed, don't create copy of out
if x:
out.update(x)
else:
out[target] = db_record.get(field)
else:
if target is None:
continue

out.update(db_record.get(field))

elif field.startswith("#"):
if enriched_data:
out.update(enriched_data)
else: # id
out[target] = db_record.get(field)
elif field.startswith("#"): # timestamps
if callable(target):
x = target(
enriched_data = target(
db_record, out
) # in the interest of speed, don't create copy of out
if x:
out.update(x)

# override temporal priority for links data
if (
db_record.get("bib_data", None)
and db_record.get("nonbib_data", None)
and db_record["bib_data"].get("links_data", None)
and db_record["nonbib_data"].get("links_data", None)
if enriched_data:
out.update(enriched_data)

# If both bib and nonbib pipeline provided links data
# use nonbib data even if it is older
if all(
db_record.get(key, {}).get("links_data")
for key in ("bib_data", "nonbib_data")
):
# here if both bib and nonbib pipeline provided links data
# use nonbib data even if it is older

out["links_data"] = db_record["nonbib_data"]["links_data"]

# override temporal priority for bibgroup and bibgroup_facet, prefer nonbib
if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get(
"bibgroup", None
):
out["bibgroup"] = db_record["nonbib_data"]["bibgroup"]
if db_record.get("nonbib_data", None) and db_record["nonbib_data"].get(
"bibgroup_facet", None
):
out["bibgroup_facet"] = db_record["nonbib_data"]["bibgroup_facet"]

# if only bib data is available, use it to compute property
if db_record.get("nonbib_data", None) is None and db_record.get("bib_data", None):
# Else if only bib pipeline provided links data
elif db_record.get("bib_data", {}).get("links_data"):
links_data = db_record["bib_data"].get("links_data", None)
if links_data:
try:
Expand All @@ -466,11 +447,18 @@ def transform_json_record(db_record):
db_record["bibcode"], type(links_data), links_data
)
)

out["scix_id"] = None
if db_record.get("scix_id", None):
out["scix_id"] = db_record.get("scix_id")

# Compute doctype scores on the fly
# override temporal priority for bibgroup and bibgroup_facet, prefer nonbib
for key in ("bibgroup", "bibgroup_facet"):
if db_record.get("nonbib_data", {}).get(key):
out[key] = db_record["nonbib_data"][key]


# Compute doctype scores on the fly
out["doctype_boost"] = None

if config.get("DOCTYPE_RANKING", False):
Expand Down Expand Up @@ -510,4 +498,4 @@ def transform_json_record(db_record):
has.append(field)
out["has"] = has

return out
return out
31 changes: 15 additions & 16 deletions adsmp/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

from __future__ import absolute_import, unicode_literals
from past.builtins import basestring
import os
Expand Down Expand Up @@ -230,21 +229,21 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True

# check if we have complete record
for bibcode in bibcodes:
r = app.get_record(bibcode, load_only=fields)
record = app.get_record(bibcode, load_only=fields)

if r is None:
if record is None:
logger.error('The bibcode %s doesn\'t exist!', bibcode)
continue

augments_updated = r.get('augments_updated', None)
bib_data_updated = r.get('bib_data_updated', None)
fulltext_updated = r.get('fulltext_updated', None)
metrics_updated = r.get('metrics_updated', None)
nonbib_data_updated = r.get('nonbib_data_updated', None)
orcid_claims_updated = r.get('orcid_claims_updated', None)
augments_updated = record.get('augments_updated', None)
bib_data_updated = record.get('bib_data_updated', None)
fulltext_updated = record.get('fulltext_updated', None)
metrics_updated = record.get('metrics_updated', None)
nonbib_data_updated = record.get('nonbib_data_updated', None)
orcid_claims_updated = record.get('orcid_claims_updated', None)

year_zero = '1972'
processed = r.get('processed', adsputils.get_date(year_zero))
processed = record.get('processed', adsputils.get_date(year_zero))
if processed is None:
processed = adsputils.get_date(year_zero)

Expand All @@ -265,25 +264,25 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
metrics_updated, augments_updated))
# build the solr record
if update_solr:
solr_payload = solr_updater.transform_json_record(r)
solr_payload = solr_updater.transform_json_record(record)
# ADS microservices assume the identifier field exists and contains the canonical bibcode:
if 'identifier' not in solr_payload:
solr_payload['identifier'] = []
if 'bibcode' in solr_payload and solr_payload['bibcode'] not in solr_payload['identifier']:
solr_payload['identifier'].append(solr_payload['bibcode'])
logger.debug('Built SOLR: %s', solr_payload)
solr_checksum = app.checksum(solr_payload)
if ignore_checksums or r.get('solr_checksum', None) != solr_checksum:
if ignore_checksums or record.get('solr_checksum', None) != solr_checksum:
solr_records.append(solr_payload)
solr_records_checksum.append(solr_checksum)
else:
logger.debug('Checksum identical, skipping solr update for: %s', bibcode)

# get data for metrics
if update_metrics:
metrics_payload = r.get('metrics', None)
metrics_payload = record.get('metrics', None)
metrics_checksum = app.checksum(metrics_payload or '')
if (metrics_payload and ignore_checksums) or (metrics_payload and r.get('metrics_checksum', None) != metrics_checksum):
if (metrics_payload and ignore_checksums) or (metrics_payload and record.get('metrics_checksum', None) != metrics_checksum):
metrics_payload['bibcode'] = bibcode
logger.debug('Got metrics: %s', metrics_payload)
metrics_records.append(metrics_payload)
Expand All @@ -292,10 +291,10 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
logger.debug('Checksum identical or no metrics data available, skipping metrics update for: %s', bibcode)

if update_links and links_url:
datalinks_payload = app.generate_links_for_resolver(r)
datalinks_payload = app.generate_links_for_resolver(record)
if datalinks_payload:
datalinks_checksum = app.checksum(datalinks_payload)
if ignore_checksums or r.get('datalinks_checksum', None) != datalinks_checksum:
if ignore_checksums or record.get('datalinks_checksum', None) != datalinks_checksum:
links_data_records.append(datalinks_payload)
links_data_records_checksum.append(datalinks_checksum)
else:
Expand Down
Loading