diff --git a/adsmp/app.py b/adsmp/app.py index e80168b..ff4f9fb 100644 --- a/adsmp/app.py +++ b/adsmp/app.py @@ -2,7 +2,7 @@ from __future__ import absolute_import, unicode_literals from past.builtins import basestring from . import exceptions -from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records +from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records, BoostFactors from adsmsg import OrcidClaims, DenormalizedRecord, FulltextUpdate, MetricsRecord, NonBibRecord, NonBibRecordList, MetricsRecordList, AugmentAffiliationResponseRecord, AugmentAffiliationRequestRecord from adsmsg.msg import Msg from adsputils import ADSCelery, create_engine, sessionmaker, scoped_session, contextmanager @@ -71,6 +71,16 @@ def __init__(self, app_name, *args, **kwargs): 'rn_citations': getattr(self._metrics_table_upsert.excluded, 'rn_citations'), 'rn_citation_data': getattr(self._metrics_table_upsert.excluded, 'rn_citation_data')} self._metrics_table_upsert = self._metrics_table_upsert.on_conflict_do_update(index_elements=['bibcode'], set_=update_columns) + + if self._config.get("DOCTYPE_RANKING", False): + doctype_rank = self._config.get("DOCTYPE_RANKING") + unique_ranks = sorted(set(doctype_rank.values())) + + # Map ranks to scores evenly spaced between 0 and 1 (invert: lowest rank gets the highest score) + rank_to_score = {rank: 1 - ( i / (len(unique_ranks) - 1)) for i, rank in enumerate(unique_ranks)} + + # Assign scores to each rank + self._doctype_scores = {doctype: rank_to_score[rank] for doctype, rank in doctype_rank.items()} def update_storage(self, bibcode, type, payload): """Update the document in the database, every time @@ -120,6 +130,9 @@ def update_storage(self, bibcode, type, payload): r.updated = now out = r.toJSON() + if out.get("bib_data", None): + doctype_boost = self.generate_doctype_boost(out["bib_data"].get("bibcode", None),out["bib_data"].get("doctype", None) ) + try: session.commit() return out @@ -136,6 +149,12 @@ def delete_by_bibcode(self, bibcode): session.delete(r) session.commit() return True + + b = session.query(BoostFactors).filter_by(bibcode=bibcode).first() + if b is not None: + session.delete(b) + session.commit() + return True def rename_bibcode(self, old_bibcode, new_bibcode): assert old_bibcode and new_bibcode @@ -555,3 +574,47 @@ def generate_links_for_resolver(self, record): # here if record holds unexpected value self.logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(bibcode, type(bib_links_record), bib_links_record)) return resolver_record + + def generate_doctype_boost(self, bibcode, doctype = None): + if not doctype: + with self.session_scope() as session: + r = session.query(Records).filter_by(bibcode=bibcode).first() + r = r.toJSON() + if r: + if r.get("bibdata", None): + doctype = r["bibdata"].get("doctype", None) + + if bibcode and doctype: + with self.session_scope() as session: + b = session.query(BoostFactors).filter_by(bibcode=bibcode).first() + + if b is None: + b = BoostFactors(bibcode=bibcode) + session.add(b) + + b.doctype_boost = self._doctype_scores.get(doctype, None) + + try: + session.commit() + except exc.IntegrityError: + self.logger.exception('error in app.generate_doctype_boost while updating database for bibcode {}'.format(r.bibcode)) + session.rollback() + raise + + return b.doctype_boost + + def get_doctype_boost(self, bibcode): + if bibcode: + with self.session_scope() as session: + b = session.query(BoostFactors).filter_by(bibcode=bibcode).first() + + if b: + return b.doctype_boost + + def delete_table_contents(self, table): + """Delete all contents of the table + :param table: string, name of the table + """ + with self.session_scope() as session: + session.query(table).delete() + session.commit() \ No newline at end of file diff --git a/adsmp/models.py b/adsmp/models.py index b442175..f03af25 100644 --- a/adsmp/models.py +++ b/adsmp/models.py @@ -5,12 +5,13 @@ from adsputils import get_date from datetime import datetime from dateutil.tz import tzutc -from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime +from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime, Float from sqlalchemy import types from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.types import Enum from sqlalchemy.dialects import postgresql from sqlalchemy import text +from sqlalchemy import ForeignKey import json Base = declarative_base() @@ -184,3 +185,58 @@ def toJSON(self): rn_citations=self.rn_citations, rn_citation_data=self.rn_citation_data, modtime=self.modtime and get_date(self.modtime).isoformat() or None) + +class BoostFactors(Base): + """ + Tracks the various boost factors associated to each bibcode + Attributes: + id (int): The unique identifier for the table + record_id (int): The foreign key referencing the associated record. + bibcode (str) : bibcode of the record + doctype_boost (float): Boost factor between 0 to 1 based on doctype of the record + recency_boost (float): Boost factor between 0 to 1 based on pubyear of the record + refereed_boost (float): Boost factor between 0 to 1 based on refereed status of the record + cite_boost_astro (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the astro collection and year + cite_boost_geo (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the earth science collection and year + cite_boost_planetary (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the planetary science collection and year + cite_boost_physics (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the physics collection and year + cite_boost_general (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the general collection and year + cite_boost (float): Boost factor between 0 to 1; combination of other cite_boost scores based on the all the collections relevant to the record + + """ + + __tablename__ = 'boostfactors' + + id = Column(Integer, primary_key=True) #bigint + record_id = Column(Integer, ForeignKey('records.id')) + bibcode = Column(String(255)) + doctype_boost = Column(Float) + cite_boost = Column(Float) + recency_boost = Column(Float) + refereed_boost = Column(Float) + cite_boost_astro = Column(Float) + cite_boost_geo = Column(Float) + cite_boost_planetary = Column(Float) + cite_boost_physics = Column(Float) + cite_boost_general = Column(Float) + + def toJSON(self): + """ + Converts the BoostFactors object to a JSON representation. + Returns: + dict: A dictionary containing the JSON representation of the object. + """ + return { + 'id': self.id, + 'record_id': self.record_id, + 'bibcode': self.bibcode, + "doctype_boost": self.doctype_boost, + "cite_boost": self.cite_boost, + "recency_boost": self.recency_boost, + "refereed_boost": self.refereed_boost, + "cite_boost_astro": self.cite_boost_astro, + "cite_boost_geo": self.cite_boost_geo, + "cite_boost_planetary": self.cite_boost_planetary, + "cite_boost_physics": self.cite_boost_physics, + "cite_boost_general": self.cite_boost_general, + } \ No newline at end of file diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py index 9499cf8..a47b034 100644 --- a/adsmp/solr_updater.py +++ b/adsmp/solr_updater.py @@ -21,7 +21,6 @@ def extract_metrics_pipeline(data, solrdoc): return dict(citation=citation) - def extract_data_pipeline(data, solrdoc): reader = data.get("readers", []) read_count = len(reader) @@ -301,7 +300,6 @@ def get_timestamps(db_record, out): out["update_timestamp"] = date2solrstamp(last_update) return out - DB_COLUMN_DESTINATIONS = [ ("bib_data", ""), ("orcid_claims", get_orcid_claims), @@ -466,6 +464,11 @@ def transform_json_record(db_record): db_record["bibcode"], type(links_data), links_data ) ) + out["doctype_boost"] = None + if db_record.get("nonbib_data"): + if db_record["nonbib_data"].get("doctype_boost", None): + out["doctype_boost"] = db_record["nonbib_data"]["doctype_boost"] + if config.get("ENABLE_HAS", False): # Read-in names of fields to check for solr "has:" field hasfields = sorted(config.get("HAS_FIELDS", [])) diff --git a/adsmp/tasks.py b/adsmp/tasks.py index 29216fc..bf11cca 100644 --- a/adsmp/tasks.py +++ b/adsmp/tasks.py @@ -7,7 +7,7 @@ from adsmp import solr_updater from kombu import Queue from adsmsg.msg import Msg - +from adsmp.models import BoostFactors # ============================= INITIALIZATION ==================================== # proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../')) @@ -191,6 +191,11 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True logger.error('The bibcode %s doesn\'t exist!', bibcode) continue + # Adding boost factors to r to pass it to solr + doctype_boost = app.get_doctype_boost(bibcode) + if doctype_boost: + r["nonbib_data"]["doctype_boost"] = doctype_boost + logger.debug("doctype_boost ",doctype_boost) augments_updated = r.get('augments_updated', None) bib_data_updated = r.get('bib_data_updated', None) fulltext_updated = r.get('fulltext_updated', None) @@ -304,6 +309,28 @@ def task_delete_documents(bibcode): else: logger.debug('Failed to deleted metrics record: %s', bibcode) +@app.task(queue='populate-boostfactors-table') +def task_populate_boostfactors_table(bibcodes, boost_action): + """ + Populate the sitemap table for the given bibcodes + """ + + if boost_action not in ['add', 'delete-tablecontents']: + logger.error("Invalid boost_action %s, must be 'add', 'remove', 'force-update', 'delete-table'", boost_action) + return + + if boost_action == 'delete-tablecontents': + # reset and empty all entries in boostfactors table + app.delete_table_contents(BoostFactors) + return + + if isinstance(bibcodes, basestring): + bibcodes = [bibcodes] + + if boost_action == 'add': + logger.debug('Updating doctype_boost info in boostfactors table for: %s', bibcodes) + for bibcode in bibcodes: + app.generate_doctype_boost(bibcode) if __name__ == '__main__': app.start() diff --git a/adsmp/tests/test_app.py b/adsmp/tests/test_app.py index 2c5e4de..d7cfe53 100644 --- a/adsmp/tests/test_app.py +++ b/adsmp/tests/test_app.py @@ -64,7 +64,7 @@ def test_mark_processed(self): r = self.app.get_record('abc') self.assertEqual(r, None) - self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1}) + self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1, 'doctype': 'article'}) self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success') r = self.app.get_record('abc') @@ -76,6 +76,7 @@ def test_mark_processed(self): self.assertTrue(r['solr_processed']) self.assertTrue(r['processed']) self.assertEqual(r['status'], 'solr-failed') + self.assertEqual(self.app.get_doctype_boost(r["bibcode"]), 1) def test_index_solr(self): self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1}) @@ -453,6 +454,5 @@ def test_generate_links_for_resolver(self): self.assertTrue('testbib' in str(m_args[0])) self.assertTrue('foobar' in str(m_args[0])) - if __name__ == '__main__': unittest.main() diff --git a/adsmp/tests/test_solr_updater.py b/adsmp/tests/test_solr_updater.py index 415689d..f201307 100644 --- a/adsmp/tests/test_solr_updater.py +++ b/adsmp/tests/test_solr_updater.py @@ -307,6 +307,11 @@ def test_solr_transformer(self): ) rec = self.app.get_record("bibcode") + # Generate doctype_boost given the doctype from bib_data + doctype_boost = self.app.generate_doctype_boost(rec["bibcode"], rec["bib_data"]['doctype']) + # Add doctype_boost to nonbib_data as done in reindex task + rec["nonbib_data"]["doctype_boost"] = doctype_boost + x = solr_updater.transform_json_record(rec) # self.assertFalse('aff' in x, 'virtual field should not be in solr output') @@ -340,6 +345,7 @@ def test_solr_transformer(self): "volume", ], ) + self.assertEqual(round(x["doctype_boost"],3),0.857) self.app.update_storage( "bibcode", @@ -513,6 +519,11 @@ def test_solr_transformer(self): self.assertEqual(x[f], "2017-09-19T21:17:12.026474Z") rec = self.app.get_record("bibcode") + # Generate doctype_boost given the doctype from bib_data + doctype_boost = self.app.generate_doctype_boost(rec["bibcode"], rec["bib_data"]['doctype']) + # Add doctype_boost to nonbib_data as done in reindex task + rec["nonbib_data"]["doctype_boost"] = doctype_boost + x = solr_updater.transform_json_record(rec) self.assertTrue("aff" in x) # aff is no longer a virtual field @@ -546,6 +557,7 @@ def test_solr_transformer(self): "volume", ], ) + self.assertEqual(round(x["doctype_boost"],3),0.857) def test_links_data_merge(self): # links_data only from bib diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py index 6e74215..a18da1d 100644 --- a/adsmp/tests/test_tasks.py +++ b/adsmp/tests/test_tasks.py @@ -537,7 +537,7 @@ def test_avoid_duplicates(self): tasks.task_index_records(["foo"], force=True) self.assertEqual(update_solr.call_count, 1) - self._check_checksum("foo", solr="0x4db9a611") + self._check_checksum("foo", solr="0x8f51bd8d") # now change metrics (solr shouldn't be called) getter.return_value = { @@ -545,7 +545,7 @@ def test_avoid_duplicates(self): "metrics_updated": get_date("1972-04-02"), "bib_data_updated": get_date("1972-04-01"), "metrics": {}, - "solr_checksum": "0x4db9a611", + "solr_checksum": "0x8f51bd8d", } tasks.task_index_records(["foo"], force=True) self.assertEqual(update_solr.call_count, 1) @@ -563,7 +563,7 @@ def test_ignore_checksums_solr(self): "bibcode": "foo", "metrics_updated": get_date("1972-04-02"), "bib_data_updated": get_date("1972-04-01"), - "solr_checksum": "0x4db9a611", + "solr_checksum": "0x8f51bd8d", } # update with matching checksum and then update and ignore checksums diff --git a/alembic/versions/6973dd1d9f2b_added_boostfactors_table.py b/alembic/versions/6973dd1d9f2b_added_boostfactors_table.py new file mode 100644 index 0000000..b1313a6 --- /dev/null +++ b/alembic/versions/6973dd1d9f2b_added_boostfactors_table.py @@ -0,0 +1,45 @@ +"""Added BoostFactors table + +Revision ID: 6973dd1d9f2b +Revises: 2d2af8a9c996 +Create Date: 2024-11-20 18:23:26.770835 + +""" + +# revision identifiers, used by Alembic. +revision = '6973dd1d9f2b' +down_revision = '2d2af8a9c996' + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + #with app.app_context() as c: + # db.session.add(Model()) + # db.session.commit() + + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('boostfactors', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('record_id', sa.Integer(), nullable=True), + sa.Column('bibcode', sa.String(length=255), nullable=True), + sa.Column('doctype_boost', sa.Float(), nullable=True), + sa.Column('cite_boost', sa.Float(), nullable=True), + sa.Column('recency_boost', sa.Float(), nullable=True), + sa.Column('refereed_boost', sa.Float(), nullable=True), + sa.Column('cite_boost_astro', sa.Float(), nullable=True), + sa.Column('cite_boost_geo', sa.Float(), nullable=True), + sa.Column('cite_boost_planetary', sa.Float(), nullable=True), + sa.Column('cite_boost_physics', sa.Float(), nullable=True), + sa.Column('cite_boost_general', sa.Float(), nullable=True), + sa.ForeignKeyConstraint(['record_id'], ['records.id'], ), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('boostfactors') + # ### end Alembic commands ### diff --git a/config.py b/config.py index 23d07b0..c4dac72 100644 --- a/config.py +++ b/config.py @@ -7,7 +7,7 @@ # possible values: WARN, INFO, DEBUG -LOGGING_LEVEL = "INFO" +LOGGING_LEVEL = "DEBUG" CELERY_INCLUDE = ["adsmp.tasks"] OUTPUT_CELERY_BROKER = "pyamqp://test:test@localhost:5682/test_augment_pipeline" @@ -66,3 +66,27 @@ "uat", "volume", ] + +DOCTYPE_RANKING = { + "article": 1, + "eprint": 1, + "inproceedings": 2, + "inbook": 1, + "abstract": 4, + "book": 1, + "bookreview": 4, + "catalog": 2, + "circular": 3, + "erratum": 6, + "mastersthesis": 3, + "newsletter": 5, + "obituary": 6, + "phdthesis": 3, + "pressrelease": 7, + "proceedings": 3, + "proposal": 4, + "software": 2, + "talk": 4, + "techreport": 3, + "misc": 8 +} \ No newline at end of file diff --git a/run.py b/run.py index 042ad14..002e32d 100755 --- a/run.py +++ b/run.py @@ -399,6 +399,13 @@ def reindex_failed_bibcodes(app, update_processed=True): bibs = [] logger.info('Done reindexing %s previously failed bibcodes', count) +def populate_boostfactors_table(bibcodes, boost_action): + """ + actions: 'add': add/update doctype_boost scores in boostfactors table for given bibcodes + 'delete-table-contents': delete all contents of boostfactors table + """ + tasks.task_populate_boostfactors_table(bibcodes, boost_action = boost_action) + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process user input.') @@ -524,6 +531,15 @@ def reindex_failed_bibcodes(app, update_processed=True): default=False, dest='update_processed', help='update processed timestamps and other state info in records table when a record is indexed') + parser.add_argument('--populate-boostfactors-table', + action='store_true', + default=False, + dest='populate_boostfactors_table', + help='populate boostfactors table for list of bibcode') + parser.add_argument('--boost_action', + default=False, + choices=['add', 'delete-tablecontents'], + help='action for populate_boostfactors_table function') args = parser.parse_args() @@ -534,7 +550,7 @@ def reindex_failed_bibcodes(app, update_processed=True): print_kvs() logger.info('Executing run.py: %s', args) - + # uff: this whole block needs refactoring (as is written, it only allows for single operation) if args.diagnostics: diagnostics(args.bibcodes) @@ -594,6 +610,22 @@ def reindex_failed_bibcodes(app, update_processed=True): rebuild_collection(args.solr_collection, args.batch_size) elif args.index_failed: reindex_failed_bibcodes(app, args.update_processed) + elif args.populate_boostfactors_table: + if args.filename: + bibs = [] + with open(args.filename) as f: + for line in f: + bibcode = line.strip() + if bibcode: + bibs.append(bibcode) + elif args.bibcodes: + bibs = args.bibcodes + if args.boost_action: + boost_action = args.boost_action + else: + boost_action = 'add' + + populate_boostfactors_table(bibs, boost_action = boost_action) elif args.reindex: update_solr = 's' in args.reindex.lower() update_metrics = 'm' in args.reindex.lower()