adsabs · mugdhapolimera · Nov 19, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024
diff --git a/adsmp/app.py b/adsmp/app.py
@@ -2,7 +2,7 @@
 from __future__ import absolute_import, unicode_literals
 from past.builtins import basestring
 from . import exceptions
-from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records
+from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records, BoostFactors
 from adsmsg import OrcidClaims, DenormalizedRecord, FulltextUpdate, MetricsRecord, NonBibRecord, NonBibRecordList, MetricsRecordList, AugmentAffiliationResponseRecord, AugmentAffiliationRequestRecord
 from adsmsg.msg import Msg
 from adsputils import ADSCelery, create_engine, sessionmaker, scoped_session, contextmanager
@@ -71,6 +71,16 @@ def __init__(self, app_name, *args, **kwargs):
                 'rn_citations': getattr(self._metrics_table_upsert.excluded, 'rn_citations'),
                 'rn_citation_data': getattr(self._metrics_table_upsert.excluded, 'rn_citation_data')}
             self._metrics_table_upsert = self._metrics_table_upsert.on_conflict_do_update(index_elements=['bibcode'], set_=update_columns)
+
+        if self._config.get("DOCTYPE_RANKING", False):
+            doctype_rank = self._config.get("DOCTYPE_RANKING") 
+            unique_ranks = sorted(set(doctype_rank.values()))
+
+            # Map ranks to scores evenly spaced between 0 and 1 (invert: lowest rank gets the highest score)
+            rank_to_score = {rank: 1 - ( i / (len(unique_ranks) - 1)) for i, rank in enumerate(unique_ranks)}
+
+            # Assign scores to each rank
+            self._doctype_scores = {doctype: rank_to_score[rank] for doctype, rank in doctype_rank.items()}
 
     def update_storage(self, bibcode, type, payload):
         """Update the document in the database, every time
@@ -120,6 +130,9 @@ def update_storage(self, bibcode, type, payload):
 
             r.updated = now
             out = r.toJSON()
+            if out.get("bib_data", None):
+                doctype_boost = self.generate_doctype_boost(out["bib_data"].get("bibcode", None),out["bib_data"].get("doctype", None) )
+
             try:
                 session.commit()
                 return out
@@ -136,6 +149,12 @@ def delete_by_bibcode(self, bibcode):
                 session.delete(r)
                 session.commit()
                 return True
+
+            b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
+            if b is not None:
+                session.delete(b)
+                session.commit()
+                return True
 
     def rename_bibcode(self, old_bibcode, new_bibcode):
         assert old_bibcode and new_bibcode
@@ -555,3 +574,47 @@ def generate_links_for_resolver(self, record):
                     # here if record holds unexpected value
                     self.logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(bibcode, type(bib_links_record), bib_links_record))
         return resolver_record
+
+    def generate_doctype_boost(self, bibcode, doctype = None):
+        if not doctype:
+            with self.session_scope() as session:
+                r = session.query(Records).filter_by(bibcode=bibcode).first()
+                r = r.toJSON()
+                if r:
+                    if r.get("bibdata", None):
+                        doctype = r["bibdata"].get("doctype", None)
+
+        if bibcode and doctype:
+            with self.session_scope() as session:
+                b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
+
+                if b is None:
+                    b = BoostFactors(bibcode=bibcode)
+                    session.add(b)
+
+                b.doctype_boost = self._doctype_scores.get(doctype, None)
+
+                try:
+                    session.commit()
+                except exc.IntegrityError:
+                    self.logger.exception('error in app.generate_doctype_boost while updating database for bibcode {}'.format(r.bibcode))
+                    session.rollback()
+                    raise
+
+                return b.doctype_boost
+
+    def get_doctype_boost(self, bibcode):
+        if bibcode:
+            with self.session_scope() as session:
+                b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
+
+                if b:
+                    return b.doctype_boost
+
+    def delete_table_contents(self, table):
+        """Delete all contents of the table
+        :param table: string, name of the table
+        """
+        with self.session_scope() as session:
+            session.query(table).delete()
+            session.commit()
diff --git a/adsmp/models.py b/adsmp/models.py
@@ -5,12 +5,13 @@
 from adsputils import get_date
 from datetime import datetime
 from dateutil.tz import tzutc
-from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime
+from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime, Float
 from sqlalchemy import types
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.types import Enum
 from sqlalchemy.dialects import postgresql
 from sqlalchemy import text
+from sqlalchemy import ForeignKey
 import json
 
 Base = declarative_base()
@@ -184,3 +185,58 @@ def toJSON(self):
                     rn_citations=self.rn_citations,
                     rn_citation_data=self.rn_citation_data,
                     modtime=self.modtime and get_date(self.modtime).isoformat() or None)
+
+class BoostFactors(Base):
+    """
+    Tracks the various boost factors associated to each bibcode
+    Attributes:
+        id (int): The unique identifier for the table
+        record_id (int): The foreign key referencing the associated record.
+        bibcode (str) : bibcode of the record
+        doctype_boost (float): Boost factor between 0 to 1 based on doctype of the record
+        recency_boost (float): Boost factor between 0 to 1 based on pubyear of the record
+        refereed_boost (float): Boost factor between 0 to 1 based on refereed status of the record
+        cite_boost_astro (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the astro collection and year
+        cite_boost_geo (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the earth science collection and year
+        cite_boost_planetary (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the planetary science collection and year
+        cite_boost_physics (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the physics collection and year
+        cite_boost_general (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the general collection and year
+        cite_boost (float): Boost factor between 0 to 1; combination of other cite_boost scores based on the all the collections relevant to the record
+
+    """
+
+    __tablename__ = 'boostfactors'
+
+    id = Column(Integer, primary_key=True) #bigint
+    record_id = Column(Integer, ForeignKey('records.id'))
+    bibcode = Column(String(255))
+    doctype_boost = Column(Float)
+    cite_boost = Column(Float)
+    recency_boost = Column(Float)
+    refereed_boost = Column(Float)
+    cite_boost_astro = Column(Float)
+    cite_boost_geo = Column(Float)
+    cite_boost_planetary = Column(Float)
+    cite_boost_physics = Column(Float)
+    cite_boost_general = Column(Float)
+
+    def toJSON(self):
+        """
+        Converts the BoostFactors object to a JSON representation.
+        Returns:
+            dict: A dictionary containing the JSON representation of the object.
+        """
+        return {
+            'id': self.id,
+            'record_id': self.record_id,
+            'bibcode': self.bibcode,
+             "doctype_boost": self.doctype_boost,
+            "cite_boost": self.cite_boost,
+            "recency_boost": self.recency_boost,
+            "refereed_boost": self.refereed_boost,
+            "cite_boost_astro": self.cite_boost_astro,
+            "cite_boost_geo": self.cite_boost_geo,
+            "cite_boost_planetary": self.cite_boost_planetary,
+            "cite_boost_physics": self.cite_boost_physics,
+            "cite_boost_general": self.cite_boost_general,
+        }
diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py
@@ -21,7 +21,6 @@ def extract_metrics_pipeline(data, solrdoc):
 
     return dict(citation=citation)
 
-
 def extract_data_pipeline(data, solrdoc):
     reader = data.get("readers", [])
     read_count = len(reader)
@@ -301,7 +300,6 @@ def get_timestamps(db_record, out):
         out["update_timestamp"] = date2solrstamp(last_update)
     return out
 
-
 DB_COLUMN_DESTINATIONS = [
     ("bib_data", ""),
     ("orcid_claims", get_orcid_claims),
@@ -466,6 +464,11 @@ def transform_json_record(db_record):
                         db_record["bibcode"], type(links_data), links_data
                     )
                 )
+    out["doctype_boost"] = None
+    if db_record.get("nonbib_data"):
+        if db_record["nonbib_data"].get("doctype_boost", None):
+            out["doctype_boost"] = db_record["nonbib_data"]["doctype_boost"]
+
     if config.get("ENABLE_HAS", False):
         # Read-in names of fields to check for solr "has:" field
         hasfields = sorted(config.get("HAS_FIELDS", []))

diff --git a/adsmp/tasks.py b/adsmp/tasks.py
@@ -7,7 +7,7 @@
 from adsmp import solr_updater
 from kombu import Queue
 from adsmsg.msg import Msg
-
+from adsmp.models import BoostFactors
 # ============================= INITIALIZATION ==================================== #
 
 proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
@@ -191,6 +191,11 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
             logger.error('The bibcode %s doesn\'t exist!', bibcode)
             continue
 
+        # Adding boost factors to r to pass it to solr
+        doctype_boost = app.get_doctype_boost(bibcode)
+        if doctype_boost:
+            r["nonbib_data"]["doctype_boost"] = doctype_boost
+        logger.debug("doctype_boost ",doctype_boost)
         augments_updated = r.get('augments_updated', None)
         bib_data_updated = r.get('bib_data_updated', None)
         fulltext_updated = r.get('fulltext_updated', None)
@@ -304,6 +309,28 @@ def task_delete_documents(bibcode):
     else:
         logger.debug('Failed to deleted metrics record: %s', bibcode)
 
+@app.task(queue='populate-boostfactors-table') 
+def task_populate_boostfactors_table(bibcodes, boost_action):
+    """
+    Populate the sitemap table for the given bibcodes
+    """
+
+    if boost_action not in ['add', 'delete-tablecontents']:
+        logger.error("Invalid boost_action %s, must be 'add', 'remove', 'force-update', 'delete-table'", boost_action)
+        return
+
+    if boost_action == 'delete-tablecontents':
+        # reset and empty all entries in boostfactors table
+        app.delete_table_contents(BoostFactors)
+        return
+
+    if isinstance(bibcodes, basestring):
+        bibcodes = [bibcodes]
+
+    if boost_action == 'add':
+        logger.debug('Updating doctype_boost info in boostfactors table for: %s', bibcodes)
+        for bibcode in bibcodes:
+            app.generate_doctype_boost(bibcode)
 
 if __name__ == '__main__':
     app.start()
diff --git a/adsmp/tests/test_app.py b/adsmp/tests/test_app.py
@@ -64,7 +64,7 @@ def test_mark_processed(self):
         r = self.app.get_record('abc')
         self.assertEqual(r, None)
 
-        self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
+        self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1, 'doctype': 'article'})
         self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success')
         r = self.app.get_record('abc')
 
@@ -76,6 +76,7 @@ def test_mark_processed(self):
         self.assertTrue(r['solr_processed'])
         self.assertTrue(r['processed'])
         self.assertEqual(r['status'], 'solr-failed')
+        self.assertEqual(self.app.get_doctype_boost(r["bibcode"]), 1)
 
     def test_index_solr(self):
         self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
@@ -453,6 +454,5 @@ def test_generate_links_for_resolver(self):
             self.assertTrue('testbib' in str(m_args[0]))
             self.assertTrue('foobar' in str(m_args[0]))
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/adsmp/tests/test_solr_updater.py b/adsmp/tests/test_solr_updater.py
@@ -307,6 +307,11 @@ def test_solr_transformer(self):
         )
         rec = self.app.get_record("bibcode")
 
+        # Generate doctype_boost given the doctype from bib_data
+        doctype_boost = self.app.generate_doctype_boost(rec["bibcode"], rec["bib_data"]['doctype'])
+        # Add doctype_boost to nonbib_data as done in reindex task
+        rec["nonbib_data"]["doctype_boost"] = doctype_boost        
+
         x = solr_updater.transform_json_record(rec)
         # self.assertFalse('aff' in x, 'virtual field should not be in solr output')
 
@@ -340,6 +345,7 @@ def test_solr_transformer(self):
                 "volume",
             ],
         )
+        self.assertEqual(round(x["doctype_boost"],3),0.857)
 
         self.app.update_storage(
             "bibcode",
@@ -513,6 +519,11 @@ def test_solr_transformer(self):
                 self.assertEqual(x[f], "2017-09-19T21:17:12.026474Z")
 
         rec = self.app.get_record("bibcode")
+        # Generate doctype_boost given the doctype from bib_data
+        doctype_boost = self.app.generate_doctype_boost(rec["bibcode"], rec["bib_data"]['doctype'])
+        # Add doctype_boost to nonbib_data as done in reindex task
+        rec["nonbib_data"]["doctype_boost"] = doctype_boost        
+
         x = solr_updater.transform_json_record(rec)
 
         self.assertTrue("aff" in x)  # aff is no longer a virtual field
@@ -546,6 +557,7 @@ def test_solr_transformer(self):
                 "volume",
             ],
         )
+        self.assertEqual(round(x["doctype_boost"],3),0.857)
 
     def test_links_data_merge(self):
         # links_data only from bib

diff --git a/adsmp/tests/test_tasks.py b/adsmp/tests/test_tasks.py
@@ -537,15 +537,15 @@ def test_avoid_duplicates(self):
             tasks.task_index_records(["foo"], force=True)
 
             self.assertEqual(update_solr.call_count, 1)
-            self._check_checksum("foo", solr="0x4db9a611")
+            self._check_checksum("foo", solr="0x8f51bd8d")
 
             # now change metrics (solr shouldn't be called)
             getter.return_value = {
                 "bibcode": "foo",
                 "metrics_updated": get_date("1972-04-02"),
                 "bib_data_updated": get_date("1972-04-01"),
                 "metrics": {},
-                "solr_checksum": "0x4db9a611",
+                "solr_checksum": "0x8f51bd8d",
             }
             tasks.task_index_records(["foo"], force=True)
             self.assertEqual(update_solr.call_count, 1)
@@ -563,7 +563,7 @@ def test_ignore_checksums_solr(self):
                 "bibcode": "foo",
                 "metrics_updated": get_date("1972-04-02"),
                 "bib_data_updated": get_date("1972-04-01"),
-                "solr_checksum": "0x4db9a611",
+                "solr_checksum": "0x8f51bd8d",
             }
 
             # update with matching checksum and then update and ignore checksums

diff --git a/alembic/versions/6973dd1d9f2b_added_boostfactors_table.py b/alembic/versions/6973dd1d9f2b_added_boostfactors_table.py
@@ -0,0 +1,45 @@
+"""Added BoostFactors table
+
+Revision ID: 6973dd1d9f2b
+Revises: 2d2af8a9c996
+Create Date: 2024-11-20 18:23:26.770835
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '6973dd1d9f2b'
+down_revision = '2d2af8a9c996'
+
+from alembic import op
+import sqlalchemy as sa
+
+
+def upgrade():
+    #with app.app_context() as c:
+    #   db.session.add(Model())
+    #   db.session.commit()
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('boostfactors',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('record_id', sa.Integer(), nullable=True),
+    sa.Column('bibcode', sa.String(length=255), nullable=True),
+    sa.Column('doctype_boost', sa.Float(), nullable=True),
+    sa.Column('cite_boost', sa.Float(), nullable=True),
+    sa.Column('recency_boost', sa.Float(), nullable=True),
+    sa.Column('refereed_boost', sa.Float(), nullable=True),
+    sa.Column('cite_boost_astro', sa.Float(), nullable=True),
+    sa.Column('cite_boost_geo', sa.Float(), nullable=True),
+    sa.Column('cite_boost_planetary', sa.Float(), nullable=True),
+    sa.Column('cite_boost_physics', sa.Float(), nullable=True),
+    sa.Column('cite_boost_general', sa.Float(), nullable=True),
+    sa.ForeignKeyConstraint(['record_id'], ['records.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('boostfactors')
+    # ### end Alembic commands ###