adsabs · mugdhapolimera · Nov 19, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 21, 2024
diff --git a/adsmp/app.py b/adsmp/app.py
@@ -2,7 +2,7 @@
 from __future__ import absolute_import, unicode_literals
 from past.builtins import basestring
 from . import exceptions
-from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records
+from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records, BoostFactors
 from adsmsg import OrcidClaims, DenormalizedRecord, FulltextUpdate, MetricsRecord, NonBibRecord, NonBibRecordList, MetricsRecordList, AugmentAffiliationResponseRecord, AugmentAffiliationRequestRecord
 from adsmsg.msg import Msg
 from adsputils import ADSCelery, create_engine, sessionmaker, scoped_session, contextmanager
@@ -71,6 +71,16 @@ def __init__(self, app_name, *args, **kwargs):
                 'rn_citations': getattr(self._metrics_table_upsert.excluded, 'rn_citations'),
                 'rn_citation_data': getattr(self._metrics_table_upsert.excluded, 'rn_citation_data')}
             self._metrics_table_upsert = self._metrics_table_upsert.on_conflict_do_update(index_elements=['bibcode'], set_=update_columns)
+
+        if self._config.get("DOCTYPE_RANKING", False):
+            doctype_rank = self._config.get("DOCTYPE_RANKING") 
+            unique_ranks = sorted(set(doctype_rank.values()))
+
+            # Map ranks to scores evenly spaced between 0 and 1 (invert: lowest rank gets the highest score)
+            rank_to_score = {rank: 1 - ( i / (len(unique_ranks) - 1)) for i, rank in enumerate(unique_ranks)}
+
+            # Assign scores to each rank
+            self._doctype_scores = {doctype: rank_to_score[rank] for doctype, rank in doctype_rank.items()}
 
     def update_storage(self, bibcode, type, payload):
         """Update the document in the database, every time
@@ -120,6 +130,9 @@ def update_storage(self, bibcode, type, payload):
 
             r.updated = now
             out = r.toJSON()
+            if out.get("bib_data", None):
+                self.generate_doctype_boost(out["bib_data"].get("bibcode", None),out["bib_data"].get("doctype", None) )
+
             try:
                 session.commit()
                 return out
@@ -136,6 +149,12 @@ def delete_by_bibcode(self, bibcode):
                 session.delete(r)
                 session.commit()
                 return True
+
+            b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
+            if b is not None:
+                session.delete(b)
+                session.commit()
+                return True
 
     def rename_bibcode(self, old_bibcode, new_bibcode):
         assert old_bibcode and new_bibcode
@@ -555,3 +574,29 @@ def generate_links_for_resolver(self, record):
                     # here if record holds unexpected value
                     self.logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(bibcode, type(bib_links_record), bib_links_record))
         return resolver_record
+
+    def generate_doctype_boost(self, bibcode, doctype):
+        if bibcode and doctype:
+            with self.session_scope() as session:
+                b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
+
+                if b is None:
+                    b = BoostFactors(bibcode=bibcode)
+                    session.add(b)
+
+                b.doctype_boost = self._doctype_scores.get(doctype, None)
+
+                try:
+                    session.commit()
+                except exc.IntegrityError:
+                    self.logger.exception('error in app.generate_doctype_boost while updating database for bibcode {}'.format(r.bibcode))
+                    session.rollback()
+                    raise
+
+    def get_doctype_boost(self, bibcode):
+        if bibcode:
+            with self.session_scope() as session:
+                b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
+
+                if b:
+                    return b.doctype_boost
diff --git a/adsmp/models.py b/adsmp/models.py
@@ -5,12 +5,13 @@
 from adsputils import get_date
 from datetime import datetime
 from dateutil.tz import tzutc
-from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime
+from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime, Float
 from sqlalchemy import types
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.types import Enum
 from sqlalchemy.dialects import postgresql
 from sqlalchemy import text
+from sqlalchemy import ForeignKey
 import json
 
 Base = declarative_base()
@@ -184,3 +185,58 @@ def toJSON(self):
                     rn_citations=self.rn_citations,
                     rn_citation_data=self.rn_citation_data,
                     modtime=self.modtime and get_date(self.modtime).isoformat() or None)
+
+class BoostFactors(Base):
+    """
+    Tracks the various boost factors associated to each bibcode
+    Attributes:
+        id (int): The unique identifier for the table
+        record_id (int): The foreign key referencing the associated record.
+        bibcode (str) : bibcode of the record
+        doctype_boost (float): Boost factor between 0 to 1 based on doctype of the record
+        recency_boost (float): Boost factor between 0 to 1 based on pubyear of the record
+        refereed_boost (float): Boost factor between 0 to 1 based on refereed status of the record
+        cite_boost_astro (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the astro collection and year
+        cite_boost_geo (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the earth science collection and year
+        cite_boost_planetary (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the planetary science collection and year
+        cite_boost_physics (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the physics collection and year
+        cite_boost_general (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the general collection and year
+        cite_boost (float): Boost factor between 0 to 1; combination of other cite_boost scores based on the all the collections relevant to the record
+
+    """
+
+    __tablename__ = 'boostfactors'
+
+    id = Column(Integer, primary_key=True) #bigint
+    record_id = Column(Integer, ForeignKey('records.id'))
+    bibcode = Column(String(255))
+    doctype_boost = Column(Float)
+    cite_boost = Column(Float)
+    recency_boost = Column(Float)
+    refereed_boost = Column(Float)
+    cite_boost_astro = Column(Float)
+    cite_boost_geo = Column(Float)
+    cite_boost_planetary = Column(Float)
+    cite_boost_physics = Column(Float)
+    cite_boost_general = Column(Float)
+
+    def toJSON(self):
+        """
+        Converts the BoostFactors object to a JSON representation.
+        Returns:
+            dict: A dictionary containing the JSON representation of the object.
+        """
+        return {
+            'id': self.id,
+            'record_id': self.record_id,
+            'bibcode': self.bibcode,
+             "doctype_boost": self.doctype_boost,
+            "cite_boost": self.cite_boost,
+            "recency_boost": self.recency_boost,
+            "refereed_boost": self.refereed_boost,
+            "cite_boost_astro": self.cite_boost_astro,
+            "cite_boost_geo": self.cite_boost_geo,
+            "cite_boost_planetary": self.cite_boost_planetary,
+            "cite_boost_physics": self.cite_boost_physics,
+            "cite_boost_general": self.cite_boost_general,
+        }
diff --git a/adsmp/solr_updater.py b/adsmp/solr_updater.py
@@ -21,7 +21,6 @@ def extract_metrics_pipeline(data, solrdoc):
 
     return dict(citation=citation)
 
-
 def extract_data_pipeline(data, solrdoc):
     reader = data.get("readers", [])
     read_count = len(reader)
@@ -301,7 +300,6 @@ def get_timestamps(db_record, out):
         out["update_timestamp"] = date2solrstamp(last_update)
     return out
 
-
 DB_COLUMN_DESTINATIONS = [
     ("bib_data", ""),
     ("orcid_claims", get_orcid_claims),
@@ -466,6 +464,9 @@ def transform_json_record(db_record):
                         db_record["bibcode"], type(links_data), links_data
                     )
                 )
+    if db_record.get("doctype_boost", None):
+        out["doctype_boost"] = db_record.get("doctype_boost")
+
     if config.get("ENABLE_HAS", False):
         # Read-in names of fields to check for solr "has:" field
         hasfields = sorted(config.get("HAS_FIELDS", []))

diff --git a/adsmp/tasks.py b/adsmp/tasks.py
@@ -191,6 +191,11 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
             logger.error('The bibcode %s doesn\'t exist!', bibcode)
             continue
 
+        # Adding boost factors to r to pass it to solr
+        doctype_boost = app.get_doctype_boost(bibcode)
+        if doctype_boost:
+            r["doctype_boost"] = doctype_boost
+
         augments_updated = r.get('augments_updated', None)
         bib_data_updated = r.get('bib_data_updated', None)
         fulltext_updated = r.get('fulltext_updated', None)

diff --git a/adsmp/tests/test_app.py b/adsmp/tests/test_app.py
@@ -64,7 +64,7 @@ def test_mark_processed(self):
         r = self.app.get_record('abc')
         self.assertEqual(r, None)
 
-        self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
+        self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1, 'doctype': 'article'})
         self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success')
         r = self.app.get_record('abc')
 
@@ -76,6 +76,7 @@ def test_mark_processed(self):
         self.assertTrue(r['solr_processed'])
         self.assertTrue(r['processed'])
         self.assertEqual(r['status'], 'solr-failed')
+        self.assertEqual(self.app.get_doctype_boost, 1)
 
     def test_index_solr(self):
         self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})

diff --git a/alembic/versions/6973dd1d9f2b_added_boostfactors_table.py b/alembic/versions/6973dd1d9f2b_added_boostfactors_table.py
@@ -0,0 +1,45 @@
+"""Added BoostFactors table
+
+Revision ID: 6973dd1d9f2b
+Revises: 2d2af8a9c996
+Create Date: 2024-11-20 18:23:26.770835
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '6973dd1d9f2b'
+down_revision = '2d2af8a9c996'
+
+from alembic import op
+import sqlalchemy as sa
+
+
+def upgrade():
+    #with app.app_context() as c:
+    #   db.session.add(Model())
+    #   db.session.commit()
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('boostfactors',
+    sa.Column('id', sa.Integer(), nullable=False),
+    sa.Column('record_id', sa.Integer(), nullable=True),
+    sa.Column('bibcode', sa.String(length=255), nullable=True),
+    sa.Column('doctype_boost', sa.Float(), nullable=True),
+    sa.Column('cite_boost', sa.Float(), nullable=True),
+    sa.Column('recency_boost', sa.Float(), nullable=True),
+    sa.Column('refereed_boost', sa.Float(), nullable=True),
+    sa.Column('cite_boost_astro', sa.Float(), nullable=True),
+    sa.Column('cite_boost_geo', sa.Float(), nullable=True),
+    sa.Column('cite_boost_planetary', sa.Float(), nullable=True),
+    sa.Column('cite_boost_physics', sa.Float(), nullable=True),
+    sa.Column('cite_boost_general', sa.Float(), nullable=True),
+    sa.ForeignKeyConstraint(['record_id'], ['records.id'], ),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('boostfactors')
+    # ### end Alembic commands ###
diff --git a/config.py b/config.py
@@ -66,3 +66,27 @@
     "uat",
     "volume",
 ]
+
+DOCTYPE_RANKING = {
+    "article": 1,
+    "eprint": 1,
+    "inproceedings": 2,
+    "inbook": 1,
+    "abstract": 4,
+    "book": 1,
+    "bookreview": 4,
+    "catalog": 2,
+    "circular": 3,
+    "erratum": 6,
+    "mastersthesis": 3,
+    "newsletter": 5,
+    "obituary": 6,
+    "phdthesis": 3,
+    "pressrelease": 7,
+    "proceedings": 3,
+    "proposal": 4,
+    "software": 2,
+    "talk": 4,
+    "techreport": 3,
+    "misc": 8
+}