Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion adsmp/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from __future__ import absolute_import, unicode_literals
from past.builtins import basestring
from . import exceptions
from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records
from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records, BoostFactors
from adsmsg import OrcidClaims, DenormalizedRecord, FulltextUpdate, MetricsRecord, NonBibRecord, NonBibRecordList, MetricsRecordList, AugmentAffiliationResponseRecord, AugmentAffiliationRequestRecord
from adsmsg.msg import Msg
from adsputils import ADSCelery, create_engine, sessionmaker, scoped_session, contextmanager
Expand Down Expand Up @@ -71,6 +71,16 @@ def __init__(self, app_name, *args, **kwargs):
'rn_citations': getattr(self._metrics_table_upsert.excluded, 'rn_citations'),
'rn_citation_data': getattr(self._metrics_table_upsert.excluded, 'rn_citation_data')}
self._metrics_table_upsert = self._metrics_table_upsert.on_conflict_do_update(index_elements=['bibcode'], set_=update_columns)

if self._config.get("DOCTYPE_RANKING", False):
doctype_rank = self._config.get("DOCTYPE_RANKING")
unique_ranks = sorted(set(doctype_rank.values()))

# Map ranks to scores evenly spaced between 0 and 1 (invert: lowest rank gets the highest score)
rank_to_score = {rank: 1 - ( i / (len(unique_ranks) - 1)) for i, rank in enumerate(unique_ranks)}

# Assign scores to each rank
self._doctype_scores = {doctype: rank_to_score[rank] for doctype, rank in doctype_rank.items()}

def update_storage(self, bibcode, type, payload):
"""Update the document in the database, every time
Expand Down Expand Up @@ -120,6 +130,9 @@ def update_storage(self, bibcode, type, payload):

r.updated = now
out = r.toJSON()
if out.get("bib_data", None):
self.generate_doctype_boost(out["bib_data"].get("bibcode", None),out["bib_data"].get("doctype", None) )

try:
session.commit()
return out
Expand All @@ -136,6 +149,12 @@ def delete_by_bibcode(self, bibcode):
session.delete(r)
session.commit()
return True

b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
if b is not None:
session.delete(b)
session.commit()
return True

def rename_bibcode(self, old_bibcode, new_bibcode):
assert old_bibcode and new_bibcode
Expand Down Expand Up @@ -555,3 +574,29 @@ def generate_links_for_resolver(self, record):
# here if record holds unexpected value
self.logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(bibcode, type(bib_links_record), bib_links_record))
return resolver_record

def generate_doctype_boost(self, bibcode, doctype):
if bibcode and doctype:
with self.session_scope() as session:
b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()

if b is None:
b = BoostFactors(bibcode=bibcode)
session.add(b)

b.doctype_boost = self._doctype_scores.get(doctype, None)

try:
session.commit()
except exc.IntegrityError:
self.logger.exception('error in app.generate_doctype_boost while updating database for bibcode {}'.format(r.bibcode))
session.rollback()
raise

def get_doctype_boost(self, bibcode):
if bibcode:
with self.session_scope() as session:
b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()

if b:
return b.doctype_boost
58 changes: 57 additions & 1 deletion adsmp/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
from adsputils import get_date
from datetime import datetime
from dateutil.tz import tzutc
from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime
from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime, Float
from sqlalchemy import types
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.types import Enum
from sqlalchemy.dialects import postgresql
from sqlalchemy import text
from sqlalchemy import ForeignKey
import json

Base = declarative_base()
Expand Down Expand Up @@ -184,3 +185,58 @@ def toJSON(self):
rn_citations=self.rn_citations,
rn_citation_data=self.rn_citation_data,
modtime=self.modtime and get_date(self.modtime).isoformat() or None)

class BoostFactors(Base):
"""
Tracks the various boost factors associated to each bibcode
Attributes:
id (int): The unique identifier for the table
record_id (int): The foreign key referencing the associated record.
bibcode (str) : bibcode of the record
doctype_boost (float): Boost factor between 0 to 1 based on doctype of the record
recency_boost (float): Boost factor between 0 to 1 based on pubyear of the record
refereed_boost (float): Boost factor between 0 to 1 based on refereed status of the record
cite_boost_astro (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the astro collection and year
cite_boost_geo (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the earth science collection and year
cite_boost_planetary (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the planetary science collection and year
cite_boost_physics (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the physics collection and year
cite_boost_general (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the general collection and year
cite_boost (float): Boost factor between 0 to 1; combination of other cite_boost scores based on the all the collections relevant to the record

"""

__tablename__ = 'boostfactors'

id = Column(Integer, primary_key=True) #bigint
record_id = Column(Integer, ForeignKey('records.id'))
bibcode = Column(String(255))
doctype_boost = Column(Float)
cite_boost = Column(Float)
recency_boost = Column(Float)
refereed_boost = Column(Float)
cite_boost_astro = Column(Float)
cite_boost_geo = Column(Float)
cite_boost_planetary = Column(Float)
cite_boost_physics = Column(Float)
cite_boost_general = Column(Float)

def toJSON(self):
"""
Converts the BoostFactors object to a JSON representation.
Returns:
dict: A dictionary containing the JSON representation of the object.
"""
return {
'id': self.id,
'record_id': self.record_id,
'bibcode': self.bibcode,
"doctype_boost": self.doctype_boost,
"cite_boost": self.cite_boost,
"recency_boost": self.recency_boost,
"refereed_boost": self.refereed_boost,
"cite_boost_astro": self.cite_boost_astro,
"cite_boost_geo": self.cite_boost_geo,
"cite_boost_planetary": self.cite_boost_planetary,
"cite_boost_physics": self.cite_boost_physics,
"cite_boost_general": self.cite_boost_general,
}
5 changes: 3 additions & 2 deletions adsmp/solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def extract_metrics_pipeline(data, solrdoc):

return dict(citation=citation)


def extract_data_pipeline(data, solrdoc):
reader = data.get("readers", [])
read_count = len(reader)
Expand Down Expand Up @@ -301,7 +300,6 @@ def get_timestamps(db_record, out):
out["update_timestamp"] = date2solrstamp(last_update)
return out


DB_COLUMN_DESTINATIONS = [
("bib_data", ""),
("orcid_claims", get_orcid_claims),
Expand Down Expand Up @@ -466,6 +464,9 @@ def transform_json_record(db_record):
db_record["bibcode"], type(links_data), links_data
)
)
if db_record.get("doctype_boost", None):
out["doctype_boost"] = db_record.get("doctype_boost")

if config.get("ENABLE_HAS", False):
# Read-in names of fields to check for solr "has:" field
hasfields = sorted(config.get("HAS_FIELDS", []))
Expand Down
5 changes: 5 additions & 0 deletions adsmp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
logger.error('The bibcode %s doesn\'t exist!', bibcode)
continue

# Adding boost factors to r to pass it to solr
doctype_boost = app.get_doctype_boost(bibcode)
if doctype_boost:
r["doctype_boost"] = doctype_boost

augments_updated = r.get('augments_updated', None)
bib_data_updated = r.get('bib_data_updated', None)
fulltext_updated = r.get('fulltext_updated', None)
Expand Down
3 changes: 2 additions & 1 deletion adsmp/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_mark_processed(self):
r = self.app.get_record('abc')
self.assertEqual(r, None)

self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1, 'doctype': 'article'})
self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success')
r = self.app.get_record('abc')

Expand All @@ -76,6 +76,7 @@ def test_mark_processed(self):
self.assertTrue(r['solr_processed'])
self.assertTrue(r['processed'])
self.assertEqual(r['status'], 'solr-failed')
self.assertEqual(self.app.get_doctype_boost, 1)

def test_index_solr(self):
self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
Expand Down
45 changes: 45 additions & 0 deletions alembic/versions/6973dd1d9f2b_added_boostfactors_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Added BoostFactors table

Revision ID: 6973dd1d9f2b
Revises: 2d2af8a9c996
Create Date: 2024-11-20 18:23:26.770835

"""

# revision identifiers, used by Alembic.
revision = '6973dd1d9f2b'
down_revision = '2d2af8a9c996'

from alembic import op
import sqlalchemy as sa


def upgrade():
#with app.app_context() as c:
# db.session.add(Model())
# db.session.commit()

# ### commands auto generated by Alembic - please adjust! ###
op.create_table('boostfactors',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('record_id', sa.Integer(), nullable=True),
sa.Column('bibcode', sa.String(length=255), nullable=True),
sa.Column('doctype_boost', sa.Float(), nullable=True),
sa.Column('cite_boost', sa.Float(), nullable=True),
sa.Column('recency_boost', sa.Float(), nullable=True),
sa.Column('refereed_boost', sa.Float(), nullable=True),
sa.Column('cite_boost_astro', sa.Float(), nullable=True),
sa.Column('cite_boost_geo', sa.Float(), nullable=True),
sa.Column('cite_boost_planetary', sa.Float(), nullable=True),
sa.Column('cite_boost_physics', sa.Float(), nullable=True),
sa.Column('cite_boost_general', sa.Float(), nullable=True),
sa.ForeignKeyConstraint(['record_id'], ['records.id'], ),
sa.PrimaryKeyConstraint('id')
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('boostfactors')
# ### end Alembic commands ###
24 changes: 24 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,27 @@
"uat",
"volume",
]

DOCTYPE_RANKING = {
"article": 1,
"eprint": 1,
"inproceedings": 2,
"inbook": 1,
"abstract": 4,
"book": 1,
"bookreview": 4,
"catalog": 2,
"circular": 3,
"erratum": 6,
"mastersthesis": 3,
"newsletter": 5,
"obituary": 6,
"phdthesis": 3,
"pressrelease": 7,
"proceedings": 3,
"proposal": 4,
"software": 2,
"talk": 4,
"techreport": 3,
"misc": 8
}