Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion adsmp/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from __future__ import absolute_import, unicode_literals
from past.builtins import basestring
from . import exceptions
from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records
from adsmp.models import ChangeLog, IdentifierMapping, MetricsBase, MetricsModel, Records, BoostFactors
from adsmsg import OrcidClaims, DenormalizedRecord, FulltextUpdate, MetricsRecord, NonBibRecord, NonBibRecordList, MetricsRecordList, AugmentAffiliationResponseRecord, AugmentAffiliationRequestRecord
from adsmsg.msg import Msg
from adsputils import ADSCelery, create_engine, sessionmaker, scoped_session, contextmanager
Expand Down Expand Up @@ -71,6 +71,16 @@ def __init__(self, app_name, *args, **kwargs):
'rn_citations': getattr(self._metrics_table_upsert.excluded, 'rn_citations'),
'rn_citation_data': getattr(self._metrics_table_upsert.excluded, 'rn_citation_data')}
self._metrics_table_upsert = self._metrics_table_upsert.on_conflict_do_update(index_elements=['bibcode'], set_=update_columns)

if self._config.get("DOCTYPE_RANKING", False):
doctype_rank = self._config.get("DOCTYPE_RANKING")
unique_ranks = sorted(set(doctype_rank.values()))

# Map ranks to scores evenly spaced between 0 and 1 (invert: lowest rank gets the highest score)
rank_to_score = {rank: 1 - ( i / (len(unique_ranks) - 1)) for i, rank in enumerate(unique_ranks)}

# Assign scores to each rank
self._doctype_scores = {doctype: rank_to_score[rank] for doctype, rank in doctype_rank.items()}

def update_storage(self, bibcode, type, payload):
"""Update the document in the database, every time
Expand Down Expand Up @@ -120,6 +130,9 @@ def update_storage(self, bibcode, type, payload):

r.updated = now
out = r.toJSON()
if out.get("bib_data", None):
doctype_boost = self.generate_doctype_boost(out["bib_data"].get("bibcode", None),out["bib_data"].get("doctype", None) )

try:
session.commit()
return out
Expand All @@ -136,6 +149,12 @@ def delete_by_bibcode(self, bibcode):
session.delete(r)
session.commit()
return True

b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()
if b is not None:
session.delete(b)
session.commit()
return True

def rename_bibcode(self, old_bibcode, new_bibcode):
assert old_bibcode and new_bibcode
Expand Down Expand Up @@ -555,3 +574,47 @@ def generate_links_for_resolver(self, record):
# here if record holds unexpected value
self.logger.error('invalid value in bib data, bibcode = {}, type = {}, value = {}'.format(bibcode, type(bib_links_record), bib_links_record))
return resolver_record

def generate_doctype_boost(self, bibcode, doctype = None):
if not doctype:
with self.session_scope() as session:
r = session.query(Records).filter_by(bibcode=bibcode).first()
r = r.toJSON()
if r:
if r.get("bibdata", None):
doctype = r["bibdata"].get("doctype", None)

if bibcode and doctype:
with self.session_scope() as session:
b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()

if b is None:
b = BoostFactors(bibcode=bibcode)
session.add(b)

b.doctype_boost = self._doctype_scores.get(doctype, None)

try:
session.commit()
except exc.IntegrityError:
self.logger.exception('error in app.generate_doctype_boost while updating database for bibcode {}'.format(r.bibcode))
session.rollback()
raise

return b.doctype_boost

def get_doctype_boost(self, bibcode):
if bibcode:
with self.session_scope() as session:
b = session.query(BoostFactors).filter_by(bibcode=bibcode).first()

if b:
return b.doctype_boost

def delete_table_contents(self, table):
"""Delete all contents of the table
:param table: string, name of the table
"""
with self.session_scope() as session:
session.query(table).delete()
session.commit()
58 changes: 57 additions & 1 deletion adsmp/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
from adsputils import get_date
from datetime import datetime
from dateutil.tz import tzutc
from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime
from sqlalchemy import Column, Integer, BigInteger, String, Text, TIMESTAMP, Boolean, DateTime, Float
from sqlalchemy import types
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.types import Enum
from sqlalchemy.dialects import postgresql
from sqlalchemy import text
from sqlalchemy import ForeignKey
import json

Base = declarative_base()
Expand Down Expand Up @@ -184,3 +185,58 @@ def toJSON(self):
rn_citations=self.rn_citations,
rn_citation_data=self.rn_citation_data,
modtime=self.modtime and get_date(self.modtime).isoformat() or None)

class BoostFactors(Base):
"""
Tracks the various boost factors associated to each bibcode
Attributes:
id (int): The unique identifier for the table
record_id (int): The foreign key referencing the associated record.
bibcode (str) : bibcode of the record
doctype_boost (float): Boost factor between 0 to 1 based on doctype of the record
recency_boost (float): Boost factor between 0 to 1 based on pubyear of the record
refereed_boost (float): Boost factor between 0 to 1 based on refereed status of the record
cite_boost_astro (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the astro collection and year
cite_boost_geo (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the earth science collection and year
cite_boost_planetary (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the planetary science collection and year
cite_boost_physics (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the physics collection and year
cite_boost_general (float): Boost factor between 0 to 1 based on the number of citations of the record relative to the general collection and year
cite_boost (float): Boost factor between 0 to 1; combination of other cite_boost scores based on the all the collections relevant to the record

"""

__tablename__ = 'boostfactors'

id = Column(Integer, primary_key=True) #bigint
record_id = Column(Integer, ForeignKey('records.id'))
bibcode = Column(String(255))
doctype_boost = Column(Float)
cite_boost = Column(Float)
recency_boost = Column(Float)
refereed_boost = Column(Float)
cite_boost_astro = Column(Float)
cite_boost_geo = Column(Float)
cite_boost_planetary = Column(Float)
cite_boost_physics = Column(Float)
cite_boost_general = Column(Float)

def toJSON(self):
"""
Converts the BoostFactors object to a JSON representation.
Returns:
dict: A dictionary containing the JSON representation of the object.
"""
return {
'id': self.id,
'record_id': self.record_id,
'bibcode': self.bibcode,
"doctype_boost": self.doctype_boost,
"cite_boost": self.cite_boost,
"recency_boost": self.recency_boost,
"refereed_boost": self.refereed_boost,
"cite_boost_astro": self.cite_boost_astro,
"cite_boost_geo": self.cite_boost_geo,
"cite_boost_planetary": self.cite_boost_planetary,
"cite_boost_physics": self.cite_boost_physics,
"cite_boost_general": self.cite_boost_general,
}
7 changes: 5 additions & 2 deletions adsmp/solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def extract_metrics_pipeline(data, solrdoc):

return dict(citation=citation)


def extract_data_pipeline(data, solrdoc):
reader = data.get("readers", [])
read_count = len(reader)
Expand Down Expand Up @@ -301,7 +300,6 @@ def get_timestamps(db_record, out):
out["update_timestamp"] = date2solrstamp(last_update)
return out


DB_COLUMN_DESTINATIONS = [
("bib_data", ""),
("orcid_claims", get_orcid_claims),
Expand Down Expand Up @@ -466,6 +464,11 @@ def transform_json_record(db_record):
db_record["bibcode"], type(links_data), links_data
)
)
out["doctype_boost"] = None
if db_record.get("nonbib_data"):
if db_record["nonbib_data"].get("doctype_boost", None):
out["doctype_boost"] = db_record["nonbib_data"]["doctype_boost"]

if config.get("ENABLE_HAS", False):
# Read-in names of fields to check for solr "has:" field
hasfields = sorted(config.get("HAS_FIELDS", []))
Expand Down
29 changes: 28 additions & 1 deletion adsmp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from adsmp import solr_updater
from kombu import Queue
from adsmsg.msg import Msg

from adsmp.models import BoostFactors
# ============================= INITIALIZATION ==================================== #

proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
Expand Down Expand Up @@ -191,6 +191,11 @@ def reindex_records(bibcodes, force=False, update_solr=True, update_metrics=True
logger.error('The bibcode %s doesn\'t exist!', bibcode)
continue

# Adding boost factors to r to pass it to solr
doctype_boost = app.get_doctype_boost(bibcode)
if doctype_boost:
r["nonbib_data"]["doctype_boost"] = doctype_boost
logger.debug("doctype_boost ",doctype_boost)
augments_updated = r.get('augments_updated', None)
bib_data_updated = r.get('bib_data_updated', None)
fulltext_updated = r.get('fulltext_updated', None)
Expand Down Expand Up @@ -304,6 +309,28 @@ def task_delete_documents(bibcode):
else:
logger.debug('Failed to deleted metrics record: %s', bibcode)

@app.task(queue='populate-boostfactors-table')
def task_populate_boostfactors_table(bibcodes, boost_action):
"""
Populate the sitemap table for the given bibcodes
"""

if boost_action not in ['add', 'delete-tablecontents']:
logger.error("Invalid boost_action %s, must be 'add', 'remove', 'force-update', 'delete-table'", boost_action)
return

if boost_action == 'delete-tablecontents':
# reset and empty all entries in boostfactors table
app.delete_table_contents(BoostFactors)
return

if isinstance(bibcodes, basestring):
bibcodes = [bibcodes]

if boost_action == 'add':
logger.debug('Updating doctype_boost info in boostfactors table for: %s', bibcodes)
for bibcode in bibcodes:
app.generate_doctype_boost(bibcode)

if __name__ == '__main__':
app.start()
4 changes: 2 additions & 2 deletions adsmp/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_mark_processed(self):
r = self.app.get_record('abc')
self.assertEqual(r, None)

self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1, 'doctype': 'article'})
self.app.mark_processed(['abc'], 'solr', checksums=['jkl'], status='success')
r = self.app.get_record('abc')

Expand All @@ -76,6 +76,7 @@ def test_mark_processed(self):
self.assertTrue(r['solr_processed'])
self.assertTrue(r['processed'])
self.assertEqual(r['status'], 'solr-failed')
self.assertEqual(self.app.get_doctype_boost(r["bibcode"]), 1)

def test_index_solr(self):
self.app.update_storage('abc', 'bib_data', {'bibcode': 'abc', 'hey': 1})
Expand Down Expand Up @@ -453,6 +454,5 @@ def test_generate_links_for_resolver(self):
self.assertTrue('testbib' in str(m_args[0]))
self.assertTrue('foobar' in str(m_args[0]))


if __name__ == '__main__':
unittest.main()
12 changes: 12 additions & 0 deletions adsmp/tests/test_solr_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,11 @@ def test_solr_transformer(self):
)
rec = self.app.get_record("bibcode")

# Generate doctype_boost given the doctype from bib_data
doctype_boost = self.app.generate_doctype_boost(rec["bibcode"], rec["bib_data"]['doctype'])
# Add doctype_boost to nonbib_data as done in reindex task
rec["nonbib_data"]["doctype_boost"] = doctype_boost

x = solr_updater.transform_json_record(rec)
# self.assertFalse('aff' in x, 'virtual field should not be in solr output')

Expand Down Expand Up @@ -340,6 +345,7 @@ def test_solr_transformer(self):
"volume",
],
)
self.assertEqual(round(x["doctype_boost"],3),0.857)

self.app.update_storage(
"bibcode",
Expand Down Expand Up @@ -513,6 +519,11 @@ def test_solr_transformer(self):
self.assertEqual(x[f], "2017-09-19T21:17:12.026474Z")

rec = self.app.get_record("bibcode")
# Generate doctype_boost given the doctype from bib_data
doctype_boost = self.app.generate_doctype_boost(rec["bibcode"], rec["bib_data"]['doctype'])
# Add doctype_boost to nonbib_data as done in reindex task
rec["nonbib_data"]["doctype_boost"] = doctype_boost

x = solr_updater.transform_json_record(rec)

self.assertTrue("aff" in x) # aff is no longer a virtual field
Expand Down Expand Up @@ -546,6 +557,7 @@ def test_solr_transformer(self):
"volume",
],
)
self.assertEqual(round(x["doctype_boost"],3),0.857)

def test_links_data_merge(self):
# links_data only from bib
Expand Down
6 changes: 3 additions & 3 deletions adsmp/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,15 +537,15 @@ def test_avoid_duplicates(self):
tasks.task_index_records(["foo"], force=True)

self.assertEqual(update_solr.call_count, 1)
self._check_checksum("foo", solr="0x4db9a611")
self._check_checksum("foo", solr="0x8f51bd8d")

# now change metrics (solr shouldn't be called)
getter.return_value = {
"bibcode": "foo",
"metrics_updated": get_date("1972-04-02"),
"bib_data_updated": get_date("1972-04-01"),
"metrics": {},
"solr_checksum": "0x4db9a611",
"solr_checksum": "0x8f51bd8d",
}
tasks.task_index_records(["foo"], force=True)
self.assertEqual(update_solr.call_count, 1)
Expand All @@ -563,7 +563,7 @@ def test_ignore_checksums_solr(self):
"bibcode": "foo",
"metrics_updated": get_date("1972-04-02"),
"bib_data_updated": get_date("1972-04-01"),
"solr_checksum": "0x4db9a611",
"solr_checksum": "0x8f51bd8d",
}

# update with matching checksum and then update and ignore checksums
Expand Down
45 changes: 45 additions & 0 deletions alembic/versions/6973dd1d9f2b_added_boostfactors_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Added BoostFactors table

Revision ID: 6973dd1d9f2b
Revises: 2d2af8a9c996
Create Date: 2024-11-20 18:23:26.770835

"""

# revision identifiers, used by Alembic.
revision = '6973dd1d9f2b'
down_revision = '2d2af8a9c996'

from alembic import op
import sqlalchemy as sa


def upgrade():
#with app.app_context() as c:
# db.session.add(Model())
# db.session.commit()

# ### commands auto generated by Alembic - please adjust! ###
op.create_table('boostfactors',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('record_id', sa.Integer(), nullable=True),
sa.Column('bibcode', sa.String(length=255), nullable=True),
sa.Column('doctype_boost', sa.Float(), nullable=True),
sa.Column('cite_boost', sa.Float(), nullable=True),
sa.Column('recency_boost', sa.Float(), nullable=True),
sa.Column('refereed_boost', sa.Float(), nullable=True),
sa.Column('cite_boost_astro', sa.Float(), nullable=True),
sa.Column('cite_boost_geo', sa.Float(), nullable=True),
sa.Column('cite_boost_planetary', sa.Float(), nullable=True),
sa.Column('cite_boost_physics', sa.Float(), nullable=True),
sa.Column('cite_boost_general', sa.Float(), nullable=True),
sa.ForeignKeyConstraint(['record_id'], ['records.id'], ),
sa.PrimaryKeyConstraint('id')
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('boostfactors')
# ### end Alembic commands ###
Loading