Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[18.0][PORT] HA job runner using session level advisory lock #757

Merged
merged 4 commits into from
Mar 25, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 35 additions & 6 deletions queue_job/jobrunner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,17 @@

SELECT_TIMEOUT = 60
ERROR_RECOVERY_DELAY = 5
PG_ADVISORY_LOCK_ID = 2293787760715711918

_logger = logging.getLogger(__name__)

select = selectors.DefaultSelector


class MasterElectionLost(Exception):
pass


# Unfortunately, it is not possible to extend the Odoo
# server command line arguments, so we resort to environment variables
# to configure the runner (channels mostly).
Expand Down Expand Up @@ -262,10 +267,15 @@
self.db_name = db_name
connection_info = _connection_info_for(db_name)
self.conn = psycopg2.connect(**connection_info)
self.conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
self.has_queue_job = self._has_queue_job()
if self.has_queue_job:
self._initialize()
try:
self.conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
self.has_queue_job = self._has_queue_job()

Check warning on line 272 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L270-L272

Added lines #L270 - L272 were not covered by tests
if self.has_queue_job:
self._acquire_master_lock()
self._initialize()
except BaseException:
self.close()
raise

Check warning on line 278 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L274-L278

Added lines #L274 - L278 were not covered by tests

def close(self):
# pylint: disable=except-pass
Expand All @@ -278,6 +288,14 @@
pass
self.conn = None

def _acquire_master_lock(self):
"""Acquire the master runner lock or raise MasterElectionLost"""
with closing(self.conn.cursor()) as cr:
cr.execute("SELECT pg_try_advisory_lock(%s)", (PG_ADVISORY_LOCK_ID,))

Check warning on line 294 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L293-L294

Added lines #L293 - L294 were not covered by tests
if not cr.fetchone()[0]:
msg = f"could not acquire master runner lock on {self.db_name}"
raise MasterElectionLost(msg)

Check warning on line 297 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L296-L297

Added lines #L296 - L297 were not covered by tests

def _has_queue_job(self):
with closing(self.conn.cursor()) as cr:
cr.execute(
Expand Down Expand Up @@ -413,14 +431,17 @@
self.db_by_name = {}

def initialize_databases(self):
for db_name in self.get_db_names():
for db_name in sorted(self.get_db_names()):
# sorting is important to avoid deadlocks in acquiring the master lock
db = Database(db_name)
if db.has_queue_job:
self.db_by_name[db_name] = db
with db.select_jobs("state in %s", (NOT_DONE,)) as cr:
for job_data in cr:
self.channel_manager.notify(db_name, *job_data)
_logger.info("queue job runner ready for db %s", db_name)
else:
db.close()

Check warning on line 444 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L444

Added line #L444 was not covered by tests

def run_jobs(self):
now = _odoo_now()
Expand Down Expand Up @@ -507,7 +528,7 @@
while not self._stop:
# outer loop does exception recovery
try:
_logger.info("initializing database connections")
_logger.debug("initializing database connections")

Check warning on line 531 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L531

Added line #L531 was not covered by tests
# TODO: how to detect new databases or databases
# on which queue_job is installed after server start?
self.initialize_databases()
Expand All @@ -522,6 +543,14 @@
except InterruptedError:
# Interrupted system call, i.e. KeyboardInterrupt during select
self.stop()
except MasterElectionLost as e:
_logger.debug(

Check warning on line 547 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L546-L547

Added lines #L546 - L547 were not covered by tests
"master election lost: %s, sleeping %ds and retrying",
e,
ERROR_RECOVERY_DELAY,
)
self.close_databases()
time.sleep(ERROR_RECOVERY_DELAY)

Check warning on line 553 in queue_job/jobrunner/runner.py

View check run for this annotation

Codecov / codecov/patch

queue_job/jobrunner/runner.py#L552-L553

Added lines #L552 - L553 were not covered by tests
except Exception:
_logger.exception(
"exception: sleeping %ds and retrying", ERROR_RECOVERY_DELAY
Expand Down