From d329517d9af202c3353afcc5e7b5912d0911b3f5 Mon Sep 17 00:00:00 2001 From: hoangtrann Date: Sat, 22 Nov 2025 06:05:08 +0700 Subject: [PATCH 1/2] [IMP] queue_job: requeue orphaned jobs --- queue_job/jobrunner/runner.py | 37 ++++++++++++++++++++++++ queue_job/tests/test_requeue_dead_job.py | 31 ++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/queue_job/jobrunner/runner.py b/queue_job/jobrunner/runner.py index a0db6751db..ccdbeaec11 100644 --- a/queue_job/jobrunner/runner.py +++ b/queue_job/jobrunner/runner.py @@ -386,6 +386,35 @@ def _query_requeue_dead_jobs(self): RETURNING uuid """ + def _query_requeue_orphaned_jobs(self): + """Query to requeue jobs stuck in 'enqueued' state without a lock. + + This handles the edge case where the runner marks a job as 'enqueued' + but the HTTP request to start the job never reaches the Odoo server + (e.g., due to server shutdown/crash between setting enqueued and + the controller receiving the request). These jobs have no lock record + because set_started() was never called, so they are invisible to + _query_requeue_dead_jobs(). + """ + return """ + UPDATE + queue_job + SET + state='pending' + WHERE + state = 'enqueued' + AND date_enqueued < (now() AT TIME ZONE 'utc' - INTERVAL '10 sec') + AND NOT EXISTS ( + SELECT + 1 + FROM + queue_job_lock + WHERE + queue_job_id = queue_job.id + ) + RETURNING uuid + """ + def requeue_dead_jobs(self): """ Set started and enqueued jobs but not locked to pending @@ -414,6 +443,14 @@ def requeue_dead_jobs(self): for (uuid,) in cr.fetchall(): _logger.warning("Re-queued dead job with uuid: %s", uuid) + # Requeue orphaned jobs (enqueued but never started, no lock) + query = self._query_requeue_orphaned_jobs() + cr.execute(query) + for (uuid,) in cr.fetchall(): + _logger.warning( + "Re-queued orphaned job (enqueued without lock) with uuid: %s", uuid + ) + class QueueJobRunner: def __init__( diff --git a/queue_job/tests/test_requeue_dead_job.py b/queue_job/tests/test_requeue_dead_job.py index c6c82a2f4d..5c6ca47e52 100644 --- a/queue_job/tests/test_requeue_dead_job.py +++ b/queue_job/tests/test_requeue_dead_job.py @@ -131,3 +131,34 @@ def test_requeue_dead_jobs(self): # because we committed the cursor, the savepoint of the test method is # gone, and this would break TransactionCase cleanups self.cr.execute("SAVEPOINT test_%d" % self._savepoint_id) + + def test_requeue_orphaned_jobs(self): + uuid = "test_enqueued_job" + queue_job = self.create_dummy_job(uuid) + job_obj = Job.load(self.env, queue_job.uuid) + + # Only enqueued job, don't set it to started to simulate the scenario + # that system shutdown before job is starting + job_obj.set_enqueued() + job_obj.date_enqueued = datetime.now() - timedelta(minutes=1) + job_obj.store() + + # job ins't actually picked up by the first requeue attempt + query = Database(self.env.cr.dbname)._query_requeue_dead_jobs() + self.env.cr.execute(query) + uuids_requeued = self.env.cr.fetchall() + self.assertFalse(uuids_requeued) + + # job is picked up by the 2nd requeue attempt + query = Database(self.env.cr.dbname)._query_requeue_orphaned_jobs() + self.env.cr.execute(query) + uuids_requeued = self.env.cr.fetchall() + self.assertTrue(queue_job.uuid in j[0] for j in uuids_requeued) + + # clean up + queue_job.unlink() + self.env.cr.commit() # pylint: disable=E8102 + + # because we committed the cursor, the savepoint of the test method is + # gone, and this would break TransactionCase cleanups + self.cr.execute("SAVEPOINT test_%d" % self._savepoint_id) From c7324dbf91311d84f4636b91e0917211c152531c Mon Sep 17 00:00:00 2001 From: hoangtrann Date: Wed, 31 Dec 2025 19:27:16 +0700 Subject: [PATCH 2/2] [IMP] queue_job: query orphaned dead job not exist in lock table --- queue_job/jobrunner/runner.py | 78 ++++++++---------------- queue_job/tests/test_requeue_dead_job.py | 8 +-- 2 files changed, 26 insertions(+), 60 deletions(-) diff --git a/queue_job/jobrunner/runner.py b/queue_job/jobrunner/runner.py index ccdbeaec11..4cf063e818 100644 --- a/queue_job/jobrunner/runner.py +++ b/queue_job/jobrunner/runner.py @@ -365,52 +365,26 @@ def _query_requeue_dead_jobs(self): ELSE exc_info END) WHERE - id in ( - SELECT - queue_job_id - FROM - queue_job_lock - WHERE - queue_job_id in ( - SELECT - id - FROM - queue_job - WHERE - state IN ('enqueued','started') - AND date_enqueued < - (now() AT TIME ZONE 'utc' - INTERVAL '10 sec') - ) - FOR UPDATE SKIP LOCKED - ) - RETURNING uuid - """ - - def _query_requeue_orphaned_jobs(self): - """Query to requeue jobs stuck in 'enqueued' state without a lock. - - This handles the edge case where the runner marks a job as 'enqueued' - but the HTTP request to start the job never reaches the Odoo server - (e.g., due to server shutdown/crash between setting enqueued and - the controller receiving the request). These jobs have no lock record - because set_started() was never called, so they are invisible to - _query_requeue_dead_jobs(). - """ - return """ - UPDATE - queue_job - SET - state='pending' - WHERE - state = 'enqueued' + state IN ('enqueued','started') AND date_enqueued < (now() AT TIME ZONE 'utc' - INTERVAL '10 sec') - AND NOT EXISTS ( - SELECT - 1 - FROM - queue_job_lock - WHERE - queue_job_id = queue_job.id + AND ( + id in ( + SELECT + queue_job_id + FROM + queue_job_lock + WHERE + queue_job_lock.queue_job_id = queue_job.id + FOR UPDATE SKIP LOCKED + ) + OR NOT EXISTS ( + SELECT + 1 + FROM + queue_job_lock + WHERE + queue_job_lock.queue_job_id = queue_job.id + ) ) RETURNING uuid """ @@ -433,6 +407,12 @@ def requeue_dead_jobs(self): However, when the Odoo server crashes or is otherwise force-stopped, running jobs are interrupted while the runner has no chance to know they have been aborted. + + This also handles orphaned jobs (enqueued but never started, no lock). + This edge case occurs when the runner marks a job as 'enqueued' + but the HTTP request to start the job never reaches the Odoo server + (e.g., due to server shutdown/crash between setting enqueued and + the controller receiving the request). """ with closing(self.conn.cursor()) as cr: @@ -443,14 +423,6 @@ def requeue_dead_jobs(self): for (uuid,) in cr.fetchall(): _logger.warning("Re-queued dead job with uuid: %s", uuid) - # Requeue orphaned jobs (enqueued but never started, no lock) - query = self._query_requeue_orphaned_jobs() - cr.execute(query) - for (uuid,) in cr.fetchall(): - _logger.warning( - "Re-queued orphaned job (enqueued without lock) with uuid: %s", uuid - ) - class QueueJobRunner: def __init__( diff --git a/queue_job/tests/test_requeue_dead_job.py b/queue_job/tests/test_requeue_dead_job.py index 5c6ca47e52..180e1294eb 100644 --- a/queue_job/tests/test_requeue_dead_job.py +++ b/queue_job/tests/test_requeue_dead_job.py @@ -143,16 +143,10 @@ def test_requeue_orphaned_jobs(self): job_obj.date_enqueued = datetime.now() - timedelta(minutes=1) job_obj.store() - # job ins't actually picked up by the first requeue attempt + # job is now picked up by the requeue query (which includes orphaned jobs) query = Database(self.env.cr.dbname)._query_requeue_dead_jobs() self.env.cr.execute(query) uuids_requeued = self.env.cr.fetchall() - self.assertFalse(uuids_requeued) - - # job is picked up by the 2nd requeue attempt - query = Database(self.env.cr.dbname)._query_requeue_orphaned_jobs() - self.env.cr.execute(query) - uuids_requeued = self.env.cr.fetchall() self.assertTrue(queue_job.uuid in j[0] for j in uuids_requeued) # clean up