OCA · sbidoul · Dec 6, 2024 · Feb 1, 2025 · Feb 1, 2025
diff --git a/queue_job/__manifest__.py b/queue_job/__manifest__.py
@@ -2,7 +2,7 @@
 
 {
     "name": "Job Queue",
-    "version": "18.0.1.2.1",
+    "version": "18.0.1.3.0",
     "author": "Camptocamp,ACSONE SA/NV,Odoo Community Association (OCA)",
     "website": "https://github.com/OCA/queue",
     "license": "LGPL-3",

diff --git a/queue_job/controllers/main.py b/queue_job/controllers/main.py
@@ -32,6 +32,8 @@ def _try_perform_job(self, env, job):
         job.set_started()
         job.store()
         env.cr.commit()
+        job.lock()
+
         _logger.debug("%s started", job)
 
         job.perform()

diff --git a/queue_job/data/queue_data.xml b/queue_job/data/queue_data.xml
@@ -1,14 +1,6 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <odoo>
     <data noupdate="1">
-        <record id="ir_cron_queue_job_garbage_collector" model="ir.cron">
-            <field name="name">Jobs Garbage Collector</field>
-            <field name="interval_number">5</field>
-            <field name="interval_type">minutes</field>
-            <field ref="model_queue_job" name="model_id" />
-            <field name="state">code</field>
-            <field name="code">model.requeue_stuck_jobs()</field>
-        </record>
         <!-- Queue-job-related subtypes for messaging / Chatter -->
         <record id="mt_job_failed" model="mail.message.subtype">
             <field name="name">Job failed</field>

diff --git a/queue_job/job.py b/queue_job/job.py
@@ -223,6 +223,61 @@ def load_many(cls, env, job_uuids):
         recordset = cls.db_records_from_uuids(env, job_uuids)
         return {cls._load_from_db_record(record) for record in recordset}
 
+    def add_lock_record(self):
+        """
+        Create row in db to be locked while the job is being performed.
+        """
+        self.env.cr.execute(
+            """
+            INSERT INTO
+                queue_job_lock (id, queue_job_id)
+            SELECT
+                id, id
+            FROM
+                queue_job
+            WHERE
+                uuid = %s
+            ON CONFLICT(id)
+            DO NOTHING;
+        """,
+            [self.uuid],
+        )
+
+    def lock(self):
+        """
+        Lock row of job that is being performed
+
+        If a job cannot be locked,
+        it means that the job wasn't started,
+        a RetryableJobError is thrown.
+        """
+        self.env.cr.execute(
+            """
+            SELECT
+                *
+            FROM
+                queue_job_lock
+            WHERE
+                queue_job_id in (
+                    SELECT
+                        id
+                    FROM
+                        queue_job
+                    WHERE
+                        uuid = %s
+                        AND state='started'
+                )
+            FOR UPDATE;
+        """,
+            [self.uuid],
+        )
+
+        # 1 job should be locked
+        if 1 != len(self.env.cr.fetchall()):
+            raise RetryableJobError(
+                f"Trying to lock job that wasn't started, uuid: {self.uuid}"
+            )
+
     @classmethod
     def _load_from_db_record(cls, job_db_record):
         stored = job_db_record
@@ -806,6 +861,7 @@ def set_started(self):
         self.state = STARTED
         self.date_started = datetime.now()
         self.worker_pid = os.getpid()
+        self.add_lock_record()
 
     def set_done(self, result=None):
         self.state = DONE

diff --git a/queue_job/jobrunner/runner.py b/queue_job/jobrunner/runner.py
@@ -114,22 +114,6 @@
 * After creating a new database or installing queue_job on an
   existing database, Odoo must be restarted for the runner to detect it.
 
-* When Odoo shuts down normally, it waits for running jobs to finish.
-  However, when the Odoo server crashes or is otherwise force-stopped,
-  running jobs are interrupted while the runner has no chance to know
-  they have been aborted. In such situations, jobs may remain in
-  ``started`` or ``enqueued`` state after the Odoo server is halted.
-  Since the runner has no way to know if they are actually running or
-  not, and does not know for sure if it is safe to restart the jobs,
-  it does not attempt to restart them automatically. Such stale jobs
-  therefore fill the running queue and prevent other jobs to start.
-  You must therefore requeue them manually, either from the Jobs view,
-  or by running the following SQL statement *before starting Odoo*:
-
-.. code-block:: sql
-
-  update queue_job set state='pending' where state in ('started', 'enqueued')
-
 .. rubric:: Footnotes
 
 .. [1] From a security standpoint, it is safe to have an anonymous HTTP
@@ -155,7 +139,7 @@
 from odoo.tools import config
 
 from . import queue_job_config
-from .channels import ENQUEUED, NOT_DONE, PENDING, ChannelManager
+from .channels import ENQUEUED, NOT_DONE, ChannelManager
 
 SELECT_TIMEOUT = 60
 ERROR_RECOVERY_DELAY = 5
@@ -207,33 +191,12 @@ def _connection_info_for(db_name):
 
 
 def _async_http_get(scheme, host, port, user, password, db_name, job_uuid):
-    # Method to set failed job (due to timeout, etc) as pending,
-    # to avoid keeping it as enqueued.
-    def set_job_pending():
-        connection_info = _connection_info_for(db_name)
-        conn = psycopg2.connect(**connection_info)
-        conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
-        with closing(conn.cursor()) as cr:
-            cr.execute(
-                "UPDATE queue_job SET state=%s, "
-                "date_enqueued=NULL, date_started=NULL "
-                "WHERE uuid=%s and state=%s "
-                "RETURNING uuid",
-                (PENDING, job_uuid, ENQUEUED),
-            )
-            if cr.fetchone():
-                _logger.warning(
-                    "state of job %s was reset from %s to %s",
-                    job_uuid,
-                    ENQUEUED,
-                    PENDING,
-                )
-
     # TODO: better way to HTTP GET asynchronously (grequest, ...)?
     #       if this was python3 I would be doing this with
     #       asyncio, aiohttp and aiopg
     def urlopen():
         url = f"{scheme}://{host}:{port}/queue_job/runjob?db={db_name}&job_uuid={job_uuid}"
+        # pylint: disable=except-pass
         try:
             auth = None
             if user:
@@ -247,10 +210,10 @@ def urlopen():
             # for codes between 500 and 600
             response.raise_for_status()
         except requests.Timeout:
-            set_job_pending()
+            # A timeout is a normal behaviour, it shouldn't be logged as an exception
+            pass
         except Exception:
             _logger.exception("exception in GET %s", url)
-            set_job_pending()
 
     thread = threading.Thread(target=urlopen)
     thread.daemon = True
@@ -341,6 +304,93 @@ def set_job_enqueued(self, uuid):
                 (ENQUEUED, uuid),
             )
 
+    def _query_requeue_dead_jobs(self):
+        return """
+            UPDATE
+                queue_job
+            SET
+                state=(
+                    CASE
+                        WHEN
+                            max_retries IS NOT NULL AND
+                            retry IS NOT NULL AND
+                            retry>max_retries
+                        THEN 'failed'
+                        ELSE 'pending'
+                    END),
+                retry=(
+                    CASE
+                        WHEN state='started'
+                        THEN COALESCE(retry,0)+1 ELSE retry
+                    END),
+                exc_name=(
+                    CASE
+                        WHEN
+                            max_retries IS NOT NULL AND
+                            retry IS NOT NULL AND
+                            retry>max_retries
+                        THEN 'JobFoundDead'
+                        ELSE exc_name
+                    END),
+                exc_info=(
+                    CASE
+                        WHEN
+                            max_retries IS NOT NULL AND
+                            retry IS NOT NULL AND
+                            retry>max_retries
+                        THEN 'Job found dead after too many retries'
+                        ELSE exc_info
+                    END)
+            WHERE
+                id in (
+                    SELECT
+                        queue_job_id
+                    FROM
+                        queue_job_lock
+                    WHERE
+                        queue_job_id in (
+                            SELECT
+                                id
+                            FROM
+                                queue_job
+                            WHERE
+                                state IN ('enqueued','started')
+                                AND date_enqueued <
+                                (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
+                        )
+                    FOR UPDATE SKIP LOCKED
+                )
+            RETURNING uuid
+            """
+
+    def requeue_dead_jobs(self):
+        """
+        Set started and enqueued jobs but not locked to pending
+
+        A job is locked when it's being executed
+        When a job is killed, it releases the lock
+
+        If the number of retries exceeds the number of max retries,
+        the job is set as 'failed' with the error 'JobFoundDead'.
+
+        Adding a buffer on 'date_enqueued' to check
+        that it has been enqueued for more than 10sec.
+        This prevents from requeuing jobs before they are actually started.
+
+        When Odoo shuts down normally, it waits for running jobs to finish.
+        However, when the Odoo server crashes or is otherwise force-stopped,
+        running jobs are interrupted while the runner has no chance to know
+        they have been aborted.
+        """
+
+        with closing(self.conn.cursor()) as cr:
+            query = self._query_requeue_dead_jobs()
+
+            cr.execute(query)
+
+            for (uuid,) in cr.fetchall():
+                _logger.warning("Re-queued dead job with uuid: %s", uuid)
+
 
 class QueueJobRunner:
     def __init__(
@@ -422,6 +472,11 @@ def initialize_databases(self):
                         self.channel_manager.notify(db_name, *job_data)
                 _logger.info("queue job runner ready for db %s", db_name)
 
+    def requeue_dead_jobs(self):
+        for db in self.db_by_name.values():
+            if db.has_queue_job:
+                db.requeue_dead_jobs()
+
     def run_jobs(self):
         now = _odoo_now()
         for job in self.channel_manager.get_jobs_to_run(now):
@@ -514,6 +569,7 @@ def run(self):
                 _logger.info("database connections ready")
                 # inner loop does the normal processing
                 while not self._stop:
+                    self.requeue_dead_jobs()
                     self.process_notifications()
                     self.run_jobs()
                     self.wait_notification()

diff --git a/queue_job/migrations/18.0.1.3.0/pre-migration.py b/queue_job/migrations/18.0.1.3.0/pre-migration.py
@@ -0,0 +1,22 @@
+# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl.html)
+
+
+def migrate(cr, version):
+    # Deactivate cron garbage collector
+    cr.execute(
+        """
+            UPDATE
+                ir_cron
+            SET
+                active=False
+            WHERE id IN (
+                SELECT res_id
+                FROM
+                    ir_model_data
+                WHERE
+                    module='queue_job'
+                    AND model='ir.cron'
+                    AND name='ir_cron_queue_job_garbage_collector'
+            );
+        """
+    )
diff --git a/queue_job/models/__init__.py b/queue_job/models/__init__.py
@@ -3,3 +3,4 @@
 from . import queue_job
 from . import queue_job_channel
 from . import queue_job_function
+from . import queue_job_lock
diff --git a/queue_job/models/queue_job.py b/queue_job/models/queue_job.py
@@ -6,7 +6,6 @@
 from datetime import datetime, timedelta
 
 from odoo import _, api, exceptions, fields, models
-from odoo.osv import expression
 from odoo.tools import config, html_escape
 
 from odoo.addons.base_sparse_field.models.fields import Serialized
@@ -409,55 +408,6 @@ def autovacuum(self):
                     break
         return True
 
-    def requeue_stuck_jobs(self, enqueued_delta=5, started_delta=0):
-        """Fix jobs that are in a bad states
-
-        :param in_queue_delta: lookup time in minutes for jobs
-                                that are in enqueued state
-
-        :param started_delta: lookup time in minutes for jobs
-                                that are in enqueued state,
-                                0 means that it is not checked
-        """
-        self._get_stuck_jobs_to_requeue(
-            enqueued_delta=enqueued_delta, started_delta=started_delta
-        ).requeue()
-        return True
-
-    def _get_stuck_jobs_domain(self, queue_dl, started_dl):
-        domain = []
-        now = fields.datetime.now()
-        if queue_dl:
-            queue_dl = now - timedelta(minutes=queue_dl)
-            domain.append(
-                [
-                    "&",
-                    ("date_enqueued", "<=", fields.Datetime.to_string(queue_dl)),
-                    ("state", "=", "enqueued"),
-                ]
-            )
-        if started_dl:
-            started_dl = now - timedelta(minutes=started_dl)
-            domain.append(
-                [
-                    "&",
-                    ("date_started", "<=", fields.Datetime.to_string(started_dl)),
-                    ("state", "=", "started"),
-                ]
-            )
-        if not domain:
-            raise exceptions.ValidationError(
-                _("If both parameters are 0, ALL jobs will be requeued!")
-            )
-        return expression.OR(domain)
-
-    def _get_stuck_jobs_to_requeue(self, enqueued_delta, started_delta):
-        job_model = self.env["queue.job"]
-        stuck_jobs = job_model.search(
-            self._get_stuck_jobs_domain(enqueued_delta, started_delta)
-        )
-        return stuck_jobs
-
     def related_action_open_record(self):
         """Open a form view with the record(s) of the job.