From 019f74e12fbd30ebde73d3437f1bd212f8a19029 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 26 Feb 2025 13:49:27 -0800 Subject: [PATCH] recipe(lock): retry lock cleanup If a non-blocking lock fails to acquire the lock, and then encounters a KazooError (due to a suspended session), the _best_effort_cleanup method will swallow the exception and control will return without the lock contender node being deleted. If the session resumes (does not expire) then we will have left a lock contender in place, which will eventually become an orphaned, stuck lock once the original actor releases it. To correct this, retry deleting the lock contender in all cases. Due to the importance of this, we ignore the supplied timeout (in case the aquire method was called with a timeout) and retry forever. Closes: #732 --- kazoo/recipe/lock.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kazoo/recipe/lock.py b/kazoo/recipe/lock.py index 8a490394..c05a2141 100644 --- a/kazoo/recipe/lock.py +++ b/kazoo/recipe/lock.py @@ -193,13 +193,13 @@ def acquire(self, blocking=True, timeout=None, ephemeral=True): except KazooException: # if we did ultimately fail, attempt to clean up if not already_acquired: - self._best_effort_cleanup() + self._cleanup() self.cancelled = False raise if gotten: self.is_acquired = gotten if not gotten and not already_acquired: - self._best_effort_cleanup() + self._cleanup() return gotten finally: self._acquire_method_lock.release() @@ -318,13 +318,18 @@ def _find_node(self): def _delete_node(self, node): self.client.delete(self.path + "/" + node) - def _best_effort_cleanup(self): - try: - node = self.node or self._find_node() - if node: + def _cleanup(self): + self._retry( + self._inner_cleanup, + ) + + def _inner_cleanup(self): + node = self.node or self._find_node() + if node: + try: self._delete_node(node) - except KazooException: # pragma: nocover - pass + except NoNodeError: # pragma: nocover + pass def release(self): """Release the lock immediately."""