From ada5ee82c2dc09423e43772c92540e76a336b44f Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Mon, 22 Jun 2026 11:02:06 +0800
Subject: [PATCH 01/11] fix: clean every checker each window and release lock
 when idle

Two issues affecting workloads with multiple health-checked upstreams:

1. Stale-target cleanup gating. `last_cleanup_check` is a module-level
   timestamp shared by every checker, but it was updated *inside* the
   `for _, checker_obj in pairs(hcs)` loop. The first checker that runs
   cleanup advanced the timestamp, so the cleanup condition turned false
   for every remaining checker in the same iteration. As a result only
   the first upstream's purge-marked targets were ever removed; the other
   upstreams kept their deleted nodes in the shm forever (still returned
   by the control API and still actively probed). This is why it could
   not be reproduced with a single upstream. The window is now decided
   once before the loop and the timestamp advanced once after it, so
   every checker in `hcs` is cleaned in the same window.

2. Periodic lock not released when idle. When every checker on the
   lock-holding worker has been stopped (active == false), the timer kept
   renewing the global PERIODIC_LOCK, so other workers that still own
   active checkers could never acquire it and active checks stopped
   entirely after a disable -> re-enable cycle. The lock holder now
   releases the lock when no active checker remains. This extends the
   work started in #54.

Adds a regression test (t/with_worker-events/19-multi-checker-cleanup.t)
that creates two health-checked upstreams, marks both for delayed
removal, and asserts both are cleaned. It fails before the gating fix
(second checker keeps its targets) and passes after.
---
 lib/resty/healthcheck.lua                     | 47 ++++++++++-
 .../19-multi-checker-cleanup.t                | 80 +++++++++++++++++++
 2 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 t/with_worker-events/19-multi-checker-cleanup.t

diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua
index cfc405c1..5102edea 100644
--- a/lib/resty/healthcheck.lua
+++ b/lib/resty/healthcheck.lua
@@ -1735,6 +1735,25 @@ function _M.new(opts)
       expire = function()
 
         if get_periodic_lock(shm, key) then
+          -- Check whether any checker on this worker is still active before
+          -- renewing the lock. When every checker has been stopped
+          -- (active == false) the lock holder must release the lock so that
+          -- other workers which still own active checkers can acquire it,
+          -- otherwise active health checks become permanently stuck after a
+          -- disable -> re-enable cycle.
+          local has_active_checker = false
+          for _, checker_obj in pairs(hcs) do
+            if checker_obj.checks.active.healthy.active or
+               checker_obj.checks.active.unhealthy.active then
+              has_active_checker = true
+              break
+            end
+          end
+          if not has_active_checker then
+            shm:delete(key)
+            active_check_timer.interval = CHECK_INTERVAL * 10
+            return
+          end
           active_check_timer.interval = CHECK_INTERVAL
           renew_periodic_lock(shm, key)
         else
@@ -1743,9 +1762,17 @@ function _M.new(opts)
         end
 
         local cur_time = ngx_now()
+        -- Decide once, before iterating the checkers, whether this is a
+        -- cleanup window. `last_cleanup_check` is module level and shared by
+        -- every checker, so it must be read before the loop and updated only
+        -- after it: updating it inside the loop would make the first checker
+        -- that runs cleanup advance the timestamp and starve every other
+        -- checker in `hcs`, leaving their purge-marked targets in the shm
+        -- forever. This only manifests with multiple health-checked upstreams.
+        local run_cleanup = (last_cleanup_check + CLEANUP_INTERVAL) < cur_time
         for _, checker_obj in pairs(hcs) do
 
-          if (last_cleanup_check + CLEANUP_INTERVAL) < cur_time then
+          if run_cleanup then
             -- clear targets marked for delayed removal
             locking_target_list(checker_obj, function(target_list)
               local removed_targets = {}
@@ -1774,8 +1801,6 @@ function _M.new(opts)
                 end
               end
             end)
-
-            last_cleanup_check = cur_time
           end
 
           if checker_obj.checks.active.healthy.active and
@@ -1794,6 +1819,12 @@ function _M.new(opts)
             checker_callback(checker_obj, "unhealthy")
           end
         end
+
+        -- Advance the cleanup window once, after every checker in `hcs` has
+        -- had a chance to purge its stale targets in this iteration.
+        if run_cleanup then
+          last_cleanup_check = cur_time
+        end
       end,
     })
     if not active_check_timer then
@@ -1868,4 +1899,14 @@ function _M.get_target_list(name, shm_name)
 end
 
 
+if TESTING then
+  -- test-only hook: shorten the stale-target cleanup window so the periodic
+  -- cleanup path can be exercised deterministically without waiting for the
+  -- default CLEANUP_INTERVAL (CHECK_INTERVAL * 25).
+  function _M._set_cleanup_interval(interval)
+    CLEANUP_INTERVAL = interval
+  end
+end
+
+
 return _M
diff --git a/t/with_worker-events/19-multi-checker-cleanup.t b/t/with_worker-events/19-multi-checker-cleanup.t
new file mode 100644
index 00000000..f1e51be7
--- /dev/null
+++ b/t/with_worker-events/19-multi-checker-cleanup.t
@@ -0,0 +1,80 @@
+use Test::Nginx::Socket::Lua;
+use Cwd qw(cwd);
+
+workers(1);
+
+plan tests => repeat_each() * (blocks() * 2);
+
+my $pwd = cwd();
+
+our $HttpConfig = qq{
+    lua_package_path "$pwd/lib/?.lua;;";
+    lua_shared_dict test_shm 8m;
+    lua_shared_dict my_worker_events 8m;
+
+    init_worker_by_lua_block {
+        _G.__TESTING_HEALTHCHECKER = true
+    }
+};
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: stale-target cleanup runs for every checker, not just the first one
+--- http_config eval: $::HttpConfig
+--- config
+    location = /t {
+        content_by_lua_block {
+            local we = require "resty.worker.events"
+            assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
+
+            local healthcheck = require("resty.healthcheck")
+            -- shorten the cleanup window so it fires within the test
+            healthcheck._set_cleanup_interval(0.2)
+
+            local function new_checker(name)
+                return assert(healthcheck.new({
+                    name = name,
+                    shm_name = "test_shm",
+                    checks = {
+                        active = {
+                            type = "tcp",
+                            healthy   = { interval = 0.1 },
+                            unhealthy = { interval = 0.1 },
+                        }
+                    }
+                }))
+            end
+
+            -- two upstreams, each with health checks enabled
+            local checker1 = new_checker("upstream-1")
+            local checker2 = new_checker("upstream-2")
+
+            for i = 1, 3 do
+                assert(checker1:add_target("127.0.0.1", 20000 + i, nil, true))
+                assert(checker2:add_target("127.0.0.1", 30000 + i, nil, true))
+            end
+
+            -- mark every target on both checkers for immediate delayed removal
+            assert(checker1:delayed_clear(0))
+            assert(checker2:delayed_clear(0))
+
+            -- wait long enough for several cleanup windows to elapse
+            ngx.sleep(1.5)
+
+            local list1 = healthcheck.get_target_list("upstream-1", "test_shm")
+            local list2 = healthcheck.get_target_list("upstream-2", "test_shm")
+
+            -- Before the fix only the first checker in `hcs` is cleaned each
+            -- window, so checker2 keeps its purge-marked targets forever.
+            ngx.say("checker1 remaining: ", #list1)
+            ngx.say("checker2 remaining: ", #list2)
+        }
+    }
+--- request
+GET /t
+--- response_body
+checker1 remaining: 0
+checker2 remaining: 0
+--- timeout: 5

From df219d206571adc2f9e2c1f955176d75f3fe8d4c Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Mon, 22 Jun 2026 15:14:14 +0800
Subject: [PATCH 02/11] test: cover periodic-lock release and mirror cleanup
 test to resty-events

- Add a regression test asserting the periodic lock is released once no
  active checker remains on the worker (so another worker can take over).
- Mirror the multi-checker cleanup test and the new lock test into the
  resty-events suite so both event backends exercise the fixes.
---
 .../19-multi-checker-cleanup.t                | 96 +++++++++++++++++++
 .../20-periodic-lock-release.t                | 86 +++++++++++++++++
 .../20-periodic-lock-release.t                | 67 +++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 t/with_resty-events/19-multi-checker-cleanup.t
 create mode 100644 t/with_resty-events/20-periodic-lock-release.t
 create mode 100644 t/with_worker-events/20-periodic-lock-release.t

diff --git a/t/with_resty-events/19-multi-checker-cleanup.t b/t/with_resty-events/19-multi-checker-cleanup.t
new file mode 100644
index 00000000..3a9d6e3a
--- /dev/null
+++ b/t/with_resty-events/19-multi-checker-cleanup.t
@@ -0,0 +1,96 @@
+use Test::Nginx::Socket::Lua;
+use Cwd qw(cwd);
+
+workers(1);
+
+plan tests => repeat_each() * (blocks() * 2);
+
+my $pwd = cwd();
+$ENV{TEST_NGINX_SERVROOT} = server_root();
+
+our $HttpConfig = qq{
+    lua_package_path "$pwd/lib/?.lua;;";
+    lua_shared_dict test_shm 8m;
+
+    init_worker_by_lua_block {
+        _G.__TESTING_HEALTHCHECKER = true
+        local we = require "resty.events.compat"
+        assert(we.configure({
+            unique_timeout = 5,
+            broker_id = 0,
+            listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock"
+        }))
+        assert(we.configured())
+    }
+
+    server {
+        server_name kong_worker_events;
+        listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock;
+        access_log off;
+        location / {
+            content_by_lua_block {
+                require("resty.events.compat").run()
+            }
+        }
+    }
+};
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: stale-target cleanup runs for every checker, not just the first one
+--- http_config eval: $::HttpConfig
+--- config
+    location = /t {
+        content_by_lua_block {
+            local healthcheck = require("resty.healthcheck")
+            -- shorten the cleanup window so it fires within the test
+            healthcheck._set_cleanup_interval(0.2)
+
+            local function new_checker(name)
+                return assert(healthcheck.new({
+                    name = name,
+                    shm_name = "test_shm",
+                    events_module = "resty.events",
+                    checks = {
+                        active = {
+                            type = "tcp",
+                            healthy   = { interval = 0.1 },
+                            unhealthy = { interval = 0.1 },
+                        }
+                    }
+                }))
+            end
+
+            -- two upstreams, each with health checks enabled
+            local checker1 = new_checker("upstream-1")
+            local checker2 = new_checker("upstream-2")
+
+            for i = 1, 3 do
+                assert(checker1:add_target("127.0.0.1", 20000 + i, nil, true))
+                assert(checker2:add_target("127.0.0.1", 30000 + i, nil, true))
+            end
+
+            -- mark every target on both checkers for immediate delayed removal
+            assert(checker1:delayed_clear(0))
+            assert(checker2:delayed_clear(0))
+
+            -- wait long enough for several cleanup windows to elapse
+            ngx.sleep(1.5)
+
+            local list1 = healthcheck.get_target_list("upstream-1", "test_shm")
+            local list2 = healthcheck.get_target_list("upstream-2", "test_shm")
+
+            -- Before the fix only the first checker in `hcs` is cleaned each
+            -- window, so checker2 keeps its purge-marked targets forever.
+            ngx.say("checker1 remaining: ", #list1)
+            ngx.say("checker2 remaining: ", #list2)
+        }
+    }
+--- request
+GET /t
+--- response_body
+checker1 remaining: 0
+checker2 remaining: 0
+--- timeout: 5
diff --git a/t/with_resty-events/20-periodic-lock-release.t b/t/with_resty-events/20-periodic-lock-release.t
new file mode 100644
index 00000000..99ae3b03
--- /dev/null
+++ b/t/with_resty-events/20-periodic-lock-release.t
@@ -0,0 +1,86 @@
+use Test::Nginx::Socket::Lua;
+use Cwd qw(cwd);
+
+workers(1);
+
+plan tests => repeat_each() * (blocks() * 2);
+
+my $pwd = cwd();
+$ENV{TEST_NGINX_SERVROOT} = server_root();
+
+our $HttpConfig = qq{
+    lua_package_path "$pwd/lib/?.lua;;";
+    lua_shared_dict test_shm 8m;
+
+    init_worker_by_lua_block {
+        local we = require "resty.events.compat"
+        assert(we.configure({
+            unique_timeout = 5,
+            broker_id = 0,
+            listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock"
+        }))
+        assert(we.configured())
+    }
+
+    server {
+        server_name kong_worker_events;
+        listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock;
+        access_log off;
+        location / {
+            content_by_lua_block {
+                require("resty.events.compat").run()
+            }
+        }
+    }
+};
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: periodic lock is released once no checker is active
+--- http_config eval: $::HttpConfig
+--- config
+    location = /t {
+        content_by_lua_block {
+            local healthcheck = require("resty.healthcheck")
+            -- the periodic lock is a single global key shared by every checker
+            -- on this worker; see SHM_PREFIX .. ":period_lock:"
+            local PERIODIC_LOCK = "lua-resty-healthcheck::period_lock:"
+            local shm = ngx.shared.test_shm
+
+            local checker = assert(healthcheck.new({
+                name = "lock-release",
+                shm_name = "test_shm",
+                events_module = "resty.events",
+                checks = {
+                    active = {
+                        type = "tcp",
+                        healthy   = { interval = 0.1 },
+                        unhealthy = { interval = 0.1 },
+                    }
+                }
+            }))
+            assert(checker:add_target("127.0.0.1", 21111, nil, true))
+
+            -- let the periodic timer acquire the lock
+            ngx.sleep(0.5)
+            ngx.say("lock held while active: ", shm:get(PERIODIC_LOCK) ~= nil)
+
+            -- stop the only checker: no active checker remains on this worker.
+            -- Before the fix the timer keeps renewing the lock forever; after
+            -- the fix the lock holder releases it so another worker that still
+            -- owns active checkers can take over.
+            checker:stop()
+
+            -- wait past LOCK_PERIOD (CHECK_INTERVAL * 15 = 1.5s) and several ticks
+            ngx.sleep(2)
+            ngx.say("lock released after stop: ", shm:get(PERIODIC_LOCK) == nil)
+        }
+    }
+--- request
+GET /t
+--- response_body
+lock held while active: true
+lock released after stop: true
+--- timeout: 5
diff --git a/t/with_worker-events/20-periodic-lock-release.t b/t/with_worker-events/20-periodic-lock-release.t
new file mode 100644
index 00000000..288bf3d6
--- /dev/null
+++ b/t/with_worker-events/20-periodic-lock-release.t
@@ -0,0 +1,67 @@
+use Test::Nginx::Socket::Lua;
+use Cwd qw(cwd);
+
+workers(1);
+
+plan tests => repeat_each() * (blocks() * 2);
+
+my $pwd = cwd();
+
+our $HttpConfig = qq{
+    lua_package_path "$pwd/lib/?.lua;;";
+    lua_shared_dict test_shm 8m;
+    lua_shared_dict my_worker_events 8m;
+};
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: periodic lock is released once no checker is active
+--- http_config eval: $::HttpConfig
+--- config
+    location = /t {
+        content_by_lua_block {
+            local we = require "resty.worker.events"
+            assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
+
+            local healthcheck = require("resty.healthcheck")
+            -- the periodic lock is a single global key shared by every checker
+            -- on this worker; see SHM_PREFIX .. ":period_lock:"
+            local PERIODIC_LOCK = "lua-resty-healthcheck::period_lock:"
+            local shm = ngx.shared.test_shm
+
+            local checker = assert(healthcheck.new({
+                name = "lock-release",
+                shm_name = "test_shm",
+                checks = {
+                    active = {
+                        type = "tcp",
+                        healthy   = { interval = 0.1 },
+                        unhealthy = { interval = 0.1 },
+                    }
+                }
+            }))
+            assert(checker:add_target("127.0.0.1", 21111, nil, true))
+
+            -- let the periodic timer acquire the lock
+            ngx.sleep(0.5)
+            ngx.say("lock held while active: ", shm:get(PERIODIC_LOCK) ~= nil)
+
+            -- stop the only checker: no active checker remains on this worker.
+            -- Before the fix the timer keeps renewing the lock forever; after
+            -- the fix the lock holder releases it so another worker that still
+            -- owns active checkers can take over.
+            checker:stop()
+
+            -- wait past LOCK_PERIOD (CHECK_INTERVAL * 15 = 1.5s) and several ticks
+            ngx.sleep(2)
+            ngx.say("lock released after stop: ", shm:get(PERIODIC_LOCK) == nil)
+        }
+    }
+--- request
+GET /t
+--- response_body
+lock held while active: true
+lock released after stop: true
+--- timeout: 5

From 941ca869c7fec2c18553acc011bdbd42897cacc1 Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Mon, 22 Jun 2026 15:33:21 +0800
Subject: [PATCH 03/11] test: disable externally-dependent 14-tls active-probe
 test

14-tls_active_probes.t actively probes the external host badssl.com via a
hard-coded IP (104.154.89.105). It is flaky in CI whenever that host is
slow/unreachable and is currently red on every run, even for the non-disable
TEST 1. Disable it via the repo's .t_disabled convention (cf.
10-garbagecollect, 13-integration) until it is rewritten against a local TLS
server. This is unrelated to the cleanup/lock changes in this PR.
---
 .../{14-tls_active_probes.t => 14-tls_active_probes.t_disabled}   | 0
 .../{14-tls_active_probes.t => 14-tls_active_probes.t_disabled}   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename t/with_resty-events/{14-tls_active_probes.t => 14-tls_active_probes.t_disabled} (100%)
 rename t/with_worker-events/{14-tls_active_probes.t => 14-tls_active_probes.t_disabled} (100%)

diff --git a/t/with_resty-events/14-tls_active_probes.t b/t/with_resty-events/14-tls_active_probes.t_disabled
similarity index 100%
rename from t/with_resty-events/14-tls_active_probes.t
rename to t/with_resty-events/14-tls_active_probes.t_disabled
diff --git a/t/with_worker-events/14-tls_active_probes.t b/t/with_worker-events/14-tls_active_probes.t_disabled
similarity index 100%
rename from t/with_worker-events/14-tls_active_probes.t
rename to t/with_worker-events/14-tls_active_probes.t_disabled

From 76ab8fbd58f667fc7bd5a785cc5d15472356bd7f Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Mon, 22 Jun 2026 15:33:21 +0800
Subject: [PATCH 04/11] fix: only contend for the periodic lock when a checker
 is active

Check whether the worker still owns an active checker before acquiring the
periodic lock instead of after. A worker whose checkers were all disabled no
longer re-acquires and immediately releases the lock on every back-off tick;
it releases the lock once (if it still holds it) and then stops contending.
---
 lib/resty/healthcheck.lua | 43 +++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua
index 5102edea..3ce44af5 100644
--- a/lib/resty/healthcheck.lua
+++ b/lib/resty/healthcheck.lua
@@ -1734,26 +1734,35 @@ function _M.new(opts)
       detached = false,
       expire = function()
 
-        if get_periodic_lock(shm, key) then
-          -- Check whether any checker on this worker is still active before
-          -- renewing the lock. When every checker has been stopped
-          -- (active == false) the lock holder must release the lock so that
-          -- other workers which still own active checkers can acquire it,
-          -- otherwise active health checks become permanently stuck after a
-          -- disable -> re-enable cycle.
-          local has_active_checker = false
-          for _, checker_obj in pairs(hcs) do
-            if checker_obj.checks.active.healthy.active or
-               checker_obj.checks.active.unhealthy.active then
-              has_active_checker = true
-              break
-            end
+        -- A worker only contends for the periodic lock when it still owns at
+        -- least one active checker. Checking this *before* acquiring the lock
+        -- avoids a worker whose checkers were all disabled from repeatedly
+        -- re-acquiring and releasing the lock on every tick (the lock would
+        -- otherwise jitter once per back-off interval on a fully disabled
+        -- cluster). When every checker has been stopped the lock holder must
+        -- also release the lock so that other workers which still own active
+        -- checkers can acquire it, otherwise active health checks become
+        -- permanently stuck after a disable -> re-enable cycle.
+        local has_active_checker = false
+        for _, checker_obj in pairs(hcs) do
+          if checker_obj.checks.active.healthy.active or
+             checker_obj.checks.active.unhealthy.active then
+            has_active_checker = true
+            break
           end
-          if not has_active_checker then
+        end
+
+        if not has_active_checker then
+          -- release the lock only if we are still the holder, then back off
+          -- without contending again
+          if shm:get(key) == ngx_worker_pid() then
             shm:delete(key)
-            active_check_timer.interval = CHECK_INTERVAL * 10
-            return
           end
+          active_check_timer.interval = CHECK_INTERVAL * 10
+          return
+        end
+
+        if get_periodic_lock(shm, key) then
           active_check_timer.interval = CHECK_INTERVAL
           renew_periodic_lock(shm, key)
         else

From 38d86b3e561b3528f5fc050a87652e0afe741d8f Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Tue, 23 Jun 2026 05:23:11 +0800
Subject: [PATCH 05/11] test: make TLS active-probe and status-version tests
 deterministic

14-tls_active_probes: replace the external badssl.com probe (hard-coded
IP 104.154.89.105, flaky/unreachable in CI) with a local self-signed TLS
server embedded in the test nginx config. The cert has CN/SAN
example.test, so probing with that SNI and verification enabled succeeds,
a mismatching SNI with verification enabled fails, and verification off
accepts the mismatch - reproducing the original healthy/unhealthy
assertions against a local server. Wait for the probe by polling the
target status instead of a fixed sleep, and use a distinct port per
events backend to avoid cross-suite collisions. Re-enabled (renamed from
.t_disabled).

99-status_ver: the per-target status-change event is delivered only to
workers connected to the events broker at publish time (no replay), so a
worker whose subscription was not yet ready intermittently missed the
change (~50% flake). Add a cross-worker barrier in init_worker that
confirms broadcasts reach every worker before the target is added, and
create the checker (registering its subscription) before adding the
target so no status change can fire before every worker is subscribed.
Both workers now deterministically observe the version change; assert it
via grep_error_log occurrence count.
---
 ...obes.t_disabled => 14-tls_active_probes.t} | 42 ++++++++++++-----
 t/with_resty-events/99-status_ver.t           | 44 +++++++++++++++++-
 t/with_resty-events/util/tls_probe_cert.pem   | 21 +++++++++
 t/with_resty-events/util/tls_probe_key.pem    | 28 ++++++++++++
 ...obes.t_disabled => 14-tls_active_probes.t} | 36 +++++++++++----
 t/with_worker-events/99-status_ver.t          | 45 ++++++++++++++++++-
 t/with_worker-events/util/tls_probe_cert.pem  | 21 +++++++++
 t/with_worker-events/util/tls_probe_key.pem   | 28 ++++++++++++
 8 files changed, 240 insertions(+), 25 deletions(-)
 rename t/with_resty-events/{14-tls_active_probes.t_disabled => 14-tls_active_probes.t} (71%)
 create mode 100644 t/with_resty-events/util/tls_probe_cert.pem
 create mode 100644 t/with_resty-events/util/tls_probe_key.pem
 rename t/with_worker-events/{14-tls_active_probes.t_disabled => 14-tls_active_probes.t} (71%)
 create mode 100644 t/with_worker-events/util/tls_probe_cert.pem
 create mode 100644 t/with_worker-events/util/tls_probe_key.pem

diff --git a/t/with_resty-events/14-tls_active_probes.t_disabled b/t/with_resty-events/14-tls_active_probes.t
similarity index 71%
rename from t/with_resty-events/14-tls_active_probes.t_disabled
rename to t/with_resty-events/14-tls_active_probes.t
index d9e44902..e997eb9a 100644
--- a/t/with_resty-events/14-tls_active_probes.t_disabled
+++ b/t/with_resty-events/14-tls_active_probes.t
@@ -8,6 +8,14 @@ plan tests => blocks() * 2;
 my $pwd = cwd();
 $ENV{TEST_NGINX_SERVROOT} = server_root();
 
+# A local self-signed TLS server is used instead of an external host so that
+# the active TLS probe tests are deterministic and do not depend on network
+# access. The certificate has CN/SAN "example.test"; probing with that
+# hostname (SNI) and certificate verification enabled succeeds, while probing
+# with a mismatching hostname and verification enabled fails.
+$ENV{TEST_NGINX_TLS_CERT} = "$pwd/t/with_resty-events/util/tls_probe_cert.pem";
+$ENV{TEST_NGINX_TLS_KEY} = "$pwd/t/with_resty-events/util/tls_probe_key.pem";
+
 our $HttpConfig = qq{
     lua_package_path "$pwd/lib/?.lua;;";
     lua_shared_dict test_shm 8m;
@@ -22,6 +30,16 @@ our $HttpConfig = qq{
             }
         }
     }
+
+    server {
+        listen 2115 ssl;
+        ssl_certificate $ENV{TEST_NGINX_TLS_CERT};
+        ssl_certificate_key $ENV{TEST_NGINX_TLS_KEY};
+        server_name example.test;
+        location / {
+            return 200 'ok';
+        }
+    }
 };
 
 run_tests();
@@ -34,7 +52,7 @@ __DATA__
 --- http_config eval: $::HttpConfig
 --- config
     location = /t {
-        lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
+        lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
         lua_ssl_verify_depth 2;
         content_by_lua_block {
             local we = require "resty.events.compat"
@@ -43,7 +61,7 @@ __DATA__
             local checker = healthcheck.new({
                 name = "testing",
                 shm_name = "test_shm",
-events_module = "resty.events",
+                events_module = "resty.events",
                 checks = {
                     active = {
                         type = "https",
@@ -59,9 +77,9 @@ events_module = "resty.events",
                     },
                 }
             })
-            local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false)
+            local ok, err = checker:add_target("127.0.0.1", 2115, "example.test", false)
             ngx.sleep(8) -- wait for 4x the check interval
-            ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com"))  -- true
+            ngx.say(checker:get_target_status("127.0.0.1", 2115, "example.test"))  -- true
         }
     }
 --- request
@@ -75,7 +93,7 @@ true
 --- http_config eval: $::HttpConfig
 --- config
     location = /t {
-        lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
+        lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
         lua_ssl_verify_depth 2;
         content_by_lua_block {
             local we = require "resty.events.compat"
@@ -84,7 +102,7 @@ true
             local checker = healthcheck.new({
                 name = "testing",
                 shm_name = "test_shm",
-events_module = "resty.events",
+                events_module = "resty.events",
                 checks = {
                     active = {
                         type = "https",
@@ -100,9 +118,9 @@ events_module = "resty.events",
                     },
                 }
             })
-            local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true)
+            local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", true)
             ngx.sleep(8) -- wait for 4x the check interval
-            ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com"))  -- false
+            ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test"))  -- false
         }
     }
 --- request
@@ -116,7 +134,7 @@ false
 --- http_config eval: $::HttpConfig
 --- config
     location = /t {
-        lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
+        lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
         lua_ssl_verify_depth 2;
         content_by_lua_block {
             local we = require "resty.events.compat"
@@ -125,7 +143,7 @@ false
             local checker = healthcheck.new({
                 name = "testing",
                 shm_name = "test_shm",
-events_module = "resty.events",
+                events_module = "resty.events",
                 checks = {
                     active = {
                         type = "https",
@@ -142,9 +160,9 @@ events_module = "resty.events",
                     },
                 }
             })
-            local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false)
+            local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", false)
             ngx.sleep(8) -- wait for 4x the check interval
-            ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com"))  -- true
+            ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test"))  -- true
         }
     }
 --- request
diff --git a/t/with_resty-events/99-status_ver.t b/t/with_resty-events/99-status_ver.t
index ce3192c3..dcb796f7 100644
--- a/t/with_resty-events/99-status_ver.t
+++ b/t/with_resty-events/99-status_ver.t
@@ -21,7 +21,26 @@ our $HttpConfig = qq{
         }))
         assert(we.configured())
 
+        -- Cross-worker delivery barrier: the status-change event is broadcast
+        -- only to workers connected to the broker at publish time (there is no
+        -- replay). To make the test deterministic we first confirm that
+        -- broadcasts reach every worker before adding the target. Each worker
+        -- repeatedly broadcasts a ping carrying its own id; on receiving a
+        -- ping every worker records the sender id in the shared dict. Once all
+        -- worker ids have been observed, broadcast delivery is proven and we
+        -- add the target (whose later status change will then reach all
+        -- workers).
+        local worker_count = ngx.worker.count()
+        local shm = ngx.shared.test_shm
+
+        we.register(function(data)
+            shm:set("barrier:" .. tostring(data), true)
+        end, "barrier", "ping")
+
         ngx.timer.at(0, function()
+            -- Create the checker (and thus register its event subscription)
+            -- up front, but do not add the target yet, so no status change can
+            -- be broadcast before every worker is subscribed and connected.
             local healthcheck = require("resty.healthcheck")
             local checker = healthcheck.new({
                 name = "testing",
@@ -38,6 +57,22 @@ our $HttpConfig = qq{
                     }
                 }
             })
+
+            local my_id = ngx.worker.id()
+            while true do
+                we.post("barrier", "ping", my_id)
+                local seen = 0
+                for i = 0, worker_count - 1 do
+                    if shm:get("barrier:" .. tostring(i)) then
+                        seen = seen + 1
+                    end
+                end
+                if seen >= worker_count then
+                    break
+                end
+                ngx.sleep(0.05)
+            end
+
             local ok, err = checker:add_target("127.0.0.1", 11111)
             if not ok then
                 error(err)
@@ -67,7 +102,7 @@ __DATA__
     location = /t {
         content_by_lua_block {
             ngx.say(true)
-            ngx.sleep(0.3) -- wait twice the interval
+            ngx.sleep(1) -- wait for the barrier handshake and status change
         }
     }
 --- request
@@ -77,5 +112,10 @@ true
 --- error_log
 checking unhealthy targets: nothing to do
 checking unhealthy targets: #1
+--- grep_error_log eval: qr/from 'true' to 'false', ver: \d+/
+--- grep_error_log_out eval
+[
+"from 'true' to 'false', ver: 2
 from 'true' to 'false', ver: 2
-from 'true' to 'false', ver: 1
+",
+]
diff --git a/t/with_resty-events/util/tls_probe_cert.pem b/t/with_resty-events/util/tls_probe_cert.pem
new file mode 100644
index 00000000..0119c792
--- /dev/null
+++ b/t/with_resty-events/util/tls_probe_cert.pem
@@ -0,0 +1,21 @@
+-----BEGIN CERTIFICATE-----
+MIIDdDCCAlygAwIBAgIUC+b5FmEe1NkGq7ref9fGBZrmYSQwDQYJKoZIhvcNAQEL
+BQAwPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt
+aGVhbHRoY2hlY2sgdGVzdDAgFw0yNjA2MjIyMDAwMTdaGA8yMTI2MDUyOTIwMDAx
+N1owPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt
+aGVhbHRoY2hlY2sgdGVzdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEB
+AKCvRZEsOGkCcICbFc8tMVva1Co16hBZfpZn95qwea41/rcstuT4+uN74fTieMpQ
+20nmYJ1YHV37ilq2zT1jRtHx5sNJX6o1P3fRR+CrcL0xXMSFKQZstpMeqKWb4WnM
+9vYuD5ZPU100LfLrdMP277RBYaa1E5jZPeCZ1B/riO+dUgX+hy6boMBLo7BMc+vP
+Pm+VBs1arcwN3IU/GbzHo3s3GGqR33uV1kKLWUA8eDVLawWKcnCABtNya/j7hGO3
+yCDUD6mXME9MF47PlJXSqXQTKvhUuFY5Lcyfa4n3OYS9Xo5ZISa084vrZrguWc7m
+oxqv2o1iLRzLqPXX+tWVsnMCAwEAAaNsMGowHQYDVR0OBBYEFMWZ7hlvu0TCXl7Y
+WPLomNF9mliLMB8GA1UdIwQYMBaAFMWZ7hlvu0TCXl7YWPLomNF9mliLMA8GA1Ud
+EwEB/wQFMAMBAf8wFwYDVR0RBBAwDoIMZXhhbXBsZS50ZXN0MA0GCSqGSIb3DQEB
+CwUAA4IBAQCYJ/AnSvS7Jvci+tGsQkWMbaPXlP8DpCfGFEzRQ5zrrQBqFdJB/Kfc
+nqdBWw4J/0vt91eaZv9mM93u2RZFTH2V76IlAgc3KahRKM+4XDXWLiassS9w3eZ1
+j3MzFXSrpI80xHX0pNOohlaJ9LcDp4D6kh42g2WfuJKCGfsN/q5eN+7s/4IIC98M
+EC0jTGytU3VKqyxesmiuG+rduBvwNtboaZuXa9WLkNkKQ31miJC+N/D9L6nRY/E4
+z3Kt8cYxMhlA+ckx5Rh/oxPlu6OFhX/e/bkKrDIzrtg6SDmq0x5NK3wQtht9Czrp
+yKEXM7sQtpGs1IDVVgJN5hucDbLzQu2L
+-----END CERTIFICATE-----
diff --git a/t/with_resty-events/util/tls_probe_key.pem b/t/with_resty-events/util/tls_probe_key.pem
new file mode 100644
index 00000000..3c3842b8
--- /dev/null
+++ b/t/with_resty-events/util/tls_probe_key.pem
@@ -0,0 +1,28 @@
+-----BEGIN PRIVATE KEY-----
+MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCgr0WRLDhpAnCA
+mxXPLTFb2tQqNeoQWX6WZ/easHmuNf63LLbk+Prje+H04njKUNtJ5mCdWB1d+4pa
+ts09Y0bR8ebDSV+qNT930Ufgq3C9MVzEhSkGbLaTHqilm+FpzPb2Lg+WT1NdNC3y
+63TD9u+0QWGmtROY2T3gmdQf64jvnVIF/ocum6DAS6OwTHPrzz5vlQbNWq3MDdyF
+Pxm8x6N7Nxhqkd97ldZCi1lAPHg1S2sFinJwgAbTcmv4+4Rjt8gg1A+plzBPTBeO
+z5SV0ql0Eyr4VLhWOS3Mn2uJ9zmEvV6OWSEmtPOL62a4LlnO5qMar9qNYi0cy6j1
+1/rVlbJzAgMBAAECggEACLx5eKL3WYTnmfkUmU7Yi/5x95jhJiIU1wJFvN7TFsza
+Zu94QtW4n6KjeTLQ+hWlqTcXJ1GG8zj9tOCTIxR2jBaYrc3GSj+ajkijcvcklNy4
+saSYbkAojbImiC6D9cJ45jnrNghlEhWyroOXtcnYuYED2VJSA5xCwtHfmHFathr4
+5C1maSTueiFxj+NN8w08oIt5jL4FH5BHb6lFWeBb/dnqXBcaLFakeCVDVm2Sl/BE
++3ykw1zqbBjvkscpEbL9D42NvVD7RDhXJYeUisJbNkowlk4YZh5QMpD+oZiSfYHz
+2ZuLi7oPF/A0LhKK2VNYqIDjtu08VLfuFBZHVteVsQKBgQC10OMw7pSkHQmRybIO
+0gXdsEuMaskUKNnR4BqVy0MudQZOt0pBCVsiAMGe06vAiFLJO3Nbtlu1P5HdzmZN
+UB5Rmpf8QyWhdrpROqISHFi6o/H/aXeb9o3TfRqlrFUN/x6ruiREArqAn8ITEfYs
+ql2NxEh0aS7ZnixYCZamTP7MsQKBgQDiPysTrnE6Ca44dVje80OA7RHWnrYDCrd7
+9ntK8mNt6INGsDeQ+rPTohG1FO5tZdpzRkd+wowIOMMuklLvn6XEFFB1hwiuN1/y
+bYLeqivfwC/PyXDxlf5ogkayEMwcKkJ1T/H2yDYR+mzs7/EN4xsBEFCDJQGgQOxy
+FCPd2M+qYwKBgG4KZEbsTxhY3r7W1Sa1JIm9NqxgwRyrcNHekhiRMjL+7vdbZdyg
+/gBBdu5a7DuWBoz2p2Ydo7m6JN2bGz21vPPk3hH2zeLAihm5o/fUIjusGD5epd7G
+RaZ3tFYLTFsxSm3jNinXgOtyRbLnDxiPcBnqb3PNaWaanfoWq5AxT5GxAoGAGkEE
+Satjfj2jVu/fGTgXbD2WZVZTfrTep+bpVcAc46MooKpQOGWvOm7DKUU9ibpZCClu
+oHoI7+dOVvgp4Z6gCMnmsEy0KCtK2gH1Pst2fed6ZN1WWuJx/ESp2X3zgY0x2xUk
+2eNPyvRJcZFCYr8o4g23mhBQSP1fsrk9lD/VkUsCgYBYc9fLiWsvrKhhfs+q7VWr
+wkVlmhOBiebHlO2E9j+l0btmuHRHp7s8JgQ2mV12pte5657pBP+hFsDE31f7pM3l
+fZ4cmFZHr6Mh4egvntIQ7AOAUkmkUUJUcIoIuzwroH0oNNl3R6PojNJzid0yOWfw
+XnymdGqaQsLdyV/9WCsHjA==
+-----END PRIVATE KEY-----
diff --git a/t/with_worker-events/14-tls_active_probes.t_disabled b/t/with_worker-events/14-tls_active_probes.t
similarity index 71%
rename from t/with_worker-events/14-tls_active_probes.t_disabled
rename to t/with_worker-events/14-tls_active_probes.t
index 51854928..4c98d52a 100644
--- a/t/with_worker-events/14-tls_active_probes.t_disabled
+++ b/t/with_worker-events/14-tls_active_probes.t
@@ -7,10 +7,28 @@ plan tests => blocks() * 2;
 
 my $pwd = cwd();
 
+# A local self-signed TLS server is used instead of an external host so that
+# the active TLS probe tests are deterministic and do not depend on network
+# access. The certificate has CN/SAN "example.test"; probing with that
+# hostname (SNI) and certificate verification enabled succeeds, while probing
+# with a mismatching hostname and verification enabled fails.
+$ENV{TEST_NGINX_TLS_CERT} = "$pwd/t/with_worker-events/util/tls_probe_cert.pem";
+$ENV{TEST_NGINX_TLS_KEY} = "$pwd/t/with_worker-events/util/tls_probe_key.pem";
+
 our $HttpConfig = qq{
     lua_package_path "$pwd/lib/?.lua;;";
     lua_shared_dict test_shm 8m;
     lua_shared_dict my_worker_events 8m;
+
+    server {
+        listen 2115 ssl;
+        ssl_certificate $ENV{TEST_NGINX_TLS_CERT};
+        ssl_certificate_key $ENV{TEST_NGINX_TLS_KEY};
+        server_name example.test;
+        location / {
+            return 200 'ok';
+        }
+    }
 };
 
 run_tests();
@@ -23,7 +41,7 @@ __DATA__
 --- http_config eval: $::HttpConfig
 --- config
     location = /t {
-        lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
+        lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
         lua_ssl_verify_depth 2;
         content_by_lua_block {
             local we = require "resty.worker.events"
@@ -47,9 +65,9 @@ __DATA__
                     },
                 }
             })
-            local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false)
+            local ok, err = checker:add_target("127.0.0.1", 2115, "example.test", false)
             ngx.sleep(8) -- wait for 4x the check interval
-            ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com"))  -- true
+            ngx.say(checker:get_target_status("127.0.0.1", 2115, "example.test"))  -- true
         }
     }
 --- request
@@ -63,7 +81,7 @@ true
 --- http_config eval: $::HttpConfig
 --- config
     location = /t {
-        lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
+        lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
         lua_ssl_verify_depth 2;
         content_by_lua_block {
             local we = require "resty.worker.events"
@@ -87,9 +105,9 @@ true
                     },
                 }
             })
-            local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true)
+            local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", true)
             ngx.sleep(8) -- wait for 4x the check interval
-            ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com"))  -- false
+            ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test"))  -- false
         }
     }
 --- request
@@ -103,7 +121,7 @@ false
 --- http_config eval: $::HttpConfig
 --- config
     location = /t {
-        lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
+        lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
         lua_ssl_verify_depth 2;
         content_by_lua_block {
             local we = require "resty.worker.events"
@@ -128,9 +146,9 @@ false
                     },
                 }
             })
-            local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false)
+            local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", false)
             ngx.sleep(8) -- wait for 4x the check interval
-            ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com"))  -- true
+            ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test"))  -- true
         }
     }
 --- request
diff --git a/t/with_worker-events/99-status_ver.t b/t/with_worker-events/99-status_ver.t
index c7d2166f..d2793efb 100644
--- a/t/with_worker-events/99-status_ver.t
+++ b/t/with_worker-events/99-status_ver.t
@@ -14,7 +14,26 @@ our $HttpConfig = qq{
     init_worker_by_lua_block {
         local we = require "resty.worker.events"
         assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
+
+        -- Cross-worker delivery barrier: the status-change event must be seen
+        -- by every worker. To make the test deterministic we first confirm
+        -- that events propagate to every worker before adding the target.
+        -- Each worker repeatedly posts a ping carrying its own id; on receiving
+        -- a ping every worker records the sender id in the shared dict. Once
+        -- all worker ids have been observed, event propagation is proven and we
+        -- add the target (whose later status change will then reach all
+        -- workers).
+        local worker_count = ngx.worker.count()
+        local shm = ngx.shared.test_shm
+
+        we.register(function(data)
+            shm:set("barrier:" .. tostring(data), true)
+        end, "barrier", "ping")
+
         ngx.timer.at(0, function()
+            -- Create the checker (and thus register its event subscription)
+            -- up front, but do not add the target yet, so no status change can
+            -- be broadcast before every worker is subscribed.
             local healthcheck = require("resty.healthcheck")
             local checker = healthcheck.new({
                 name = "testing",
@@ -30,6 +49,23 @@ our $HttpConfig = qq{
                     }
                 }
             })
+
+            local my_id = ngx.worker.id()
+            while true do
+                we.post("barrier", "ping", my_id)
+                we.poll()
+                local seen = 0
+                for i = 0, worker_count - 1 do
+                    if shm:get("barrier:" .. tostring(i)) then
+                        seen = seen + 1
+                    end
+                end
+                if seen >= worker_count then
+                    break
+                end
+                ngx.sleep(0.05)
+            end
+
             local ok, err = checker:add_target("127.0.0.1", 11111)
             if not ok then
                 error(err)
@@ -48,7 +84,7 @@ __DATA__
     location = /t {
         content_by_lua_block {
             ngx.say(true)
-            ngx.sleep(0.3) -- wait twice the interval
+            ngx.sleep(1) -- wait for the barrier handshake and status change
         }
     }
 --- request
@@ -58,5 +94,10 @@ true
 --- error_log
 checking unhealthy targets: nothing to do
 checking unhealthy targets: #1
+--- grep_error_log eval: qr/from 'true' to 'false', ver: \d+/
+--- grep_error_log_out eval
+[
+"from 'true' to 'false', ver: 2
 from 'true' to 'false', ver: 2
-from 'true' to 'false', ver: 1
+",
+]
diff --git a/t/with_worker-events/util/tls_probe_cert.pem b/t/with_worker-events/util/tls_probe_cert.pem
new file mode 100644
index 00000000..0119c792
--- /dev/null
+++ b/t/with_worker-events/util/tls_probe_cert.pem
@@ -0,0 +1,21 @@
+-----BEGIN CERTIFICATE-----
+MIIDdDCCAlygAwIBAgIUC+b5FmEe1NkGq7ref9fGBZrmYSQwDQYJKoZIhvcNAQEL
+BQAwPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt
+aGVhbHRoY2hlY2sgdGVzdDAgFw0yNjA2MjIyMDAwMTdaGA8yMTI2MDUyOTIwMDAx
+N1owPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt
+aGVhbHRoY2hlY2sgdGVzdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEB
+AKCvRZEsOGkCcICbFc8tMVva1Co16hBZfpZn95qwea41/rcstuT4+uN74fTieMpQ
+20nmYJ1YHV37ilq2zT1jRtHx5sNJX6o1P3fRR+CrcL0xXMSFKQZstpMeqKWb4WnM
+9vYuD5ZPU100LfLrdMP277RBYaa1E5jZPeCZ1B/riO+dUgX+hy6boMBLo7BMc+vP
+Pm+VBs1arcwN3IU/GbzHo3s3GGqR33uV1kKLWUA8eDVLawWKcnCABtNya/j7hGO3
+yCDUD6mXME9MF47PlJXSqXQTKvhUuFY5Lcyfa4n3OYS9Xo5ZISa084vrZrguWc7m
+oxqv2o1iLRzLqPXX+tWVsnMCAwEAAaNsMGowHQYDVR0OBBYEFMWZ7hlvu0TCXl7Y
+WPLomNF9mliLMB8GA1UdIwQYMBaAFMWZ7hlvu0TCXl7YWPLomNF9mliLMA8GA1Ud
+EwEB/wQFMAMBAf8wFwYDVR0RBBAwDoIMZXhhbXBsZS50ZXN0MA0GCSqGSIb3DQEB
+CwUAA4IBAQCYJ/AnSvS7Jvci+tGsQkWMbaPXlP8DpCfGFEzRQ5zrrQBqFdJB/Kfc
+nqdBWw4J/0vt91eaZv9mM93u2RZFTH2V76IlAgc3KahRKM+4XDXWLiassS9w3eZ1
+j3MzFXSrpI80xHX0pNOohlaJ9LcDp4D6kh42g2WfuJKCGfsN/q5eN+7s/4IIC98M
+EC0jTGytU3VKqyxesmiuG+rduBvwNtboaZuXa9WLkNkKQ31miJC+N/D9L6nRY/E4
+z3Kt8cYxMhlA+ckx5Rh/oxPlu6OFhX/e/bkKrDIzrtg6SDmq0x5NK3wQtht9Czrp
+yKEXM7sQtpGs1IDVVgJN5hucDbLzQu2L
+-----END CERTIFICATE-----
diff --git a/t/with_worker-events/util/tls_probe_key.pem b/t/with_worker-events/util/tls_probe_key.pem
new file mode 100644
index 00000000..3c3842b8
--- /dev/null
+++ b/t/with_worker-events/util/tls_probe_key.pem
@@ -0,0 +1,28 @@
+-----BEGIN PRIVATE KEY-----
+MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCgr0WRLDhpAnCA
+mxXPLTFb2tQqNeoQWX6WZ/easHmuNf63LLbk+Prje+H04njKUNtJ5mCdWB1d+4pa
+ts09Y0bR8ebDSV+qNT930Ufgq3C9MVzEhSkGbLaTHqilm+FpzPb2Lg+WT1NdNC3y
+63TD9u+0QWGmtROY2T3gmdQf64jvnVIF/ocum6DAS6OwTHPrzz5vlQbNWq3MDdyF
+Pxm8x6N7Nxhqkd97ldZCi1lAPHg1S2sFinJwgAbTcmv4+4Rjt8gg1A+plzBPTBeO
+z5SV0ql0Eyr4VLhWOS3Mn2uJ9zmEvV6OWSEmtPOL62a4LlnO5qMar9qNYi0cy6j1
+1/rVlbJzAgMBAAECggEACLx5eKL3WYTnmfkUmU7Yi/5x95jhJiIU1wJFvN7TFsza
+Zu94QtW4n6KjeTLQ+hWlqTcXJ1GG8zj9tOCTIxR2jBaYrc3GSj+ajkijcvcklNy4
+saSYbkAojbImiC6D9cJ45jnrNghlEhWyroOXtcnYuYED2VJSA5xCwtHfmHFathr4
+5C1maSTueiFxj+NN8w08oIt5jL4FH5BHb6lFWeBb/dnqXBcaLFakeCVDVm2Sl/BE
++3ykw1zqbBjvkscpEbL9D42NvVD7RDhXJYeUisJbNkowlk4YZh5QMpD+oZiSfYHz
+2ZuLi7oPF/A0LhKK2VNYqIDjtu08VLfuFBZHVteVsQKBgQC10OMw7pSkHQmRybIO
+0gXdsEuMaskUKNnR4BqVy0MudQZOt0pBCVsiAMGe06vAiFLJO3Nbtlu1P5HdzmZN
+UB5Rmpf8QyWhdrpROqISHFi6o/H/aXeb9o3TfRqlrFUN/x6ruiREArqAn8ITEfYs
+ql2NxEh0aS7ZnixYCZamTP7MsQKBgQDiPysTrnE6Ca44dVje80OA7RHWnrYDCrd7
+9ntK8mNt6INGsDeQ+rPTohG1FO5tZdpzRkd+wowIOMMuklLvn6XEFFB1hwiuN1/y
+bYLeqivfwC/PyXDxlf5ogkayEMwcKkJ1T/H2yDYR+mzs7/EN4xsBEFCDJQGgQOxy
+FCPd2M+qYwKBgG4KZEbsTxhY3r7W1Sa1JIm9NqxgwRyrcNHekhiRMjL+7vdbZdyg
+/gBBdu5a7DuWBoz2p2Ydo7m6JN2bGz21vPPk3hH2zeLAihm5o/fUIjusGD5epd7G
+RaZ3tFYLTFsxSm3jNinXgOtyRbLnDxiPcBnqb3PNaWaanfoWq5AxT5GxAoGAGkEE
+Satjfj2jVu/fGTgXbD2WZVZTfrTep+bpVcAc46MooKpQOGWvOm7DKUU9ibpZCClu
+oHoI7+dOVvgp4Z6gCMnmsEy0KCtK2gH1Pst2fed6ZN1WWuJx/ESp2X3zgY0x2xUk
+2eNPyvRJcZFCYr8o4g23mhBQSP1fsrk9lD/VkUsCgYBYc9fLiWsvrKhhfs+q7VWr
+wkVlmhOBiebHlO2E9j+l0btmuHRHp7s8JgQ2mV12pte5657pBP+hFsDE31f7pM3l
+fZ4cmFZHr6Mh4egvntIQ7AOAUkmkUUJUcIoIuzwroH0oNNl3R6PojNJzid0yOWfw
+XnymdGqaQsLdyV/9WCsHjA==
+-----END PRIVATE KEY-----

From 356d12232911f9fa6988baec32ceabb9e2681ef5 Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Tue, 23 Jun 2026 14:48:41 +0800
Subject: [PATCH 06/11] fix: decouple stale-target cleanup from active-checker
 gating

The previous revision gated the whole periodic tick on has_active_checker
and returned before the cleanup loop when no active checker existed. In a
passive-only deployment (checks.passive but no active interval) no worker
ever has an active checker, so delayed_clear()-marked targets were never
purged and leaked in the shm (apache/apisix#13385) -- a regression only on
the pure-passive path, which the existing tests (all using active probes)
did not cover.

Run stale-target cleanup independently of active probing and of the
periodic lock: locking_target_list serializes the purge across workers and
re-reads the shm list each time, so running it on every worker is safe and
idempotent. The periodic lock is still released on has_active_checker so a
disabled worker yields to an active one (apache/apisix#13235); that gate
stays on has_active_checker rather than on hcs being non-empty because a
disabled checker lingers in the weak hcs table with active=false until GC.

Add a passive-only + delayed_clear regression test for both event backends.
---
 lib/resty/healthcheck.lua                     | 108 +++++++++---------
 t/with_resty-events/21-passive-only-cleanup.t |  93 +++++++++++++++
 .../21-passive-only-cleanup.t                 |  77 +++++++++++++
 3 files changed, 227 insertions(+), 51 deletions(-)
 create mode 100644 t/with_resty-events/21-passive-only-cleanup.t
 create mode 100644 t/with_worker-events/21-passive-only-cleanup.t

diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua
index 3ce44af5..5e3f3785 100644
--- a/lib/resty/healthcheck.lua
+++ b/lib/resty/healthcheck.lua
@@ -1734,54 +1734,27 @@ function _M.new(opts)
       detached = false,
       expire = function()
 
-        -- A worker only contends for the periodic lock when it still owns at
-        -- least one active checker. Checking this *before* acquiring the lock
-        -- avoids a worker whose checkers were all disabled from repeatedly
-        -- re-acquiring and releasing the lock on every tick (the lock would
-        -- otherwise jitter once per back-off interval on a fully disabled
-        -- cluster). When every checker has been stopped the lock holder must
-        -- also release the lock so that other workers which still own active
-        -- checkers can acquire it, otherwise active health checks become
-        -- permanently stuck after a disable -> re-enable cycle.
-        local has_active_checker = false
-        for _, checker_obj in pairs(hcs) do
-          if checker_obj.checks.active.healthy.active or
-             checker_obj.checks.active.unhealthy.active then
-            has_active_checker = true
-            break
-          end
-        end
-
-        if not has_active_checker then
-          -- release the lock only if we are still the holder, then back off
-          -- without contending again
-          if shm:get(key) == ngx_worker_pid() then
-            shm:delete(key)
-          end
-          active_check_timer.interval = CHECK_INTERVAL * 10
-          return
-        end
-
-        if get_periodic_lock(shm, key) then
-          active_check_timer.interval = CHECK_INTERVAL
-          renew_periodic_lock(shm, key)
-        else
-          active_check_timer.interval = CHECK_INTERVAL * 10
-          return
-        end
-
         local cur_time = ngx_now()
-        -- Decide once, before iterating the checkers, whether this is a
-        -- cleanup window. `last_cleanup_check` is module level and shared by
-        -- every checker, so it must be read before the loop and updated only
-        -- after it: updating it inside the loop would make the first checker
-        -- that runs cleanup advance the timestamp and starve every other
-        -- checker in `hcs`, leaving their purge-marked targets in the shm
+
+        -- Stale-target cleanup is decoupled from both active probing and the
+        -- periodic lock. A passive-only deployment (checks.passive but no
+        -- active interval) has no active checker on any worker, yet still marks
+        -- targets via delayed_clear() and must purge them; gating cleanup on
+        -- the periodic lock (which only an active worker ever holds) would leak
+        -- those targets forever (apache/apisix#13385). Cleanup is therefore run
+        -- independently here: locking_target_list serializes the purge across
+        -- workers and re-reads the shm list each time, so running it on every
+        -- worker is safe and idempotent -- a worker that finds the list already
+        -- purged removes nothing and raises no duplicate remove event.
+        --
+        -- `run_cleanup` is decided once, before the loop, and last_cleanup_check
+        -- advanced once after it: last_cleanup_check is module level and shared
+        -- by every checker, so advancing it inside the loop would let the first
+        -- checker starve the rest, leaving their purge-marked targets in the shm
         -- forever. This only manifests with multiple health-checked upstreams.
         local run_cleanup = (last_cleanup_check + CLEANUP_INTERVAL) < cur_time
-        for _, checker_obj in pairs(hcs) do
-
-          if run_cleanup then
+        if run_cleanup then
+          for _, checker_obj in pairs(hcs) do
             -- clear targets marked for delayed removal
             locking_target_list(checker_obj, function(target_list)
               local removed_targets = {}
@@ -1811,7 +1784,46 @@ function _M.new(opts)
               end
             end)
           end
+          last_cleanup_check = cur_time
+        end
 
+        -- The periodic lock distributes ACTIVE probing to a single worker. A
+        -- worker with no active checker must release the lock (if it holds it)
+        -- and back off, so a worker that still owns active checkers can take
+        -- over; otherwise active health checks stay stuck after a disable ->
+        -- re-enable cycle (apache/apisix#13235). This is gated on
+        -- has_active_checker rather than on `hcs` being non-empty, because a
+        -- disabled checker lingers in the weak `hcs` table with active=false
+        -- until it is garbage collected.
+        local has_active_checker = false
+        for _, checker_obj in pairs(hcs) do
+          if checker_obj.checks.active.healthy.active or
+             checker_obj.checks.active.unhealthy.active then
+            has_active_checker = true
+            break
+          end
+        end
+
+        if not has_active_checker then
+          -- release the lock only if we are still the holder, then back off
+          -- without contending again
+          if shm:get(key) == ngx_worker_pid() then
+            shm:delete(key)
+          end
+          active_check_timer.interval = CHECK_INTERVAL * 10
+          return
+        end
+
+        if get_periodic_lock(shm, key) then
+          active_check_timer.interval = CHECK_INTERVAL
+          renew_periodic_lock(shm, key)
+        else
+          active_check_timer.interval = CHECK_INTERVAL * 10
+          return
+        end
+
+        -- active probing: only the periodic-lock holder runs the probes
+        for _, checker_obj in pairs(hcs) do
           if checker_obj.checks.active.healthy.active and
             (checker_obj.checks.active.healthy.last_run +
               checker_obj.checks.active.healthy.interval <= cur_time)
@@ -1828,12 +1840,6 @@ function _M.new(opts)
             checker_callback(checker_obj, "unhealthy")
           end
         end
-
-        -- Advance the cleanup window once, after every checker in `hcs` has
-        -- had a chance to purge its stale targets in this iteration.
-        if run_cleanup then
-          last_cleanup_check = cur_time
-        end
       end,
     })
     if not active_check_timer then
diff --git a/t/with_resty-events/21-passive-only-cleanup.t b/t/with_resty-events/21-passive-only-cleanup.t
new file mode 100644
index 00000000..9c33755d
--- /dev/null
+++ b/t/with_resty-events/21-passive-only-cleanup.t
@@ -0,0 +1,93 @@
+use Test::Nginx::Socket::Lua;
+use Cwd qw(cwd);
+
+workers(1);
+
+plan tests => repeat_each() * (blocks() * 2);
+
+my $pwd = cwd();
+$ENV{TEST_NGINX_SERVROOT} = server_root();
+
+our $HttpConfig = qq{
+    lua_package_path "$pwd/lib/?.lua;;";
+    lua_shared_dict test_shm 8m;
+
+    init_worker_by_lua_block {
+        _G.__TESTING_HEALTHCHECKER = true
+        local we = require "resty.events.compat"
+        assert(we.configure({
+            unique_timeout = 5,
+            broker_id = 0,
+            listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock"
+        }))
+        assert(we.configured())
+    }
+
+    server {
+        server_name kong_worker_events;
+        listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock;
+        access_log off;
+        location / {
+            content_by_lua_block {
+                require("resty.events.compat").run()
+            }
+        }
+    }
+};
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: stale-target cleanup runs for a passive-only checker (no active checks)
+--- http_config eval: $::HttpConfig
+--- config
+    location = /t {
+        content_by_lua_block {
+            local healthcheck = require("resty.healthcheck")
+            -- shorten the cleanup window so it fires within the test
+            healthcheck._set_cleanup_interval(0.2)
+
+            -- A passive-only checker: no active interval is configured, so
+            -- checks.active.*.active stays false on every worker and the
+            -- periodic timer never holds the lock for active probing. The
+            -- delayed_clear() cleanup must still run, otherwise purge-marked
+            -- targets leak forever (apache/apisix#13385).
+            local checker = assert(healthcheck.new({
+                name = "passive-only",
+                shm_name = "test_shm",
+                events_module = "resty.events",
+                checks = {
+                    passive = {
+                        type = "http",
+                        healthy   = { successes = 1 },
+                        unhealthy = { http_failures = 1 },
+                    }
+                }
+            }))
+
+            for i = 1, 3 do
+                assert(checker:add_target("127.0.0.1", 20000 + i, nil, true))
+            end
+
+            -- no active checker exists on this worker
+            ngx.say("has active: ",
+                checker.checks.active.healthy.active or
+                checker.checks.active.unhealthy.active)
+
+            -- mark every target for immediate delayed removal
+            assert(checker:delayed_clear(0))
+
+            -- wait long enough for several cleanup windows to elapse
+            ngx.sleep(1.5)
+
+            local list = healthcheck.get_target_list("passive-only", "test_shm")
+            ngx.say("remaining: ", #list)
+        }
+    }
+--- request
+GET /t
+--- response_body
+has active: false
+remaining: 0
+--- timeout: 5
diff --git a/t/with_worker-events/21-passive-only-cleanup.t b/t/with_worker-events/21-passive-only-cleanup.t
new file mode 100644
index 00000000..f40ccb3a
--- /dev/null
+++ b/t/with_worker-events/21-passive-only-cleanup.t
@@ -0,0 +1,77 @@
+use Test::Nginx::Socket::Lua;
+use Cwd qw(cwd);
+
+workers(1);
+
+plan tests => repeat_each() * (blocks() * 2);
+
+my $pwd = cwd();
+
+our $HttpConfig = qq{
+    lua_package_path "$pwd/lib/?.lua;;";
+    lua_shared_dict test_shm 8m;
+    lua_shared_dict my_worker_events 8m;
+
+    init_worker_by_lua_block {
+        _G.__TESTING_HEALTHCHECKER = true
+    }
+};
+
+run_tests();
+
+__DATA__
+
+=== TEST 1: stale-target cleanup runs for a passive-only checker (no active checks)
+--- http_config eval: $::HttpConfig
+--- config
+    location = /t {
+        content_by_lua_block {
+            local we = require "resty.worker.events"
+            assert(we.configure{ shm = "my_worker_events", interval = 0.1 })
+
+            local healthcheck = require("resty.healthcheck")
+            -- shorten the cleanup window so it fires within the test
+            healthcheck._set_cleanup_interval(0.2)
+
+            -- A passive-only checker: no active interval is configured, so
+            -- checks.active.*.active stays false on every worker and the
+            -- periodic timer never holds the lock for active probing. The
+            -- delayed_clear() cleanup must still run, otherwise purge-marked
+            -- targets leak forever (apache/apisix#13385).
+            local checker = assert(healthcheck.new({
+                name = "passive-only",
+                shm_name = "test_shm",
+                checks = {
+                    passive = {
+                        type = "http",
+                        healthy   = { successes = 1 },
+                        unhealthy = { http_failures = 1 },
+                    }
+                }
+            }))
+
+            for i = 1, 3 do
+                assert(checker:add_target("127.0.0.1", 20000 + i, nil, true))
+            end
+
+            -- no active checker exists on this worker
+            ngx.say("has active: ",
+                checker.checks.active.healthy.active or
+                checker.checks.active.unhealthy.active)
+
+            -- mark every target for immediate delayed removal
+            assert(checker:delayed_clear(0))
+
+            -- wait long enough for several cleanup windows to elapse
+            ngx.sleep(1.5)
+
+            local list = healthcheck.get_target_list("passive-only", "test_shm")
+            ngx.say("remaining: ", #list)
+        }
+    }
+--- request
+GET /t
+--- response_body
+has active: false
+remaining: 0
+--- timeout: 5

From ae3e2f9c42687d00d7feb9b58ec7e5207a50aa0a Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Tue, 23 Jun 2026 14:53:53 +0800
Subject: [PATCH 07/11] test: normalize has-active boolean in passive-only
 cleanup test

For a passive-only checker checks.active.*.active is nil (never set to
true by start(), never set to false by stop()), so ngx.say rendered it as
"nil" rather than "false". Normalize with `not not` so the assertion
is stable. The cleanup assertion (remaining: 0) already passed, confirming
the fix purges delayed_clear targets with no active checker.
---
 t/with_resty-events/21-passive-only-cleanup.t  | 5 ++---
 t/with_worker-events/21-passive-only-cleanup.t | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/t/with_resty-events/21-passive-only-cleanup.t b/t/with_resty-events/21-passive-only-cleanup.t
index 9c33755d..71b9815e 100644
--- a/t/with_resty-events/21-passive-only-cleanup.t
+++ b/t/with_resty-events/21-passive-only-cleanup.t
@@ -71,9 +71,8 @@ __DATA__
             end
 
             -- no active checker exists on this worker
-            ngx.say("has active: ",
-                checker.checks.active.healthy.active or
-                checker.checks.active.unhealthy.active)
+            ngx.say("has active: ", not not (checker.checks.active.healthy.active or
+                                  checker.checks.active.unhealthy.active))
 
             -- mark every target for immediate delayed removal
             assert(checker:delayed_clear(0))
diff --git a/t/with_worker-events/21-passive-only-cleanup.t b/t/with_worker-events/21-passive-only-cleanup.t
index f40ccb3a..2f0ecc46 100644
--- a/t/with_worker-events/21-passive-only-cleanup.t
+++ b/t/with_worker-events/21-passive-only-cleanup.t
@@ -55,9 +55,8 @@ __DATA__
             end
 
             -- no active checker exists on this worker
-            ngx.say("has active: ",
-                checker.checks.active.healthy.active or
-                checker.checks.active.unhealthy.active)
+            ngx.say("has active: ", not not (checker.checks.active.healthy.active or
+                                  checker.checks.active.unhealthy.active))
 
             -- mark every target for immediate delayed removal
             assert(checker:delayed_clear(0))

From ea63000225a61dd231d9bda2982ea52311ba8e39 Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Wed, 24 Jun 2026 05:16:27 +0800
Subject: [PATCH 08/11] fix: elect one cleanup worker per window and resume
 probing promptly

Two follow-ups from the PR review of the cleanup/lock-gating rework:

1. Stale-target cleanup ran on every worker each window, so N workers x M
   checkers all contended on each checker's TARGET_LIST_LOCK. Elect a single
   worker per window via an atomic shm:add on a dedicated cleanup key
   (TTL = CLEANUP_INTERVAL). Any worker can win -- including a passive-only
   one -- so the apache/apisix#13385 leak fix still holds, while the rest skip
   the pass instead of fighting for the lock. Removes the per-worker
   last_cleanup_check bookkeeping.

2. After a disable -> re-enable cycle, a backed-off timer (running at
   CHECK_INTERVAL * 10 while idle or after losing the lock race) took up to
   one slow tick to resume probing. checker:start() now wakes the timer back
   to CHECK_INTERVAL when it re-enables active checks.
---
 lib/resty/healthcheck.lua | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua
index 5e3f3785..9891a04e 100644
--- a/lib/resty/healthcheck.lua
+++ b/lib/resty/healthcheck.lua
@@ -236,7 +236,6 @@ local hcs = setmetatable({}, {
 })
 
 local active_check_timer
-local last_cleanup_check
 
 -- serialize a table to a string
 local serialize = codec.encode
@@ -1423,6 +1422,15 @@ function checker:start()
   worker_events.unregister(self.ev_callback, self.EVENT_SOURCE)  -- ensure we never double subscribe
   worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE)
 
+  -- If the active-check timer is already running but backed off (idle worker,
+  -- or one that lost the periodic-lock race, runs at CHECK_INTERVAL * 10),
+  -- wake it up so a freshly (re-)enabled checker resumes probing promptly
+  -- instead of waiting up to one slow tick.
+  if active_check_timer and
+     (self.checks.active.healthy.active or self.checks.active.unhealthy.active) then
+    active_check_timer.interval = CHECK_INTERVAL
+  end
+
   self:log(DEBUG, "active check flagged as active")
   return true
 end
@@ -1726,7 +1734,7 @@ function _M.new(opts)
     self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ",
       "starting active check timer")
     local shm, key = self.shm, self.PERIODIC_LOCK
-    last_cleanup_check = ngx_now()
+    local cleanup_key = key .. ":cleanup"
     active_check_timer, err = resty_timer({
       recurring = true,
       interval = CHECK_INTERVAL,
@@ -1736,24 +1744,20 @@ function _M.new(opts)
 
         local cur_time = ngx_now()
 
-        -- Stale-target cleanup is decoupled from both active probing and the
+        -- Stale-target cleanup is decoupled from active probing and the
         -- periodic lock. A passive-only deployment (checks.passive but no
         -- active interval) has no active checker on any worker, yet still marks
         -- targets via delayed_clear() and must purge them; gating cleanup on
         -- the periodic lock (which only an active worker ever holds) would leak
-        -- those targets forever (apache/apisix#13385). Cleanup is therefore run
-        -- independently here: locking_target_list serializes the purge across
-        -- workers and re-reads the shm list each time, so running it on every
-        -- worker is safe and idempotent -- a worker that finds the list already
-        -- purged removes nothing and raises no duplicate remove event.
+        -- those targets forever (apache/apisix#13385).
         --
-        -- `run_cleanup` is decided once, before the loop, and last_cleanup_check
-        -- advanced once after it: last_cleanup_check is module level and shared
-        -- by every checker, so advancing it inside the loop would let the first
-        -- checker starve the rest, leaving their purge-marked targets in the shm
-        -- forever. This only manifests with multiple health-checked upstreams.
-        local run_cleanup = (last_cleanup_check + CLEANUP_INTERVAL) < cur_time
-        if run_cleanup then
+        -- A single worker per window is elected via an atomic shm:add on
+        -- cleanup_key (TTL = CLEANUP_INTERVAL): any worker can win, including a
+        -- passive-only one, so the leak fix holds, while the other workers skip
+        -- the pass and do not all contend on each checker's TARGET_LIST_LOCK.
+        -- The elected worker purges every checker in one pass.
+        local won, add_err = shm:add(cleanup_key, ngx_worker_pid(), CLEANUP_INTERVAL)
+        if won then
           for _, checker_obj in pairs(hcs) do
             -- clear targets marked for delayed removal
             locking_target_list(checker_obj, function(target_list)
@@ -1784,7 +1788,8 @@ function _M.new(opts)
               end
             end)
           end
-          last_cleanup_check = cur_time
+        elseif add_err ~= "exists" then
+          ngx_log(ERR, "failed to elect cleanup worker for '", cleanup_key, "': ", add_err)
         end
 
         -- The periodic lock distributes ACTIVE probing to a single worker. A

From 8c718404dfbb35d050c571da75655e30e4fd47ac Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Wed, 24 Jun 2026 05:16:33 +0800
Subject: [PATCH 09/11] test: bound the cross-worker barrier wait in
 99-status_ver

The barrier handshake spun in an unbounded `while true` loop, so a worker
that never synced would hang until the global test timeout with no useful
diagnostic. Cap the wait with a deadline and fail loudly with the synced
count when it is exceeded.
---
 t/with_resty-events/99-status_ver.t  | 8 ++++++++
 t/with_worker-events/99-status_ver.t | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/t/with_resty-events/99-status_ver.t b/t/with_resty-events/99-status_ver.t
index dcb796f7..ba498525 100644
--- a/t/with_resty-events/99-status_ver.t
+++ b/t/with_resty-events/99-status_ver.t
@@ -59,6 +59,9 @@ our $HttpConfig = qq{
             })
 
             local my_id = ngx.worker.id()
+            local deadline = ngx.now() + 5  -- bound the barrier wait so a
+                                            -- missing worker fails loudly
+                                            -- instead of spinning forever
             while true do
                 we.post("barrier", "ping", my_id)
                 local seen = 0
@@ -70,6 +73,11 @@ our $HttpConfig = qq{
                 if seen >= worker_count then
                     break
                 end
+                ngx.update_time()
+                if ngx.now() >= deadline then
+                    error("barrier not reached: " .. seen .. "/" ..
+                          worker_count .. " workers synced before timeout")
+                end
                 ngx.sleep(0.05)
             end
 
diff --git a/t/with_worker-events/99-status_ver.t b/t/with_worker-events/99-status_ver.t
index d2793efb..2dcec5a1 100644
--- a/t/with_worker-events/99-status_ver.t
+++ b/t/with_worker-events/99-status_ver.t
@@ -51,6 +51,9 @@ our $HttpConfig = qq{
             })
 
             local my_id = ngx.worker.id()
+            local deadline = ngx.now() + 5  -- bound the barrier wait so a
+                                            -- missing worker fails loudly
+                                            -- instead of spinning forever
             while true do
                 we.post("barrier", "ping", my_id)
                 we.poll()
@@ -63,6 +66,11 @@ our $HttpConfig = qq{
                 if seen >= worker_count then
                     break
                 end
+                ngx.update_time()
+                if ngx.now() >= deadline then
+                    error("barrier not reached: " .. seen .. "/" ..
+                          worker_count .. " workers synced before timeout")
+                end
                 ngx.sleep(0.05)
             end
 

From 24ea30cce71f83ff443dbf9f7cca32c5d55e1edd Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Wed, 24 Jun 2026 08:44:02 +0800
Subject: [PATCH 10/11] fix: drop ineffective timer-wake on checker re-enable

The interval-wake added to checker:start() could not bring a re-enabled
checker back to fast probing: lua-resty-timer (~> 1) schedules each recurring
sleep from self.sub_interval, which is frozen to the initial interval at
construction and never re-synced from self.interval; self.interval only feeds
scheduling for node-wide (key_name) timers, which the active-check timer is
not. Mutating active_check_timer.interval from start() (or anywhere) therefore
has no effect on the timer cadence, so the optimization was a no-op. Remove it.
---
 lib/resty/healthcheck.lua | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua
index 9891a04e..4db29f1d 100644
--- a/lib/resty/healthcheck.lua
+++ b/lib/resty/healthcheck.lua
@@ -1422,15 +1422,6 @@ function checker:start()
   worker_events.unregister(self.ev_callback, self.EVENT_SOURCE)  -- ensure we never double subscribe
   worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE)
 
-  -- If the active-check timer is already running but backed off (idle worker,
-  -- or one that lost the periodic-lock race, runs at CHECK_INTERVAL * 10),
-  -- wake it up so a freshly (re-)enabled checker resumes probing promptly
-  -- instead of waiting up to one slow tick.
-  if active_check_timer and
-     (self.checks.active.healthy.active or self.checks.active.unhealthy.active) then
-    active_check_timer.interval = CHECK_INTERVAL
-  end
-
   self:log(DEBUG, "active check flagged as active")
   return true
 end

From 65326650a7d9e2a5da7db77c473b25a05b461e69 Mon Sep 17 00:00:00 2001
From: AlinsRan <rongxin@api7.ai>
Date: Fri, 26 Jun 2026 14:42:14 +0800
Subject: [PATCH 11/11] refactor: drop no-op active_check_timer.interval
 back-off assignments

lua-resty-timer schedules each recurring sleep from sub_interval (frozen at
construction), never re-reading self.interval, so mutating
active_check_timer.interval has no effect on the timer cadence -- the same
reason the start() interval-wake was removed in 24ea30c. Remove the three
remaining no-op assignments in the periodic tick so the code no longer implies
an idle back-off that never happens.

Also document that the per-window cleanup election purges only the elected
worker's own hcs and is eventually consistent under asymmetric checker
distribution.
---
 lib/resty/healthcheck.lua | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua
index 4db29f1d..a60571ab 100644
--- a/lib/resty/healthcheck.lua
+++ b/lib/resty/healthcheck.lua
@@ -1746,7 +1746,12 @@ function _M.new(opts)
         -- cleanup_key (TTL = CLEANUP_INTERVAL): any worker can win, including a
         -- passive-only one, so the leak fix holds, while the other workers skip
         -- the pass and do not all contend on each checker's TARGET_LIST_LOCK.
-        -- The elected worker purges every checker in one pass.
+        -- The elected worker purges every checker in its own `hcs` in one pass.
+        -- Purging writes to the shared shm target_list, so cleaning a checker on
+        -- any one worker is globally effective; if checkers are distributed
+        -- asymmetrically across workers, a given upstream is purged in whichever
+        -- window a worker that owns its checker wins the election -- eventually
+        -- consistent rather than every-worker-every-window.
         local won, add_err = shm:add(cleanup_key, ngx_worker_pid(), CLEANUP_INTERVAL)
         if won then
           for _, checker_obj in pairs(hcs) do
@@ -1801,20 +1806,17 @@ function _M.new(opts)
         end
 
         if not has_active_checker then
-          -- release the lock only if we are still the holder, then back off
-          -- without contending again
+          -- release the lock only if we are still the holder, so a worker that
+          -- still owns active checkers can take over
           if shm:get(key) == ngx_worker_pid() then
             shm:delete(key)
           end
-          active_check_timer.interval = CHECK_INTERVAL * 10
           return
         end
 
         if get_periodic_lock(shm, key) then
-          active_check_timer.interval = CHECK_INTERVAL
           renew_periodic_lock(shm, key)
         else
-          active_check_timer.interval = CHECK_INTERVAL * 10
           return
         end