From ada5ee82c2dc09423e43772c92540e76a336b44f Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Mon, 22 Jun 2026 11:02:06 +0800 Subject: [PATCH 01/11] fix: clean every checker each window and release lock when idle Two issues affecting workloads with multiple health-checked upstreams: 1. Stale-target cleanup gating. `last_cleanup_check` is a module-level timestamp shared by every checker, but it was updated *inside* the `for _, checker_obj in pairs(hcs)` loop. The first checker that runs cleanup advanced the timestamp, so the cleanup condition turned false for every remaining checker in the same iteration. As a result only the first upstream's purge-marked targets were ever removed; the other upstreams kept their deleted nodes in the shm forever (still returned by the control API and still actively probed). This is why it could not be reproduced with a single upstream. The window is now decided once before the loop and the timestamp advanced once after it, so every checker in `hcs` is cleaned in the same window. 2. Periodic lock not released when idle. When every checker on the lock-holding worker has been stopped (active == false), the timer kept renewing the global PERIODIC_LOCK, so other workers that still own active checkers could never acquire it and active checks stopped entirely after a disable -> re-enable cycle. The lock holder now releases the lock when no active checker remains. This extends the work started in #54. Adds a regression test (t/with_worker-events/19-multi-checker-cleanup.t) that creates two health-checked upstreams, marks both for delayed removal, and asserts both are cleaned. It fails before the gating fix (second checker keeps its targets) and passes after. --- lib/resty/healthcheck.lua | 47 ++++++++++- .../19-multi-checker-cleanup.t | 80 +++++++++++++++++++ 2 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 t/with_worker-events/19-multi-checker-cleanup.t diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index cfc405c1..5102edea 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1735,6 +1735,25 @@ function _M.new(opts) expire = function() if get_periodic_lock(shm, key) then + -- Check whether any checker on this worker is still active before + -- renewing the lock. When every checker has been stopped + -- (active == false) the lock holder must release the lock so that + -- other workers which still own active checkers can acquire it, + -- otherwise active health checks become permanently stuck after a + -- disable -> re-enable cycle. + local has_active_checker = false + for _, checker_obj in pairs(hcs) do + if checker_obj.checks.active.healthy.active or + checker_obj.checks.active.unhealthy.active then + has_active_checker = true + break + end + end + if not has_active_checker then + shm:delete(key) + active_check_timer.interval = CHECK_INTERVAL * 10 + return + end active_check_timer.interval = CHECK_INTERVAL renew_periodic_lock(shm, key) else @@ -1743,9 +1762,17 @@ function _M.new(opts) end local cur_time = ngx_now() + -- Decide once, before iterating the checkers, whether this is a + -- cleanup window. `last_cleanup_check` is module level and shared by + -- every checker, so it must be read before the loop and updated only + -- after it: updating it inside the loop would make the first checker + -- that runs cleanup advance the timestamp and starve every other + -- checker in `hcs`, leaving their purge-marked targets in the shm + -- forever. This only manifests with multiple health-checked upstreams. + local run_cleanup = (last_cleanup_check + CLEANUP_INTERVAL) < cur_time for _, checker_obj in pairs(hcs) do - if (last_cleanup_check + CLEANUP_INTERVAL) < cur_time then + if run_cleanup then -- clear targets marked for delayed removal locking_target_list(checker_obj, function(target_list) local removed_targets = {} @@ -1774,8 +1801,6 @@ function _M.new(opts) end end end) - - last_cleanup_check = cur_time end if checker_obj.checks.active.healthy.active and @@ -1794,6 +1819,12 @@ function _M.new(opts) checker_callback(checker_obj, "unhealthy") end end + + -- Advance the cleanup window once, after every checker in `hcs` has + -- had a chance to purge its stale targets in this iteration. + if run_cleanup then + last_cleanup_check = cur_time + end end, }) if not active_check_timer then @@ -1868,4 +1899,14 @@ function _M.get_target_list(name, shm_name) end +if TESTING then + -- test-only hook: shorten the stale-target cleanup window so the periodic + -- cleanup path can be exercised deterministically without waiting for the + -- default CLEANUP_INTERVAL (CHECK_INTERVAL * 25). + function _M._set_cleanup_interval(interval) + CLEANUP_INTERVAL = interval + end +end + + return _M diff --git a/t/with_worker-events/19-multi-checker-cleanup.t b/t/with_worker-events/19-multi-checker-cleanup.t new file mode 100644 index 00000000..f1e51be7 --- /dev/null +++ b/t/with_worker-events/19-multi-checker-cleanup.t @@ -0,0 +1,80 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * (blocks() * 2); + +my $pwd = cwd(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + lua_shared_dict my_worker_events 8m; + + init_worker_by_lua_block { + _G.__TESTING_HEALTHCHECKER = true + } +}; + +run_tests(); + +__DATA__ + +=== TEST 1: stale-target cleanup runs for every checker, not just the first one +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + + local healthcheck = require("resty.healthcheck") + -- shorten the cleanup window so it fires within the test + healthcheck._set_cleanup_interval(0.2) + + local function new_checker(name) + return assert(healthcheck.new({ + name = name, + shm_name = "test_shm", + checks = { + active = { + type = "tcp", + healthy = { interval = 0.1 }, + unhealthy = { interval = 0.1 }, + } + } + })) + end + + -- two upstreams, each with health checks enabled + local checker1 = new_checker("upstream-1") + local checker2 = new_checker("upstream-2") + + for i = 1, 3 do + assert(checker1:add_target("127.0.0.1", 20000 + i, nil, true)) + assert(checker2:add_target("127.0.0.1", 30000 + i, nil, true)) + end + + -- mark every target on both checkers for immediate delayed removal + assert(checker1:delayed_clear(0)) + assert(checker2:delayed_clear(0)) + + -- wait long enough for several cleanup windows to elapse + ngx.sleep(1.5) + + local list1 = healthcheck.get_target_list("upstream-1", "test_shm") + local list2 = healthcheck.get_target_list("upstream-2", "test_shm") + + -- Before the fix only the first checker in `hcs` is cleaned each + -- window, so checker2 keeps its purge-marked targets forever. + ngx.say("checker1 remaining: ", #list1) + ngx.say("checker2 remaining: ", #list2) + } + } +--- request +GET /t +--- response_body +checker1 remaining: 0 +checker2 remaining: 0 +--- timeout: 5 From df219d206571adc2f9e2c1f955176d75f3fe8d4c Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Mon, 22 Jun 2026 15:14:14 +0800 Subject: [PATCH 02/11] test: cover periodic-lock release and mirror cleanup test to resty-events - Add a regression test asserting the periodic lock is released once no active checker remains on the worker (so another worker can take over). - Mirror the multi-checker cleanup test and the new lock test into the resty-events suite so both event backends exercise the fixes. --- .../19-multi-checker-cleanup.t | 96 +++++++++++++++++++ .../20-periodic-lock-release.t | 86 +++++++++++++++++ .../20-periodic-lock-release.t | 67 +++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 t/with_resty-events/19-multi-checker-cleanup.t create mode 100644 t/with_resty-events/20-periodic-lock-release.t create mode 100644 t/with_worker-events/20-periodic-lock-release.t diff --git a/t/with_resty-events/19-multi-checker-cleanup.t b/t/with_resty-events/19-multi-checker-cleanup.t new file mode 100644 index 00000000..3a9d6e3a --- /dev/null +++ b/t/with_resty-events/19-multi-checker-cleanup.t @@ -0,0 +1,96 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * (blocks() * 2); + +my $pwd = cwd(); +$ENV{TEST_NGINX_SERVROOT} = server_root(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + + init_worker_by_lua_block { + _G.__TESTING_HEALTHCHECKER = true + local we = require "resty.events.compat" + assert(we.configure({ + unique_timeout = 5, + broker_id = 0, + listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock" + })) + assert(we.configured()) + } + + server { + server_name kong_worker_events; + listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock; + access_log off; + location / { + content_by_lua_block { + require("resty.events.compat").run() + } + } + } +}; + +run_tests(); + +__DATA__ + +=== TEST 1: stale-target cleanup runs for every checker, not just the first one +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + -- shorten the cleanup window so it fires within the test + healthcheck._set_cleanup_interval(0.2) + + local function new_checker(name) + return assert(healthcheck.new({ + name = name, + shm_name = "test_shm", + events_module = "resty.events", + checks = { + active = { + type = "tcp", + healthy = { interval = 0.1 }, + unhealthy = { interval = 0.1 }, + } + } + })) + end + + -- two upstreams, each with health checks enabled + local checker1 = new_checker("upstream-1") + local checker2 = new_checker("upstream-2") + + for i = 1, 3 do + assert(checker1:add_target("127.0.0.1", 20000 + i, nil, true)) + assert(checker2:add_target("127.0.0.1", 30000 + i, nil, true)) + end + + -- mark every target on both checkers for immediate delayed removal + assert(checker1:delayed_clear(0)) + assert(checker2:delayed_clear(0)) + + -- wait long enough for several cleanup windows to elapse + ngx.sleep(1.5) + + local list1 = healthcheck.get_target_list("upstream-1", "test_shm") + local list2 = healthcheck.get_target_list("upstream-2", "test_shm") + + -- Before the fix only the first checker in `hcs` is cleaned each + -- window, so checker2 keeps its purge-marked targets forever. + ngx.say("checker1 remaining: ", #list1) + ngx.say("checker2 remaining: ", #list2) + } + } +--- request +GET /t +--- response_body +checker1 remaining: 0 +checker2 remaining: 0 +--- timeout: 5 diff --git a/t/with_resty-events/20-periodic-lock-release.t b/t/with_resty-events/20-periodic-lock-release.t new file mode 100644 index 00000000..99ae3b03 --- /dev/null +++ b/t/with_resty-events/20-periodic-lock-release.t @@ -0,0 +1,86 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * (blocks() * 2); + +my $pwd = cwd(); +$ENV{TEST_NGINX_SERVROOT} = server_root(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + + init_worker_by_lua_block { + local we = require "resty.events.compat" + assert(we.configure({ + unique_timeout = 5, + broker_id = 0, + listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock" + })) + assert(we.configured()) + } + + server { + server_name kong_worker_events; + listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock; + access_log off; + location / { + content_by_lua_block { + require("resty.events.compat").run() + } + } + } +}; + +run_tests(); + +__DATA__ + +=== TEST 1: periodic lock is released once no checker is active +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + -- the periodic lock is a single global key shared by every checker + -- on this worker; see SHM_PREFIX .. ":period_lock:" + local PERIODIC_LOCK = "lua-resty-healthcheck::period_lock:" + local shm = ngx.shared.test_shm + + local checker = assert(healthcheck.new({ + name = "lock-release", + shm_name = "test_shm", + events_module = "resty.events", + checks = { + active = { + type = "tcp", + healthy = { interval = 0.1 }, + unhealthy = { interval = 0.1 }, + } + } + })) + assert(checker:add_target("127.0.0.1", 21111, nil, true)) + + -- let the periodic timer acquire the lock + ngx.sleep(0.5) + ngx.say("lock held while active: ", shm:get(PERIODIC_LOCK) ~= nil) + + -- stop the only checker: no active checker remains on this worker. + -- Before the fix the timer keeps renewing the lock forever; after + -- the fix the lock holder releases it so another worker that still + -- owns active checkers can take over. + checker:stop() + + -- wait past LOCK_PERIOD (CHECK_INTERVAL * 15 = 1.5s) and several ticks + ngx.sleep(2) + ngx.say("lock released after stop: ", shm:get(PERIODIC_LOCK) == nil) + } + } +--- request +GET /t +--- response_body +lock held while active: true +lock released after stop: true +--- timeout: 5 diff --git a/t/with_worker-events/20-periodic-lock-release.t b/t/with_worker-events/20-periodic-lock-release.t new file mode 100644 index 00000000..288bf3d6 --- /dev/null +++ b/t/with_worker-events/20-periodic-lock-release.t @@ -0,0 +1,67 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * (blocks() * 2); + +my $pwd = cwd(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + lua_shared_dict my_worker_events 8m; +}; + +run_tests(); + +__DATA__ + +=== TEST 1: periodic lock is released once no checker is active +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + + local healthcheck = require("resty.healthcheck") + -- the periodic lock is a single global key shared by every checker + -- on this worker; see SHM_PREFIX .. ":period_lock:" + local PERIODIC_LOCK = "lua-resty-healthcheck::period_lock:" + local shm = ngx.shared.test_shm + + local checker = assert(healthcheck.new({ + name = "lock-release", + shm_name = "test_shm", + checks = { + active = { + type = "tcp", + healthy = { interval = 0.1 }, + unhealthy = { interval = 0.1 }, + } + } + })) + assert(checker:add_target("127.0.0.1", 21111, nil, true)) + + -- let the periodic timer acquire the lock + ngx.sleep(0.5) + ngx.say("lock held while active: ", shm:get(PERIODIC_LOCK) ~= nil) + + -- stop the only checker: no active checker remains on this worker. + -- Before the fix the timer keeps renewing the lock forever; after + -- the fix the lock holder releases it so another worker that still + -- owns active checkers can take over. + checker:stop() + + -- wait past LOCK_PERIOD (CHECK_INTERVAL * 15 = 1.5s) and several ticks + ngx.sleep(2) + ngx.say("lock released after stop: ", shm:get(PERIODIC_LOCK) == nil) + } + } +--- request +GET /t +--- response_body +lock held while active: true +lock released after stop: true +--- timeout: 5 From 941ca869c7fec2c18553acc011bdbd42897cacc1 Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Mon, 22 Jun 2026 15:33:21 +0800 Subject: [PATCH 03/11] test: disable externally-dependent 14-tls active-probe test 14-tls_active_probes.t actively probes the external host badssl.com via a hard-coded IP (104.154.89.105). It is flaky in CI whenever that host is slow/unreachable and is currently red on every run, even for the non-disable TEST 1. Disable it via the repo's .t_disabled convention (cf. 10-garbagecollect, 13-integration) until it is rewritten against a local TLS server. This is unrelated to the cleanup/lock changes in this PR. --- .../{14-tls_active_probes.t => 14-tls_active_probes.t_disabled} | 0 .../{14-tls_active_probes.t => 14-tls_active_probes.t_disabled} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename t/with_resty-events/{14-tls_active_probes.t => 14-tls_active_probes.t_disabled} (100%) rename t/with_worker-events/{14-tls_active_probes.t => 14-tls_active_probes.t_disabled} (100%) diff --git a/t/with_resty-events/14-tls_active_probes.t b/t/with_resty-events/14-tls_active_probes.t_disabled similarity index 100% rename from t/with_resty-events/14-tls_active_probes.t rename to t/with_resty-events/14-tls_active_probes.t_disabled diff --git a/t/with_worker-events/14-tls_active_probes.t b/t/with_worker-events/14-tls_active_probes.t_disabled similarity index 100% rename from t/with_worker-events/14-tls_active_probes.t rename to t/with_worker-events/14-tls_active_probes.t_disabled From 76ab8fbd58f667fc7bd5a785cc5d15472356bd7f Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Mon, 22 Jun 2026 15:33:21 +0800 Subject: [PATCH 04/11] fix: only contend for the periodic lock when a checker is active Check whether the worker still owns an active checker before acquiring the periodic lock instead of after. A worker whose checkers were all disabled no longer re-acquires and immediately releases the lock on every back-off tick; it releases the lock once (if it still holds it) and then stops contending. --- lib/resty/healthcheck.lua | 43 +++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 5102edea..3ce44af5 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1734,26 +1734,35 @@ function _M.new(opts) detached = false, expire = function() - if get_periodic_lock(shm, key) then - -- Check whether any checker on this worker is still active before - -- renewing the lock. When every checker has been stopped - -- (active == false) the lock holder must release the lock so that - -- other workers which still own active checkers can acquire it, - -- otherwise active health checks become permanently stuck after a - -- disable -> re-enable cycle. - local has_active_checker = false - for _, checker_obj in pairs(hcs) do - if checker_obj.checks.active.healthy.active or - checker_obj.checks.active.unhealthy.active then - has_active_checker = true - break - end + -- A worker only contends for the periodic lock when it still owns at + -- least one active checker. Checking this *before* acquiring the lock + -- avoids a worker whose checkers were all disabled from repeatedly + -- re-acquiring and releasing the lock on every tick (the lock would + -- otherwise jitter once per back-off interval on a fully disabled + -- cluster). When every checker has been stopped the lock holder must + -- also release the lock so that other workers which still own active + -- checkers can acquire it, otherwise active health checks become + -- permanently stuck after a disable -> re-enable cycle. + local has_active_checker = false + for _, checker_obj in pairs(hcs) do + if checker_obj.checks.active.healthy.active or + checker_obj.checks.active.unhealthy.active then + has_active_checker = true + break end - if not has_active_checker then + end + + if not has_active_checker then + -- release the lock only if we are still the holder, then back off + -- without contending again + if shm:get(key) == ngx_worker_pid() then shm:delete(key) - active_check_timer.interval = CHECK_INTERVAL * 10 - return end + active_check_timer.interval = CHECK_INTERVAL * 10 + return + end + + if get_periodic_lock(shm, key) then active_check_timer.interval = CHECK_INTERVAL renew_periodic_lock(shm, key) else From 38d86b3e561b3528f5fc050a87652e0afe741d8f Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Tue, 23 Jun 2026 05:23:11 +0800 Subject: [PATCH 05/11] test: make TLS active-probe and status-version tests deterministic 14-tls_active_probes: replace the external badssl.com probe (hard-coded IP 104.154.89.105, flaky/unreachable in CI) with a local self-signed TLS server embedded in the test nginx config. The cert has CN/SAN example.test, so probing with that SNI and verification enabled succeeds, a mismatching SNI with verification enabled fails, and verification off accepts the mismatch - reproducing the original healthy/unhealthy assertions against a local server. Wait for the probe by polling the target status instead of a fixed sleep, and use a distinct port per events backend to avoid cross-suite collisions. Re-enabled (renamed from .t_disabled). 99-status_ver: the per-target status-change event is delivered only to workers connected to the events broker at publish time (no replay), so a worker whose subscription was not yet ready intermittently missed the change (~50% flake). Add a cross-worker barrier in init_worker that confirms broadcasts reach every worker before the target is added, and create the checker (registering its subscription) before adding the target so no status change can fire before every worker is subscribed. Both workers now deterministically observe the version change; assert it via grep_error_log occurrence count. --- ...obes.t_disabled => 14-tls_active_probes.t} | 42 ++++++++++++----- t/with_resty-events/99-status_ver.t | 44 +++++++++++++++++- t/with_resty-events/util/tls_probe_cert.pem | 21 +++++++++ t/with_resty-events/util/tls_probe_key.pem | 28 ++++++++++++ ...obes.t_disabled => 14-tls_active_probes.t} | 36 +++++++++++---- t/with_worker-events/99-status_ver.t | 45 ++++++++++++++++++- t/with_worker-events/util/tls_probe_cert.pem | 21 +++++++++ t/with_worker-events/util/tls_probe_key.pem | 28 ++++++++++++ 8 files changed, 240 insertions(+), 25 deletions(-) rename t/with_resty-events/{14-tls_active_probes.t_disabled => 14-tls_active_probes.t} (71%) create mode 100644 t/with_resty-events/util/tls_probe_cert.pem create mode 100644 t/with_resty-events/util/tls_probe_key.pem rename t/with_worker-events/{14-tls_active_probes.t_disabled => 14-tls_active_probes.t} (71%) create mode 100644 t/with_worker-events/util/tls_probe_cert.pem create mode 100644 t/with_worker-events/util/tls_probe_key.pem diff --git a/t/with_resty-events/14-tls_active_probes.t_disabled b/t/with_resty-events/14-tls_active_probes.t similarity index 71% rename from t/with_resty-events/14-tls_active_probes.t_disabled rename to t/with_resty-events/14-tls_active_probes.t index d9e44902..e997eb9a 100644 --- a/t/with_resty-events/14-tls_active_probes.t_disabled +++ b/t/with_resty-events/14-tls_active_probes.t @@ -8,6 +8,14 @@ plan tests => blocks() * 2; my $pwd = cwd(); $ENV{TEST_NGINX_SERVROOT} = server_root(); +# A local self-signed TLS server is used instead of an external host so that +# the active TLS probe tests are deterministic and do not depend on network +# access. The certificate has CN/SAN "example.test"; probing with that +# hostname (SNI) and certificate verification enabled succeeds, while probing +# with a mismatching hostname and verification enabled fails. +$ENV{TEST_NGINX_TLS_CERT} = "$pwd/t/with_resty-events/util/tls_probe_cert.pem"; +$ENV{TEST_NGINX_TLS_KEY} = "$pwd/t/with_resty-events/util/tls_probe_key.pem"; + our $HttpConfig = qq{ lua_package_path "$pwd/lib/?.lua;;"; lua_shared_dict test_shm 8m; @@ -22,6 +30,16 @@ our $HttpConfig = qq{ } } } + + server { + listen 2115 ssl; + ssl_certificate $ENV{TEST_NGINX_TLS_CERT}; + ssl_certificate_key $ENV{TEST_NGINX_TLS_KEY}; + server_name example.test; + location / { + return 200 'ok'; + } + } }; run_tests(); @@ -34,7 +52,7 @@ __DATA__ --- http_config eval: $::HttpConfig --- config location = /t { - lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; + lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT; lua_ssl_verify_depth 2; content_by_lua_block { local we = require "resty.events.compat" @@ -43,7 +61,7 @@ __DATA__ local checker = healthcheck.new({ name = "testing", shm_name = "test_shm", -events_module = "resty.events", + events_module = "resty.events", checks = { active = { type = "https", @@ -59,9 +77,9 @@ events_module = "resty.events", }, } }) - local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false) + local ok, err = checker:add_target("127.0.0.1", 2115, "example.test", false) ngx.sleep(8) -- wait for 4x the check interval - ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true + ngx.say(checker:get_target_status("127.0.0.1", 2115, "example.test")) -- true } } --- request @@ -75,7 +93,7 @@ true --- http_config eval: $::HttpConfig --- config location = /t { - lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; + lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT; lua_ssl_verify_depth 2; content_by_lua_block { local we = require "resty.events.compat" @@ -84,7 +102,7 @@ true local checker = healthcheck.new({ name = "testing", shm_name = "test_shm", -events_module = "resty.events", + events_module = "resty.events", checks = { active = { type = "https", @@ -100,9 +118,9 @@ events_module = "resty.events", }, } }) - local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true) + local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", true) ngx.sleep(8) -- wait for 4x the check interval - ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false + ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test")) -- false } } --- request @@ -116,7 +134,7 @@ false --- http_config eval: $::HttpConfig --- config location = /t { - lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; + lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT; lua_ssl_verify_depth 2; content_by_lua_block { local we = require "resty.events.compat" @@ -125,7 +143,7 @@ false local checker = healthcheck.new({ name = "testing", shm_name = "test_shm", -events_module = "resty.events", + events_module = "resty.events", checks = { active = { type = "https", @@ -142,9 +160,9 @@ events_module = "resty.events", }, } }) - local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false) + local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", false) ngx.sleep(8) -- wait for 4x the check interval - ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true + ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test")) -- true } } --- request diff --git a/t/with_resty-events/99-status_ver.t b/t/with_resty-events/99-status_ver.t index ce3192c3..dcb796f7 100644 --- a/t/with_resty-events/99-status_ver.t +++ b/t/with_resty-events/99-status_ver.t @@ -21,7 +21,26 @@ our $HttpConfig = qq{ })) assert(we.configured()) + -- Cross-worker delivery barrier: the status-change event is broadcast + -- only to workers connected to the broker at publish time (there is no + -- replay). To make the test deterministic we first confirm that + -- broadcasts reach every worker before adding the target. Each worker + -- repeatedly broadcasts a ping carrying its own id; on receiving a + -- ping every worker records the sender id in the shared dict. Once all + -- worker ids have been observed, broadcast delivery is proven and we + -- add the target (whose later status change will then reach all + -- workers). + local worker_count = ngx.worker.count() + local shm = ngx.shared.test_shm + + we.register(function(data) + shm:set("barrier:" .. tostring(data), true) + end, "barrier", "ping") + ngx.timer.at(0, function() + -- Create the checker (and thus register its event subscription) + -- up front, but do not add the target yet, so no status change can + -- be broadcast before every worker is subscribed and connected. local healthcheck = require("resty.healthcheck") local checker = healthcheck.new({ name = "testing", @@ -38,6 +57,22 @@ our $HttpConfig = qq{ } } }) + + local my_id = ngx.worker.id() + while true do + we.post("barrier", "ping", my_id) + local seen = 0 + for i = 0, worker_count - 1 do + if shm:get("barrier:" .. tostring(i)) then + seen = seen + 1 + end + end + if seen >= worker_count then + break + end + ngx.sleep(0.05) + end + local ok, err = checker:add_target("127.0.0.1", 11111) if not ok then error(err) @@ -67,7 +102,7 @@ __DATA__ location = /t { content_by_lua_block { ngx.say(true) - ngx.sleep(0.3) -- wait twice the interval + ngx.sleep(1) -- wait for the barrier handshake and status change } } --- request @@ -77,5 +112,10 @@ true --- error_log checking unhealthy targets: nothing to do checking unhealthy targets: #1 +--- grep_error_log eval: qr/from 'true' to 'false', ver: \d+/ +--- grep_error_log_out eval +[ +"from 'true' to 'false', ver: 2 from 'true' to 'false', ver: 2 -from 'true' to 'false', ver: 1 +", +] diff --git a/t/with_resty-events/util/tls_probe_cert.pem b/t/with_resty-events/util/tls_probe_cert.pem new file mode 100644 index 00000000..0119c792 --- /dev/null +++ b/t/with_resty-events/util/tls_probe_cert.pem @@ -0,0 +1,21 @@ +-----BEGIN CERTIFICATE----- +MIIDdDCCAlygAwIBAgIUC+b5FmEe1NkGq7ref9fGBZrmYSQwDQYJKoZIhvcNAQEL +BQAwPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt +aGVhbHRoY2hlY2sgdGVzdDAgFw0yNjA2MjIyMDAwMTdaGA8yMTI2MDUyOTIwMDAx +N1owPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt +aGVhbHRoY2hlY2sgdGVzdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEB +AKCvRZEsOGkCcICbFc8tMVva1Co16hBZfpZn95qwea41/rcstuT4+uN74fTieMpQ +20nmYJ1YHV37ilq2zT1jRtHx5sNJX6o1P3fRR+CrcL0xXMSFKQZstpMeqKWb4WnM +9vYuD5ZPU100LfLrdMP277RBYaa1E5jZPeCZ1B/riO+dUgX+hy6boMBLo7BMc+vP +Pm+VBs1arcwN3IU/GbzHo3s3GGqR33uV1kKLWUA8eDVLawWKcnCABtNya/j7hGO3 +yCDUD6mXME9MF47PlJXSqXQTKvhUuFY5Lcyfa4n3OYS9Xo5ZISa084vrZrguWc7m +oxqv2o1iLRzLqPXX+tWVsnMCAwEAAaNsMGowHQYDVR0OBBYEFMWZ7hlvu0TCXl7Y +WPLomNF9mliLMB8GA1UdIwQYMBaAFMWZ7hlvu0TCXl7YWPLomNF9mliLMA8GA1Ud +EwEB/wQFMAMBAf8wFwYDVR0RBBAwDoIMZXhhbXBsZS50ZXN0MA0GCSqGSIb3DQEB +CwUAA4IBAQCYJ/AnSvS7Jvci+tGsQkWMbaPXlP8DpCfGFEzRQ5zrrQBqFdJB/Kfc +nqdBWw4J/0vt91eaZv9mM93u2RZFTH2V76IlAgc3KahRKM+4XDXWLiassS9w3eZ1 +j3MzFXSrpI80xHX0pNOohlaJ9LcDp4D6kh42g2WfuJKCGfsN/q5eN+7s/4IIC98M +EC0jTGytU3VKqyxesmiuG+rduBvwNtboaZuXa9WLkNkKQ31miJC+N/D9L6nRY/E4 +z3Kt8cYxMhlA+ckx5Rh/oxPlu6OFhX/e/bkKrDIzrtg6SDmq0x5NK3wQtht9Czrp +yKEXM7sQtpGs1IDVVgJN5hucDbLzQu2L +-----END CERTIFICATE----- diff --git a/t/with_resty-events/util/tls_probe_key.pem b/t/with_resty-events/util/tls_probe_key.pem new file mode 100644 index 00000000..3c3842b8 --- /dev/null +++ b/t/with_resty-events/util/tls_probe_key.pem @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCgr0WRLDhpAnCA +mxXPLTFb2tQqNeoQWX6WZ/easHmuNf63LLbk+Prje+H04njKUNtJ5mCdWB1d+4pa +ts09Y0bR8ebDSV+qNT930Ufgq3C9MVzEhSkGbLaTHqilm+FpzPb2Lg+WT1NdNC3y +63TD9u+0QWGmtROY2T3gmdQf64jvnVIF/ocum6DAS6OwTHPrzz5vlQbNWq3MDdyF +Pxm8x6N7Nxhqkd97ldZCi1lAPHg1S2sFinJwgAbTcmv4+4Rjt8gg1A+plzBPTBeO +z5SV0ql0Eyr4VLhWOS3Mn2uJ9zmEvV6OWSEmtPOL62a4LlnO5qMar9qNYi0cy6j1 +1/rVlbJzAgMBAAECggEACLx5eKL3WYTnmfkUmU7Yi/5x95jhJiIU1wJFvN7TFsza +Zu94QtW4n6KjeTLQ+hWlqTcXJ1GG8zj9tOCTIxR2jBaYrc3GSj+ajkijcvcklNy4 +saSYbkAojbImiC6D9cJ45jnrNghlEhWyroOXtcnYuYED2VJSA5xCwtHfmHFathr4 +5C1maSTueiFxj+NN8w08oIt5jL4FH5BHb6lFWeBb/dnqXBcaLFakeCVDVm2Sl/BE ++3ykw1zqbBjvkscpEbL9D42NvVD7RDhXJYeUisJbNkowlk4YZh5QMpD+oZiSfYHz +2ZuLi7oPF/A0LhKK2VNYqIDjtu08VLfuFBZHVteVsQKBgQC10OMw7pSkHQmRybIO +0gXdsEuMaskUKNnR4BqVy0MudQZOt0pBCVsiAMGe06vAiFLJO3Nbtlu1P5HdzmZN +UB5Rmpf8QyWhdrpROqISHFi6o/H/aXeb9o3TfRqlrFUN/x6ruiREArqAn8ITEfYs +ql2NxEh0aS7ZnixYCZamTP7MsQKBgQDiPysTrnE6Ca44dVje80OA7RHWnrYDCrd7 +9ntK8mNt6INGsDeQ+rPTohG1FO5tZdpzRkd+wowIOMMuklLvn6XEFFB1hwiuN1/y +bYLeqivfwC/PyXDxlf5ogkayEMwcKkJ1T/H2yDYR+mzs7/EN4xsBEFCDJQGgQOxy +FCPd2M+qYwKBgG4KZEbsTxhY3r7W1Sa1JIm9NqxgwRyrcNHekhiRMjL+7vdbZdyg +/gBBdu5a7DuWBoz2p2Ydo7m6JN2bGz21vPPk3hH2zeLAihm5o/fUIjusGD5epd7G +RaZ3tFYLTFsxSm3jNinXgOtyRbLnDxiPcBnqb3PNaWaanfoWq5AxT5GxAoGAGkEE +Satjfj2jVu/fGTgXbD2WZVZTfrTep+bpVcAc46MooKpQOGWvOm7DKUU9ibpZCClu +oHoI7+dOVvgp4Z6gCMnmsEy0KCtK2gH1Pst2fed6ZN1WWuJx/ESp2X3zgY0x2xUk +2eNPyvRJcZFCYr8o4g23mhBQSP1fsrk9lD/VkUsCgYBYc9fLiWsvrKhhfs+q7VWr +wkVlmhOBiebHlO2E9j+l0btmuHRHp7s8JgQ2mV12pte5657pBP+hFsDE31f7pM3l +fZ4cmFZHr6Mh4egvntIQ7AOAUkmkUUJUcIoIuzwroH0oNNl3R6PojNJzid0yOWfw +XnymdGqaQsLdyV/9WCsHjA== +-----END PRIVATE KEY----- diff --git a/t/with_worker-events/14-tls_active_probes.t_disabled b/t/with_worker-events/14-tls_active_probes.t similarity index 71% rename from t/with_worker-events/14-tls_active_probes.t_disabled rename to t/with_worker-events/14-tls_active_probes.t index 51854928..4c98d52a 100644 --- a/t/with_worker-events/14-tls_active_probes.t_disabled +++ b/t/with_worker-events/14-tls_active_probes.t @@ -7,10 +7,28 @@ plan tests => blocks() * 2; my $pwd = cwd(); +# A local self-signed TLS server is used instead of an external host so that +# the active TLS probe tests are deterministic and do not depend on network +# access. The certificate has CN/SAN "example.test"; probing with that +# hostname (SNI) and certificate verification enabled succeeds, while probing +# with a mismatching hostname and verification enabled fails. +$ENV{TEST_NGINX_TLS_CERT} = "$pwd/t/with_worker-events/util/tls_probe_cert.pem"; +$ENV{TEST_NGINX_TLS_KEY} = "$pwd/t/with_worker-events/util/tls_probe_key.pem"; + our $HttpConfig = qq{ lua_package_path "$pwd/lib/?.lua;;"; lua_shared_dict test_shm 8m; lua_shared_dict my_worker_events 8m; + + server { + listen 2115 ssl; + ssl_certificate $ENV{TEST_NGINX_TLS_CERT}; + ssl_certificate_key $ENV{TEST_NGINX_TLS_KEY}; + server_name example.test; + location / { + return 200 'ok'; + } + } }; run_tests(); @@ -23,7 +41,7 @@ __DATA__ --- http_config eval: $::HttpConfig --- config location = /t { - lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; + lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT; lua_ssl_verify_depth 2; content_by_lua_block { local we = require "resty.worker.events" @@ -47,9 +65,9 @@ __DATA__ }, } }) - local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false) + local ok, err = checker:add_target("127.0.0.1", 2115, "example.test", false) ngx.sleep(8) -- wait for 4x the check interval - ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true + ngx.say(checker:get_target_status("127.0.0.1", 2115, "example.test")) -- true } } --- request @@ -63,7 +81,7 @@ true --- http_config eval: $::HttpConfig --- config location = /t { - lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; + lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT; lua_ssl_verify_depth 2; content_by_lua_block { local we = require "resty.worker.events" @@ -87,9 +105,9 @@ true }, } }) - local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true) + local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", true) ngx.sleep(8) -- wait for 4x the check interval - ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false + ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test")) -- false } } --- request @@ -103,7 +121,7 @@ false --- http_config eval: $::HttpConfig --- config location = /t { - lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; + lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT; lua_ssl_verify_depth 2; content_by_lua_block { local we = require "resty.worker.events" @@ -128,9 +146,9 @@ false }, } }) - local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false) + local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", false) ngx.sleep(8) -- wait for 4x the check interval - ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true + ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test")) -- true } } --- request diff --git a/t/with_worker-events/99-status_ver.t b/t/with_worker-events/99-status_ver.t index c7d2166f..d2793efb 100644 --- a/t/with_worker-events/99-status_ver.t +++ b/t/with_worker-events/99-status_ver.t @@ -14,7 +14,26 @@ our $HttpConfig = qq{ init_worker_by_lua_block { local we = require "resty.worker.events" assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + + -- Cross-worker delivery barrier: the status-change event must be seen + -- by every worker. To make the test deterministic we first confirm + -- that events propagate to every worker before adding the target. + -- Each worker repeatedly posts a ping carrying its own id; on receiving + -- a ping every worker records the sender id in the shared dict. Once + -- all worker ids have been observed, event propagation is proven and we + -- add the target (whose later status change will then reach all + -- workers). + local worker_count = ngx.worker.count() + local shm = ngx.shared.test_shm + + we.register(function(data) + shm:set("barrier:" .. tostring(data), true) + end, "barrier", "ping") + ngx.timer.at(0, function() + -- Create the checker (and thus register its event subscription) + -- up front, but do not add the target yet, so no status change can + -- be broadcast before every worker is subscribed. local healthcheck = require("resty.healthcheck") local checker = healthcheck.new({ name = "testing", @@ -30,6 +49,23 @@ our $HttpConfig = qq{ } } }) + + local my_id = ngx.worker.id() + while true do + we.post("barrier", "ping", my_id) + we.poll() + local seen = 0 + for i = 0, worker_count - 1 do + if shm:get("barrier:" .. tostring(i)) then + seen = seen + 1 + end + end + if seen >= worker_count then + break + end + ngx.sleep(0.05) + end + local ok, err = checker:add_target("127.0.0.1", 11111) if not ok then error(err) @@ -48,7 +84,7 @@ __DATA__ location = /t { content_by_lua_block { ngx.say(true) - ngx.sleep(0.3) -- wait twice the interval + ngx.sleep(1) -- wait for the barrier handshake and status change } } --- request @@ -58,5 +94,10 @@ true --- error_log checking unhealthy targets: nothing to do checking unhealthy targets: #1 +--- grep_error_log eval: qr/from 'true' to 'false', ver: \d+/ +--- grep_error_log_out eval +[ +"from 'true' to 'false', ver: 2 from 'true' to 'false', ver: 2 -from 'true' to 'false', ver: 1 +", +] diff --git a/t/with_worker-events/util/tls_probe_cert.pem b/t/with_worker-events/util/tls_probe_cert.pem new file mode 100644 index 00000000..0119c792 --- /dev/null +++ b/t/with_worker-events/util/tls_probe_cert.pem @@ -0,0 +1,21 @@ +-----BEGIN CERTIFICATE----- +MIIDdDCCAlygAwIBAgIUC+b5FmEe1NkGq7ref9fGBZrmYSQwDQYJKoZIhvcNAQEL +BQAwPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt +aGVhbHRoY2hlY2sgdGVzdDAgFw0yNjA2MjIyMDAwMTdaGA8yMTI2MDUyOTIwMDAx +N1owPDEVMBMGA1UEAwwMZXhhbXBsZS50ZXN0MSMwIQYDVQQKDBpsdWEtcmVzdHkt +aGVhbHRoY2hlY2sgdGVzdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEB +AKCvRZEsOGkCcICbFc8tMVva1Co16hBZfpZn95qwea41/rcstuT4+uN74fTieMpQ +20nmYJ1YHV37ilq2zT1jRtHx5sNJX6o1P3fRR+CrcL0xXMSFKQZstpMeqKWb4WnM +9vYuD5ZPU100LfLrdMP277RBYaa1E5jZPeCZ1B/riO+dUgX+hy6boMBLo7BMc+vP +Pm+VBs1arcwN3IU/GbzHo3s3GGqR33uV1kKLWUA8eDVLawWKcnCABtNya/j7hGO3 +yCDUD6mXME9MF47PlJXSqXQTKvhUuFY5Lcyfa4n3OYS9Xo5ZISa084vrZrguWc7m +oxqv2o1iLRzLqPXX+tWVsnMCAwEAAaNsMGowHQYDVR0OBBYEFMWZ7hlvu0TCXl7Y +WPLomNF9mliLMB8GA1UdIwQYMBaAFMWZ7hlvu0TCXl7YWPLomNF9mliLMA8GA1Ud +EwEB/wQFMAMBAf8wFwYDVR0RBBAwDoIMZXhhbXBsZS50ZXN0MA0GCSqGSIb3DQEB +CwUAA4IBAQCYJ/AnSvS7Jvci+tGsQkWMbaPXlP8DpCfGFEzRQ5zrrQBqFdJB/Kfc +nqdBWw4J/0vt91eaZv9mM93u2RZFTH2V76IlAgc3KahRKM+4XDXWLiassS9w3eZ1 +j3MzFXSrpI80xHX0pNOohlaJ9LcDp4D6kh42g2WfuJKCGfsN/q5eN+7s/4IIC98M +EC0jTGytU3VKqyxesmiuG+rduBvwNtboaZuXa9WLkNkKQ31miJC+N/D9L6nRY/E4 +z3Kt8cYxMhlA+ckx5Rh/oxPlu6OFhX/e/bkKrDIzrtg6SDmq0x5NK3wQtht9Czrp +yKEXM7sQtpGs1IDVVgJN5hucDbLzQu2L +-----END CERTIFICATE----- diff --git a/t/with_worker-events/util/tls_probe_key.pem b/t/with_worker-events/util/tls_probe_key.pem new file mode 100644 index 00000000..3c3842b8 --- /dev/null +++ b/t/with_worker-events/util/tls_probe_key.pem @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCgr0WRLDhpAnCA +mxXPLTFb2tQqNeoQWX6WZ/easHmuNf63LLbk+Prje+H04njKUNtJ5mCdWB1d+4pa +ts09Y0bR8ebDSV+qNT930Ufgq3C9MVzEhSkGbLaTHqilm+FpzPb2Lg+WT1NdNC3y +63TD9u+0QWGmtROY2T3gmdQf64jvnVIF/ocum6DAS6OwTHPrzz5vlQbNWq3MDdyF +Pxm8x6N7Nxhqkd97ldZCi1lAPHg1S2sFinJwgAbTcmv4+4Rjt8gg1A+plzBPTBeO +z5SV0ql0Eyr4VLhWOS3Mn2uJ9zmEvV6OWSEmtPOL62a4LlnO5qMar9qNYi0cy6j1 +1/rVlbJzAgMBAAECggEACLx5eKL3WYTnmfkUmU7Yi/5x95jhJiIU1wJFvN7TFsza +Zu94QtW4n6KjeTLQ+hWlqTcXJ1GG8zj9tOCTIxR2jBaYrc3GSj+ajkijcvcklNy4 +saSYbkAojbImiC6D9cJ45jnrNghlEhWyroOXtcnYuYED2VJSA5xCwtHfmHFathr4 +5C1maSTueiFxj+NN8w08oIt5jL4FH5BHb6lFWeBb/dnqXBcaLFakeCVDVm2Sl/BE ++3ykw1zqbBjvkscpEbL9D42NvVD7RDhXJYeUisJbNkowlk4YZh5QMpD+oZiSfYHz +2ZuLi7oPF/A0LhKK2VNYqIDjtu08VLfuFBZHVteVsQKBgQC10OMw7pSkHQmRybIO +0gXdsEuMaskUKNnR4BqVy0MudQZOt0pBCVsiAMGe06vAiFLJO3Nbtlu1P5HdzmZN +UB5Rmpf8QyWhdrpROqISHFi6o/H/aXeb9o3TfRqlrFUN/x6ruiREArqAn8ITEfYs +ql2NxEh0aS7ZnixYCZamTP7MsQKBgQDiPysTrnE6Ca44dVje80OA7RHWnrYDCrd7 +9ntK8mNt6INGsDeQ+rPTohG1FO5tZdpzRkd+wowIOMMuklLvn6XEFFB1hwiuN1/y +bYLeqivfwC/PyXDxlf5ogkayEMwcKkJ1T/H2yDYR+mzs7/EN4xsBEFCDJQGgQOxy +FCPd2M+qYwKBgG4KZEbsTxhY3r7W1Sa1JIm9NqxgwRyrcNHekhiRMjL+7vdbZdyg +/gBBdu5a7DuWBoz2p2Ydo7m6JN2bGz21vPPk3hH2zeLAihm5o/fUIjusGD5epd7G +RaZ3tFYLTFsxSm3jNinXgOtyRbLnDxiPcBnqb3PNaWaanfoWq5AxT5GxAoGAGkEE +Satjfj2jVu/fGTgXbD2WZVZTfrTep+bpVcAc46MooKpQOGWvOm7DKUU9ibpZCClu +oHoI7+dOVvgp4Z6gCMnmsEy0KCtK2gH1Pst2fed6ZN1WWuJx/ESp2X3zgY0x2xUk +2eNPyvRJcZFCYr8o4g23mhBQSP1fsrk9lD/VkUsCgYBYc9fLiWsvrKhhfs+q7VWr +wkVlmhOBiebHlO2E9j+l0btmuHRHp7s8JgQ2mV12pte5657pBP+hFsDE31f7pM3l +fZ4cmFZHr6Mh4egvntIQ7AOAUkmkUUJUcIoIuzwroH0oNNl3R6PojNJzid0yOWfw +XnymdGqaQsLdyV/9WCsHjA== +-----END PRIVATE KEY----- From 356d12232911f9fa6988baec32ceabb9e2681ef5 Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Tue, 23 Jun 2026 14:48:41 +0800 Subject: [PATCH 06/11] fix: decouple stale-target cleanup from active-checker gating The previous revision gated the whole periodic tick on has_active_checker and returned before the cleanup loop when no active checker existed. In a passive-only deployment (checks.passive but no active interval) no worker ever has an active checker, so delayed_clear()-marked targets were never purged and leaked in the shm (apache/apisix#13385) -- a regression only on the pure-passive path, which the existing tests (all using active probes) did not cover. Run stale-target cleanup independently of active probing and of the periodic lock: locking_target_list serializes the purge across workers and re-reads the shm list each time, so running it on every worker is safe and idempotent. The periodic lock is still released on has_active_checker so a disabled worker yields to an active one (apache/apisix#13235); that gate stays on has_active_checker rather than on hcs being non-empty because a disabled checker lingers in the weak hcs table with active=false until GC. Add a passive-only + delayed_clear regression test for both event backends. --- lib/resty/healthcheck.lua | 108 +++++++++--------- t/with_resty-events/21-passive-only-cleanup.t | 93 +++++++++++++++ .../21-passive-only-cleanup.t | 77 +++++++++++++ 3 files changed, 227 insertions(+), 51 deletions(-) create mode 100644 t/with_resty-events/21-passive-only-cleanup.t create mode 100644 t/with_worker-events/21-passive-only-cleanup.t diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 3ce44af5..5e3f3785 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1734,54 +1734,27 @@ function _M.new(opts) detached = false, expire = function() - -- A worker only contends for the periodic lock when it still owns at - -- least one active checker. Checking this *before* acquiring the lock - -- avoids a worker whose checkers were all disabled from repeatedly - -- re-acquiring and releasing the lock on every tick (the lock would - -- otherwise jitter once per back-off interval on a fully disabled - -- cluster). When every checker has been stopped the lock holder must - -- also release the lock so that other workers which still own active - -- checkers can acquire it, otherwise active health checks become - -- permanently stuck after a disable -> re-enable cycle. - local has_active_checker = false - for _, checker_obj in pairs(hcs) do - if checker_obj.checks.active.healthy.active or - checker_obj.checks.active.unhealthy.active then - has_active_checker = true - break - end - end - - if not has_active_checker then - -- release the lock only if we are still the holder, then back off - -- without contending again - if shm:get(key) == ngx_worker_pid() then - shm:delete(key) - end - active_check_timer.interval = CHECK_INTERVAL * 10 - return - end - - if get_periodic_lock(shm, key) then - active_check_timer.interval = CHECK_INTERVAL - renew_periodic_lock(shm, key) - else - active_check_timer.interval = CHECK_INTERVAL * 10 - return - end - local cur_time = ngx_now() - -- Decide once, before iterating the checkers, whether this is a - -- cleanup window. `last_cleanup_check` is module level and shared by - -- every checker, so it must be read before the loop and updated only - -- after it: updating it inside the loop would make the first checker - -- that runs cleanup advance the timestamp and starve every other - -- checker in `hcs`, leaving their purge-marked targets in the shm + + -- Stale-target cleanup is decoupled from both active probing and the + -- periodic lock. A passive-only deployment (checks.passive but no + -- active interval) has no active checker on any worker, yet still marks + -- targets via delayed_clear() and must purge them; gating cleanup on + -- the periodic lock (which only an active worker ever holds) would leak + -- those targets forever (apache/apisix#13385). Cleanup is therefore run + -- independently here: locking_target_list serializes the purge across + -- workers and re-reads the shm list each time, so running it on every + -- worker is safe and idempotent -- a worker that finds the list already + -- purged removes nothing and raises no duplicate remove event. + -- + -- `run_cleanup` is decided once, before the loop, and last_cleanup_check + -- advanced once after it: last_cleanup_check is module level and shared + -- by every checker, so advancing it inside the loop would let the first + -- checker starve the rest, leaving their purge-marked targets in the shm -- forever. This only manifests with multiple health-checked upstreams. local run_cleanup = (last_cleanup_check + CLEANUP_INTERVAL) < cur_time - for _, checker_obj in pairs(hcs) do - - if run_cleanup then + if run_cleanup then + for _, checker_obj in pairs(hcs) do -- clear targets marked for delayed removal locking_target_list(checker_obj, function(target_list) local removed_targets = {} @@ -1811,7 +1784,46 @@ function _M.new(opts) end end) end + last_cleanup_check = cur_time + end + -- The periodic lock distributes ACTIVE probing to a single worker. A + -- worker with no active checker must release the lock (if it holds it) + -- and back off, so a worker that still owns active checkers can take + -- over; otherwise active health checks stay stuck after a disable -> + -- re-enable cycle (apache/apisix#13235). This is gated on + -- has_active_checker rather than on `hcs` being non-empty, because a + -- disabled checker lingers in the weak `hcs` table with active=false + -- until it is garbage collected. + local has_active_checker = false + for _, checker_obj in pairs(hcs) do + if checker_obj.checks.active.healthy.active or + checker_obj.checks.active.unhealthy.active then + has_active_checker = true + break + end + end + + if not has_active_checker then + -- release the lock only if we are still the holder, then back off + -- without contending again + if shm:get(key) == ngx_worker_pid() then + shm:delete(key) + end + active_check_timer.interval = CHECK_INTERVAL * 10 + return + end + + if get_periodic_lock(shm, key) then + active_check_timer.interval = CHECK_INTERVAL + renew_periodic_lock(shm, key) + else + active_check_timer.interval = CHECK_INTERVAL * 10 + return + end + + -- active probing: only the periodic-lock holder runs the probes + for _, checker_obj in pairs(hcs) do if checker_obj.checks.active.healthy.active and (checker_obj.checks.active.healthy.last_run + checker_obj.checks.active.healthy.interval <= cur_time) @@ -1828,12 +1840,6 @@ function _M.new(opts) checker_callback(checker_obj, "unhealthy") end end - - -- Advance the cleanup window once, after every checker in `hcs` has - -- had a chance to purge its stale targets in this iteration. - if run_cleanup then - last_cleanup_check = cur_time - end end, }) if not active_check_timer then diff --git a/t/with_resty-events/21-passive-only-cleanup.t b/t/with_resty-events/21-passive-only-cleanup.t new file mode 100644 index 00000000..9c33755d --- /dev/null +++ b/t/with_resty-events/21-passive-only-cleanup.t @@ -0,0 +1,93 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * (blocks() * 2); + +my $pwd = cwd(); +$ENV{TEST_NGINX_SERVROOT} = server_root(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + + init_worker_by_lua_block { + _G.__TESTING_HEALTHCHECKER = true + local we = require "resty.events.compat" + assert(we.configure({ + unique_timeout = 5, + broker_id = 0, + listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock" + })) + assert(we.configured()) + } + + server { + server_name kong_worker_events; + listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock; + access_log off; + location / { + content_by_lua_block { + require("resty.events.compat").run() + } + } + } +}; + +run_tests(); + +__DATA__ + +=== TEST 1: stale-target cleanup runs for a passive-only checker (no active checks) +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + -- shorten the cleanup window so it fires within the test + healthcheck._set_cleanup_interval(0.2) + + -- A passive-only checker: no active interval is configured, so + -- checks.active.*.active stays false on every worker and the + -- periodic timer never holds the lock for active probing. The + -- delayed_clear() cleanup must still run, otherwise purge-marked + -- targets leak forever (apache/apisix#13385). + local checker = assert(healthcheck.new({ + name = "passive-only", + shm_name = "test_shm", + events_module = "resty.events", + checks = { + passive = { + type = "http", + healthy = { successes = 1 }, + unhealthy = { http_failures = 1 }, + } + } + })) + + for i = 1, 3 do + assert(checker:add_target("127.0.0.1", 20000 + i, nil, true)) + end + + -- no active checker exists on this worker + ngx.say("has active: ", + checker.checks.active.healthy.active or + checker.checks.active.unhealthy.active) + + -- mark every target for immediate delayed removal + assert(checker:delayed_clear(0)) + + -- wait long enough for several cleanup windows to elapse + ngx.sleep(1.5) + + local list = healthcheck.get_target_list("passive-only", "test_shm") + ngx.say("remaining: ", #list) + } + } +--- request +GET /t +--- response_body +has active: false +remaining: 0 +--- timeout: 5 diff --git a/t/with_worker-events/21-passive-only-cleanup.t b/t/with_worker-events/21-passive-only-cleanup.t new file mode 100644 index 00000000..f40ccb3a --- /dev/null +++ b/t/with_worker-events/21-passive-only-cleanup.t @@ -0,0 +1,77 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * (blocks() * 2); + +my $pwd = cwd(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + lua_shared_dict my_worker_events 8m; + + init_worker_by_lua_block { + _G.__TESTING_HEALTHCHECKER = true + } +}; + +run_tests(); + +__DATA__ + +=== TEST 1: stale-target cleanup runs for a passive-only checker (no active checks) +--- http_config eval: $::HttpConfig +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + + local healthcheck = require("resty.healthcheck") + -- shorten the cleanup window so it fires within the test + healthcheck._set_cleanup_interval(0.2) + + -- A passive-only checker: no active interval is configured, so + -- checks.active.*.active stays false on every worker and the + -- periodic timer never holds the lock for active probing. The + -- delayed_clear() cleanup must still run, otherwise purge-marked + -- targets leak forever (apache/apisix#13385). + local checker = assert(healthcheck.new({ + name = "passive-only", + shm_name = "test_shm", + checks = { + passive = { + type = "http", + healthy = { successes = 1 }, + unhealthy = { http_failures = 1 }, + } + } + })) + + for i = 1, 3 do + assert(checker:add_target("127.0.0.1", 20000 + i, nil, true)) + end + + -- no active checker exists on this worker + ngx.say("has active: ", + checker.checks.active.healthy.active or + checker.checks.active.unhealthy.active) + + -- mark every target for immediate delayed removal + assert(checker:delayed_clear(0)) + + -- wait long enough for several cleanup windows to elapse + ngx.sleep(1.5) + + local list = healthcheck.get_target_list("passive-only", "test_shm") + ngx.say("remaining: ", #list) + } + } +--- request +GET /t +--- response_body +has active: false +remaining: 0 +--- timeout: 5 From ae3e2f9c42687d00d7feb9b58ec7e5207a50aa0a Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Tue, 23 Jun 2026 14:53:53 +0800 Subject: [PATCH 07/11] test: normalize has-active boolean in passive-only cleanup test For a passive-only checker checks.active.*.active is nil (never set to true by start(), never set to false by stop()), so ngx.say rendered it as "nil" rather than "false". Normalize with `not not` so the assertion is stable. The cleanup assertion (remaining: 0) already passed, confirming the fix purges delayed_clear targets with no active checker. --- t/with_resty-events/21-passive-only-cleanup.t | 5 ++--- t/with_worker-events/21-passive-only-cleanup.t | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/t/with_resty-events/21-passive-only-cleanup.t b/t/with_resty-events/21-passive-only-cleanup.t index 9c33755d..71b9815e 100644 --- a/t/with_resty-events/21-passive-only-cleanup.t +++ b/t/with_resty-events/21-passive-only-cleanup.t @@ -71,9 +71,8 @@ __DATA__ end -- no active checker exists on this worker - ngx.say("has active: ", - checker.checks.active.healthy.active or - checker.checks.active.unhealthy.active) + ngx.say("has active: ", not not (checker.checks.active.healthy.active or + checker.checks.active.unhealthy.active)) -- mark every target for immediate delayed removal assert(checker:delayed_clear(0)) diff --git a/t/with_worker-events/21-passive-only-cleanup.t b/t/with_worker-events/21-passive-only-cleanup.t index f40ccb3a..2f0ecc46 100644 --- a/t/with_worker-events/21-passive-only-cleanup.t +++ b/t/with_worker-events/21-passive-only-cleanup.t @@ -55,9 +55,8 @@ __DATA__ end -- no active checker exists on this worker - ngx.say("has active: ", - checker.checks.active.healthy.active or - checker.checks.active.unhealthy.active) + ngx.say("has active: ", not not (checker.checks.active.healthy.active or + checker.checks.active.unhealthy.active)) -- mark every target for immediate delayed removal assert(checker:delayed_clear(0)) From ea63000225a61dd231d9bda2982ea52311ba8e39 Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Wed, 24 Jun 2026 05:16:27 +0800 Subject: [PATCH 08/11] fix: elect one cleanup worker per window and resume probing promptly Two follow-ups from the PR review of the cleanup/lock-gating rework: 1. Stale-target cleanup ran on every worker each window, so N workers x M checkers all contended on each checker's TARGET_LIST_LOCK. Elect a single worker per window via an atomic shm:add on a dedicated cleanup key (TTL = CLEANUP_INTERVAL). Any worker can win -- including a passive-only one -- so the apache/apisix#13385 leak fix still holds, while the rest skip the pass instead of fighting for the lock. Removes the per-worker last_cleanup_check bookkeeping. 2. After a disable -> re-enable cycle, a backed-off timer (running at CHECK_INTERVAL * 10 while idle or after losing the lock race) took up to one slow tick to resume probing. checker:start() now wakes the timer back to CHECK_INTERVAL when it re-enables active checks. --- lib/resty/healthcheck.lua | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 5e3f3785..9891a04e 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -236,7 +236,6 @@ local hcs = setmetatable({}, { }) local active_check_timer -local last_cleanup_check -- serialize a table to a string local serialize = codec.encode @@ -1423,6 +1422,15 @@ function checker:start() worker_events.unregister(self.ev_callback, self.EVENT_SOURCE) -- ensure we never double subscribe worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE) + -- If the active-check timer is already running but backed off (idle worker, + -- or one that lost the periodic-lock race, runs at CHECK_INTERVAL * 10), + -- wake it up so a freshly (re-)enabled checker resumes probing promptly + -- instead of waiting up to one slow tick. + if active_check_timer and + (self.checks.active.healthy.active or self.checks.active.unhealthy.active) then + active_check_timer.interval = CHECK_INTERVAL + end + self:log(DEBUG, "active check flagged as active") return true end @@ -1726,7 +1734,7 @@ function _M.new(opts) self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ", "starting active check timer") local shm, key = self.shm, self.PERIODIC_LOCK - last_cleanup_check = ngx_now() + local cleanup_key = key .. ":cleanup" active_check_timer, err = resty_timer({ recurring = true, interval = CHECK_INTERVAL, @@ -1736,24 +1744,20 @@ function _M.new(opts) local cur_time = ngx_now() - -- Stale-target cleanup is decoupled from both active probing and the + -- Stale-target cleanup is decoupled from active probing and the -- periodic lock. A passive-only deployment (checks.passive but no -- active interval) has no active checker on any worker, yet still marks -- targets via delayed_clear() and must purge them; gating cleanup on -- the periodic lock (which only an active worker ever holds) would leak - -- those targets forever (apache/apisix#13385). Cleanup is therefore run - -- independently here: locking_target_list serializes the purge across - -- workers and re-reads the shm list each time, so running it on every - -- worker is safe and idempotent -- a worker that finds the list already - -- purged removes nothing and raises no duplicate remove event. + -- those targets forever (apache/apisix#13385). -- - -- `run_cleanup` is decided once, before the loop, and last_cleanup_check - -- advanced once after it: last_cleanup_check is module level and shared - -- by every checker, so advancing it inside the loop would let the first - -- checker starve the rest, leaving their purge-marked targets in the shm - -- forever. This only manifests with multiple health-checked upstreams. - local run_cleanup = (last_cleanup_check + CLEANUP_INTERVAL) < cur_time - if run_cleanup then + -- A single worker per window is elected via an atomic shm:add on + -- cleanup_key (TTL = CLEANUP_INTERVAL): any worker can win, including a + -- passive-only one, so the leak fix holds, while the other workers skip + -- the pass and do not all contend on each checker's TARGET_LIST_LOCK. + -- The elected worker purges every checker in one pass. + local won, add_err = shm:add(cleanup_key, ngx_worker_pid(), CLEANUP_INTERVAL) + if won then for _, checker_obj in pairs(hcs) do -- clear targets marked for delayed removal locking_target_list(checker_obj, function(target_list) @@ -1784,7 +1788,8 @@ function _M.new(opts) end end) end - last_cleanup_check = cur_time + elseif add_err ~= "exists" then + ngx_log(ERR, "failed to elect cleanup worker for '", cleanup_key, "': ", add_err) end -- The periodic lock distributes ACTIVE probing to a single worker. A From 8c718404dfbb35d050c571da75655e30e4fd47ac Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Wed, 24 Jun 2026 05:16:33 +0800 Subject: [PATCH 09/11] test: bound the cross-worker barrier wait in 99-status_ver The barrier handshake spun in an unbounded `while true` loop, so a worker that never synced would hang until the global test timeout with no useful diagnostic. Cap the wait with a deadline and fail loudly with the synced count when it is exceeded. --- t/with_resty-events/99-status_ver.t | 8 ++++++++ t/with_worker-events/99-status_ver.t | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/t/with_resty-events/99-status_ver.t b/t/with_resty-events/99-status_ver.t index dcb796f7..ba498525 100644 --- a/t/with_resty-events/99-status_ver.t +++ b/t/with_resty-events/99-status_ver.t @@ -59,6 +59,9 @@ our $HttpConfig = qq{ }) local my_id = ngx.worker.id() + local deadline = ngx.now() + 5 -- bound the barrier wait so a + -- missing worker fails loudly + -- instead of spinning forever while true do we.post("barrier", "ping", my_id) local seen = 0 @@ -70,6 +73,11 @@ our $HttpConfig = qq{ if seen >= worker_count then break end + ngx.update_time() + if ngx.now() >= deadline then + error("barrier not reached: " .. seen .. "/" .. + worker_count .. " workers synced before timeout") + end ngx.sleep(0.05) end diff --git a/t/with_worker-events/99-status_ver.t b/t/with_worker-events/99-status_ver.t index d2793efb..2dcec5a1 100644 --- a/t/with_worker-events/99-status_ver.t +++ b/t/with_worker-events/99-status_ver.t @@ -51,6 +51,9 @@ our $HttpConfig = qq{ }) local my_id = ngx.worker.id() + local deadline = ngx.now() + 5 -- bound the barrier wait so a + -- missing worker fails loudly + -- instead of spinning forever while true do we.post("barrier", "ping", my_id) we.poll() @@ -63,6 +66,11 @@ our $HttpConfig = qq{ if seen >= worker_count then break end + ngx.update_time() + if ngx.now() >= deadline then + error("barrier not reached: " .. seen .. "/" .. + worker_count .. " workers synced before timeout") + end ngx.sleep(0.05) end From 24ea30cce71f83ff443dbf9f7cca32c5d55e1edd Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Wed, 24 Jun 2026 08:44:02 +0800 Subject: [PATCH 10/11] fix: drop ineffective timer-wake on checker re-enable The interval-wake added to checker:start() could not bring a re-enabled checker back to fast probing: lua-resty-timer (~> 1) schedules each recurring sleep from self.sub_interval, which is frozen to the initial interval at construction and never re-synced from self.interval; self.interval only feeds scheduling for node-wide (key_name) timers, which the active-check timer is not. Mutating active_check_timer.interval from start() (or anywhere) therefore has no effect on the timer cadence, so the optimization was a no-op. Remove it. --- lib/resty/healthcheck.lua | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 9891a04e..4db29f1d 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1422,15 +1422,6 @@ function checker:start() worker_events.unregister(self.ev_callback, self.EVENT_SOURCE) -- ensure we never double subscribe worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE) - -- If the active-check timer is already running but backed off (idle worker, - -- or one that lost the periodic-lock race, runs at CHECK_INTERVAL * 10), - -- wake it up so a freshly (re-)enabled checker resumes probing promptly - -- instead of waiting up to one slow tick. - if active_check_timer and - (self.checks.active.healthy.active or self.checks.active.unhealthy.active) then - active_check_timer.interval = CHECK_INTERVAL - end - self:log(DEBUG, "active check flagged as active") return true end From 65326650a7d9e2a5da7db77c473b25a05b461e69 Mon Sep 17 00:00:00 2001 From: AlinsRan Date: Fri, 26 Jun 2026 14:42:14 +0800 Subject: [PATCH 11/11] refactor: drop no-op active_check_timer.interval back-off assignments lua-resty-timer schedules each recurring sleep from sub_interval (frozen at construction), never re-reading self.interval, so mutating active_check_timer.interval has no effect on the timer cadence -- the same reason the start() interval-wake was removed in 24ea30c. Remove the three remaining no-op assignments in the periodic tick so the code no longer implies an idle back-off that never happens. Also document that the per-window cleanup election purges only the elected worker's own hcs and is eventually consistent under asymmetric checker distribution. --- lib/resty/healthcheck.lua | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 4db29f1d..a60571ab 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1746,7 +1746,12 @@ function _M.new(opts) -- cleanup_key (TTL = CLEANUP_INTERVAL): any worker can win, including a -- passive-only one, so the leak fix holds, while the other workers skip -- the pass and do not all contend on each checker's TARGET_LIST_LOCK. - -- The elected worker purges every checker in one pass. + -- The elected worker purges every checker in its own `hcs` in one pass. + -- Purging writes to the shared shm target_list, so cleaning a checker on + -- any one worker is globally effective; if checkers are distributed + -- asymmetrically across workers, a given upstream is purged in whichever + -- window a worker that owns its checker wins the election -- eventually + -- consistent rather than every-worker-every-window. local won, add_err = shm:add(cleanup_key, ngx_worker_pid(), CLEANUP_INTERVAL) if won then for _, checker_obj in pairs(hcs) do @@ -1801,20 +1806,17 @@ function _M.new(opts) end if not has_active_checker then - -- release the lock only if we are still the holder, then back off - -- without contending again + -- release the lock only if we are still the holder, so a worker that + -- still owns active checkers can take over if shm:get(key) == ngx_worker_pid() then shm:delete(key) end - active_check_timer.interval = CHECK_INTERVAL * 10 return end if get_periodic_lock(shm, key) then - active_check_timer.interval = CHECK_INTERVAL renew_periodic_lock(shm, key) else - active_check_timer.interval = CHECK_INTERVAL * 10 return end