Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 74 additions & 13 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ local hcs = setmetatable({}, {
})

local active_check_timer
local last_cleanup_check

-- serialize a table to a string
local serialize = codec.encode
Expand Down Expand Up @@ -1423,6 +1422,15 @@ function checker:start()
worker_events.unregister(self.ev_callback, self.EVENT_SOURCE) -- ensure we never double subscribe
worker_events.register_weak(self.ev_callback, self.EVENT_SOURCE)

-- If the active-check timer is already running but backed off (idle worker,
-- or one that lost the periodic-lock race, runs at CHECK_INTERVAL * 10),
-- wake it up so a freshly (re-)enabled checker resumes probing promptly
-- instead of waiting up to one slow tick.
if active_check_timer and
(self.checks.active.healthy.active or self.checks.active.unhealthy.active) then
active_check_timer.interval = CHECK_INTERVAL
end

Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
self:log(DEBUG, "active check flagged as active")
return true
end
Expand Down Expand Up @@ -1726,26 +1734,31 @@ function _M.new(opts)
self:log(DEBUG, "worker ", ngx_worker_id(), " (pid: ", ngx_worker_pid(), ") ",
"starting active check timer")
local shm, key = self.shm, self.PERIODIC_LOCK
last_cleanup_check = ngx_now()
local cleanup_key = key .. ":cleanup"
active_check_timer, err = resty_timer({
recurring = true,
interval = CHECK_INTERVAL,
jitter = CHECK_JITTER,
detached = false,
expire = function()

if get_periodic_lock(shm, key) then
active_check_timer.interval = CHECK_INTERVAL
renew_periodic_lock(shm, key)
else
active_check_timer.interval = CHECK_INTERVAL * 10
return
end

local cur_time = ngx_now()
for _, checker_obj in pairs(hcs) do

if (last_cleanup_check + CLEANUP_INTERVAL) < cur_time then
-- Stale-target cleanup is decoupled from active probing and the
-- periodic lock. A passive-only deployment (checks.passive but no
-- active interval) has no active checker on any worker, yet still marks
-- targets via delayed_clear() and must purge them; gating cleanup on
-- the periodic lock (which only an active worker ever holds) would leak
-- those targets forever (apache/apisix#13385).
--
-- A single worker per window is elected via an atomic shm:add on
-- cleanup_key (TTL = CLEANUP_INTERVAL): any worker can win, including a
-- passive-only one, so the leak fix holds, while the other workers skip
-- the pass and do not all contend on each checker's TARGET_LIST_LOCK.
-- The elected worker purges every checker in one pass.
local won, add_err = shm:add(cleanup_key, ngx_worker_pid(), CLEANUP_INTERVAL)
if won then
for _, checker_obj in pairs(hcs) do
-- clear targets marked for delayed removal
locking_target_list(checker_obj, function(target_list)
local removed_targets = {}
Expand Down Expand Up @@ -1774,10 +1787,48 @@ function _M.new(opts)
end
end
end)
end
elseif add_err ~= "exists" then
ngx_log(ERR, "failed to elect cleanup worker for '", cleanup_key, "': ", add_err)
end

last_cleanup_check = cur_time
-- The periodic lock distributes ACTIVE probing to a single worker. A
-- worker with no active checker must release the lock (if it holds it)
-- and back off, so a worker that still owns active checkers can take
-- over; otherwise active health checks stay stuck after a disable ->
-- re-enable cycle (apache/apisix#13235). This is gated on
-- has_active_checker rather than on `hcs` being non-empty, because a
-- disabled checker lingers in the weak `hcs` table with active=false
-- until it is garbage collected.
local has_active_checker = false
for _, checker_obj in pairs(hcs) do
if checker_obj.checks.active.healthy.active or
checker_obj.checks.active.unhealthy.active then
has_active_checker = true
break
end
end

if not has_active_checker then
-- release the lock only if we are still the holder, then back off
-- without contending again
if shm:get(key) == ngx_worker_pid() then
shm:delete(key)
end
active_check_timer.interval = CHECK_INTERVAL * 10
return
end

if get_periodic_lock(shm, key) then
active_check_timer.interval = CHECK_INTERVAL
renew_periodic_lock(shm, key)
else
active_check_timer.interval = CHECK_INTERVAL * 10
return
end

-- active probing: only the periodic-lock holder runs the probes
for _, checker_obj in pairs(hcs) do
if checker_obj.checks.active.healthy.active and
(checker_obj.checks.active.healthy.last_run +
checker_obj.checks.active.healthy.interval <= cur_time)
Expand Down Expand Up @@ -1868,4 +1919,14 @@ function _M.get_target_list(name, shm_name)
end


if TESTING then
-- test-only hook: shorten the stale-target cleanup window so the periodic
-- cleanup path can be exercised deterministically without waiting for the
-- default CLEANUP_INTERVAL (CHECK_INTERVAL * 25).
function _M._set_cleanup_interval(interval)
CLEANUP_INTERVAL = interval
end
end


return _M
42 changes: 30 additions & 12 deletions t/with_resty-events/14-tls_active_probes.t
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ plan tests => blocks() * 2;
my $pwd = cwd();
$ENV{TEST_NGINX_SERVROOT} = server_root();

# A local self-signed TLS server is used instead of an external host so that
# the active TLS probe tests are deterministic and do not depend on network
# access. The certificate has CN/SAN "example.test"; probing with that
# hostname (SNI) and certificate verification enabled succeeds, while probing
# with a mismatching hostname and verification enabled fails.
$ENV{TEST_NGINX_TLS_CERT} = "$pwd/t/with_resty-events/util/tls_probe_cert.pem";
$ENV{TEST_NGINX_TLS_KEY} = "$pwd/t/with_resty-events/util/tls_probe_key.pem";

our $HttpConfig = qq{
lua_package_path "$pwd/lib/?.lua;;";
lua_shared_dict test_shm 8m;
Expand All @@ -22,6 +30,16 @@ our $HttpConfig = qq{
}
}
}

server {
listen 2115 ssl;
ssl_certificate $ENV{TEST_NGINX_TLS_CERT};
ssl_certificate_key $ENV{TEST_NGINX_TLS_KEY};
server_name example.test;
location / {
return 200 'ok';
}
}
};

run_tests();
Expand All @@ -34,7 +52,7 @@ __DATA__
--- http_config eval: $::HttpConfig
--- config
location = /t {
lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
lua_ssl_verify_depth 2;
content_by_lua_block {
local we = require "resty.events.compat"
Expand All @@ -43,7 +61,7 @@ __DATA__
local checker = healthcheck.new({
name = "testing",
shm_name = "test_shm",
events_module = "resty.events",
events_module = "resty.events",
checks = {
active = {
type = "https",
Expand All @@ -59,9 +77,9 @@ events_module = "resty.events",
},
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false)
local ok, err = checker:add_target("127.0.0.1", 2115, "example.test", false)
ngx.sleep(8) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true
ngx.say(checker:get_target_status("127.0.0.1", 2115, "example.test")) -- true
}
}
--- request
Expand All @@ -75,7 +93,7 @@ true
--- http_config eval: $::HttpConfig
--- config
location = /t {
lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
lua_ssl_verify_depth 2;
content_by_lua_block {
local we = require "resty.events.compat"
Expand All @@ -84,7 +102,7 @@ true
local checker = healthcheck.new({
name = "testing",
shm_name = "test_shm",
events_module = "resty.events",
events_module = "resty.events",
checks = {
active = {
type = "https",
Expand All @@ -100,9 +118,9 @@ events_module = "resty.events",
},
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true)
local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", true)
ngx.sleep(8) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false
ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test")) -- false
}
}
--- request
Expand All @@ -116,7 +134,7 @@ false
--- http_config eval: $::HttpConfig
--- config
location = /t {
lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;
lua_ssl_trusted_certificate $TEST_NGINX_TLS_CERT;
lua_ssl_verify_depth 2;
content_by_lua_block {
local we = require "resty.events.compat"
Expand All @@ -125,7 +143,7 @@ false
local checker = healthcheck.new({
name = "testing",
shm_name = "test_shm",
events_module = "resty.events",
events_module = "resty.events",
checks = {
active = {
type = "https",
Expand All @@ -142,9 +160,9 @@ events_module = "resty.events",
},
}
})
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false)
local ok, err = checker:add_target("127.0.0.1", 2115, "wrong.host.test", false)
ngx.sleep(8) -- wait for 4x the check interval
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true
ngx.say(checker:get_target_status("127.0.0.1", 2115, "wrong.host.test")) -- true
}
}
--- request
Expand Down
96 changes: 96 additions & 0 deletions t/with_resty-events/19-multi-checker-cleanup.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
use Test::Nginx::Socket::Lua;
use Cwd qw(cwd);

workers(1);

plan tests => repeat_each() * (blocks() * 2);

my $pwd = cwd();
$ENV{TEST_NGINX_SERVROOT} = server_root();

our $HttpConfig = qq{
lua_package_path "$pwd/lib/?.lua;;";
lua_shared_dict test_shm 8m;

init_worker_by_lua_block {
_G.__TESTING_HEALTHCHECKER = true
local we = require "resty.events.compat"
assert(we.configure({
unique_timeout = 5,
broker_id = 0,
listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock"
}))
assert(we.configured())
}

server {
server_name kong_worker_events;
listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock;
access_log off;
location / {
content_by_lua_block {
require("resty.events.compat").run()
}
}
}
};

run_tests();

__DATA__

=== TEST 1: stale-target cleanup runs for every checker, not just the first one
--- http_config eval: $::HttpConfig
--- config
location = /t {
content_by_lua_block {
local healthcheck = require("resty.healthcheck")
-- shorten the cleanup window so it fires within the test
healthcheck._set_cleanup_interval(0.2)

local function new_checker(name)
return assert(healthcheck.new({
name = name,
shm_name = "test_shm",
events_module = "resty.events",
checks = {
active = {
type = "tcp",
healthy = { interval = 0.1 },
unhealthy = { interval = 0.1 },
}
}
}))
end

-- two upstreams, each with health checks enabled
local checker1 = new_checker("upstream-1")
local checker2 = new_checker("upstream-2")

for i = 1, 3 do
assert(checker1:add_target("127.0.0.1", 20000 + i, nil, true))
assert(checker2:add_target("127.0.0.1", 30000 + i, nil, true))
end

-- mark every target on both checkers for immediate delayed removal
assert(checker1:delayed_clear(0))
assert(checker2:delayed_clear(0))

-- wait long enough for several cleanup windows to elapse
ngx.sleep(1.5)

local list1 = healthcheck.get_target_list("upstream-1", "test_shm")
local list2 = healthcheck.get_target_list("upstream-2", "test_shm")

-- Before the fix only the first checker in `hcs` is cleaned each
-- window, so checker2 keeps its purge-marked targets forever.
ngx.say("checker1 remaining: ", #list1)
ngx.say("checker2 remaining: ", #list2)
}
}
--- request
GET /t
--- response_body
checker1 remaining: 0
checker2 remaining: 0
--- timeout: 5
Loading
Loading