Skip to content

Commit 15342a3

Browse files
committed
connectd: don't complain if lightningd is unresponsive while doing dev-memleak.
We had a flake of form: ``` 2025-11-18T04:42:23.489Z **BROKEN** 022d223620a359a47ff7f7ac447c85c46c923da53389221a0054c11c1e3ca31d59-connectd: wake delay for WIRE_CHANNEL_REESTABLISH: 6789msec ``` Which happened as we're shutting down. Some investigation revealed the cause: `dev-memleak` can be extremely slow. Fair enough. So we change `dev-memleak` to call connectd first, and connectd uses that as a trigger to stop complaining about delays. Signed-off-by: Rusty Russell <[email protected]>
1 parent a55c88b commit 15342a3

File tree

4 files changed

+45
-35
lines changed

4 files changed

+45
-35
lines changed

connectd/connectd.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,6 +2014,10 @@ static void dev_connect_memleak(struct daemon *daemon, const u8 *msg)
20142014
struct htable *memtable;
20152015
bool found_leak;
20162016

2017+
/* As a side-effect, this tells us lightningd will be unresponsive,
2018+
* so don't complain (and break CI!) if it's slow. */
2019+
daemon->dev_lightningd_is_slow = true;
2020+
20172021
memtable = memleak_start(tmpctx);
20182022
memleak_ptr(memtable, msg);
20192023

@@ -2465,6 +2469,7 @@ int main(int argc, char *argv[])
24652469
daemon->dev_suppress_gossip = false;
24662470
daemon->custom_msgs = NULL;
24672471
daemon->dev_exhausted_fds = false;
2472+
daemon->dev_lightningd_is_slow = false;
24682473
/* We generally allow 1MB per second per peer, except for dev testing */
24692474
daemon->gossip_stream_limit = 1000000;
24702475
daemon->scid_htable = new_htable(daemon, scid_htable);

connectd/connectd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,8 @@ struct daemon {
356356
bool dev_no_reconnect;
357357
/* --dev-fast-reconnect */
358358
bool dev_fast_reconnect;
359+
/* Don't complain about lightningd being unresponsive. */
360+
bool dev_lightningd_is_slow;
359361
};
360362

361363
/* Called by io_tor_connect once it has a connection out. */

connectd/multiplex.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1150,7 +1150,7 @@ static struct io_plan *write_to_subd(struct io_conn *subd_conn,
11501150
if (subd->peer->peer_in_lastmsg != -1) {
11511151
u64 msec = time_to_msec(timemono_between(time_mono(),
11521152
subd->peer->peer_in_lasttime));
1153-
if (msec > 5000)
1153+
if (msec > 5000 && !subd->peer->daemon->dev_lightningd_is_slow)
11541154
status_peer_broken(&subd->peer->id,
11551155
"wake delay for %s: %"PRIu64"msec",
11561156
peer_wire_name(subd->peer->peer_in_lastmsg),

lightningd/memdump.c

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ static const struct json_command dev_memdump_command = {
8383
};
8484
AUTODATA(json_command, &dev_memdump_command);
8585

86+
8687
static void memleak_log(struct logger *log, const char *fmt, ...)
8788
{
8889
va_list ap;
@@ -91,16 +92,48 @@ static void memleak_log(struct logger *log, const char *fmt, ...)
9192
va_end(ap);
9293
}
9394

94-
static void finish_report(const struct leak_detect *leaks)
95+
static bool lightningd_check_leaks(struct command *cmd)
9596
{
97+
struct lightningd *ld = cmd->ld;
98+
struct htable *memtable;
99+
100+
/* Enter everything, except this cmd and its jcon */
101+
memtable = memleak_start(cmd);
102+
103+
/* This command is not a leak! */
104+
memleak_ptr(memtable, cmd);
105+
memleak_ignore_children(memtable, cmd);
106+
107+
/* Now delete ld and those which it has pointers to. */
108+
memleak_scan_obj(memtable, ld);
109+
110+
return dump_memleak(memtable, memleak_log, ld->log);
111+
}
112+
113+
static void finish_report(struct leak_detect *leaks)
114+
{
115+
bool found_leak;
96116
struct json_stream *response;
117+
const u8 *msg;
97118

98119
/* If it timed out, we free ourselved and exit! */
99120
if (!leaks->cmd) {
100121
tal_free(leaks);
101122
return;
102123
}
103124

125+
/* Check for our own leaks. */
126+
if (lightningd_check_leaks(leaks->cmd))
127+
tal_arr_expand(&leaks->leakers, "lightningd");
128+
129+
/* Check hsmd for leaks. */
130+
msg = hsm_sync_req(tmpctx, leaks->cmd->ld, take(towire_hsmd_dev_memleak(NULL)));
131+
if (!fromwire_hsmd_dev_memleak_reply(msg, &found_leak))
132+
fatal("Bad HSMD_DEV_MEMLEAK_REPLY: %s", tal_hex(tmpctx, msg));
133+
134+
if (found_leak)
135+
report_subd_memleak(leaks, leaks->cmd->ld->hsm);
136+
104137
response = json_stream_success(leaks->cmd);
105138
json_array_start(response, "leaks");
106139
for (size_t num_leakers = 0;
@@ -177,32 +210,12 @@ static void connect_dev_memleak_done(struct subd *connectd,
177210
report_subd_memleak(leaks, connectd);
178211
}
179212

180-
static bool lightningd_check_leaks(struct command *cmd)
181-
{
182-
struct lightningd *ld = cmd->ld;
183-
struct htable *memtable;
184-
185-
/* Enter everything, except this cmd and its jcon */
186-
memtable = memleak_start(cmd);
187-
188-
/* This command is not a leak! */
189-
memleak_ptr(memtable, cmd);
190-
memleak_ignore_children(memtable, cmd);
191-
192-
/* Now delete ld and those which it has pointers to. */
193-
memleak_scan_obj(memtable, ld);
194-
195-
return dump_memleak(memtable, memleak_log, ld->log);
196-
}
197-
198213
static struct command_result *json_memleak(struct command *cmd,
199214
const char *buffer,
200215
const jsmntok_t *obj UNNEEDED,
201216
const jsmntok_t *params)
202217
{
203218
struct lightningd *ld = cmd->ld;
204-
const u8 *msg;
205-
bool found_leak;
206219
struct leak_detect *leaks;
207220

208221
if (!param_check(cmd, buffer, params, NULL))
@@ -221,19 +234,9 @@ static struct command_result *json_memleak(struct command *cmd,
221234
leaks->num_outstanding_requests = 0;
222235
leaks->leakers = tal_arr(leaks, const char *, 0);
223236

224-
/* Check for our own leaks. */
225-
if (lightningd_check_leaks(cmd))
226-
tal_arr_expand(&leaks->leakers, "lightningd");
227-
228-
/* hsmd is sync, so do that first. */
229-
msg = hsm_sync_req(tmpctx, cmd->ld, take(towire_hsmd_dev_memleak(NULL)));
230-
if (!fromwire_hsmd_dev_memleak_reply(msg, &found_leak))
231-
fatal("Bad HSMD_DEV_MEMLEAK_REPLY: %s", tal_hex(tmpctx, msg));
232-
233-
if (found_leak)
234-
report_subd_memleak(leaks, ld->hsm);
235-
236-
/* Now do all the async ones. */
237+
/* Now do all the async ones. By doing connectd first, it
238+
* has the side-effect of suppressing the complaint it makes
239+
* about us being unresponsive. */
237240
start_leak_request(subd_req(ld->connectd, ld->connectd,
238241
take(towire_connectd_dev_memleak(NULL)),
239242
-1, 0, connect_dev_memleak_done, leaks),

0 commit comments

Comments
 (0)