From 79311f3a3149791a37e900cc0cc89c222caa564e Mon Sep 17 00:00:00 2001 From: Wesley Pettit Date: Thu, 30 May 2024 21:54:21 -0700 Subject: [PATCH 1/3] engine: input grace period delay shutdown for pending tasks & chunks 1. Input grace period Currently, Fluent Bit pauses all inputs 1 second after SIGTERM. The change creates an input grace period, which by default is one half the total Grace setting. This means that half way through the grace period Fluent Bit stops accepting any new logs and only sends logs pending in the buffers. 2. Check pending chunks on shutdown Previously the engine shutdown immediately if there were no pending tasks. A task is created from a chunk in the buffer. If there is a new chunk, but no task yet, the engine should keep running until the task is created and completed. This change makes the engine wait on shutdown for all pending chunks until the max grace period has expired. Signed-off-by: Wesley Pettit Co-authored-by: Anuj Singh --- include/fluent-bit/flb_config.h | 7 +++--- include/fluent-bit/flb_engine.h | 1 + include/fluent-bit/flb_storage.h | 2 ++ src/flb_config.c | 1 + src/flb_engine.c | 42 +++++++++++++++++++++++++++++--- src/flb_storage.c | 10 ++++++++ 6 files changed, 56 insertions(+), 7 deletions(-) diff --git a/include/fluent-bit/flb_config.h b/include/fluent-bit/flb_config.h index 1cfc6301ff8..c02607f35f3 100644 --- a/include/fluent-bit/flb_config.h +++ b/include/fluent-bit/flb_config.h @@ -53,9 +53,10 @@ struct flb_config { * shutdown when all remaining tasks are flushed */ int grace; - int grace_count; /* Count of grace shutdown tries */ - flb_pipefd_t flush_fd; /* Timer FD associated to flush */ - int convert_nan_to_null; /* convert null to nan ? */ + int grace_count; /* Count of grace shutdown tries */ + int grace_input; /* Shutdown grace to keep inputs ingesting */ + flb_pipefd_t flush_fd; /* Timer FD associated to flush */ + int convert_nan_to_null; /* Convert null to nan ? */ int daemon; /* Run as a daemon ? */ flb_pipefd_t shutdown_fd; /* Shutdown FD, 5 seconds */ diff --git a/include/fluent-bit/flb_engine.h b/include/fluent-bit/flb_engine.h index d242cc61a43..b10bed4202c 100644 --- a/include/fluent-bit/flb_engine.h +++ b/include/fluent-bit/flb_engine.h @@ -37,6 +37,7 @@ int flb_engine_exit_status(struct flb_config *config, int status); int flb_engine_shutdown(struct flb_config *config); int flb_engine_destroy_tasks(struct mk_list *tasks); void flb_engine_reschedule_retries(struct flb_config *config); +void flb_engine_stop_ingestion(struct flb_config *config); /* Engine event loop */ void flb_engine_evl_init(); diff --git a/include/fluent-bit/flb_storage.h b/include/fluent-bit/flb_storage.h index 220568ae41d..c5b24aaae97 100644 --- a/include/fluent-bit/flb_storage.h +++ b/include/fluent-bit/flb_storage.h @@ -83,4 +83,6 @@ struct flb_storage_metrics *flb_storage_metrics_create(struct flb_config *ctx); /* cmetrics */ int flb_storage_metrics_update(struct flb_config *config, struct flb_storage_metrics *sm); +void flb_chunk_count(struct flb_config *ctx, int *mem_chunks, int *fs_chunks); + #endif diff --git a/src/flb_config.c b/src/flb_config.c index 32dc34b7e83..b55e3222a2e 100644 --- a/src/flb_config.c +++ b/src/flb_config.c @@ -241,6 +241,7 @@ struct flb_config *flb_config_init() config->verbose = 3; config->grace = 5; config->grace_count = 0; + config->grace_input = config->grace / 2; config->exit_status_code = 0; /* json */ diff --git a/src/flb_engine.c b/src/flb_engine.c index bedc28477c5..d684577dcfb 100644 --- a/src/flb_engine.c +++ b/src/flb_engine.c @@ -685,6 +685,9 @@ int sb_segregate_chunks(struct flb_config *config) int flb_engine_start(struct flb_config *config) { int ret; + int tasks = 0; + int fs_chunks = 0; + int mem_chunks = 0; uint64_t ts; char tmp[16]; int rb_flush_flag; @@ -951,6 +954,9 @@ int flb_engine_start(struct flb_config *config) return -2; } + config->grace_input = config->grace / 2; + flb_info("[engine] Shutdown Grace Period=%d, Shutdown Input Grace Period=%d", config->grace, config->grace_input); + while (1) { rb_flush_flag = FLB_FALSE; @@ -1019,19 +1025,36 @@ int flb_engine_start(struct flb_config *config) * If grace period is set to -1, keep trying to shut down until all * tasks and retries get flushed. */ - ret = flb_task_running_count(config); + tasks = 0; + mem_chunks = 0; + fs_chunks = 0; + tasks = flb_task_running_count(config); + flb_chunk_count(config, &mem_chunks, &fs_chunks); + ret = tasks + mem_chunks + fs_chunks; if (ret > 0 && (config->grace_count < config->grace || config->grace == -1)) { if (config->grace_count == 1) { flb_task_running_print(config); } - flb_engine_exit(config); + if ((mem_chunks + fs_chunks) > 0) { + flb_info("[engine] Pending chunk count: memory=%d, filesystem=%d", + mem_chunks, fs_chunks); + } + if (config->grace_count < config->grace_input) { + flb_engine_exit(config); + } else { + flb_engine_stop_ingestion(config); + } } else { - if (ret > 0) { + if (tasks > 0) { flb_task_running_print(config); } + if ((mem_chunks + fs_chunks) > 0) { + flb_info("[engine] Pending chunk count: memory=%d, filesystem=%d", + mem_chunks, fs_chunks); + } flb_info("[engine] service has stopped (%i pending tasks)", - ret); + tasks); ret = config->exit_status_code; flb_engine_shutdown(config); config = NULL; @@ -1132,6 +1155,7 @@ int flb_engine_shutdown(struct flb_config *config) struct flb_sched_timer_coro_cb_params *sched_params; config->is_running = FLB_FALSE; + config->is_ingestion_active = FLB_FALSE; flb_input_pause_all(config); #ifdef FLB_HAVE_STREAM_PROCESSOR @@ -1200,6 +1224,16 @@ int flb_engine_exit(struct flb_config *config) return ret; } +/* Stop ingestion and pause all inputs */ +void flb_engine_stop_ingestion(struct flb_config *config) +{ + config->is_ingestion_active = FLB_FALSE; + config->is_shutting_down = FLB_TRUE; + + flb_info("[engine] pausing all inputs.."); + flb_input_pause_all(config); +} + int flb_engine_exit_status(struct flb_config *config, int status) { config->exit_status_code = status; diff --git a/src/flb_storage.c b/src/flb_storage.c index 4148be66781..deac97a526e 100644 --- a/src/flb_storage.c +++ b/src/flb_storage.c @@ -710,6 +710,16 @@ int flb_storage_create(struct flb_config *ctx) return 0; } +void flb_chunk_count(struct flb_config *ctx, int *mem_chunks, int *fs_chunks) +{ + struct cio_stats storage_st; + + cio_stats_get(ctx->cio, &storage_st); + + *mem_chunks = storage_st.chunks_mem; + *fs_chunks = storage_st.chunks_fs; +} + void flb_storage_destroy(struct flb_config *ctx) { struct cio_ctx *cio; From 3fcb50e28552e61a8a86967728c23751497d8efe Mon Sep 17 00:00:00 2001 From: Wesley Pettit Date: Wed, 12 Jun 2024 21:01:09 -0700 Subject: [PATCH 2/3] engine: force flush on shutdown to create tasks for pending chunks Signed-off-by: Wesley Pettit --- src/flb_engine.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/flb_engine.c b/src/flb_engine.c index d684577dcfb..cb12d186550 100644 --- a/src/flb_engine.c +++ b/src/flb_engine.c @@ -698,6 +698,7 @@ int flb_engine_start(struct flb_config *config) struct flb_sched *sched; struct flb_net_dns dns_ctx; struct flb_notification *notification; + int exiting = FLB_FALSE; /* Initialize the networking layer */ flb_net_lib_init(); @@ -1036,13 +1037,21 @@ int flb_engine_start(struct flb_config *config) flb_task_running_print(config); } if ((mem_chunks + fs_chunks) > 0) { - flb_info("[engine] Pending chunk count: memory=%d, filesystem=%d", - mem_chunks, fs_chunks); + flb_info("[engine] pending chunk count: memory=%d, filesystem=%d; grace_timer=%d", + mem_chunks, fs_chunks, config->grace_count); } + + /* Create new tasks for pending chunks */ + flb_engine_flush(config, NULL); if (config->grace_count < config->grace_input) { - flb_engine_exit(config); + if (exiting == FLB_FALSE) { + flb_engine_exit(config); + exiting = FLB_TRUE; + } } else { - flb_engine_stop_ingestion(config); + if (config->is_ingestion_active == FLB_TRUE) { + flb_engine_stop_ingestion(config); + } } } else { @@ -1050,8 +1059,8 @@ int flb_engine_start(struct flb_config *config) flb_task_running_print(config); } if ((mem_chunks + fs_chunks) > 0) { - flb_info("[engine] Pending chunk count: memory=%d, filesystem=%d", - mem_chunks, fs_chunks); + flb_info("[engine] pending chunk count: memory=%d, filesystem=%d; grace_timer=%d", + mem_chunks, fs_chunks, config->grace_count); } flb_info("[engine] service has stopped (%i pending tasks)", tasks); From fb13aebc07e632e7f3831c8ff966b30a311f8d3d Mon Sep 17 00:00:00 2001 From: Wesley Pettit Date: Sun, 16 Jun 2024 19:03:31 -0700 Subject: [PATCH 3/3] engine: send backlog chunks on shutdown Signed-off-by: Wesley Pettit --- src/flb_engine.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/flb_engine.c b/src/flb_engine.c index cb12d186550..e19cfc6c6dd 100644 --- a/src/flb_engine.c +++ b/src/flb_engine.c @@ -950,7 +950,8 @@ int flb_engine_start(struct flb_config *config) ret = sb_segregate_chunks(config); - if (ret) { + if (ret < 0) + { flb_error("[engine] could not segregate backlog chunks"); return -2; } @@ -1035,6 +1036,11 @@ int flb_engine_start(struct flb_config *config) if (ret > 0 && (config->grace_count < config->grace || config->grace == -1)) { if (config->grace_count == 1) { flb_task_running_print(config); + ret = sb_segregate_chunks(config); + if (ret < 0) { + flb_error("[engine] could not segregate backlog chunks"); + return -2; + } } if ((mem_chunks + fs_chunks) > 0) { flb_info("[engine] pending chunk count: memory=%d, filesystem=%d; grace_timer=%d",