dragonflydb · kostasrim · Feb 18, 2025 · Jan 21, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/src/facade/dragonfly_connection.cc b/src/facade/dragonfly_connection.cc
@@ -9,6 +9,7 @@
 #include <absl/strings/str_cat.h>
 #include <mimalloc.h>
 
+#include <chrono>
 #include <numeric>
 #include <variant>
 
@@ -93,7 +94,6 @@ using nonstd::make_unexpected;
 
 namespace facade {
 
-
 namespace {
 
 void SendProtocolError(RedisParser::Result pres, SinkReplyBuilder* builder) {
@@ -266,8 +266,6 @@ void LogTraffic(uint32_t id, bool has_more, absl::Span<RespExpr> resp,
 
 constexpr size_t kMinReadSize = 256;
 
-thread_local uint32_t free_req_release_weight = 0;
-
 const char* kPhaseName[Connection::NUM_PHASES] = {"SETUP", "READ", "PROCESS", "SHUTTING_DOWN",
                                                   "PRECLOSE"};
 
@@ -316,6 +314,34 @@ QueueBackpressure& GetQueueBackpressure() {
 
 thread_local vector<Connection::PipelineMessagePtr> Connection::pipeline_req_pool_;
 
+class PipelineCacheSizeTracker {
+ public:
+  bool CheckAndUpdateWatermark(size_t pipeline_sz) {
+    const auto now = Clock::now();
+    const auto elapsed = now - last_check_;
+    min_ = std::min(min_, pipeline_sz);
+    if (elapsed < std::chrono::milliseconds(10)) {
+      return false;
+    }
+
+    const bool watermark_reached = (min_ > 0);
+    min_ = Limits::max();
+    last_check_ = Clock::now();
+
+    return watermark_reached;
+  }
+
+ private:
+  using Tp = std::chrono::time_point<std::chrono::system_clock>;
+  using Clock = std::chrono::system_clock;
+  using Limits = std::numeric_limits<size_t>;
+
+  Tp last_check_ = Clock::now();
+  size_t min_ = Limits::max();
+};
+
+thread_local PipelineCacheSizeTracker tl_pipe_cache_sz_tracker;
+
 void Connection::PipelineMessage::SetArgs(const RespVec& args) {
   auto* next = storage.data();
   for (size_t i = 0; i < args.size(); ++i) {
@@ -1589,14 +1615,7 @@ void Connection::ShrinkPipelinePool() {
   if (pipeline_req_pool_.empty())
     return;
 
-  // The request pool is shared by all the connections in the thread so we do not want
-  // to release it aggressively just because some connection is running in
-  // non-pipelined mode. So by using free_req_release_weight we wait at least N times,
-  // where N is the number of connections in the thread.
-  ++free_req_release_weight;
-
-  if (free_req_release_weight > stats_->num_conns) {
-    free_req_release_weight = 0;
+  if (tl_pipe_cache_sz_tracker.CheckAndUpdateWatermark(pipeline_req_pool_.size())) {
     stats_->pipeline_cmd_cache_bytes -= pipeline_req_pool_.back()->StorageCapacity();
     pipeline_req_pool_.pop_back();
   }
@@ -1606,7 +1625,6 @@ Connection::PipelineMessagePtr Connection::GetFromPipelinePool() {
   if (pipeline_req_pool_.empty())
     return nullptr;
 
-  free_req_release_weight = 0;  // Reset the release weight.
   auto ptr = std::move(pipeline_req_pool_.back());
   stats_->pipeline_cmd_cache_bytes -= ptr->StorageCapacity();
   pipeline_req_pool_.pop_back();

diff --git a/tests/dragonfly/connection_test.py b/tests/dragonfly/connection_test.py
@@ -1061,7 +1061,7 @@ async def test_timeout(df_server: DflyInstance, async_client: aioredis.Redis):
     assert len(clients) == 2
 
     await asyncio.sleep(2)
-    
+
     @assert_eventually
     async def wait_for_conn_drop():
         clients = await async_client.client_list()
@@ -1070,4 +1070,84 @@ async def wait_for_conn_drop():
 
     await wait_for_conn_drop()
     info = await async_client.info("clients")
-    assert int(info["timeout_disconnects"]) >= 1
+    assert int(info["timeout_disconnects"]) >= 1
+
+
+# Test that the cache pipeline does not grow or shrink under constant pipeline load.
+@dfly_args({"proactor_threads": 1, "pipeline_squash": 9})
+async def test_pipeline_cache_only_async_squashed_dispatches(df_factory):
+    server = df_factory.create()
+    server.start()
+
+    client = server.client()
+
+    async def push_pipeline(size=1):
+        p = client.pipeline(transaction=True)
+        for i in range(size):
+            p.info()
+        res = await p.execute()
+        return res
+
+    # Dispatch only async command/pipelines and force squashing. pipeline_cache_bytes,
+    # should be zero because:
+    # We always dispatch the items that will be squashed, so when `INFO` gets called
+    # the cache is empty because the pipeline consumed it throughout its execution
+    for i in range(0, 30):
+        # it's actually 11 commands. 8 INFO + 2 from the MULTI/EXEC block that is injected
+        # by the client. Connection fiber yields to dispatch/async fiber when
+        # ++async_streak_len_ >= 10. The minimum to squash is 9 so it will squash the pipeline
+        # and INFO ALL should return zero for all the squashed commands in the pipeline
+        res = await push_pipeline(8)
+        for i in range(1):
+            assert res[i]["pipeline_cache_bytes"] == 0
+
+    # Non zero because we reclaimed/recycled the messages back to the cache
+    info = await client.info()
+    assert info["pipeline_cache_bytes"] > 0
+
+
+# Test that the pipeline cache size shrinks on workloads that storm the datastore with
+# pipeline commands and then "back off" by gradually reducing the pipeline load such that
+# the cache becomes progressively underutilized. At that stage, the pipeline should slowly
+# shrink (because it's underutilized).
+@dfly_args({"proactor_threads": 1})
+async def test_pipeline_cache_size(df_factory):
+    server = df_factory.create(proactor_threads=1)
+    server.start()
+
+    # Start 1 client.
+    good_client = server.client()
+    bad_actor_client = server.client()
+
+    async def push_pipeline(bad_actor_client, size=1):
+        # Fill cache.
+        p = bad_actor_client.pipeline(transaction=True)
+        for i in range(size):
+            p.lpush(str(i), "V")
+        await p.execute()
+
+    # Establish a baseline for the cache size. We dispatch async here.
+    await push_pipeline(bad_actor_client, 32)
+    info = await good_client.info()
+
+    old_pipeline_cache_bytes = info["pipeline_cache_bytes"]
+    assert old_pipeline_cache_bytes > 0
+    assert info["dispatch_queue_bytes"] == 0
+
+    for i in range(30):
+        await push_pipeline(bad_actor_client)
+        await good_client.execute_command(f"set foo{i} bar")
+
+    info = await good_client.info()
+
+    # Gradually release pipeline.
+    assert old_pipeline_cache_bytes > info["pipeline_cache_bytes"]
+    assert info["dispatch_queue_bytes"] == 0
+
+    # Now drain the full cache.
+    async with async_timeout.timeout(5):
+        while info["pipeline_cache_bytes"] != 0:
+            await good_client.execute_command(f"set foo{i} bar")
+            info = await good_client.info()
+
+    assert info["dispatch_queue_bytes"] == 0