From 9caaa798e3902e1350c8c09649699e3e950cc545 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Fri, 18 Apr 2025 22:56:30 +0000
Subject: [PATCH 01/14] feat(TNS): initial single thread disk-based sorting

---
 dbcon/joblist/limitedorderby.h   |   1 +
 dbcon/joblist/tupleannexstep.cpp | 178 +++++++++++++++++--------------
 dbcon/joblist/tupleannexstep.h   |  12 ++-
 3 files changed, 111 insertions(+), 80 deletions(-)

diff --git a/dbcon/joblist/limitedorderby.h b/dbcon/joblist/limitedorderby.h
index ff0daafd22..8f5717bea7 100644
--- a/dbcon/joblist/limitedorderby.h
+++ b/dbcon/joblist/limitedorderby.h
@@ -51,6 +51,7 @@ class LimitedOrderBy : public ordering::IdbOrderBy
   const std::string toString() const override;
 
   void finalize();
+  void flushCurrentToDisk() {}
 
  protected:
   uint64_t fStart;
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index 3345194181..806501a7bc 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -46,7 +46,6 @@ using namespace execplan;
 using namespace rowgroup;
 
 #include "hasher.h"
-#include "stlpoolallocator.h"
 #include "threadnaming.h"
 using namespace utils;
 
@@ -83,8 +82,8 @@ struct TAEq
   bool operator()(const rowgroup::Row::Pointer&, const rowgroup::Row::Pointer&) const;
 };
 // TODO:  Generalize these and put them back in utils/common/hasher.h
-using TNSDistinctMap_t =
-    std::unordered_set<rowgroup::Row::Pointer, TAHasher, TAEq, STLPoolAllocator<rowgroup::Row::Pointer> >;
+using TNSDistinctMap_t = std::unordered_set<rowgroup::Row::Pointer, TAHasher, TAEq,
+                                            allocators::CountingAllocator<rowgroup::Row::Pointer> >;
 };  // namespace
 
 inline uint64_t TAHasher::operator()(const Row::Pointer& p) const
@@ -574,12 +573,12 @@ void TupleAnnexStep::executeNoOrderByWithDistinct()
       dataVec.pop_back();
     }
   }
-  catch (const std::bad_alloc&)
+  catch (const logging::OutOfMemoryExcept&)
   {
     auto errorCode = ERR_TNS_DISTINCT_IS_TOO_BIG;
     auto newException = IDBExcept(errorCode);
-    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
-                    "TupleAnnexStep::executeNoOrderByWithDistinct()");
+    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS,
+                    logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::executeNoOrderByWithDistinct()");
   }
   catch (...)
   {
@@ -612,88 +611,107 @@ void TupleAnnexStep::executeWithOrderBy()
   RGData rgDataOut;
   bool more = false;
 
-  try
+  for(;;)
   {
-    more = fInputDL->next(fInputIterator, &rgDataIn);
-
-    if (traceOn())
-      dlTimes.setFirstReadTime();
-
-    StepTeleStats sts(fQueryUuid, fStepUuid, StepTeleStats::ST_START, 1);
-    postStepStartTele(sts);
-
-    while (more && !cancelled())
+    try
     {
-      fRowGroupIn.setData(&rgDataIn);
-      fRowGroupIn.getRow(0, &fRowIn);
-
-      for (uint64_t i = 0; i < fRowGroupIn.getRowCount() && !cancelled(); ++i)
-      {
-        fOrderBy->processRow(fRowIn);
-        fRowIn.nextRow();
-      }
-
       more = fInputDL->next(fInputIterator, &rgDataIn);
-    }
 
-    fOrderBy->finalize();
+      if (traceOn())
+        dlTimes.setFirstReadTime();
 
-    if (!cancelled())
-    {
-      while (fOrderBy->getData(rgDataIn))
+      StepTeleStats sts(fQueryUuid, fStepUuid, StepTeleStats::ST_START, 1);
+      postStepStartTele(sts);
+
+      while (more && !cancelled())
       {
-        if (fConstant == NULL && fRowGroupOut.getColumnCount() == fRowGroupIn.getColumnCount())
+        fRowGroupIn.setData(&rgDataIn);
+        fRowGroupIn.getRow(0, &fRowIn);
+
+        for (uint64_t i = 0; i < fRowGroupIn.getRowCount() && !cancelled(); ++i)
         {
-          rgDataOut = rgDataIn;
-          fRowGroupOut.setData(&rgDataOut);
+          fOrderBy->processRow(fRowIn);
+          fRowIn.nextRow();
         }
-        else
-        {
-          fRowGroupIn.setData(&rgDataIn);
-          fRowGroupIn.getRow(0, &fRowIn);
 
-          rgDataOut.reinit(fRowGroupOut, fRowGroupIn.getRowCount());
-          fRowGroupOut.setData(&rgDataOut);
-          fRowGroupOut.resetRowGroup(fRowGroupIn.getBaseRid());
-          fRowGroupOut.setDBRoot(fRowGroupIn.getDBRoot());
-          fRowGroupOut.getRow(0, &fRowOut);
+        more = fInputDL->next(fInputIterator, &rgDataIn);
+      }
 
-          for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i)
+      if (!isDiskBased())
+      {
+        fOrderBy->finalize();
+
+        if (!cancelled())
+        {
+          while (fOrderBy->getData(rgDataIn))
           {
-            if (fConstant)
-              fConstant->fillInConstants(fRowIn, fRowOut);
+            if (fConstant == NULL && fRowGroupOut.getColumnCount() == fRowGroupIn.getColumnCount())
+            {
+              rgDataOut = rgDataIn;
+              fRowGroupOut.setData(&rgDataOut);
+            }
             else
-              copyRow(fRowIn, &fRowOut);
-
-            fRowGroupOut.incRowCount();
-            fRowOut.nextRow();
-            fRowIn.nextRow();
-          }
-        }
+            {
+              fRowGroupIn.setData(&rgDataIn);
+              fRowGroupIn.getRow(0, &fRowIn);
+
+              rgDataOut.reinit(fRowGroupOut, fRowGroupIn.getRowCount());
+              fRowGroupOut.setData(&rgDataOut);
+              fRowGroupOut.resetRowGroup(fRowGroupIn.getBaseRid());
+              fRowGroupOut.setDBRoot(fRowGroupIn.getDBRoot());
+              fRowGroupOut.getRow(0, &fRowOut);
+
+              for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i)
+              {
+                if (fConstant)
+                  fConstant->fillInConstants(fRowIn, fRowOut);
+                else
+                  copyRow(fRowIn, &fRowOut);
+
+                fRowGroupOut.incRowCount();
+                fRowOut.nextRow();
+                fRowIn.nextRow();
+              }
+            }
 
-        if (fRowGroupOut.getRowCount() > 0)
-        {
-          fRowsReturned += fRowGroupOut.getRowCount();
-          fOutputDL->insert(rgDataOut);
+            if (fRowGroupOut.getRowCount() > 0)
+            {
+              fRowsReturned += fRowGroupOut.getRowCount();
+              fOutputDL->insert(rgDataOut);
 
-          // release RGData memory
-          size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
-          fOrderBy->returnRGDataMemory2RM(rgDataSize);
+              // release RGData memory
+              size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
+              fOrderBy->returnRGDataMemory2RM(rgDataSize);
+            }
+          }
         }
       }
     }
-  }
-  catch (const std::bad_alloc&)
-  {
-    auto errorCode = fOrderBy->getErrorCode();
-    auto newException = IDBExcept(errorCode);
-    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
-                    "TupleAnnexStep::executeWithOrderBy()");
-  }
-  catch (...)
-  {
-    handleException(std::current_exception(), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
-                    "TupleAnnexStep::executeWithOrderBy()");
+    catch (const logging::OutOfMemoryExcept&)
+    {
+      // if (!isDiskBased()) // if enabled
+      // {
+      convertToDiskBased();
+      // continue;
+      // }
+      // else
+      // {
+      //   auto errorCode = fOrderBy->getErrorCode();
+      //   auto newException = OutOfMemoryExcept(errorCode);
+      //   handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS,
+      //                   logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::executeWithOrderBy()");
+      // }
+    }
+    catch (...)
+    {
+      handleException(std::current_exception(), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
+                      "TupleAnnexStep::executeWithOrderBy()");
+    }
+
+    if (isDiskBased())
+    {
+      fOrderBy->flushCurrentToDisk();
+    }
   }
 
   while (more)
@@ -729,9 +747,11 @@ void TupleAnnexStep::finalizeParallelOrderByDistinct()
   // Calculate offset here
   fRowGroupOut.getRow(0, &fRowOut);
 
-  ordering::SortingPQ finalPQ(rowgroup::rgCommonSize, fRm->getAllocator<ordering::OrderByRow>());
+  auto allocSorting = fRm->getAllocator<ordering::OrderByRow>();
+  ordering::SortingPQ finalPQ(rowgroup::rgCommonSize, allocSorting);
+  auto allocDistinct = fRm->getAllocator<rowgroup::Row::Pointer>();
   std::unique_ptr<TNSDistinctMap_t> distinctMap(
-      new TNSDistinctMap_t(10, TAHasher(this), TAEq(this), STLPoolAllocator<rowgroup::Row::Pointer>(fRm)));
+      new TNSDistinctMap_t(10, TAHasher(this), TAEq(this), allocDistinct));
   fRowGroupIn.initRow(&row1);
   fRowGroupIn.initRow(&row2);
 
@@ -765,8 +785,8 @@ void TupleAnnexStep::finalizeParallelOrderByDistinct()
   {
     auto errorCode = fOrderBy->getErrorCode();
     auto newException = IDBExcept(errorCode);
-    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
-                    "TupleAnnexStep::finalizeParallelOrderByDistinct()");
+    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS,
+                    logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::finalizeParallelOrderByDistinct()");
   }
   catch (...)
   {
@@ -964,8 +984,8 @@ void TupleAnnexStep::finalizeParallelOrderBy()
   {
     auto errorCode = fOrderBy->getErrorCode();
     auto newException = IDBExcept(errorCode);
-    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
-                    "TupleAnnexStep::finalizeParallelOrderBy()");
+    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS,
+                    logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::finalizeParallelOrderBy()");
   }
   catch (...)
   {
@@ -1179,8 +1199,8 @@ void TupleAnnexStep::executeParallelOrderBy(uint64_t id)
   {
     auto errorCode = fOrderBy->getErrorCode();
     auto newException = IDBExcept(errorCode);
-    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
-                    "TupleAnnexStep::executeParallelOrderBy()");
+    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS,
+                    logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::executeParallelOrderBy()");
   }
   catch (...)
   {
diff --git a/dbcon/joblist/tupleannexstep.h b/dbcon/joblist/tupleannexstep.h
index f259476861..2c0beae174 100644
--- a/dbcon/joblist/tupleannexstep.h
+++ b/dbcon/joblist/tupleannexstep.h
@@ -20,8 +20,8 @@
 
 #pragma once
 
-#include <queue>
 #include <boost/thread/thread.hpp>
+#include <atomic>
 
 #include "jobstep.h"
 #include "limitedorderby.h"
@@ -114,6 +114,15 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   void printCalTrace();
   void finalizeParallelOrderBy();
   void finalizeParallelOrderByDistinct();
+  void convertToDiskBased()
+  {
+    fDiskBased.store(true, std::memory_order_relaxed);
+  }
+  bool isDiskBased() const
+  {
+    return fDiskBased.load(std::memory_order_relaxed);
+  }
+
 
   // input/output rowgroup and row
   rowgroup::RowGroup fRowGroupIn;
@@ -173,6 +182,7 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   uint16_t fFinishedThreads;
   boost::mutex fParallelFinalizeMutex;
   joblist::ResourceManager* fRm;
+  std::atomic<bool> fDiskBased;
 };
 
 }  // namespace joblist

From 25d3387e5e864c40453a0ba49575e070100c6372 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Mon, 21 Apr 2025 17:44:54 +0000
Subject: [PATCH 02/14] feat(sorting,TNS): preparations to replace heavy
 LimitedOrderBy::finalize with a lightweight + heavy getNextRGData

---
 dbcon/joblist/limitedorderby.cpp | 145 ++++++++++++++++++++---
 dbcon/joblist/limitedorderby.h   |   3 +
 dbcon/joblist/tupleannexstep.cpp | 196 ++++++++++++++++++-------------
 dbcon/joblist/tupleannexstep.h   |  24 +++-
 4 files changed, 263 insertions(+), 105 deletions(-)

diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index f22b2adab3..7b313de3ce 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -109,13 +109,13 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
   if (fCount == 0)
     return;
 
-  auto& orderByQueue = getQueue();
+  auto& orderedRowsQueue = getQueue();
   // if the row count is less than the limit
-  if (orderByQueue.size() < fStart + fCount)
+  if (orderedRowsQueue.size() < fStart + fCount)
   {
     copyRow(row, &fRow0);
     OrderByRow newRow(fRow0, fRule);
-    orderByQueue.push(newRow);
+    orderedRowsQueue.push(newRow);
 
     uint64_t memSizeInc = sizeof(newRow);
     fUncommitedMemory += memSizeInc;
@@ -156,23 +156,38 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
     }
   }
 
-  else if (fOrderByCond.size() > 0 && fRule.less(row.getPointer(), orderByQueue.top().fData))
+  else if (fOrderByCond.size() > 0 && fRule.less(row.getPointer(), orderedRowsQueue.top().fData))
   {
-    OrderByRow swapRow = orderByQueue.top();
+    OrderByRow swapRow = orderedRowsQueue.top();
     row1.setData(swapRow.fData);
     copyRow(row, &row1);
 
     if (fDistinct)
     {
-      fDistinctMap->erase(orderByQueue.top().fData);
+      fDistinctMap->erase(orderedRowsQueue.top().fData);
       fDistinctMap->insert(row1.getPointer());
     }
 
-    orderByQueue.pop();
-    orderByQueue.push(swapRow);
+    orderedRowsQueue.pop();
+    orderedRowsQueue.push(swapRow);
   }
 }
 
+  void LimitedOrderBy::brandNewFinalize()
+  {
+    auto& orderedRowsQueue = getQueue();
+    uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
+
+    uint64_t effectiveCount = std::min(queueSizeWoOffset, fCount);
+    uint64_t fOffsetInOrderedRowsQueue = effectiveCount % fRowsPerRG;
+    if (fOffsetInOrderedRowsQueue == 0 && effectiveCount > 0)
+    {
+      fOffsetInOrderedRowsQueue = fRowsPerRG;
+    }
+
+  }
+
+
 /*
  * The f() copies top element from an ordered queue into a row group. It
  * does this backwards to syncronise sorting orientation with the server.
@@ -185,7 +200,7 @@ void LimitedOrderBy::finalize()
     if (!fRm->getMemory(fUncommitedMemory, fSessionMemLimit))
     {
       cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-      throw IDBExcept(fErrorCode);
+      throw logging::OutOfMemoryExcept(fErrorCode);
     }
     fMemSize += fUncommitedMemory;
     fUncommitedMemory = 0;
@@ -195,9 +210,9 @@ void LimitedOrderBy::finalize()
   if (fRowGroup.getRowCount() > 0)
     fDataQueue.push(fData);
 
-  auto& orderByQueue = getQueue();
+  auto& orderedRowsQueue = getQueue();
 
-  if (orderByQueue.size() > 0)
+  if (orderedRowsQueue.size() > 0)
   {
     // *DRRTUY Very memory intensive. CS needs to account active
     // memory only and release memory if needed.
@@ -206,14 +221,14 @@ void LimitedOrderBy::finalize()
     if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
     {
       cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-      throw IDBExcept(fErrorCode);
+      throw logging::OutOfMemoryExcept(fErrorCode);
     }
     fMemSize += memSizeInc;
 
     uint64_t offset = 0;
     uint64_t i = 0;
     // Reduce queue size by an offset value if it applicable.
-    uint64_t queueSizeWoOffset = orderByQueue.size() > fStart ? orderByQueue.size() - fStart : 0;
+    uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
     list<RGData> tempRGDataList;
 
     if (fCount <= queueSizeWoOffset)
@@ -242,15 +257,15 @@ void LimitedOrderBy::finalize()
     offset = offset != 0 ? offset - 1 : offset;
     fRowGroup.getRow(offset, &fRow0);
 
-    while ((orderByQueue.size() > fStart) && (i++ < fCount))
+    while ((orderedRowsQueue.size() > fStart) && (i++ < fCount))
     {
-      const OrderByRow& topRow = orderByQueue.top();
+      const OrderByRow& topRow = orderedRowsQueue.top();
       row1.setData(topRow.fData);
       copyRow(row1, &fRow0);
       fRowGroup.incRowCount();
       offset--;
       fRow0.prevRow(rSize);
-      orderByQueue.pop();
+      orderedRowsQueue.pop();
 
       // if RG has fRowsPerRG rows
       if (offset == (uint64_t)-1)
@@ -260,7 +275,7 @@ void LimitedOrderBy::finalize()
         if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
         {
           cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-          throw IDBExcept(fErrorCode);
+          throw logging::OutOfMemoryExcept(fErrorCode);
         }
         fMemSize += memSizeInc;
 
@@ -282,6 +297,102 @@ void LimitedOrderBy::finalize()
   }
 }
 
+
+
+bool LimitedOrderBy::getNextRGData(RGData& data)
+{
+  auto& orderedRowsQueue = getQueue();
+
+  if (orderedRowsQueue.empty())
+  {
+    return false;
+  }
+
+  // *DRRTUY Very memory intensive. CS needs to account active
+  // memory only and release memory if needed.
+  // uint64_t memSizeInc = fRowGroup.getSizeWithStrings() - fRowGroup.getHeaderSize();
+  uint64_t memSizeInc = 0ULL;
+
+  // if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
+  // {
+  //   cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+  //   throw logging::OutOfMemoryExcept(fErrorCode);
+  // }
+  // fMemSize += memSizeInc;
+
+  uint64_t offset = 0;
+  uint64_t i = 0;
+  // Reduce queue size by an offset value if it applicable.
+  uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
+  // list<RGData> tempRGDataList;
+
+  // ordered rows queue offset calculation based on LIMIT/OFFSET of a query
+  // looks like count = min(fCount, queueSizeWoOffset)
+  if (fCount <= queueSizeWoOffset)
+  {
+    offset = fCount % fRowsPerRG;
+    // If fCount is a multiply of fRowsPerRG
+    if (!offset && fCount > 0)
+      offset = fRowsPerRG;
+  }
+  else
+  {
+    offset = queueSizeWoOffset % fRowsPerRG;
+    // If fCount is a multiply of fRowsPerRG
+    if (!offset && queueSizeWoOffset > 0)
+      offset = fRowsPerRG;
+  }
+
+  // list<RGData>::iterator tempListIter = tempRGDataList.begin();
+
+  i = 0;
+  uint32_t rSize = fRow0.getSize();
+  uint64_t preLastRowNumb = fRowsPerRG - 1;
+
+  data.reinit(fRowGroup, fRowsPerRG);
+  fRowGroup.setData(&data);
+  fRowGroup.resetRowGroup(0);
+  // *DRRTUY This approach won't work with
+  // OFFSET > fRowsPerRG
+  offset = offset != 0 ? offset - 1 : offset;
+  fRowGroup.getRow(offset, &fRow0);
+
+  while ((orderedRowsQueue.size() > fStart) && (i++ < fCount))
+  {
+    const OrderByRow& topRow = orderedRowsQueue.top();
+    row1.setData(topRow.data);
+    copyRow(row1, &fRow0);
+    fRowGroup.incRowCount();
+    offset--;
+    fRow0.prevRow(rSize);
+    orderedRowsQueue.pop();
+
+    // if RG has fRowsPerRG rows
+    if (offset == (uint64_t)-1)
+    {
+      tempRGDataList.push_front(data);
+
+      if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
+      {
+        cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+        throw logging::OutOfMemoryExcept(fErrorCode);
+      }
+      fMemSize += memSizeInc;
+
+      data.reinit(fRowGroup, fRowsPerRG);
+      fRowGroup.setData(&data);
+      fRowGroup.resetRowGroup(0);  // ?
+      fRowGroup.getRow(preLastRowNumb, &fRow0);
+      offset = preLastRowNumb;
+    }
+  }
+  // Push the last/only group into the queue.
+  // if (fRowGroup.getRowCount() > 0)
+    // tempRGDataList.push_front(data);
+
+  return true;
+}
+
 const string LimitedOrderBy::toString() const
 {
   ostringstream oss;
diff --git a/dbcon/joblist/limitedorderby.h b/dbcon/joblist/limitedorderby.h
index 8f5717bea7..658f703c1c 100644
--- a/dbcon/joblist/limitedorderby.h
+++ b/dbcon/joblist/limitedorderby.h
@@ -51,6 +51,8 @@ class LimitedOrderBy : public ordering::IdbOrderBy
   const std::string toString() const override;
 
   void finalize();
+  void brandNewFinalize();
+  bool getNextRGData(RGData& data);
   void flushCurrentToDisk() {}
 
  protected:
@@ -58,6 +60,7 @@ class LimitedOrderBy : public ordering::IdbOrderBy
   uint64_t fCount;
   uint64_t fUncommitedMemory;
   static const uint64_t fMaxUncommited;
+  uint64_t fOffsetInOrderedRowsQueue;
 };
 
 }  // namespace joblist
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index 806501a7bc..086681341e 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -611,106 +611,136 @@ void TupleAnnexStep::executeWithOrderBy()
   RGData rgDataOut;
   bool more = false;
 
-  for(;;)
+  try
   {
-    try
+    for (;;)
     {
-      more = fInputDL->next(fInputIterator, &rgDataIn);
-
-      if (traceOn())
-        dlTimes.setFirstReadTime();
+      try
+      {
+        more = fInputDL->next(fInputIterator, &rgDataIn);
 
-      StepTeleStats sts(fQueryUuid, fStepUuid, StepTeleStats::ST_START, 1);
-      postStepStartTele(sts);
+        if (traceOn())
+          dlTimes.setFirstReadTime();
 
-      while (more && !cancelled())
-      {
-        fRowGroupIn.setData(&rgDataIn);
-        fRowGroupIn.getRow(0, &fRowIn);
+        StepTeleStats sts(fQueryUuid, fStepUuid, StepTeleStats::ST_START, 1);
+        postStepStartTele(sts);
 
-        for (uint64_t i = 0; i < fRowGroupIn.getRowCount() && !cancelled(); ++i)
+        while (more && !cancelled())
         {
-          fOrderBy->processRow(fRowIn);
-          fRowIn.nextRow();
-        }
+          fRowGroupIn.setData(&rgDataIn);
+          fRowGroupIn.getRow(0, &fRowIn);
 
-        more = fInputDL->next(fInputIterator, &rgDataIn);
+          for (uint64_t i = 0; i < fRowGroupIn.getRowCount() && !cancelled(); ++i)
+          {
+            fOrderBy->processRow(fRowIn);
+            fRowIn.nextRow();
+          }
+
+          more = fInputDL->next(fInputIterator, &rgDataIn);
+        }
+      }
+      catch (const logging::OutOfMemoryExcept&)
+      {
+        enableFlushToDisk();
+        incrementGenerationCounter();
+      }
+      catch (...)
+      {
+        handleException(std::current_exception(), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
+                        "TupleAnnexStep::executeWithOrderBy()");
       }
 
-      if (!isDiskBased())
+      if (isFlushToDiskEnabled())
       {
-        fOrderBy->finalize();
+        fOrderBy->flushCurrentToDisk();
+        disableFlushToDisk();
+      }
+      else
+      {
+        break;
+      }
+    }
+  }
+  catch (const logging::OutOfMemoryExcept&)
+  {
+    auto errorCode = fOrderBy->getErrorCode();
+    auto newException = OutOfMemoryExcept(errorCode);
+    handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS,
+                    logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::executeWithOrderBy()");
+  }
 
-        if (!cancelled())
+  // store avg RGData size
+  if (getGenerationCounter())
+  {
+    // assess RAM available, avg RGData size statistics and free enough memory
+    // return memory if needed
+    // create outputDLs or simplier atomic queues + readers threads
+    // call fOrderBy->diskBasedMergePhase()
+  }
+
+  // if (!isDiskBased())
+  {
+    // {
+    //   if (fUncommitedMemory > 0)
+    //   {
+    //     if (!fRm->getMemory(fUncommitedMemory, fSessionMemLimit))
+    //     {
+    //       cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+    //       throw logging::OutOfMemoryExcept(fErrorCode);
+    //     }
+    //     fMemSize += fUncommitedMemory;
+    //     fUncommitedMemory = 0;
+    //   }
+    //   if (fRowGroup.getRowCount() > 0)
+    //     fDataQueue.push(fData);
+    // }
+    // replace with pushing the leftovers of LimitedOrderBy into the RGDATA queue
+    fOrderBy->brandNewFinalize();
+
+    if (!cancelled())
+    {
+      while (fOrderBy->getData(rgDataIn))
+      {
+        if (fConstant == NULL && fRowGroupOut.getColumnCount() == fRowGroupIn.getColumnCount())
+        {
+          rgDataOut = rgDataIn;
+          fRowGroupOut.setData(&rgDataOut);
+        }
+        else  // TODO push this into finalize to populate next RGData rows
         {
-          while (fOrderBy->getData(rgDataIn))
+          fRowGroupIn.setData(&rgDataIn);
+          fRowGroupIn.getRow(0, &fRowIn);
+
+          rgDataOut.reinit(fRowGroupOut, fRowGroupIn.getRowCount());
+          fRowGroupOut.setData(&rgDataOut);
+          fRowGroupOut.resetRowGroup(fRowGroupIn.getBaseRid());
+          fRowGroupOut.setDBRoot(fRowGroupIn.getDBRoot());
+          fRowGroupOut.getRow(0, &fRowOut);
+
+          for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i)
           {
-            if (fConstant == NULL && fRowGroupOut.getColumnCount() == fRowGroupIn.getColumnCount())
-            {
-              rgDataOut = rgDataIn;
-              fRowGroupOut.setData(&rgDataOut);
-            }
+            if (fConstant)
+              fConstant->fillInConstants(fRowIn, fRowOut);
             else
-            {
-              fRowGroupIn.setData(&rgDataIn);
-              fRowGroupIn.getRow(0, &fRowIn);
-
-              rgDataOut.reinit(fRowGroupOut, fRowGroupIn.getRowCount());
-              fRowGroupOut.setData(&rgDataOut);
-              fRowGroupOut.resetRowGroup(fRowGroupIn.getBaseRid());
-              fRowGroupOut.setDBRoot(fRowGroupIn.getDBRoot());
-              fRowGroupOut.getRow(0, &fRowOut);
-
-              for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i)
-              {
-                if (fConstant)
-                  fConstant->fillInConstants(fRowIn, fRowOut);
-                else
-                  copyRow(fRowIn, &fRowOut);
-
-                fRowGroupOut.incRowCount();
-                fRowOut.nextRow();
-                fRowIn.nextRow();
-              }
-            }
+              copyRow(fRowIn, &fRowOut);
 
-            if (fRowGroupOut.getRowCount() > 0)
-            {
-              fRowsReturned += fRowGroupOut.getRowCount();
-              fOutputDL->insert(rgDataOut);
-
-              // release RGData memory
-              size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
-              fOrderBy->returnRGDataMemory2RM(rgDataSize);
-            }
+            fRowGroupOut.incRowCount();
+            fRowOut.nextRow();
+            fRowIn.nextRow();
           }
         }
-      }
-    }
-    catch (const logging::OutOfMemoryExcept&)
-    {
-      // if (!isDiskBased()) // if enabled
-      // {
-      convertToDiskBased();
-      // continue;
-      // }
-      // else
-      // {
-      //   auto errorCode = fOrderBy->getErrorCode();
-      //   auto newException = OutOfMemoryExcept(errorCode);
-      //   handleException(std::make_exception_ptr(newException), logging::ERR_IN_PROCESS,
-      //                   logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::executeWithOrderBy()");
-      // }
-    }
-    catch (...)
-    {
-      handleException(std::current_exception(), logging::ERR_IN_PROCESS, logging::ERR_ALWAYS_CRITICAL,
-                      "TupleAnnexStep::executeWithOrderBy()");
-    }
 
-    if (isDiskBased())
-    {
-      fOrderBy->flushCurrentToDisk();
+        if (fRowGroupOut.getRowCount() > 0)
+        {
+          fRowsReturned += fRowGroupOut.getRowCount();
+          fOutputDL->insert(rgDataOut);
+
+          // release RGData memory
+          // TODO add some batching here to reduce atomic overhead.
+          size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
+          fOrderBy->returnRGDataMemory2RM(rgDataSize);
+        }
+      }
     }
   }
 
@@ -719,6 +749,8 @@ void TupleAnnexStep::executeWithOrderBy()
 
   // Bug 3136, let mini stats to be formatted if traceOn.
   fOutputDL->endOfInput();
+
+  // TODO clean existing leftover disk-based files.
 }
 
 /*
diff --git a/dbcon/joblist/tupleannexstep.h b/dbcon/joblist/tupleannexstep.h
index 2c0beae174..be3c003c14 100644
--- a/dbcon/joblist/tupleannexstep.h
+++ b/dbcon/joblist/tupleannexstep.h
@@ -114,15 +114,26 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   void printCalTrace();
   void finalizeParallelOrderBy();
   void finalizeParallelOrderByDistinct();
-  void convertToDiskBased()
+  void enableFlushToDisk()
   {
-    fDiskBased.store(true, std::memory_order_relaxed);
+    fFlushToDisk.store(true, std::memory_order_relaxed);
   }
-  bool isDiskBased() const
+  void disableFlushToDisk()
   {
-    return fDiskBased.load(std::memory_order_relaxed);
+    fFlushToDisk.store(false, std::memory_order_relaxed);
+  }
+  bool isFlushToDiskEnabled() const
+  {
+    return fFlushToDisk.load(std::memory_order_relaxed);
+  }
+  void incrementGenerationCounter()
+  {
+    ++fGenerationCounter;
+  }
+  uint64_t getGenerationCounter() const
+  {
+    return fGenerationCounter;
   }
-
 
   // input/output rowgroup and row
   rowgroup::RowGroup fRowGroupIn;
@@ -182,7 +193,8 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   uint16_t fFinishedThreads;
   boost::mutex fParallelFinalizeMutex;
   joblist::ResourceManager* fRm;
-  std::atomic<bool> fDiskBased;
+  std::atomic<bool> fFlushToDisk {false};
+  uint64_t fGenerationCounter {0};
 };
 
 }  // namespace joblist

From 5af5e1f2115d2b03a7ad5d325ba2b90c94a0ec44 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Mon, 21 Apr 2025 19:20:13 +0000
Subject: [PATCH 03/14] feat(sorting,TNS): replacing LimitedOrderBy::finalize
 with a lightweight + heavy getNextRGData

---
 dbcon/joblist/limitedorderby.cpp | 109 +++++++------------------------
 dbcon/joblist/limitedorderby.h   |   3 +-
 dbcon/joblist/tupleannexstep.cpp |   6 +-
 3 files changed, 29 insertions(+), 89 deletions(-)

diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index 7b313de3ce..86f118de67 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -173,20 +173,19 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
   }
 }
 
-  void LimitedOrderBy::brandNewFinalize()
-  {
-    auto& orderedRowsQueue = getQueue();
-    uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
-
-    uint64_t effectiveCount = std::min(queueSizeWoOffset, fCount);
-    uint64_t fOffsetInOrderedRowsQueue = effectiveCount % fRowsPerRG;
-    if (fOffsetInOrderedRowsQueue == 0 && effectiveCount > 0)
-    {
-      fOffsetInOrderedRowsQueue = fRowsPerRG;
-    }
+void LimitedOrderBy::brandNewFinalize()
+{
+  auto& orderedRowsQueue = getQueue();
+  uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
 
+  // Skip OFFSET
+  uint64_t sqlOffset = fStart;
+  while (sqlOffset > 0 && !orderedRowsQueue.empty())
+  {
+    orderedRowsQueue.pop();
+    --sqlOffset;
   }
-
+}
 
 /*
  * The f() copies top element from an ordered queue into a row group. It
@@ -297,8 +296,6 @@ void LimitedOrderBy::finalize()
   }
 }
 
-
-
 bool LimitedOrderBy::getNextRGData(RGData& data)
 {
   auto& orderedRowsQueue = getQueue();
@@ -308,89 +305,31 @@ bool LimitedOrderBy::getNextRGData(RGData& data)
     return false;
   }
 
-  // *DRRTUY Very memory intensive. CS needs to account active
-  // memory only and release memory if needed.
-  // uint64_t memSizeInc = fRowGroup.getSizeWithStrings() - fRowGroup.getHeaderSize();
-  uint64_t memSizeInc = 0ULL;
-
-  // if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
-  // {
-  //   cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-  //   throw logging::OutOfMemoryExcept(fErrorCode);
-  // }
-  // fMemSize += memSizeInc;
-
-  uint64_t offset = 0;
-  uint64_t i = 0;
-  // Reduce queue size by an offset value if it applicable.
-  uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
-  // list<RGData> tempRGDataList;
-
-  // ordered rows queue offset calculation based on LIMIT/OFFSET of a query
-  // looks like count = min(fCount, queueSizeWoOffset)
-  if (fCount <= queueSizeWoOffset)
-  {
-    offset = fCount % fRowsPerRG;
-    // If fCount is a multiply of fRowsPerRG
-    if (!offset && fCount > 0)
-      offset = fRowsPerRG;
-  }
-  else
-  {
-    offset = queueSizeWoOffset % fRowsPerRG;
-    // If fCount is a multiply of fRowsPerRG
-    if (!offset && queueSizeWoOffset > 0)
-      offset = fRowsPerRG;
-  }
-
-  // list<RGData>::iterator tempListIter = tempRGDataList.begin();
-
-  i = 0;
   uint32_t rSize = fRow0.getSize();
-  uint64_t preLastRowNumb = fRowsPerRG - 1;
-
   data.reinit(fRowGroup, fRowsPerRG);
   fRowGroup.setData(&data);
   fRowGroup.resetRowGroup(0);
-  // *DRRTUY This approach won't work with
-  // OFFSET > fRowsPerRG
-  offset = offset != 0 ? offset - 1 : offset;
-  fRowGroup.getRow(offset, &fRow0);
+  fRowGroup.getRow(0, &fRow0);
 
-  while ((orderedRowsQueue.size() > fStart) && (i++ < fCount))
+  uint64_t thisRGRowNumber = 0;
+  // find number of rows to retrieve from the queue using SQL LIMIT
+  // and the current sorted queue size.
+  uint64_t rowsToRetrieve = std::min(fCount - fRowsReturned, fRowsPerRG);
+  uint64_t rowsToRetrieveFromQueue = std::min(rowsToRetrieve, orderedRowsQueue.size());
+
+  for (; rowsToRetrieveFromQueue > thisRGRowNumber; ++thisRGRowNumber)
   {
     const OrderByRow& topRow = orderedRowsQueue.top();
-    row1.setData(topRow.data);
+    row1.setData(topRow.fData);
     copyRow(row1, &fRow0);
     fRowGroup.incRowCount();
-    offset--;
-    fRow0.prevRow(rSize);
+    fRow0.nextRow(rSize);
     orderedRowsQueue.pop();
-
-    // if RG has fRowsPerRG rows
-    if (offset == (uint64_t)-1)
-    {
-      tempRGDataList.push_front(data);
-
-      if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
-      {
-        cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-        throw logging::OutOfMemoryExcept(fErrorCode);
-      }
-      fMemSize += memSizeInc;
-
-      data.reinit(fRowGroup, fRowsPerRG);
-      fRowGroup.setData(&data);
-      fRowGroup.resetRowGroup(0);  // ?
-      fRowGroup.getRow(preLastRowNumb, &fRow0);
-      offset = preLastRowNumb;
-    }
   }
-  // Push the last/only group into the queue.
-  // if (fRowGroup.getRowCount() > 0)
-    // tempRGDataList.push_front(data);
 
-  return true;
+  fRowsReturned += rowsToRetrieveFromQueue;
+
+  return rowsToRetrieveFromQueue > 0;
 }
 
 const string LimitedOrderBy::toString() const
diff --git a/dbcon/joblist/limitedorderby.h b/dbcon/joblist/limitedorderby.h
index 658f703c1c..46c0da5fb9 100644
--- a/dbcon/joblist/limitedorderby.h
+++ b/dbcon/joblist/limitedorderby.h
@@ -52,7 +52,7 @@ class LimitedOrderBy : public ordering::IdbOrderBy
 
   void finalize();
   void brandNewFinalize();
-  bool getNextRGData(RGData& data);
+  bool getNextRGData(rowgroup::RGData& data);
   void flushCurrentToDisk() {}
 
  protected:
@@ -61,6 +61,7 @@ class LimitedOrderBy : public ordering::IdbOrderBy
   uint64_t fUncommitedMemory;
   static const uint64_t fMaxUncommited;
   uint64_t fOffsetInOrderedRowsQueue;
+  uint64_t fRowsReturned;
 };
 
 }  // namespace joblist
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index 086681341e..fd4117a968 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -186,7 +186,7 @@ void TupleAnnexStep::initialize(const RowGroup& rgIn, const JobInfo& jobInfo)
     if (fOrderBy)
     {
       fOrderBy->distinct(fDistinct);
-      fOrderBy->initialize(rgIn, jobInfo);
+      fOrderBy->initialize(rgIn, jobInfo, true, false);
     }
   }
 
@@ -699,7 +699,7 @@ void TupleAnnexStep::executeWithOrderBy()
 
     if (!cancelled())
     {
-      while (fOrderBy->getData(rgDataIn))
+      while (fOrderBy->getNextRGData(rgDataIn))
       {
         if (fConstant == NULL && fRowGroupOut.getColumnCount() == fRowGroupIn.getColumnCount())
         {
@@ -738,7 +738,7 @@ void TupleAnnexStep::executeWithOrderBy()
           // release RGData memory
           // TODO add some batching here to reduce atomic overhead.
           size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
-          fOrderBy->returnRGDataMemory2RM(rgDataSize);
+          // fOrderBy->returnRGDataMemory2RM(rgDataSize);
         }
       }
     }

From 1c4328aec394599c62f850281933b7bcf1b0f324 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Tue, 22 Apr 2025 21:14:59 +0000
Subject: [PATCH 04/14] feat(sorting): fixed DSC orientation with a new
 simplified getData method.

---
 dbcon/joblist/limitedorderby.cpp | 101 +++++++++++++++++++++++++++++--
 dbcon/joblist/limitedorderby.h   |   3 +-
 dbcon/joblist/tupleannexstep.cpp |  19 +-----
 3 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index 86f118de67..fcff097d18 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -109,10 +109,13 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
   if (fCount == 0)
     return;
 
+  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl; 
+  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl; 
   auto& orderedRowsQueue = getQueue();
   // if the row count is less than the limit
   if (orderedRowsQueue.size() < fStart + fCount)
   {
+
     copyRow(row, &fRow0);
     OrderByRow newRow(fRow0, fRule);
     orderedRowsQueue.push(newRow);
@@ -155,11 +158,97 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
       fRowGroup.getRow(0, &fRow0);
     }
   }
-
   else if (fOrderByCond.size() > 0 && fRule.less(row.getPointer(), orderedRowsQueue.top().fData))
   {
     OrderByRow swapRow = orderedRowsQueue.top();
     row1.setData(swapRow.fData);
+    std::cout << "LimitedOrderBy::processRow row2swap " << row1.toString() << std::endl;
+    std::cout <<"LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
+
+    copyRow(row, &row1);
+
+    if (fDistinct)
+    {
+      fDistinctMap->erase(orderedRowsQueue.top().fData);
+      fDistinctMap->insert(row1.getPointer());
+    }
+
+    orderedRowsQueue.pop();
+    orderedRowsQueue.push(swapRow);
+  }
+}
+
+
+void LimitedOrderBy::processRow_(const rowgroup::Row& row)
+{
+  // check if this is a distinct row
+  if (fDistinct && fDistinctMap->find(row.getPointer()) != fDistinctMap->end())
+    return;
+
+  // @bug5312, limit count is 0, do nothing.
+  if (fCount == 0)
+    return;
+
+
+  // TODO copy rules or replace ptrs to real instances in CompareRules
+  // auto invertedRule = fRule;
+  // invertedRule.revertRules();
+
+  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl; 
+  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl; 
+  auto& orderedRowsQueue = getQueue();
+  // if the row count is less than the limit
+  if (orderedRowsQueue.size() < fStart + fCount)
+  {
+    copyRow(row, &fRow0);
+    OrderByRow newRow(fRow0, fRule);
+    orderedRowsQueue.push(newRow);
+
+    uint64_t memSizeInc = sizeof(newRow);
+    fUncommitedMemory += memSizeInc;
+    if (fUncommitedMemory >= fMaxUncommited)
+    {
+      if (!fRm->getMemory(fUncommitedMemory, fSessionMemLimit))
+      {
+        cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+        throw IDBExcept(fErrorCode);
+      }
+      fMemSize += fUncommitedMemory;
+      fUncommitedMemory = 0;
+    }
+
+    // add to the distinct map
+    if (fDistinct)
+      fDistinctMap->insert(fRow0.getPointer());
+
+    fRowGroup.incRowCount();
+    fRow0.nextRow();
+
+    if (fRowGroup.getRowCount() >= fRowsPerRG)
+    {
+      fDataQueue.push(fData);
+      uint64_t newSize = fRowGroup.getSizeWithStrings() - fRowGroup.getHeaderSize();
+
+      if (!fRm->getMemory(newSize, fSessionMemLimit))
+      {
+        cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+        throw IDBExcept(fErrorCode);
+      }
+      fMemSize += newSize;
+
+      fData.reinit(fRowGroup, fRowsPerRG);
+      fRowGroup.setData(&fData);
+      fRowGroup.resetRowGroup(0);
+      fRowGroup.getRow(0, &fRow0);
+    }
+  }
+  else if (fOrderByCond.size() > 0 && invertedRule.less(row.getPointer(), orderedRowsQueue.top().fData))
+  {
+    OrderByRow swapRow = orderedRowsQueue.top();
+    row1.setData(swapRow.fData);
+    std::cout << "LimitedOrderBy::processRow row2swap " << row1.toString() << std::endl;
+    std::cout <<"LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
+
     copyRow(row, &row1);
 
     if (fDistinct)
@@ -176,18 +265,21 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
 void LimitedOrderBy::brandNewFinalize()
 {
   auto& orderedRowsQueue = getQueue();
-  uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
 
   // Skip OFFSET
   uint64_t sqlOffset = fStart;
+  std::cout << "brandNewFinalize offset " << sqlOffset << " orderedRowsQueue.size() " << orderedRowsQueue.size() << std::endl;
   while (sqlOffset > 0 && !orderedRowsQueue.empty())
   {
+    auto r = orderedRowsQueue.top();
+    row1.setData(r.fData);
+    std::cout << "brandNewFinalize row " << row1.toString() << std::endl;
     orderedRowsQueue.pop();
     --sqlOffset;
   }
 }
 
-/*
+/* 
  * The f() copies top element from an ordered queue into a row group. It
  * does this backwards to syncronise sorting orientation with the server.
  * The top row from the queue goes last into the returned set.
@@ -316,7 +408,8 @@ bool LimitedOrderBy::getNextRGData(RGData& data)
   // and the current sorted queue size.
   uint64_t rowsToRetrieve = std::min(fCount - fRowsReturned, fRowsPerRG);
   uint64_t rowsToRetrieveFromQueue = std::min(rowsToRetrieve, orderedRowsQueue.size());
-
+  std::cout << "getNextRGData rowsToRetrieve " << rowsToRetrieve << " orderedRowsQueue.size() " << orderedRowsQueue.size() << std::endl;
+  std::cout << "getNextRGData rowsToRetrieveFromQueue " << rowsToRetrieveFromQueue << std::endl;
   for (; rowsToRetrieveFromQueue > thisRGRowNumber; ++thisRGRowNumber)
   {
     const OrderByRow& topRow = orderedRowsQueue.top();
diff --git a/dbcon/joblist/limitedorderby.h b/dbcon/joblist/limitedorderby.h
index 46c0da5fb9..36e5e85d44 100644
--- a/dbcon/joblist/limitedorderby.h
+++ b/dbcon/joblist/limitedorderby.h
@@ -43,6 +43,7 @@ class LimitedOrderBy : public ordering::IdbOrderBy
   void initialize(const rowgroup::RowGroup&, const JobInfo&, bool invertRules = false,
                   bool isMultiThreded = false);
   void processRow(const rowgroup::Row&) override;
+  void processRow_(const rowgroup::Row&);
   uint64_t getKeyLength() const override;
   uint64_t getLimitCount() const
   {
@@ -61,7 +62,7 @@ class LimitedOrderBy : public ordering::IdbOrderBy
   uint64_t fUncommitedMemory;
   static const uint64_t fMaxUncommited;
   uint64_t fOffsetInOrderedRowsQueue;
-  uint64_t fRowsReturned;
+  uint64_t fRowsReturned{0};
 };
 
 }  // namespace joblist
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index fd4117a968..b3a3971ba7 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -632,7 +632,7 @@ void TupleAnnexStep::executeWithOrderBy()
 
           for (uint64_t i = 0; i < fRowGroupIn.getRowCount() && !cancelled(); ++i)
           {
-            fOrderBy->processRow(fRowIn);
+            fOrderBy->processRow_(fRowIn);
             fRowIn.nextRow();
           }
 
@@ -680,21 +680,6 @@ void TupleAnnexStep::executeWithOrderBy()
 
   // if (!isDiskBased())
   {
-    // {
-    //   if (fUncommitedMemory > 0)
-    //   {
-    //     if (!fRm->getMemory(fUncommitedMemory, fSessionMemLimit))
-    //     {
-    //       cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-    //       throw logging::OutOfMemoryExcept(fErrorCode);
-    //     }
-    //     fMemSize += fUncommitedMemory;
-    //     fUncommitedMemory = 0;
-    //   }
-    //   if (fRowGroup.getRowCount() > 0)
-    //     fDataQueue.push(fData);
-    // }
-    // replace with pushing the leftovers of LimitedOrderBy into the RGDATA queue
     fOrderBy->brandNewFinalize();
 
     if (!cancelled())
@@ -737,7 +722,7 @@ void TupleAnnexStep::executeWithOrderBy()
 
           // release RGData memory
           // TODO add some batching here to reduce atomic overhead.
-          size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
+          // size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
           // fOrderBy->returnRGDataMemory2RM(rgDataSize);
         }
       }

From 6a31569e914e8d30c5ded0ad078199cb022e26f2 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Thu, 1 May 2025 21:31:14 +0000
Subject: [PATCH 05/14] feat(struct): MinMaxHeap struct

---
 dbcon/joblist/limitedorderby.cpp |   3 +-
 dbcon/joblist/tupleannexstep.cpp |   2 +-
 utils/structs/minmaxheap.h       | 177 +++++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+), 2 deletions(-)
 create mode 100644 utils/structs/minmaxheap.h

diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index fcff097d18..0f844b8fa7 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -242,7 +242,7 @@ void LimitedOrderBy::processRow_(const rowgroup::Row& row)
       fRowGroup.getRow(0, &fRow0);
     }
   }
-  else if (fOrderByCond.size() > 0 && invertedRule.less(row.getPointer(), orderedRowsQueue.top().fData))
+  else if (fOrderByCond.size() > 0 && fRule.less(row.getPointer(), orderedRowsQueue.top().fData))
   {
     OrderByRow swapRow = orderedRowsQueue.top();
     row1.setData(swapRow.fData);
@@ -414,6 +414,7 @@ bool LimitedOrderBy::getNextRGData(RGData& data)
   {
     const OrderByRow& topRow = orderedRowsQueue.top();
     row1.setData(topRow.fData);
+    std::cout << "getNextRGData row " << row1.toString() << std::endl;
     copyRow(row1, &fRow0);
     fRowGroup.incRowCount();
     fRow0.nextRow(rSize);
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index b3a3971ba7..562242ab7e 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -186,7 +186,7 @@ void TupleAnnexStep::initialize(const RowGroup& rgIn, const JobInfo& jobInfo)
     if (fOrderBy)
     {
       fOrderBy->distinct(fDistinct);
-      fOrderBy->initialize(rgIn, jobInfo, true, false);
+      fOrderBy->initialize(rgIn, jobInfo, false, false);
     }
   }
 
diff --git a/utils/structs/minmaxheap.h b/utils/structs/minmaxheap.h
new file mode 100644
index 0000000000..fba0eef5cd
--- /dev/null
+++ b/utils/structs/minmaxheap.h
@@ -0,0 +1,177 @@
+#include <vector>
+#include <stdexcept>
+#include <utility>      // std::swap
+#include <algorithm>    // std::min_element, std::max_element
+
+template <typename T, typename Compare = std::less<T>>
+class MinMaxHeap {
+public:
+    // --- интерфейс ---
+    bool empty() const noexcept { return data_.empty(); }
+    std::size_t size() const noexcept { return data_.size(); }
+
+    const T& getMin() const {
+        if (empty()) throw std::out_of_range("heap empty");
+        return data_[0];
+    }
+
+    const T& getMax() const {
+        if (empty()) throw std::out_of_range("heap empty");
+        if (data_.size() == 1) return data_[0];
+        if (data_.size() == 2) return data_[1];
+        return cmp_(data_[1], data_[2]) ? data_[2] : data_[1]; // whichever is larger
+    }
+
+    void push(const T& x) {
+        data_.push_back(x);
+        bubbleUp(data_.size() - 1);
+    }
+
+    void popMin() {
+        if (empty()) throw std::out_of_range("heap empty");
+        moveLastToRootThenTrickleDown(0);
+    }
+
+    void popMax() {
+        if (empty()) throw std::out_of_range("heap empty");
+        std::size_t maxi = maxIndexUnderRoot();
+        moveLastToRootThenTrickleDown(maxi);
+    }
+
+private:
+    // --- внутренние данные ---
+    std::vector<T> data_;
+    Compare cmp_{};                // по умолчанию std::less ⇒ min на чётных уровнях
+
+    // --- индексы ---
+    static std::size_t parent(std::size_t i)     { return (i - 1) / 2; }
+    static std::size_t grand(std::size_t i)      { return (i - 1) / 4; }
+    static std::size_t left  (std::size_t i)     { return 2 * i + 1; }
+    static std::size_t right (std::size_t i)     { return 2 * i + 2; }
+
+    static bool isMinLevel(std::size_t i) { return (std::bit_width(i + 1) & 1) == 1; }
+    // bit_width(n) ≡ floor(log2(n)) + 1  (C++20 <bit>)
+
+    // bubble-up (вставка)
+    void bubbleUp(std::size_t i) {
+        if (i == 0) return;
+        std::size_t p = parent(i);
+        if (isMinLevel(i)) {
+            if (!cmp_(data_[i], data_[p])) {               // x > parent ⇒ уровень макс <-->
+                std::swap(data_[i], data_[p]);
+                bubbleUpMax(p);
+            } else {
+                bubbleUpMin(i);
+            }
+        } else {                                           // max-уровень
+            if (cmp_(data_[i], data_[p])) {                // x < parent ⇒ уровень мин <-->
+                std::swap(data_[i], data_[p]);
+                bubbleUpMin(p);
+            } else {
+                bubbleUpMax(i);
+            }
+        }
+    }
+
+    void bubbleUpMin(std::size_t i) {
+        while (i >= 3) {
+            std::size_t g = grand(i);
+            if (cmp_(data_[i], data_[g])) {
+                std::swap(data_[i], data_[g]);
+                i = g;
+            } else break;
+        }
+    }
+
+    void bubbleUpMax(std::size_t i) {
+        while (i >= 3) {
+            std::size_t g = grand(i);
+            if (!cmp_(data_[i], data_[g])) {
+                std::swap(data_[i], data_[g]);
+                i = g;
+            } else break;
+        }
+    }
+
+    // trickle-down (удаление)
+    void trickleDown(std::size_t i) {
+        if (isMinLevel(i))
+            trickleDownMin(i);
+        else
+            trickleDownMax(i);
+    }
+
+    void trickleDownMin(std::size_t i) {
+        while (left(i) < data_.size()) {
+            std::size_t m = minDescendant(i);
+            if (isGrandchild(i, m)) {
+                if (cmp_(data_[m], data_[i])) std::swap(data_[m], data_[i]);
+                std::size_t p = parent(m);
+                if (!cmp_(data_[m], data_[p])) std::swap(data_[m], data_[p]);
+                i = m;
+            } else {    // ребёнок
+                if (cmp_(data_[m], data_[i])) std::swap(data_[m], data_[i]);
+                break;
+            }
+        }
+    }
+
+    void trickleDownMax(std::size_t i) {
+        while (left(i) < data_.size()) {
+            std::size_t m = maxDescendant(i);
+            if (isGrandchild(i, m)) {
+                if (!cmp_(data_[m], data_[i])) std::swap(data_[m], data_[i]);
+                std::size_t p = parent(m);
+                if (cmp_(data_[m], data_[p])) std::swap(data_[m], data_[p]);
+                i = m;
+            } else {
+                if (!cmp_(data_[m], data_[i])) std::swap(data_[m], data_[i]);
+                break;
+            }
+        }
+    }
+
+    // вспом-ки
+    bool isGrandchild(std::size_t i, std::size_t m) const {
+        return m >= left(left(i));
+    }
+
+    std::size_t minDescendant(std::size_t i) const {
+        return descendantByCompare(i, /*wantMin=*/true);
+    }
+
+    std::size_t maxDescendant(std::size_t i) const {
+        return descendantByCompare(i, /*wantMin=*/false);
+    }
+
+    std::size_t descendantByCompare(std::size_t i, bool wantMin) const {
+        std::size_t best = left(i);
+        if (right(i) < data_.size() &&
+            (wantMin ? cmp_(data_[right(i)], data_[best])
+                     : cmp_(data_[best], data_[right(i)])))
+            best = right(i);
+
+        // внуки
+        for (std::size_t c = left(left(i)); c < data_.size() && c <= right(right(i)); ++c) {
+            bool better = wantMin ? cmp_(data_[c], data_[best])
+                                  : cmp_(data_[best], data_[c]);
+            if (better) best = c;
+        }
+        return best;
+    }
+
+    // удалить узел j: перенести последний элемент на место j и опустить
+    void moveLastToRootThenTrickleDown(std::size_t j) {
+        data_[j] = std::move(data_.back());
+        data_.pop_back();
+        if (j < data_.size())
+            trickleDown(j);
+    }
+
+    // индекс max-элемента среди детей корня
+    std::size_t maxIndexUnderRoot() const {
+        if (data_.size() == 1) return 0;
+        if (data_.size() == 2) return 1;
+        return cmp_(data_[1], data_[2]) ? 2 : 1;
+    }
+};
\ No newline at end of file

From cb48d02b6425a895f872616a8b77292bdfa3b25b Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Thu, 1 May 2025 21:34:01 +0000
Subject: [PATCH 06/14] feat(structs): UT for MinMaxHeap

---
 tests/minmaxheap.cpp | 151 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 tests/minmaxheap.cpp

diff --git a/tests/minmaxheap.cpp b/tests/minmaxheap.cpp
new file mode 100644
index 0000000000..2c28ba204d
--- /dev/null
+++ b/tests/minmaxheap.cpp
@@ -0,0 +1,151 @@
+/* Copyright (C) 2024 MariaDB Corporation
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#include <gtest/gtest.h>
+#include <random>
+#include <set>
+#include "utils/structs/minmaxheap.h"   // header that contains template class MinMaxHeap
+
+using IntHeap = MinMaxHeap<int>;          // default std::less comparator
+using IntInvHeap = MinMaxHeap<int, std::greater<int>>; // inverted ordering
+
+// ---------- Basic behaviour -------------------------------------------------
+
+TEST(MinMaxHeap, InitiallyEmpty) {
+    IntHeap h;
+    EXPECT_TRUE(h.empty());
+    EXPECT_EQ(h.size(), 0);
+    EXPECT_THROW(h.getMin(), std::out_of_range);
+    EXPECT_THROW(h.getMax(), std::out_of_range);
+    EXPECT_THROW(h.popMin(), std::out_of_range);
+    EXPECT_THROW(h.popMax(), std::out_of_range);
+}
+
+TEST(MinMaxHeap, SingleElement) {
+    IntHeap h;
+    h.push(42);
+    EXPECT_FALSE(h.empty());
+    EXPECT_EQ(h.size(), 1);
+    EXPECT_EQ(h.getMin(), 42);
+    EXPECT_EQ(h.getMax(), 42);
+    h.popMin();
+    EXPECT_TRUE(h.empty());
+}
+
+TEST(MinMaxHeap, OrderAfterInsertions) {
+    IntHeap h;
+    h.push(10);
+    h.push(5);
+    h.push(20);
+    EXPECT_EQ(h.getMin(), 5);
+    EXPECT_EQ(h.getMax(), 20);
+}
+
+TEST(MinMaxHeap, PopMinAsc) {
+    IntHeap h;
+    for (int v : {7, 1, 4, 9, 3}) h.push(v);
+
+    std::vector<int> asc;
+    while (!h.empty()) {
+        asc.push_back(h.getMin());
+        h.popMin();
+    }
+    EXPECT_EQ(asc, (std::vector<int>{1, 3, 4, 7, 9}));
+}
+
+TEST(MinMaxHeap, PopMaxDesc) {
+    IntHeap h;
+    for (int v : {7, 1, 4, 9, 3}) h.push(v);
+
+    std::vector<int> desc;
+    while (!h.empty()) {
+        desc.push_back(h.getMax());
+        h.popMax();
+    }
+    EXPECT_EQ(desc, (std::vector<int>{9, 7, 4, 3, 1}));
+}
+
+TEST(MinMaxHeap, MixedPops) {
+    IntHeap h;
+    for (int i = 1; i <= 5; ++i) h.push(i); // heap contains 1..5
+
+    // min(1) → {2,3,4,5}
+    EXPECT_EQ(h.getMin(), 1);
+    h.popMin();
+    // max(5) → {2,3,4}
+    EXPECT_EQ(h.getMax(), 5);
+    h.popMax();
+    // min(2) → {3,4}
+    EXPECT_EQ(h.getMin(), 2);
+    h.popMin();
+    // max(4) → {3}
+    EXPECT_EQ(h.getMax(), 4);
+    h.popMax();
+    // last element
+    EXPECT_EQ(h.getMin(), 3);
+    EXPECT_EQ(h.getMax(), 3);
+    h.popMin();
+
+    EXPECT_TRUE(h.empty());
+}
+
+// ---------- Comparator test --------------------------------------------------
+
+TEST(MinMaxHeap, CustomComparator) {
+    IntInvHeap h; // std::greater ⇒ heap treats greater elements as *smaller*
+    for (int v : {3, 1, 4}) h.push(v);
+    // now "minimum" is actually the *largest* integer because of comparator
+    EXPECT_EQ(h.getMin(), 4);
+    EXPECT_EQ(h.getMax(), 1);
+}
+
+// ---------- Robustness & stress ---------------------------------------------
+
+TEST(MinMaxHeap, RandomAgainstMultiset) {
+    constexpr int N = 10000;
+    std::mt19937 rng(42);
+    std::uniform_int_distribution<int> dist(-100000, 100000);
+
+    IntHeap heap;
+    std::multiset<int> ref;
+
+    for (int i = 0; i < N; ++i) {
+        if (ref.empty() || dist(rng) % 2) { // 50 % push
+            int val = dist(rng);
+            heap.push(val);
+            ref.insert(val);
+        } else if (dist(rng) % 2) {         // 25 % popMin
+            int refMin = *ref.begin();
+            EXPECT_EQ(heap.getMin(), refMin);
+            heap.popMin();
+            ref.erase(ref.begin());
+        } else {                            // 25 % popMax
+            int refMax = *ref.rbegin();
+            EXPECT_EQ(heap.getMax(), refMax);
+            heap.popMax();
+            auto it = std::prev(ref.end());
+            ref.erase(it);
+        }
+        // invariants after each mutation
+        if (!ref.empty()) {
+            EXPECT_EQ(heap.getMin(), *ref.begin());
+            EXPECT_EQ(heap.getMax(), *ref.rbegin());
+        } else {
+            EXPECT_TRUE(heap.empty());
+        }
+    }
+}

From a0cf50710567ae6e2c9238511fd9e16591dc2090 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Fri, 2 May 2025 15:09:06 +0000
Subject: [PATCH 07/14] feat(TNS,sorting): a revised skeleton for single-thread
 disk-based OB

---
 dbcon/joblist/disk-based-topnorderby.h |  58 +++++++++++
 dbcon/joblist/elementtype.h            |  13 +--
 dbcon/joblist/limitedorderby.cpp       | 135 +++++++++++++++++++++++--
 dbcon/joblist/limitedorderby.h         |   8 +-
 dbcon/joblist/tupleannexstep.cpp       | 132 +++++++++++++++---------
 dbcon/joblist/tupleannexstep.h         |  49 +++++----
 tests/minmaxheap.cpp                   |   2 +-
 7 files changed, 305 insertions(+), 92 deletions(-)
 create mode 100644 dbcon/joblist/disk-based-topnorderby.h

diff --git a/dbcon/joblist/disk-based-topnorderby.h b/dbcon/joblist/disk-based-topnorderby.h
new file mode 100644
index 0000000000..e1e17e818c
--- /dev/null
+++ b/dbcon/joblist/disk-based-topnorderby.h
@@ -0,0 +1,58 @@
+/* Copyright (C) 2025 MariaDB Corp.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "elementtype.h"
+namespace joblist
+{
+
+class DiskBasedTopNOrderBy
+{
+ public:
+  DiskBasedTopNOrderBy();
+  ~DiskBasedTopNOrderBy(
+     // clean up left over files
+  );
+
+
+  void incrementGenerationCounter()
+  {
+    ++fGenerationCounter;
+  }
+  uint64_t getGenerationCounter() const
+  {
+    return fGenerationCounter;
+  }
+
+  bool isDiskBased() const { return fGenerationCounter > 0; }
+
+  size_t getGenerationFilesNumber() const { return 0; }
+  std::vector<std::string> getGenerationFileNamesNextBatch(const size_t batchSize) { return {}; }
+
+  void flushCurrentToDisk(const bool firstFlush) { incrementGenerationCounter(); }
+  void diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists) {}
+
+//  private:
+  uint64_t fGenerationCounter {0};
+};
+
+}
\ No newline at end of file
diff --git a/dbcon/joblist/elementtype.h b/dbcon/joblist/elementtype.h
index 7987498a74..1b35e67f93 100644
--- a/dbcon/joblist/elementtype.h
+++ b/dbcon/joblist/elementtype.h
@@ -1,4 +1,5 @@
 /* Copyright (C) 2014 InfiniDB, Inc.
+   Copyright (C) 2016-2025 MariaDB Corporation
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
@@ -238,16 +239,7 @@ extern std::istream& operator>>(std::istream& in, TupleType& rhs);
 extern std::ostream& operator<<(std::ostream& out, const TupleType& rhs);
 }  // namespace joblist
 
-#ifndef NO_DATALISTS
-
-// #include "bandeddl.h"
-// #include "wsdl.h"
 #include "fifo.h"
-// #include "bucketdl.h"
-// #include "constantdatalist.h"
-// #include "swsdl.h"
-// #include "zdl.h"
-// #include "deliverywsdl.h"
 
 namespace joblist
 {
@@ -327,7 +319,8 @@ typedef DataList<StringElementType> StrDataList;
 // */
 // typedef BucketDL<TupleType> TupleBucketDataList;
 
-typedef FIFO<rowgroup::RGData> RowGroupDL;
+using RowGroupDL = FIFO<rowgroup::RGData>;
+using RowGroupDLSPtr = std::shared_ptr<RowGroupDL>;
 
 }  // namespace joblist
 
diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index 0f844b8fa7..c88598568c 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -262,21 +262,137 @@ void LimitedOrderBy::processRow_(const rowgroup::Row& row)
   }
 }
 
+// void LimitedOrderBy::brandNewFinalize()
+// {
+//   auto& orderedRowsQueue = getQueue();
+
+//   // Skip OFFSET
+//   uint64_t sqlOffset = fStart;
+//   std::cout << "brandNewFinalize offset " << sqlOffset << " orderedRowsQueue.size() " << orderedRowsQueue.size() << std::endl;
+//   while (sqlOffset > 0 && !orderedRowsQueue.empty())
+//   {
+//     auto r = orderedRowsQueue.top();
+//     row1.setData(r.fData);
+//     std::cout << "brandNewFinalize row " << row1.toString() << std::endl;
+//     orderedRowsQueue.pop();
+//     --sqlOffset;
+//   }
+// }
+
 void LimitedOrderBy::brandNewFinalize()
 {
+  if (!isDiskBased())
+  {
+    return finalize();    
+  }
+
+  // if disk-based
+  // here there are <= inputQueuesNumber files on disk
+  // and potentially some in-memory state
+  // need to merge this together to produce a result
+
+  if (fUncommitedMemory > 0)
+  {
+    if (!fRm->getMemory(fUncommitedMemory, fSessionMemLimit))
+    {
+      cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+      throw logging::OutOfMemoryExcept(fErrorCode);
+    }
+    fMemSize += fUncommitedMemory;
+    fUncommitedMemory = 0;
+  }
+
+  queue<RGData> tempQueue;
+  if (fRowGroup.getRowCount() > 0)
+    fDataQueue.push(fData);
+
   auto& orderedRowsQueue = getQueue();
 
-  // Skip OFFSET
-  uint64_t sqlOffset = fStart;
-  std::cout << "brandNewFinalize offset " << sqlOffset << " orderedRowsQueue.size() " << orderedRowsQueue.size() << std::endl;
-  while (sqlOffset > 0 && !orderedRowsQueue.empty())
+  if (orderedRowsQueue.size() > 0)
   {
-    auto r = orderedRowsQueue.top();
-    row1.setData(r.fData);
-    std::cout << "brandNewFinalize row " << row1.toString() << std::endl;
-    orderedRowsQueue.pop();
-    --sqlOffset;
+    // *DRRTUY Very memory intensive. CS needs to account active
+    // memory only and release memory if needed.
+    uint64_t memSizeInc = fRowGroup.getSizeWithStrings() - fRowGroup.getHeaderSize();
+
+    if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
+    {
+      cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+      throw logging::OutOfMemoryExcept(fErrorCode);
+    }
+    fMemSize += memSizeInc;
+
+    uint64_t offset = 0;
+    uint64_t i = 0;
+    // Reduce queue size by an offset value if it applicable.
+    uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
+    list<RGData> tempRGDataList;
+
+    if (fCount <= queueSizeWoOffset)
+    {
+      offset = fCount % fRowsPerRG;
+      if (!offset && fCount > 0)
+        offset = fRowsPerRG;
+    }
+    else
+    {
+      offset = queueSizeWoOffset % fRowsPerRG;
+      if (!offset && queueSizeWoOffset > 0)
+        offset = fRowsPerRG;
+    }
+
+    list<RGData>::iterator tempListIter = tempRGDataList.begin();
+
+    i = 0;
+    uint32_t rSize = fRow0.getSize();
+    uint64_t preLastRowNumb = fRowsPerRG - 1;
+    fData.reinit(fRowGroup, fRowsPerRG);
+    fRowGroup.setData(&fData);
+    fRowGroup.resetRowGroup(0);
+    // *DRRTUY This approach won't work with
+    // OFSET > fRowsPerRG
+    offset = offset != 0 ? offset - 1 : offset;
+    fRowGroup.getRow(offset, &fRow0);
+
+    while ((orderedRowsQueue.size() > fStart) && (i++ < fCount))
+    {
+      const OrderByRow& topRow = orderedRowsQueue.top();
+      row1.setData(topRow.fData);
+      copyRow(row1, &fRow0);
+      fRowGroup.incRowCount();
+      offset--;
+      fRow0.prevRow(rSize);
+      orderedRowsQueue.pop();
+
+      // if RG has fRowsPerRG rows
+      if (offset == (uint64_t)-1)
+      {
+        tempRGDataList.push_front(fData);
+
+        if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
+        {
+          cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
+          throw logging::OutOfMemoryExcept(fErrorCode);
+        }
+        fMemSize += memSizeInc;
+
+        fData.reinit(fRowGroup, fRowsPerRG);
+        fRowGroup.setData(&fData);
+        fRowGroup.resetRowGroup(0);  // ?
+        fRowGroup.getRow(preLastRowNumb, &fRow0);
+        offset = preLastRowNumb;
+      }
+    }
+    // Push the last/only group into the queue.
+    if (fRowGroup.getRowCount() > 0)
+      tempRGDataList.push_front(fData);
+
+    for (tempListIter = tempRGDataList.begin(); tempListIter != tempRGDataList.end(); tempListIter++)
+      tempQueue.push(*tempListIter);
+
+    fDataQueue = tempQueue;
   }
+
+
 }
 
 /* 
@@ -388,6 +504,7 @@ void LimitedOrderBy::finalize()
   }
 }
 
+// WIP UNUSED
 bool LimitedOrderBy::getNextRGData(RGData& data)
 {
   auto& orderedRowsQueue = getQueue();
diff --git a/dbcon/joblist/limitedorderby.h b/dbcon/joblist/limitedorderby.h
index 36e5e85d44..5a4107e47e 100644
--- a/dbcon/joblist/limitedorderby.h
+++ b/dbcon/joblist/limitedorderby.h
@@ -22,8 +22,10 @@
 #pragma once
 
 #include <string>
-#include "rowgroup.h"
+
+#include "disk-based-topnorderby.h"
 #include "../../utils/windowfunction/idborderby.h"
+#include "rowgroup.h"
 
 namespace joblist
 {
@@ -34,7 +36,7 @@ struct JobInfo;
 // This version is for subqueries, limit the result set to fit in memory,
 // use ORDER BY to make the results consistent.
 // The actual output are the first or last # of rows, which are NOT ordered.
-class LimitedOrderBy : public ordering::IdbOrderBy
+class LimitedOrderBy : public ordering::IdbOrderBy, public DiskBasedTopNOrderBy
 {
  public:
   LimitedOrderBy();
@@ -54,7 +56,7 @@ class LimitedOrderBy : public ordering::IdbOrderBy
   void finalize();
   void brandNewFinalize();
   bool getNextRGData(rowgroup::RGData& data);
-  void flushCurrentToDisk() {}
+
 
  protected:
   uint64_t fStart;
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index 562242ab7e..b62e6fac1b 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -604,12 +604,43 @@ void TupleAnnexStep::checkAndAllocateMemory4RGData(const rowgroup::RowGroup& row
   }
 }
 
+// RowGroupDL* dl1 = new RowGroupDL(1, jobInfo.fifoSize);
+std::vector<RowGroupDLSPtr> TupleAnnexStep::createInputDLs(const std::vector<std::string>& fileNames)
+{
+  std::vector<RowGroupDLSPtr> result;
+  for (size_t i = 0; i < fileNames.size(); ++i)
+  {
+    result.emplace_back(new RowGroupDL(1, 16)); // WIP hardcode
+  }
+  return result;
+}
+
+std::vector<uint64_t> TupleAnnexStep::startReaders(std::vector<RowGroupDLSPtr>& dataLists)
+{
+  std::vector<uint64_t> result(dataLists.size());
+  for (size_t i = 0; i < dataLists.size(); ++i)
+  {
+    result[i] = jobstepThreadPool.invoke([&dataLists, i]() { 
+      // open file 
+      // loop  
+        // read ByteStream
+        // make BS into RGData
+        // put into dataLists[i]
+      // close file
+      // emit empty RGData
+     });
+  }
+  return result;
+}
+
+
 void TupleAnnexStep::executeWithOrderBy()
 {
   utils::setThreadName("TNSwOrd");
   RGData rgDataIn;
   RGData rgDataOut;
   bool more = false;
+  bool flushToDisk = false;
 
   try
   {
@@ -632,7 +663,7 @@ void TupleAnnexStep::executeWithOrderBy()
 
           for (uint64_t i = 0; i < fRowGroupIn.getRowCount() && !cancelled(); ++i)
           {
-            fOrderBy->processRow_(fRowIn);
+            fOrderBy->processRow(fRowIn);
             fRowIn.nextRow();
           }
 
@@ -641,8 +672,7 @@ void TupleAnnexStep::executeWithOrderBy()
       }
       catch (const logging::OutOfMemoryExcept&)
       {
-        enableFlushToDisk();
-        incrementGenerationCounter();
+        flushToDisk = true;
       }
       catch (...)
       {
@@ -650,10 +680,11 @@ void TupleAnnexStep::executeWithOrderBy()
                         "TupleAnnexStep::executeWithOrderBy()");
       }
 
-      if (isFlushToDiskEnabled())
+      if (flushToDisk)
       {
-        fOrderBy->flushCurrentToDisk();
-        disableFlushToDisk();
+        bool firstFlush = true;
+        fOrderBy->flushCurrentToDisk(firstFlush);
+        flushToDisk = false;
       }
       else
       {
@@ -669,62 +700,69 @@ void TupleAnnexStep::executeWithOrderBy()
                     logging::ERR_ALWAYS_CRITICAL, "TupleAnnexStep::executeWithOrderBy()");
   }
 
+  // can be disk-based with no or few files and some in-memory state
+
   // store avg RGData size
-  if (getGenerationCounter())
+  if (fOrderBy->isDiskBased())
   {
     // assess RAM available, avg RGData size statistics and free enough memory
     // return memory if needed
-    // create outputDLs or simplier atomic queues + readers threads
-    // call fOrderBy->diskBasedMergePhase()
+    size_t inputQueuesNumber = 2;
+    while (inputQueuesNumber < fOrderBy->getGenerationFilesNumber())
+    {
+      auto fileNames = fOrderBy->getGenerationFileNamesNextBatch(inputQueuesNumber);
+      auto inputDLs = createInputDLs(fileNames);
+      auto readers = startReaders(inputDLs);
+      // create outputDLs or simplier atomic queues + readers threads
+      fOrderBy->diskBasedMergePhaseIfNeeded(inputDLs);
+      jobstepThreadPool.join(readers);
+    }
   }
 
-  // if (!isDiskBased())
-  {
-    fOrderBy->brandNewFinalize();
+  fOrderBy->brandNewFinalize();
 
-    if (!cancelled())
+  if (!cancelled())
+  {
+    while (fOrderBy->getData(rgDataIn))
     {
-      while (fOrderBy->getNextRGData(rgDataIn))
+      if (fConstant == NULL && fRowGroupOut.getColumnCount() == fRowGroupIn.getColumnCount())
       {
-        if (fConstant == NULL && fRowGroupOut.getColumnCount() == fRowGroupIn.getColumnCount())
-        {
-          rgDataOut = rgDataIn;
-          fRowGroupOut.setData(&rgDataOut);
-        }
-        else  // TODO push this into finalize to populate next RGData rows
-        {
-          fRowGroupIn.setData(&rgDataIn);
-          fRowGroupIn.getRow(0, &fRowIn);
+        rgDataOut = rgDataIn;
+        fRowGroupOut.setData(&rgDataOut);
+      }
+      else  // TODO push this into finalize to populate next RGData rows
+      {
+        fRowGroupIn.setData(&rgDataIn);
+        fRowGroupIn.getRow(0, &fRowIn);
 
-          rgDataOut.reinit(fRowGroupOut, fRowGroupIn.getRowCount());
-          fRowGroupOut.setData(&rgDataOut);
-          fRowGroupOut.resetRowGroup(fRowGroupIn.getBaseRid());
-          fRowGroupOut.setDBRoot(fRowGroupIn.getDBRoot());
-          fRowGroupOut.getRow(0, &fRowOut);
+        rgDataOut.reinit(fRowGroupOut, fRowGroupIn.getRowCount());
+        fRowGroupOut.setData(&rgDataOut);
+        fRowGroupOut.resetRowGroup(fRowGroupIn.getBaseRid());
+        fRowGroupOut.setDBRoot(fRowGroupIn.getDBRoot());
+        fRowGroupOut.getRow(0, &fRowOut);
 
-          for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i)
-          {
-            if (fConstant)
-              fConstant->fillInConstants(fRowIn, fRowOut);
-            else
-              copyRow(fRowIn, &fRowOut);
+        for (uint64_t i = 0; i < fRowGroupIn.getRowCount(); ++i)
+        {
+          if (fConstant)
+            fConstant->fillInConstants(fRowIn, fRowOut);
+          else
+            copyRow(fRowIn, &fRowOut);
 
-            fRowGroupOut.incRowCount();
-            fRowOut.nextRow();
-            fRowIn.nextRow();
-          }
+          fRowGroupOut.incRowCount();
+          fRowOut.nextRow();
+          fRowIn.nextRow();
         }
+      }
 
-        if (fRowGroupOut.getRowCount() > 0)
-        {
-          fRowsReturned += fRowGroupOut.getRowCount();
-          fOutputDL->insert(rgDataOut);
+      if (fRowGroupOut.getRowCount() > 0)
+      {
+        fRowsReturned += fRowGroupOut.getRowCount();
+        fOutputDL->insert(rgDataOut);
 
-          // release RGData memory
-          // TODO add some batching here to reduce atomic overhead.
-          // size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
-          // fOrderBy->returnRGDataMemory2RM(rgDataSize);
-        }
+        // release RGData memory
+        // TODO add some batching here to reduce atomic overhead.
+        // size_t rgDataSize = fRowGroupOut.getSizeWithStrings() - fRowGroupOut.getHeaderSize();
+        // fOrderBy->returnRGDataMemory2RM(rgDataSize);
       }
     }
   }
diff --git a/dbcon/joblist/tupleannexstep.h b/dbcon/joblist/tupleannexstep.h
index be3c003c14..0776823e94 100644
--- a/dbcon/joblist/tupleannexstep.h
+++ b/dbcon/joblist/tupleannexstep.h
@@ -22,6 +22,7 @@
 
 #include <boost/thread/thread.hpp>
 #include <atomic>
+#include <memory>
 
 #include "jobstep.h"
 #include "limitedorderby.h"
@@ -114,26 +115,30 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   void printCalTrace();
   void finalizeParallelOrderBy();
   void finalizeParallelOrderByDistinct();
-  void enableFlushToDisk()
-  {
-    fFlushToDisk.store(true, std::memory_order_relaxed);
-  }
-  void disableFlushToDisk()
-  {
-    fFlushToDisk.store(false, std::memory_order_relaxed);
-  }
-  bool isFlushToDiskEnabled() const
-  {
-    return fFlushToDisk.load(std::memory_order_relaxed);
-  }
-  void incrementGenerationCounter()
-  {
-    ++fGenerationCounter;
-  }
-  uint64_t getGenerationCounter() const
-  {
-    return fGenerationCounter;
-  }
+
+  // void enableFlushToDisk()
+  // {
+  //   fFlushToDisk.store(true, std::memory_order_relaxed);
+  // }
+  // void disableFlushToDisk()
+  // {
+  //   fFlushToDisk.store(false, std::memory_order_relaxed);
+  // }
+  // bool isFlushToDiskEnabled() const
+  // {
+  //   return fFlushToDisk.load(std::memory_order_relaxed);
+  // }
+  // void incrementGenerationCounter()
+  // {
+  //   ++fGenerationCounter;
+  // }
+  // uint64_t getGenerationCounter() const
+  // {
+  //   return fGenerationCounter;
+  // }
+
+  std::vector<RowGroupDLSPtr> createInputDLs(const std::vector<std::string>& fileNames);
+  std::vector<uint64_t> startReaders(std::vector<RowGroupDLSPtr>& inputDLs);
 
   // input/output rowgroup and row
   rowgroup::RowGroup fRowGroupIn;
@@ -193,8 +198,8 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   uint16_t fFinishedThreads;
   boost::mutex fParallelFinalizeMutex;
   joblist::ResourceManager* fRm;
-  std::atomic<bool> fFlushToDisk {false};
-  uint64_t fGenerationCounter {0};
+  // std::atomic<bool> fFlushToDisk {false};
+  // uint64_t fGenerationCounter {0};
 };
 
 }  // namespace joblist
diff --git a/tests/minmaxheap.cpp b/tests/minmaxheap.cpp
index 2c28ba204d..9a4f26b210 100644
--- a/tests/minmaxheap.cpp
+++ b/tests/minmaxheap.cpp
@@ -116,7 +116,7 @@ TEST(MinMaxHeap, CustomComparator) {
 // ---------- Robustness & stress ---------------------------------------------
 
 TEST(MinMaxHeap, RandomAgainstMultiset) {
-    constexpr int N = 10000;
+    constexpr int N = 100;
     std::mt19937 rng(42);
     std::uniform_int_distribution<int> dist(-100000, 100000);
 

From 9c3b82651d2503f0d3547f69439772361783afba Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Fri, 2 May 2025 15:12:55 +0000
Subject: [PATCH 08/14] fix(): compilation issues

---
 dbcon/joblist/elementtype.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dbcon/joblist/elementtype.h b/dbcon/joblist/elementtype.h
index 1b35e67f93..8b8db7b5f7 100644
--- a/dbcon/joblist/elementtype.h
+++ b/dbcon/joblist/elementtype.h
@@ -418,5 +418,3 @@ extern std::ostream& showOidInDL(std::ostream& strm);
 extern std::ostream& omitOidInDL(std::ostream& strm);
 
 }  // namespace joblist
-
-#endif

From 8258716a4e23c8de68c48bc774d675a8038f7ee4 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Fri, 2 May 2025 19:46:15 +0000
Subject: [PATCH 09/14] feat(RG): generalise Dumper class to re-use it for OB.

---
 dbcon/joblist/disk-based-topnorderby.h |   7 +-
 utils/idbdatafile/utility.h            |   2 +
 utils/rowgroup/dumper.cpp              | 219 +++++++++++++++++
 utils/rowgroup/dumper.h                |  65 +++++
 utils/rowgroup/memmanager.h            | 174 +++++++++++++
 utils/rowgroup/rowstorage.cpp          | 328 ++-----------------------
 utils/rowgroup/rowstorage.h            |  10 +-
 7 files changed, 494 insertions(+), 311 deletions(-)
 create mode 100644 utils/rowgroup/dumper.cpp
 create mode 100644 utils/rowgroup/dumper.h
 create mode 100644 utils/rowgroup/memmanager.h

diff --git a/dbcon/joblist/disk-based-topnorderby.h b/dbcon/joblist/disk-based-topnorderby.h
index e1e17e818c..74ede381e6 100644
--- a/dbcon/joblist/disk-based-topnorderby.h
+++ b/dbcon/joblist/disk-based-topnorderby.h
@@ -28,10 +28,11 @@ namespace joblist
 class DiskBasedTopNOrderBy
 {
  public:
-  DiskBasedTopNOrderBy();
-  ~DiskBasedTopNOrderBy(
+  DiskBasedTopNOrderBy() {}
+  ~DiskBasedTopNOrderBy() 
+  {
      // clean up left over files
-  );
+  }
 
 
   void incrementGenerationCounter()
diff --git a/utils/idbdatafile/utility.h b/utils/idbdatafile/utility.h
index 18dae9005d..663337f531 100644
--- a/utils/idbdatafile/utility.h
+++ b/utils/idbdatafile/utility.h
@@ -22,6 +22,8 @@
 #include <fcntl.h>
 #include <string>
 
+#include <string.h>
+
 namespace idbdatafile
 {
 /**
diff --git a/utils/rowgroup/dumper.cpp b/utils/rowgroup/dumper.cpp
new file mode 100644
index 0000000000..b85e4d560d
--- /dev/null
+++ b/utils/rowgroup/dumper.cpp
@@ -0,0 +1,219 @@
+/* Copyright (C) 2025 MariaDB Corporation
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <cassert>
+#include <errno.h>
+
+#include "dumper.h"
+
+namespace common
+{
+  std::string errorString(int errNo)
+  {
+    char tmp[1024];
+    auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
+    return {buf};
+  }
+}  // namespace common
+
+namespace rowgroup
+{
+Dumper::Dumper(const compress::CompressInterface* comp, MemManager* mm) : fCompressor(comp), fMM(mm->clone())
+{
+}
+
+int Dumper::write(const std::string& fname, const char* buf, size_t sz)
+{
+  if (sz == 0)
+    return 0;
+
+  int fd = open(fname.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+  if (UNLIKELY(fd < 0))
+    return errno;
+
+  const char* tmpbuf;
+  if (fCompressor)
+  {
+    auto len = fCompressor->maxCompressedSize(sz);
+    checkBuffer(len);
+    fCompressor->compress(buf, sz, fTmpBuf.data(), &len);
+    tmpbuf = fTmpBuf.data();
+    sz = len;
+  }
+  else
+  {
+    tmpbuf = buf;
+  }
+
+  auto to_write = sz;
+  int ret = 0;
+  while (to_write > 0)
+  {
+    auto r = ::write(fd, tmpbuf + sz - to_write, to_write);
+    if (UNLIKELY(r < 0))
+    {
+      if (errno == EAGAIN)
+        continue;
+
+      ret = errno;
+      close(fd);
+      return ret;
+    }
+    assert(size_t(r) <= to_write);
+    to_write -= r;
+  }
+
+  close(fd);
+  return ret;
+}
+
+int Dumper::read(const std::string& fname, std::vector<char>& buf)
+{
+  int fd = open(fname.c_str(), O_RDONLY);
+  if (UNLIKELY(fd < 0))
+    return errno;
+
+  struct stat st{};
+  fstat(fd, &st);
+  size_t sz = st.st_size;
+  std::vector<char>* tmpbuf;
+  if (fCompressor)
+  {
+    tmpbuf = &fTmpBuf;
+    checkBuffer(sz);
+  }
+  else
+  {
+    tmpbuf = &buf;
+    buf.resize(sz);
+  }
+
+  auto to_read = sz;
+  int ret = 0;
+  while (to_read > 0)
+  {
+    auto r = ::read(fd, tmpbuf->data() + sz - to_read, to_read);
+    if (UNLIKELY(r < 0))
+    {
+      if (errno == EAGAIN)
+        continue;
+
+      ret = errno;
+      close(fd);
+      return ret;
+    }
+
+    assert(size_t(r) <= to_read);
+    to_read -= r;
+  }
+
+  if (fCompressor)
+  {
+    size_t len;
+    if (!fCompressor->getUncompressedSize(tmpbuf->data(), sz, &len))
+    {
+      ret = EPROTO;
+      close(fd);
+      return ret;
+    }
+
+    buf.resize(len);
+    fCompressor->uncompress(tmpbuf->data(), sz, buf.data(), &len);
+  }
+
+  close(fd);
+  return ret;
+}
+
+size_t Dumper::size() const
+{
+  return fTmpBuf.size();
+}
+
+void Dumper::checkBuffer(size_t len)
+{
+  if (fTmpBuf.size() < len)
+  {
+    size_t newtmpsz = (len + 8191) / 8192 * 8192;
+    std::vector<char> tmpvec(newtmpsz);
+    // WIP needs OOM check
+    fMM->acquire(newtmpsz - fTmpBuf.size());
+    fTmpBuf.swap(tmpvec);
+  }
+}
+
+std::string RGDumper::makeRGFilename(uint64_t rgid, const uint16_t generation) const
+{
+  char buf[PATH_MAX];
+  snprintf(buf, sizeof(buf), "%s/Sorting-p%u-t%ld-rg%lu-g%u", fTmpDir.c_str(), getpid(), fUniqId, rgid,
+           generation);
+  return buf;
+}
+
+void RGDumper::loadRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut,
+                      std::unique_ptr<RGData>& rgdata, bool unlinkDump)
+{
+  auto fname = makeRGFilename(rgid, generation);
+
+  std::vector<char> data;
+  int errNo;
+  if ((errNo = read(fname, data)) != 0)
+  {
+    unlink(fname.c_str());
+    // WIP replace errorcodes
+    throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR,
+                                                                         common::errorString(errNo)),
+                             logging::ERR_DISKAGG_FILEIO_ERROR);
+  }
+
+  messageqcpp::ByteStream bs(reinterpret_cast<uint8_t*>(data.data()), data.size());
+
+  if (unlinkDump)
+    unlink(fname.c_str());
+  rgdata.reset(new RGData());
+  rgdata->deserialize(bs, rowgroup::rgCommonSize);
+  assert(bs.length() == 0);
+
+  fRowGroupOut.setData(rgdata.get());
+  auto memSz = fRowGroupOut.getSizeWithStrings();
+
+  if (!fMM->acquire(memSz))
+  {
+    // WIP replace errorcodes
+    throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_AGGREGATION_TOO_BIG),
+                             logging::ERR_AGGREGATION_TOO_BIG);
+  }
+}
+
+void RGDumper::saveRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut, RGData* rgdata)
+{
+  messageqcpp::ByteStream bs;
+  fRowGroupOut.setData(rgdata);
+  rgdata->serialize(bs, fRowGroupOut.getDataSize());
+
+  int errNo;
+  if ((errNo = write(makeRGFilename(rgid, generation), (char*)bs.buf(), bs.length())) != 0)
+  {
+    throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR,
+                                                                         common::errorString(errNo)),
+                             logging::ERR_DISKAGG_FILEIO_ERROR);
+  }
+}
+
+}  // namespace rowgroup
diff --git a/utils/rowgroup/dumper.h b/utils/rowgroup/dumper.h
new file mode 100644
index 0000000000..7695d14856
--- /dev/null
+++ b/utils/rowgroup/dumper.h
@@ -0,0 +1,65 @@
+/* Copyright (C) 2025 MariaDB Corporation
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include "idbcompress.h"
+#include "memmanager.h"
+#include "rowgroup.h"
+
+namespace common
+{
+  std::string errorString(int errNo);
+}
+
+namespace rowgroup
+{
+
+class Dumper
+{
+ public:
+  Dumper(const compress::CompressInterface* comp, MemManager* mm);
+
+  int write(const std::string& fname, const char* buf, size_t sz);
+  int read(const std::string& fname, std::vector<char>& buf);
+  size_t size() const;
+
+ private:
+  void checkBuffer(size_t len);
+
+ protected:
+  const compress::CompressInterface* fCompressor;
+  std::unique_ptr<MemManager> fMM;
+  std::vector<char> fTmpBuf;
+};
+
+class RGDumper: protected Dumper
+{
+ public:
+  RGDumper(const compress::CompressInterface* comp, MemManager* mm, const std::string& tmpDir, const uint64_t uniqId) : Dumper(comp, mm), fTmpDir(tmpDir) { }
+  void loadRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut, std::unique_ptr<RGData>& rgdata, bool unlinkDump = false);
+  void saveRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut, RGData* rgdata);
+
+  std::string makeRGFilename(uint64_t rgid, const uint16_t generation) const;
+private:
+  std::string fTmpDir;
+  uint64_t fUniqId;
+};
+
+}
\ No newline at end of file
diff --git a/utils/rowgroup/memmanager.h b/utils/rowgroup/memmanager.h
new file mode 100644
index 0000000000..ebd9809e36
--- /dev/null
+++ b/utils/rowgroup/memmanager.h
@@ -0,0 +1,174 @@
+/* Copyright (C) 2021-2025 MariaDB Corporation
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#pragma once
+
+#include <cstdint>
+#include <bits/types.h>
+
+#include "resourcemanager.h"
+
+namespace rowgroup
+{
+/** @brief Some service wrapping around ResourceManager (or NoOP) */
+
+class MemManager
+{
+ public:
+  MemManager()
+  {
+  }
+  virtual ~MemManager()
+  {
+    release(fMemUsed);
+  }
+
+  bool acquire(std::size_t amount)
+  {
+    return acquireImpl(amount);
+  }
+  void release(ssize_t amount = 0)
+  {
+    // in some cases it tries to release more memory than acquired, ie create
+    // new rowgroup, acquire maximum size (w/o strings), add some rows with
+    // strings and finally release the actual size of RG with strings
+    if (amount == 0 || amount > fMemUsed)
+      amount = fMemUsed;
+    releaseImpl(amount);
+  }
+
+  ssize_t getUsed() const
+  {
+    return fMemUsed;
+  }
+  virtual int64_t getFree() const
+  {
+    return std::numeric_limits<int64_t>::max();
+  }
+
+  virtual int64_t getConfigured() const
+  {
+    return std::numeric_limits<int64_t>::max();
+  }
+
+  virtual bool isStrict() const
+  {
+    return false;
+  }
+
+  virtual MemManager* clone() const
+  {
+    return new MemManager();
+  }
+
+  virtual joblist::ResourceManager* getResourceManaged()
+  {
+    return nullptr;
+  }
+  virtual boost::shared_ptr<int64_t> getSessionLimit()
+  {
+    return {};
+  }
+
+ protected:
+  virtual bool acquireImpl(std::size_t amount)
+  {
+    fMemUsed += amount;
+    return true;
+  }
+  virtual void releaseImpl(std::size_t amount)
+  {
+    fMemUsed -= amount;
+  }
+  ssize_t fMemUsed = 0;
+};
+
+class RMMemManager : public MemManager
+{
+ public:
+  RMMemManager(joblist::ResourceManager* rm, boost::shared_ptr<int64_t> sl, bool wait = true,
+               bool strict = true)
+   : fRm(rm), fSessLimit(std::move(sl)), fWait(wait), fStrict(strict)
+  {
+  }
+
+  ~RMMemManager() override
+  {
+    release(fMemUsed);
+    fMemUsed = 0;
+  }
+
+  int64_t getConfigured() const final
+  {
+    return fRm->getConfiguredUMMemLimit();
+  }
+
+  int64_t getFree() const final
+  {
+    return std::min(fRm->availableMemory(), *fSessLimit);
+  }
+
+  bool isStrict() const final
+  {
+    return fStrict;
+  }
+
+  MemManager* clone() const final
+  {
+    return new RMMemManager(fRm, fSessLimit, fWait, fStrict);
+  }
+
+  joblist::ResourceManager* getResourceManaged() override
+  {
+    return fRm;
+  }
+  boost::shared_ptr<int64_t> getSessionLimit() override
+  {
+    return fSessLimit;
+  }
+
+ protected:
+  bool acquireImpl(size_t amount) final
+  {
+    if (amount)
+    {
+      if (!fRm->getMemory(amount, fSessLimit, fWait) && fStrict)
+      {
+        return false;
+      }
+      MemManager::acquireImpl(amount);
+    }
+    return true;
+  }
+
+  void releaseImpl(size_t amount) override
+  {
+    if (amount)
+    {
+      MemManager::releaseImpl(amount);
+      fRm->returnMemory(amount, fSessLimit);
+    }
+  }
+
+ private:
+  joblist::ResourceManager* fRm = nullptr;
+  boost::shared_ptr<int64_t> fSessLimit;
+  const bool fWait;
+  const bool fStrict;
+};
+
+}
\ No newline at end of file
diff --git a/utils/rowgroup/rowstorage.cpp b/utils/rowgroup/rowstorage.cpp
index 8ab629873b..9e4278f93e 100644
--- a/utils/rowgroup/rowstorage.cpp
+++ b/utils/rowgroup/rowstorage.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2021-2022 MariaDB Corporation
+/* Copyright (C) 2021 MariaDB Corporation
 
    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
@@ -23,6 +23,9 @@
 #include "rowgroup.h"
 #include <resourcemanager.h>
 #include <fcntl.h>
+
+#include "dumper.h"
+#include "memmanager.h"
 #include "rowstorage.h"
 #include "robin_hood.h"
 
@@ -77,12 +80,12 @@ int readData(int fd, char* buf, size_t sz)
   return 0;
 }
 
-std::string errorString(int errNo)
-{
-  char tmp[1024];
-  auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
-  return {buf};
-}
+// std::string common::errorString(int errNo)
+// {
+//   char tmp[1024];
+//   auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
+//   return {buf};
+// }
 
 size_t findFirstSetBit(const uint64_t mask)
 {
@@ -275,289 +278,6 @@ struct LRU : public LRUIface
   List fList;
 };
 
-/** @brief Some service wrapping around ResourceManager (or NoOP) */
-class MemManager
-{
- public:
-  MemManager()
-  {
-  }
-  virtual ~MemManager()
-  {
-    release(fMemUsed);
-  }
-
-  bool acquire(ssize_t amount)
-  {
-    if (UNLIKELY(-amount > fMemUsed))
-      amount = -fMemUsed;
-    return acquireImpl(amount);
-  }
-  void release(ssize_t amount = 0)
-  {
-    // in some cases it tries to release more memory than acquired, ie create
-    // new rowgroup, acquire maximum size (w/o strings), add some rows with
-    // strings and finally release the actual size of RG with strings
-    if (amount == 0 || amount > fMemUsed)
-      amount = fMemUsed;
-    releaseImpl(amount);
-  }
-
-  ssize_t getUsed() const
-  {
-    return fMemUsed;
-  }
-  virtual int64_t getFree() const
-  {
-    return std::numeric_limits<int64_t>::max();
-  }
-
-  virtual int64_t getConfigured() const
-  {
-    return std::numeric_limits<int64_t>::max();
-  }
-
-  virtual bool isStrict() const
-  {
-    return false;
-  }
-
-  virtual MemManager* clone() const
-  {
-    return new MemManager();
-  }
-
-  virtual joblist::ResourceManager* getResourceManaged()
-  {
-    return nullptr;
-  }
-  virtual boost::shared_ptr<int64_t> getSessionLimit()
-  {
-    return {};
-  }
-
- protected:
-  virtual bool acquireImpl(ssize_t amount)
-  {
-    fMemUsed += amount;
-    return true;
-  }
-  virtual void releaseImpl(ssize_t amount)
-  {
-    fMemUsed -= amount;
-  }
-  ssize_t fMemUsed{0};
-};
-
-class RMMemManager : public MemManager
-{
- public:
-  RMMemManager(joblist::ResourceManager* rm, boost::shared_ptr<int64_t> sl, bool wait = true,
-               bool strict = true)
-   : fRm(rm), fSessLimit(std::move(sl)), fWait(wait), fStrict(strict)
-  {
-  }
-
-  ~RMMemManager() override
-  {
-    release(fMemUsed);
-    fMemUsed = 0;
-  }
-
-  int64_t getConfigured() const final
-  {
-    return fRm->getConfiguredUMMemLimit();
-  }
-
-  int64_t getFree() const final
-  {
-    return std::min(fRm->availableMemory(), *fSessLimit);
-  }
-
-  bool isStrict() const final
-  {
-    return fStrict;
-  }
-
-  MemManager* clone() const final
-  {
-    return new RMMemManager(fRm, fSessLimit, fWait, fStrict);
-  }
-
-  joblist::ResourceManager* getResourceManaged() override
-  {
-    return fRm;
-  }
-  boost::shared_ptr<int64_t> getSessionLimit() override
-  {
-    return fSessLimit;
-  }
-
- protected:
-  bool acquireImpl(ssize_t amount) final
-  {
-    if (amount)
-    {
-      if (!fRm->getMemory(amount, fSessLimit, fWait) && fStrict)
-      {
-        return false;
-      }
-      MemManager::acquireImpl(amount);
-    }
-    return true;
-  }
-
-  void releaseImpl(ssize_t amount) override
-  {
-    if (amount)
-    {
-      MemManager::releaseImpl(amount);
-      fRm->returnMemory(amount, fSessLimit);
-    }
-  }
-
- private:
-  joblist::ResourceManager* fRm = nullptr;
-  boost::shared_ptr<int64_t> fSessLimit;
-  const bool fWait;
-  const bool fStrict;
-};
-
-class Dumper
-{
- public:
-  Dumper(const compress::CompressInterface* comp, MemManager* mm) : fCompressor(comp), fMM(mm->clone())
-  {
-  }
-
-  int write(const std::string& fname, const char* buf, size_t sz)
-  {
-    if (sz == 0)
-      return 0;
-
-    int fd = open(fname.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
-    if (UNLIKELY(fd < 0))
-      return errno;
-
-    const char* tmpbuf;
-    if (fCompressor)
-    {
-      auto len = fCompressor->maxCompressedSize(sz);
-      checkBuffer(len);
-      fCompressor->compress(buf, sz, fTmpBuf.data(), &len);
-      tmpbuf = fTmpBuf.data();
-      sz = len;
-    }
-    else
-    {
-      tmpbuf = buf;
-    }
-
-    auto to_write = sz;
-    int ret = 0;
-    while (to_write > 0)
-    {
-      auto r = ::write(fd, tmpbuf + sz - to_write, to_write);
-      if (UNLIKELY(r < 0))
-      {
-        if (errno == EAGAIN)
-          continue;
-
-        ret = errno;
-        close(fd);
-        return ret;
-      }
-      assert(size_t(r) <= to_write);
-      to_write -= r;
-    }
-
-    close(fd);
-    return ret;
-  }
-
-  int read(const std::string& fname, std::vector<char>& buf)
-  {
-    int fd = open(fname.c_str(), O_RDONLY);
-    if (UNLIKELY(fd < 0))
-      return errno;
-
-    struct stat st
-    {
-    };
-    fstat(fd, &st);
-    size_t sz = st.st_size;
-    std::vector<char>* tmpbuf;
-    if (fCompressor)
-    {
-      tmpbuf = &fTmpBuf;
-      checkBuffer(sz);
-    }
-    else
-    {
-      tmpbuf = &buf;
-      buf.resize(sz);
-    }
-
-    auto to_read = sz;
-    int ret = 0;
-    while (to_read > 0)
-    {
-      auto r = ::read(fd, tmpbuf->data() + sz - to_read, to_read);
-      if (UNLIKELY(r < 0))
-      {
-        if (errno == EAGAIN)
-          continue;
-
-        ret = errno;
-        close(fd);
-        return ret;
-      }
-
-      assert(size_t(r) <= to_read);
-      to_read -= r;
-    }
-
-    if (fCompressor)
-    {
-      size_t len;
-      if (!fCompressor->getUncompressedSize(tmpbuf->data(), sz, &len))
-      {
-        ret = EPROTO;
-        close(fd);
-        return ret;
-      }
-
-      buf.resize(len);
-      fCompressor->uncompress(tmpbuf->data(), sz, buf.data(), &len);
-    }
-
-    close(fd);
-    return ret;
-  }
-
-  size_t size() const
-  {
-    return fTmpBuf.size();
-  }
-
- private:
-  void checkBuffer(size_t len)
-  {
-    if (fTmpBuf.size() < len)
-    {
-      size_t newtmpsz = (len + 8191) / 8192 * 8192;
-      std::vector<char> tmpvec(newtmpsz);
-      fMM->acquire(newtmpsz - fTmpBuf.size());
-      fTmpBuf.swap(tmpvec);
-    }
-  }
-
- private:
-  const compress::CompressInterface* fCompressor;
-  std::unique_ptr<MemManager> fMM;
-  std::vector<char> fTmpBuf;
-};
-
 /** @brief Storage for RGData with LRU-cache & memory management
  */
 class RowGroupStorage
@@ -718,7 +438,7 @@ class RowGroupStorage
         if (UNLIKELY(r < 0))
         {
           throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(
-                                       logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errno)),
+                                       logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errno)),
                                    logging::ERR_DISKAGG_FILEIO_ERROR);
         }
       }
@@ -1117,7 +837,7 @@ class RowGroupStorage
     if (UNLIKELY(fd < 0))
     {
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errno)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errno)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
     uint64_t sz = fRGDatas.size();
@@ -1131,7 +851,7 @@ class RowGroupStorage
       close(fd);
       unlink(fname.c_str());
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
     close(fd);
@@ -1145,7 +865,7 @@ class RowGroupStorage
     if (fd < 0)
     {
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errno)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errno)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
     uint64_t sz;
@@ -1157,7 +877,7 @@ class RowGroupStorage
       close(fd);
       unlink(fname.c_str());
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
     fRGDatas.resize(sz);
@@ -1167,7 +887,7 @@ class RowGroupStorage
       close(fd);
       unlink(fname.c_str());
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
     close(fd);
@@ -1361,7 +1081,7 @@ class RowGroupStorage
     {
       unlink(fname.c_str());
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
 
@@ -1415,7 +1135,7 @@ class RowGroupStorage
     if ((errNo = fDumper->write(makeRGFilename(rgid), (char*)bs.buf(), bs.length())) != 0)
     {
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
   }
@@ -1604,7 +1324,7 @@ class RowPosHashStorage
     if ((errNo = fDumper->write(makeDumpName(), (char*)fPosHashes.data(), sz)) != 0)
     {
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
   }
@@ -1638,7 +1358,7 @@ class RowPosHashStorage
     if ((errNo = fDumper->read(makeDumpName(), data)) != 0)
     {
       throw logging::IDBExcept(
-          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+          logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
           logging::ERR_DISKAGG_FILEIO_ERROR);
     }
 
@@ -2233,7 +1953,7 @@ void RowAggStorage::dumpInternalData() const
   if (fd < 0)
   {
     throw logging::IDBExcept(
-        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errno)),
+        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errno)),
         logging::ERR_DISKAGG_FILEIO_ERROR);
   }
 
@@ -2242,7 +1962,7 @@ void RowAggStorage::dumpInternalData() const
   {
     close(fd);
     throw logging::IDBExcept(
-        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
         logging::ERR_DISKAGG_FILEIO_ERROR);
   }
   close(fd);
@@ -2450,7 +2170,7 @@ void RowAggStorage::loadGeneration(uint16_t gen, size_t& size, size_t& mask, siz
   if (fd < 0)
   {
     throw logging::IDBExcept(
-        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errno)),
+        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errno)),
         logging::ERR_DISKAGG_FILEIO_ERROR);
   }
   struct stat st
@@ -2464,7 +2184,7 @@ void RowAggStorage::loadGeneration(uint16_t gen, size_t& size, size_t& mask, siz
   {
     close(fd);
     throw logging::IDBExcept(
-        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, errorString(errNo)),
+        logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR, common::errorString(errNo)),
         logging::ERR_DISKAGG_FILEIO_ERROR);
   }
   close(fd);
diff --git a/utils/rowgroup/rowstorage.h b/utils/rowgroup/rowstorage.h
index 72c20e9327..2211d1bbde 100644
--- a/utils/rowgroup/rowstorage.h
+++ b/utils/rowgroup/rowstorage.h
@@ -17,14 +17,16 @@
 
 #pragma once
 
-#include "resourcemanager.h"
-#include "rowgroup.h"
-#include "idbcompress.h"
 #include <cstdint>
 #include <random>
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "idbcompress.h"
+#include "memmanager.h"
+#include "resourcemanager.h"
+#include "rowgroup.h"
+
 namespace rowgroup
 {
 uint32_t calcNumberOfBuckets(ssize_t availMem, uint32_t numOfThreads, uint32_t numOfBuckets,
@@ -67,7 +69,7 @@ class RowAggStorage
 
   static uint16_t getMaxRows(bool enabledDiskAgg)
   {
-    return (enabledDiskAgg ? 8192 : 256);
+    return (enabledDiskAgg ? rowgroup::rgCommonSize : 256);
   }
 
   static size_t getBucketSize();

From 7e3295895cb7d3019c3a3c6ddfe3ed6b83f690b8 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Fri, 2 May 2025 22:24:15 +0000
Subject: [PATCH 10/14] feat(dumper): dumper interface improvements and some
 TNS and OB refactoring

---
 dbcon/joblist/disk-based-topnorderby.h | 47 +++++++++++++++++++-------
 dbcon/joblist/jlf_tuplejoblist.cpp     | 11 ++----
 dbcon/joblist/limitedorderby.cpp       | 31 ++++++++---------
 dbcon/joblist/limitedorderby.h         |  3 +-
 dbcon/joblist/tupleannexstep.cpp       |  9 +++--
 dbcon/joblist/tupleannexstep.h         |  6 ++--
 utils/configcpp/configcpp.cpp          |  1 +
 utils/configcpp/configcpp.h            |  3 +-
 utils/rowgroup/dumper.cpp              | 20 +++++------
 utils/rowgroup/dumper.h                | 27 ++++++++++-----
 utils/rowgroup/memmanager.h            |  1 +
 11 files changed, 90 insertions(+), 69 deletions(-)

diff --git a/dbcon/joblist/disk-based-topnorderby.h b/dbcon/joblist/disk-based-topnorderby.h
index 74ede381e6..e89c96455e 100644
--- a/dbcon/joblist/disk-based-topnorderby.h
+++ b/dbcon/joblist/disk-based-topnorderby.h
@@ -21,19 +21,26 @@
 #include <string>
 #include <vector>
 
+#include "dumper.h"
 #include "elementtype.h"
+#include "resourcemanager.h"
 namespace joblist
 {
 
-class DiskBasedTopNOrderBy
+class DiskBasedTopNOrderBy : public rowgroup::RGDumper
 {
+  // std::string fTmpDir =
+  //     config::Config::makeConfig()->getTempFileDir(config::Config::TempDirPurpose::Aggregates);
+  // std::string fCompStr = config::Config::makeConfig()->getConfig("RowAggregation", "Compression");
  public:
-  DiskBasedTopNOrderBy() {}
-  ~DiskBasedTopNOrderBy() 
+  // TODO Parametrize compression, tmpdir and memory manager (can be temp)
+  DiskBasedTopNOrderBy(ResourceManager* rm)
+   : RGDumper(compress::getCompressInterfaceByName("LZ4"), std::make_unique<rowgroup::MemManager>(),
+              config::Config::makeConfig()->getTempFileDir(config::Config::TempDirPurpose::Sorting),
+              "Sorting", reinterpret_cast<std::uintptr_t>(this))
   {
-     // clean up left over files
   }
-
+  ~DiskBasedTopNOrderBy() = default;
 
   void incrementGenerationCounter()
   {
@@ -44,16 +51,30 @@ class DiskBasedTopNOrderBy
     return fGenerationCounter;
   }
 
-  bool isDiskBased() const { return fGenerationCounter > 0; }
+  bool isDiskBased() const
+  {
+    return fGenerationCounter > 0;
+  }
 
-  size_t getGenerationFilesNumber() const { return 0; }
-  std::vector<std::string> getGenerationFileNamesNextBatch(const size_t batchSize) { return {}; }
+  size_t getGenerationFilesNumber() const
+  {
+    return 0;
+  }
+  std::vector<std::string> getGenerationFileNamesNextBatch(const size_t batchSize)
+  {
+    return {};
+  }
 
-  void flushCurrentToDisk(const bool firstFlush) { incrementGenerationCounter(); }
-  void diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists) {}
+  void flushCurrentToDisk(const bool firstFlush)
+  {
+    incrementGenerationCounter();
+  }
+  void diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists)
+  {
+  }
 
-//  private:
-  uint64_t fGenerationCounter {0};
+  //  private:
+  uint64_t fGenerationCounter{0};
 };
 
-}
\ No newline at end of file
+}  // namespace joblist
\ No newline at end of file
diff --git a/dbcon/joblist/jlf_tuplejoblist.cpp b/dbcon/joblist/jlf_tuplejoblist.cpp
index 96add786bb..bfe36dac19 100644
--- a/dbcon/joblist/jlf_tuplejoblist.cpp
+++ b/dbcon/joblist/jlf_tuplejoblist.cpp
@@ -499,12 +499,6 @@ void adjustLastStep(JobStepVector& querySteps, DeliveredTableMap& deliverySteps,
     deliverySteps[CNX_VTABLE_ID] = ws;
   }
 
-  // TODO MCOL-894 we don't need to run sorting|distinct
-  // every time
-  //    if ((jobInfo.limitCount != (uint64_t) - 1) ||
-  //            (jobInfo.constantCol == CONST_COL_EXIST) ||
-  //            (jobInfo.hasDistinct))
-  //    {
   if (jobInfo.annexStep.get() == NULL)
     jobInfo.annexStep.reset(new TupleAnnexStep(jobInfo));
 
@@ -513,20 +507,19 @@ void adjustLastStep(JobStepVector& querySteps, DeliveredTableMap& deliverySteps,
 
   if (jobInfo.orderByColVec.size() > 0)
   {
-    tas->addOrderBy(new LimitedOrderBy());
+    tas->addOrderBy(jobInfo.rm);
     if (jobInfo.orderByThreads > 1)
       tas->setParallelOp();
     tas->setMaxThreads(jobInfo.orderByThreads);
   }
 
+  // TODO decouple TCS from TNS
   if (jobInfo.constantCol == CONST_COL_EXIST)
     tas->addConstant(new TupleConstantStep(jobInfo));
 
   if (jobInfo.hasDistinct)
     tas->setDistinct();
 
-  //    }
-
   if (jobInfo.annexStep)
   {
     TupleDeliveryStep* ds = dynamic_cast<TupleDeliveryStep*>(deliverySteps[CNX_VTABLE_ID].get());
diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index c88598568c..d898c75cf1 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -40,7 +40,8 @@ namespace joblist
 const uint64_t LimitedOrderBy::fMaxUncommited = 102400;  // 100KiB - make it configurable?
 
 // LimitedOrderBy class implementation
-LimitedOrderBy::LimitedOrderBy() : fStart(0), fCount(-1), fUncommitedMemory(0)
+LimitedOrderBy::LimitedOrderBy(ResourceManager* rm)
+ : DiskBasedTopNOrderBy(rm), fStart(0), fCount(-1), fUncommitedMemory(0)
 {
   fRule.fIdbCompare = this;
 }
@@ -109,13 +110,12 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
   if (fCount == 0)
     return;
 
-  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl; 
-  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl; 
+  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl;
+  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl;
   auto& orderedRowsQueue = getQueue();
   // if the row count is less than the limit
   if (orderedRowsQueue.size() < fStart + fCount)
   {
-
     copyRow(row, &fRow0);
     OrderByRow newRow(fRow0, fRule);
     orderedRowsQueue.push(newRow);
@@ -163,7 +163,7 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
     OrderByRow swapRow = orderedRowsQueue.top();
     row1.setData(swapRow.fData);
     std::cout << "LimitedOrderBy::processRow row2swap " << row1.toString() << std::endl;
-    std::cout <<"LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
+    std::cout << "LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
 
     copyRow(row, &row1);
 
@@ -178,7 +178,6 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
   }
 }
 
-
 void LimitedOrderBy::processRow_(const rowgroup::Row& row)
 {
   // check if this is a distinct row
@@ -189,13 +188,12 @@ void LimitedOrderBy::processRow_(const rowgroup::Row& row)
   if (fCount == 0)
     return;
 
-
   // TODO copy rules or replace ptrs to real instances in CompareRules
   // auto invertedRule = fRule;
   // invertedRule.revertRules();
 
-  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl; 
-  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl; 
+  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl;
+  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl;
   auto& orderedRowsQueue = getQueue();
   // if the row count is less than the limit
   if (orderedRowsQueue.size() < fStart + fCount)
@@ -247,7 +245,7 @@ void LimitedOrderBy::processRow_(const rowgroup::Row& row)
     OrderByRow swapRow = orderedRowsQueue.top();
     row1.setData(swapRow.fData);
     std::cout << "LimitedOrderBy::processRow row2swap " << row1.toString() << std::endl;
-    std::cout <<"LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
+    std::cout << "LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
 
     copyRow(row, &row1);
 
@@ -268,8 +266,8 @@ void LimitedOrderBy::processRow_(const rowgroup::Row& row)
 
 //   // Skip OFFSET
 //   uint64_t sqlOffset = fStart;
-//   std::cout << "brandNewFinalize offset " << sqlOffset << " orderedRowsQueue.size() " << orderedRowsQueue.size() << std::endl;
-//   while (sqlOffset > 0 && !orderedRowsQueue.empty())
+//   std::cout << "brandNewFinalize offset " << sqlOffset << " orderedRowsQueue.size() " <<
+//   orderedRowsQueue.size() << std::endl; while (sqlOffset > 0 && !orderedRowsQueue.empty())
 //   {
 //     auto r = orderedRowsQueue.top();
 //     row1.setData(r.fData);
@@ -283,7 +281,7 @@ void LimitedOrderBy::brandNewFinalize()
 {
   if (!isDiskBased())
   {
-    return finalize();    
+    return finalize();
   }
 
   // if disk-based
@@ -391,11 +389,9 @@ void LimitedOrderBy::brandNewFinalize()
 
     fDataQueue = tempQueue;
   }
-
-
 }
 
-/* 
+/*
  * The f() copies top element from an ordered queue into a row group. It
  * does this backwards to syncronise sorting orientation with the server.
  * The top row from the queue goes last into the returned set.
@@ -525,7 +521,8 @@ bool LimitedOrderBy::getNextRGData(RGData& data)
   // and the current sorted queue size.
   uint64_t rowsToRetrieve = std::min(fCount - fRowsReturned, fRowsPerRG);
   uint64_t rowsToRetrieveFromQueue = std::min(rowsToRetrieve, orderedRowsQueue.size());
-  std::cout << "getNextRGData rowsToRetrieve " << rowsToRetrieve << " orderedRowsQueue.size() " << orderedRowsQueue.size() << std::endl;
+  std::cout << "getNextRGData rowsToRetrieve " << rowsToRetrieve << " orderedRowsQueue.size() "
+            << orderedRowsQueue.size() << std::endl;
   std::cout << "getNextRGData rowsToRetrieveFromQueue " << rowsToRetrieveFromQueue << std::endl;
   for (; rowsToRetrieveFromQueue > thisRGRowNumber; ++thisRGRowNumber)
   {
diff --git a/dbcon/joblist/limitedorderby.h b/dbcon/joblist/limitedorderby.h
index 5a4107e47e..0ed284ce56 100644
--- a/dbcon/joblist/limitedorderby.h
+++ b/dbcon/joblist/limitedorderby.h
@@ -25,6 +25,7 @@
 
 #include "disk-based-topnorderby.h"
 #include "../../utils/windowfunction/idborderby.h"
+#include "resourcemanager.h"
 #include "rowgroup.h"
 
 namespace joblist
@@ -39,7 +40,7 @@ struct JobInfo;
 class LimitedOrderBy : public ordering::IdbOrderBy, public DiskBasedTopNOrderBy
 {
  public:
-  LimitedOrderBy();
+  LimitedOrderBy(ResourceManager* rm = nullptr); // TODO remove default
   ~LimitedOrderBy() override;
   using ordering::IdbOrderBy::initialize;
   void initialize(const rowgroup::RowGroup&, const JobInfo&, bool invertRules = false,
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index b62e6fac1b..949269340b 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -118,8 +118,7 @@ TupleAnnexStep::TupleAnnexStep(const JobInfo& jobInfo)
  , fEndOfResult(false)
  , fDistinct(false)
  , fParallelOp(false)
- , fOrderBy(NULL)
- , fConstant(NULL)
+ , fConstant(nullptr)
  , fFeInstance(funcexp::FuncExp::instance())
  , fJobList(jobInfo.jobListPtr)
  , fFinishedThreads(0)
@@ -146,10 +145,10 @@ TupleAnnexStep::~TupleAnnexStep()
     fRunnersList.clear();
   }
 
-  if (fOrderBy)
-    delete fOrderBy;
+  // if (fOrderBy)
+  //   delete fOrderBy;
 
-  fOrderBy = NULL;
+  // fOrderBy = NULL;
 
   if (fConstant)
     delete fConstant;
diff --git a/dbcon/joblist/tupleannexstep.h b/dbcon/joblist/tupleannexstep.h
index 0776823e94..5c23398fe5 100644
--- a/dbcon/joblist/tupleannexstep.h
+++ b/dbcon/joblist/tupleannexstep.h
@@ -70,9 +70,9 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
 
   void initialize(const rowgroup::RowGroup& rgIn, const JobInfo& jobInfo);
 
-  void addOrderBy(LimitedOrderBy* lob)
+  void addOrderBy(ResourceManager* rm)
   {
-    fOrderBy = lob;
+    fOrderBy = std::make_unique<LimitedOrderBy>(rm);
   }
   void addConstant(TupleConstantStep* tcs)
   {
@@ -187,7 +187,7 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   bool fDistinct;
   bool fParallelOp;
 
-  LimitedOrderBy* fOrderBy;
+  std::unique_ptr<LimitedOrderBy> fOrderBy{nullptr};
   TupleConstantStep* fConstant;
 
   funcexp::FuncExp* fFeInstance;
diff --git a/utils/configcpp/configcpp.cpp b/utils/configcpp/configcpp.cpp
index 3f76689e99..97ab54fb2f 100644
--- a/utils/configcpp/configcpp.cpp
+++ b/utils/configcpp/configcpp.cpp
@@ -547,6 +547,7 @@ std::string Config::getTempFileDir(Config::TempDirPurpose what)
   {
     case TempDirPurpose::Joins: return prefix.append("joins/");
     case TempDirPurpose::Aggregates: return prefix.append("aggregates/");
+    case TempDirPurpose::Sorting: return prefix.append("sorting/");
   }
   // NOTREACHED
   return {};
diff --git a/utils/configcpp/configcpp.h b/utils/configcpp/configcpp.h
index 86e2db67b4..bcb27b57cb 100644
--- a/utils/configcpp/configcpp.h
+++ b/utils/configcpp/configcpp.h
@@ -196,7 +196,8 @@ class Config
   enum class TempDirPurpose
   {
     Joins,      ///< disk joins
-    Aggregates  ///< disk-based aggregation
+    Aggregates, ///< disk-based aggregation
+    Sorting     ///< disk-based sorting
   };
   /** @brief Return temporaru directory path for the specified purpose */
   std::string getTempFileDir(TempDirPurpose what);
diff --git a/utils/rowgroup/dumper.cpp b/utils/rowgroup/dumper.cpp
index b85e4d560d..6935e8c444 100644
--- a/utils/rowgroup/dumper.cpp
+++ b/utils/rowgroup/dumper.cpp
@@ -24,20 +24,16 @@
 
 namespace common
 {
-  std::string errorString(int errNo)
-  {
-    char tmp[1024];
-    auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
-    return {buf};
-  }
+std::string errorString(int errNo)
+{
+  char tmp[1024];
+  auto* buf = strerror_r(errNo, tmp, sizeof(tmp));
+  return {buf};
+}
 }  // namespace common
 
 namespace rowgroup
 {
-Dumper::Dumper(const compress::CompressInterface* comp, MemManager* mm) : fCompressor(comp), fMM(mm->clone())
-{
-}
-
 int Dumper::write(const std::string& fname, const char* buf, size_t sz)
 {
   if (sz == 0)
@@ -161,8 +157,8 @@ void Dumper::checkBuffer(size_t len)
 std::string RGDumper::makeRGFilename(uint64_t rgid, const uint16_t generation) const
 {
   char buf[PATH_MAX];
-  snprintf(buf, sizeof(buf), "%s/Sorting-p%u-t%ld-rg%lu-g%u", fTmpDir.c_str(), getpid(), fUniqId, rgid,
-           generation);
+  snprintf(buf, sizeof(buf), "%s/%s-p%u-t%ld-rg%lu-g%u", fOperationName.c_str(), fTmpDir.c_str(), getpid(),
+           fUniqId, rgid, generation);
   return buf;
 }
 
diff --git a/utils/rowgroup/dumper.h b/utils/rowgroup/dumper.h
index 7695d14856..b006ed933f 100644
--- a/utils/rowgroup/dumper.h
+++ b/utils/rowgroup/dumper.h
@@ -25,7 +25,7 @@
 
 namespace common
 {
-  std::string errorString(int errNo);
+std::string errorString(int errNo);
 }
 
 namespace rowgroup
@@ -34,8 +34,11 @@ namespace rowgroup
 class Dumper
 {
  public:
-  Dumper(const compress::CompressInterface* comp, MemManager* mm);
-
+   Dumper(const compress::CompressInterface* comp, MemManager* mm)
+   : fCompressor(comp), fMM(std::unique_ptr<MemManager>(mm)) {}
+  Dumper(const compress::CompressInterface* comp, std::unique_ptr<MemManager>& mm)
+   : fCompressor(comp), fMM(std::move(mm)) {}
+  ~Dumper() = default;
   int write(const std::string& fname, const char* buf, size_t sz);
   int read(const std::string& fname, std::vector<char>& buf);
   size_t size() const;
@@ -49,17 +52,25 @@ class Dumper
   std::vector<char> fTmpBuf;
 };
 
-class RGDumper: protected Dumper
+class RGDumper : protected Dumper
 {
  public:
-  RGDumper(const compress::CompressInterface* comp, MemManager* mm, const std::string& tmpDir, const uint64_t uniqId) : Dumper(comp, mm), fTmpDir(tmpDir) { }
-  void loadRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut, std::unique_ptr<RGData>& rgdata, bool unlinkDump = false);
+  RGDumper(const compress::CompressInterface* comp, std::unique_ptr<MemManager> mm, const std::string& tmpDir,
+           const std::string& operationName, const uint64_t uniqId)
+   : Dumper(comp, mm), fTmpDir(tmpDir), fOperationName(operationName)
+  {
+  }
+  ~RGDumper() = default;
+  void loadRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut,
+              std::unique_ptr<RGData>& rgdata, bool unlinkDump = false);
   void saveRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut, RGData* rgdata);
 
   std::string makeRGFilename(uint64_t rgid, const uint16_t generation) const;
-private:
+
+ private:
   std::string fTmpDir;
+  std::string fOperationName;
   uint64_t fUniqId;
 };
 
-}
\ No newline at end of file
+}  // namespace rowgroup
\ No newline at end of file
diff --git a/utils/rowgroup/memmanager.h b/utils/rowgroup/memmanager.h
index ebd9809e36..4a392311eb 100644
--- a/utils/rowgroup/memmanager.h
+++ b/utils/rowgroup/memmanager.h
@@ -22,6 +22,7 @@
 
 #include "resourcemanager.h"
 
+// TODO change namespace
 namespace rowgroup
 {
 /** @brief Some service wrapping around ResourceManager (or NoOP) */

From 3176effc73cda4e8eb9ab27917d2b162513a9846 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Thu, 8 May 2025 20:42:51 +0000
Subject: [PATCH 11/14] feat(sorting): basic dumper works for initial runs of
 TopNOrderBy.

---
 dbcon/joblist/CMakeLists.txt             |   1 +
 dbcon/joblist/disk-based-topnorderby.cpp |  58 ++++++
 dbcon/joblist/disk-based-topnorderby.h   |  15 +-
 dbcon/joblist/limitedorderby.cpp         | 231 ++++++-----------------
 dbcon/joblist/limitedorderby.h           |   2 +
 dbcon/joblist/tupleannexstep.cpp         |  42 +++--
 primitives/primproc/serviceexemgr.cpp    |   3 +-
 utils/rowgroup/dumper.cpp                |   6 +-
 utils/rowgroup/rowgroup.h                |   2 +-
 utils/windowfunction/idborderby.h        |   2 +-
 10 files changed, 165 insertions(+), 197 deletions(-)
 create mode 100644 dbcon/joblist/disk-based-topnorderby.cpp

diff --git a/dbcon/joblist/CMakeLists.txt b/dbcon/joblist/CMakeLists.txt
index 3783ccf14e..f0f5449df3 100644
--- a/dbcon/joblist/CMakeLists.txt
+++ b/dbcon/joblist/CMakeLists.txt
@@ -27,6 +27,7 @@ set(joblist_LIB_SRCS
     jobstepassociation.cpp
     lbidlist.cpp
     limitedorderby.cpp
+    disk-based-topnorderby.cpp
     passthrucommand-jl.cpp
     passthrustep.cpp
     pcolscan.cpp
diff --git a/dbcon/joblist/disk-based-topnorderby.cpp b/dbcon/joblist/disk-based-topnorderby.cpp
new file mode 100644
index 0000000000..a94e461361
--- /dev/null
+++ b/dbcon/joblist/disk-based-topnorderby.cpp
@@ -0,0 +1,58 @@
+/* Copyright (C) 2025 MariaDB Corp.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; version 2 of
+   the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+   MA 02110-1301, USA. */
+
+#include <vector>
+
+#include "dumper.h"
+#include "disk-based-topnorderby.h"
+namespace joblist
+{
+
+// The caller ensures lifetime of dl and rg
+void DiskBasedTopNOrderBy::flushCurrentToDisk(RowGroupDL& dl, rowgroup::RowGroup rg, const size_t numberOfRGs, const bool firstFlush)
+{
+  size_t rgid = (firstFlush) ? numberOfRGs : 0;
+  rowgroup::RGData rgData;
+
+  bool more = dl.next(0, &rgData);
+  while (more)
+  {
+    saveRG(rgid, getGenerationCounter(), rg, &rgData);
+    if (firstFlush)
+    {
+      --rgid;
+    }
+    else
+    {
+      ++rgid;
+    }
+    more = dl.next(0, &rgData);
+  }
+
+  incrementGenerationCounter();
+}
+void DiskBasedTopNOrderBy::diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists)
+{
+}
+
+std::vector<std::string> DiskBasedTopNOrderBy::getGenerationFileNamesNextBatch(const size_t batchSize)
+{
+  return {};
+}
+
+
+}  // namespace joblist
\ No newline at end of file
diff --git a/dbcon/joblist/disk-based-topnorderby.h b/dbcon/joblist/disk-based-topnorderby.h
index e89c96455e..74fa19c711 100644
--- a/dbcon/joblist/disk-based-topnorderby.h
+++ b/dbcon/joblist/disk-based-topnorderby.h
@@ -60,18 +60,11 @@ class DiskBasedTopNOrderBy : public rowgroup::RGDumper
   {
     return 0;
   }
-  std::vector<std::string> getGenerationFileNamesNextBatch(const size_t batchSize)
-  {
-    return {};
-  }
+  std::vector<std::string> getGenerationFileNamesNextBatch(const size_t batchSize);
 
-  void flushCurrentToDisk(const bool firstFlush)
-  {
-    incrementGenerationCounter();
-  }
-  void diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists)
-  {
-  }
+  // The caller ensures lifetime of dl and rg
+  void flushCurrentToDisk(RowGroupDL& dl, rowgroup::RowGroup rg, const size_t numberOfRGs, const bool firstFlush);
+  void diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists);
 
   //  private:
   uint64_t fGenerationCounter{0};
diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index d898c75cf1..785f82c4d2 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -110,8 +110,8 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
   if (fCount == 0)
     return;
 
-  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl;
-  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl;
+  // std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl;
+  // std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl;
   auto& orderedRowsQueue = getQueue();
   // if the row count is less than the limit
   if (orderedRowsQueue.size() < fStart + fCount)
@@ -162,90 +162,8 @@ void LimitedOrderBy::processRow(const rowgroup::Row& row)
   {
     OrderByRow swapRow = orderedRowsQueue.top();
     row1.setData(swapRow.fData);
-    std::cout << "LimitedOrderBy::processRow row2swap " << row1.toString() << std::endl;
-    std::cout << "LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
-
-    copyRow(row, &row1);
-
-    if (fDistinct)
-    {
-      fDistinctMap->erase(orderedRowsQueue.top().fData);
-      fDistinctMap->insert(row1.getPointer());
-    }
-
-    orderedRowsQueue.pop();
-    orderedRowsQueue.push(swapRow);
-  }
-}
-
-void LimitedOrderBy::processRow_(const rowgroup::Row& row)
-{
-  // check if this is a distinct row
-  if (fDistinct && fDistinctMap->find(row.getPointer()) != fDistinctMap->end())
-    return;
-
-  // @bug5312, limit count is 0, do nothing.
-  if (fCount == 0)
-    return;
-
-  // TODO copy rules or replace ptrs to real instances in CompareRules
-  // auto invertedRule = fRule;
-  // invertedRule.revertRules();
-
-  std::cout << "LimitedOrderBy::processRow row " << row.toString() << std::endl;
-  std::cout << "LimitedOrderBy::processRow fStart " << fStart << " fCount " << fCount << std::endl;
-  auto& orderedRowsQueue = getQueue();
-  // if the row count is less than the limit
-  if (orderedRowsQueue.size() < fStart + fCount)
-  {
-    copyRow(row, &fRow0);
-    OrderByRow newRow(fRow0, fRule);
-    orderedRowsQueue.push(newRow);
-
-    uint64_t memSizeInc = sizeof(newRow);
-    fUncommitedMemory += memSizeInc;
-    if (fUncommitedMemory >= fMaxUncommited)
-    {
-      if (!fRm->getMemory(fUncommitedMemory, fSessionMemLimit))
-      {
-        cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-        throw IDBExcept(fErrorCode);
-      }
-      fMemSize += fUncommitedMemory;
-      fUncommitedMemory = 0;
-    }
-
-    // add to the distinct map
-    if (fDistinct)
-      fDistinctMap->insert(fRow0.getPointer());
-
-    fRowGroup.incRowCount();
-    fRow0.nextRow();
-
-    if (fRowGroup.getRowCount() >= fRowsPerRG)
-    {
-      fDataQueue.push(fData);
-      uint64_t newSize = fRowGroup.getSizeWithStrings() - fRowGroup.getHeaderSize();
-
-      if (!fRm->getMemory(newSize, fSessionMemLimit))
-      {
-        cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-        throw IDBExcept(fErrorCode);
-      }
-      fMemSize += newSize;
-
-      fData.reinit(fRowGroup, fRowsPerRG);
-      fRowGroup.setData(&fData);
-      fRowGroup.resetRowGroup(0);
-      fRowGroup.getRow(0, &fRow0);
-    }
-  }
-  else if (fOrderByCond.size() > 0 && fRule.less(row.getPointer(), orderedRowsQueue.top().fData))
-  {
-    OrderByRow swapRow = orderedRowsQueue.top();
-    row1.setData(swapRow.fData);
-    std::cout << "LimitedOrderBy::processRow row2swap " << row1.toString() << std::endl;
-    std::cout << "LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
+    // std::cout << "LimitedOrderBy::processRow row2swap " << row1.toString() << std::endl;
+    // std::cout << "LimitedOrderBy::processRow new row 4 swaping " << row.toString() << std::endl;
 
     copyRow(row, &row1);
 
@@ -277,118 +195,93 @@ void LimitedOrderBy::processRow_(const rowgroup::Row& row)
 //   }
 // }
 
-void LimitedOrderBy::brandNewFinalize()
+void LimitedOrderBy::flushCurrentToDisk_(const bool firstFlush)
 {
-  if (!isDiskBased())
-  {
-    return finalize();
-  }
-
-  // if disk-based
-  // here there are <= inputQueuesNumber files on disk
-  // and potentially some in-memory state
-  // need to merge this together to produce a result
+  // make a queue with rgdatas and hand it to DiskBasedTopNOrderBy
+  auto dl = RowGroupDL(1, 1);
+  auto& orderedRowsQueue = getQueue();
+  size_t rowsOverRG = orderedRowsQueue.size() % fRowsPerRG;
+  size_t numberOfRGs = orderedRowsQueue.size() / fRowsPerRG + static_cast<size_t>(rowsOverRG > 0);
+  std::thread flushThread(&DiskBasedTopNOrderBy::flushCurrentToDisk, this, std::ref(dl), fRowGroup,
+                          numberOfRGs, firstFlush);
 
-  if (fUncommitedMemory > 0)
+  uint32_t rSize = fRow0.getSize();
+  // process leftovers
+  if (rowsOverRG)
   {
-    if (!fRm->getMemory(fUncommitedMemory, fSessionMemLimit))
-    {
-      cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-      throw logging::OutOfMemoryExcept(fErrorCode);
-    }
-    fMemSize += fUncommitedMemory;
-    fUncommitedMemory = 0;
-  }
-
-  queue<RGData> tempQueue;
-  if (fRowGroup.getRowCount() > 0)
-    fDataQueue.push(fData);
+    fData.reinit(fRowGroup, rowsOverRG);
+    fRowGroup.setData(&fData);
+    fRowGroup.resetRowGroup(0);
+    fRowGroup.getRow(rowsOverRG-1, &fRow0);
+    
+    const OrderByRow& topRow = orderedRowsQueue.top();
+    row1.setData(topRow.fData);
+    copyRow(row1, &fRow0);
+    fRowGroup.incRowCount();
+    fRow0.prevRow(rSize);
+    orderedRowsQueue.pop();
 
-  auto& orderedRowsQueue = getQueue();
+    dl.insert(fData);
+  }
 
   if (orderedRowsQueue.size() > 0)
   {
-    // *DRRTUY Very memory intensive. CS needs to account active
-    // memory only and release memory if needed.
-    uint64_t memSizeInc = fRowGroup.getSizeWithStrings() - fRowGroup.getHeaderSize();
-
-    if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
-    {
-      cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-      throw logging::OutOfMemoryExcept(fErrorCode);
-    }
-    fMemSize += memSizeInc;
-
-    uint64_t offset = 0;
-    uint64_t i = 0;
-    // Reduce queue size by an offset value if it applicable.
-    uint64_t queueSizeWoOffset = orderedRowsQueue.size() > fStart ? orderedRowsQueue.size() - fStart : 0;
-    list<RGData> tempRGDataList;
-
-    if (fCount <= queueSizeWoOffset)
-    {
-      offset = fCount % fRowsPerRG;
-      if (!offset && fCount > 0)
-        offset = fRowsPerRG;
-    }
-    else
-    {
-      offset = queueSizeWoOffset % fRowsPerRG;
-      if (!offset && queueSizeWoOffset > 0)
-        offset = fRowsPerRG;
-    }
-
-    list<RGData>::iterator tempListIter = tempRGDataList.begin();
-
-    i = 0;
-    uint32_t rSize = fRow0.getSize();
-    uint64_t preLastRowNumb = fRowsPerRG - 1;
     fData.reinit(fRowGroup, fRowsPerRG);
     fRowGroup.setData(&fData);
     fRowGroup.resetRowGroup(0);
-    // *DRRTUY This approach won't work with
-    // OFSET > fRowsPerRG
-    offset = offset != 0 ? offset - 1 : offset;
-    fRowGroup.getRow(offset, &fRow0);
+    fRowGroup.getRow(fRowsPerRG-1, &fRow0);
 
-    while ((orderedRowsQueue.size() > fStart) && (i++ < fCount))
+    while (!orderedRowsQueue.empty())
     {
       const OrderByRow& topRow = orderedRowsQueue.top();
       row1.setData(topRow.fData);
       copyRow(row1, &fRow0);
       fRowGroup.incRowCount();
-      offset--;
       fRow0.prevRow(rSize);
       orderedRowsQueue.pop();
 
-      // if RG has fRowsPerRG rows
-      if (offset == (uint64_t)-1)
+      if (fRowGroup.getRowCount() == fRowsPerRG)
       {
-        tempRGDataList.push_front(fData);
-
-        if (!fRm->getMemory(memSizeInc, fSessionMemLimit))
-        {
-          cerr << IDBErrorInfo::instance()->errorMsg(fErrorCode) << " @" << __FILE__ << ":" << __LINE__;
-          throw logging::OutOfMemoryExcept(fErrorCode);
-        }
-        fMemSize += memSizeInc;
+        dl.insert(fData);
 
         fData.reinit(fRowGroup, fRowsPerRG);
         fRowGroup.setData(&fData);
-        fRowGroup.resetRowGroup(0);  // ?
-        fRowGroup.getRow(preLastRowNumb, &fRow0);
-        offset = preLastRowNumb;
+        fRowGroup.resetRowGroup(0);
+        fRowGroup.getRow(fRowsPerRG-1, &fRow0);
       }
     }
-    // Push the last/only group into the queue.
+
     if (fRowGroup.getRowCount() > 0)
-      tempRGDataList.push_front(fData);
+      dl.insert(fData);
+  }
 
-    for (tempListIter = tempRGDataList.begin(); tempListIter != tempRGDataList.end(); tempListIter++)
-      tempQueue.push(*tempListIter);
+  dl.endOfInput();
 
-    fDataQueue = tempQueue;
+  // clean up the current queue/rgdatas to free mem
+  // fDataQueue
+  // fDistinctMap
+  // orderedRowsQueue
+  queue<RGData> tempQueue;
+  fDataQueue.swap(tempQueue);
+  if (fDistinctMap)
+  {
+    fDistinctMap->clear();
   }
+
+  flushThread.join();
+}
+
+void LimitedOrderBy::brandNewFinalize()
+{
+  if (!isDiskBased())
+  {
+    return finalize();
+  }
+
+  // if disk-based
+  // here there are <= inputQueuesNumber files on disk
+  // and potentially some in-memory state
+  // need to merge this together to produce a result
 }
 
 /*
diff --git a/dbcon/joblist/limitedorderby.h b/dbcon/joblist/limitedorderby.h
index 0ed284ce56..024bfe6a62 100644
--- a/dbcon/joblist/limitedorderby.h
+++ b/dbcon/joblist/limitedorderby.h
@@ -54,6 +54,8 @@ class LimitedOrderBy : public ordering::IdbOrderBy, public DiskBasedTopNOrderBy
   }
   const std::string toString() const override;
 
+  void flushCurrentToDisk_(const bool firstFlush);
+  
   void finalize();
   void brandNewFinalize();
   bool getNextRGData(rowgroup::RGData& data);
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index 949269340b..74ce0b900b 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -609,7 +609,7 @@ std::vector<RowGroupDLSPtr> TupleAnnexStep::createInputDLs(const std::vector<std
   std::vector<RowGroupDLSPtr> result;
   for (size_t i = 0; i < fileNames.size(); ++i)
   {
-    result.emplace_back(new RowGroupDL(1, 16)); // WIP hardcode
+    result.emplace_back(new RowGroupDL(1, 1));  // WIP hardcode
   }
   return result;
 }
@@ -619,20 +619,21 @@ std::vector<uint64_t> TupleAnnexStep::startReaders(std::vector<RowGroupDLSPtr>&
   std::vector<uint64_t> result(dataLists.size());
   for (size_t i = 0; i < dataLists.size(); ++i)
   {
-    result[i] = jobstepThreadPool.invoke([&dataLists, i]() { 
-      // open file 
-      // loop  
-        // read ByteStream
-        // make BS into RGData
-        // put into dataLists[i]
-      // close file
-      // emit empty RGData
-     });
+    result[i] = jobstepThreadPool.invoke(
+        [&dataLists, i]()
+        {
+          // open file
+          // loop
+          // read ByteStream
+          // make BS into RGData
+          // put into dataLists[i]
+          // close file
+          // emit empty RGData
+        });
   }
   return result;
 }
 
-
 void TupleAnnexStep::executeWithOrderBy()
 {
   utils::setThreadName("TNSwOrd");
@@ -666,7 +667,22 @@ void TupleAnnexStep::executeWithOrderBy()
             fRowIn.nextRow();
           }
 
+          // std::cout << "use_count " << rgDataIn.rowData.use_count() << " rgDataIn.rowData " << std::hex
+          //           << (uint64_t)rgDataIn.rowData.get() << std::dec << std::endl;
+          
           more = fInputDL->next(fInputIterator, &rgDataIn);
+          
+          // if (more)
+          // {
+          //   std::cout << "use_count " << rgDataIn.rowData.use_count() << " rgDataIn.rowData " << std::hex
+          //             << (uint64_t)rgDataIn.rowData.get() << std::dec << std::endl;
+          // }
+          // else
+          // {
+          //   std::cout << "use_count " << rgDataIn.rowData.use_count() << std::endl;
+          // }
+          // fOrderBy->fDataQueue.pop();
+          // std::cout << "use_count " << rgDataIn.rowData.use_count() << std::endl;
         }
       }
       catch (const logging::OutOfMemoryExcept&)
@@ -682,7 +698,8 @@ void TupleAnnexStep::executeWithOrderBy()
       if (flushToDisk)
       {
         bool firstFlush = true;
-        fOrderBy->flushCurrentToDisk(firstFlush);
+        std::cout << "disk-based flush" << std::endl;
+        fOrderBy->flushCurrentToDisk_(firstFlush);
         flushToDisk = false;
       }
       else
@@ -704,6 +721,7 @@ void TupleAnnexStep::executeWithOrderBy()
   // store avg RGData size
   if (fOrderBy->isDiskBased())
   {
+    std::cout << "disk-based is triggered" << std::endl;
     // assess RAM available, avg RGData size statistics and free enough memory
     // return memory if needed
     size_t inputQueuesNumber = 2;
diff --git a/primitives/primproc/serviceexemgr.cpp b/primitives/primproc/serviceexemgr.cpp
index 2de371f21d..d04a1b0c55 100644
--- a/primitives/primproc/serviceexemgr.cpp
+++ b/primitives/primproc/serviceexemgr.cpp
@@ -130,7 +130,8 @@ void cleanTempDir()
     TempDirPurpose purpose;
   };
   std::vector<Dirs> dirs{{"HashJoin", "AllowDiskBasedJoin", TempDirPurpose::Joins},
-                         {"RowAggregation", "AllowDiskBasedAggregation", TempDirPurpose::Aggregates}};
+                         {"RowAggregation", "AllowDiskBasedAggregation", TempDirPurpose::Aggregates},
+                         {"Sorting", "AllowDiskBasedSorting", TempDirPurpose::Sorting}};
   const auto config = config::Config::makeConfig();
 
   for (const auto& dir : dirs)
diff --git a/utils/rowgroup/dumper.cpp b/utils/rowgroup/dumper.cpp
index 6935e8c444..998cc0a94a 100644
--- a/utils/rowgroup/dumper.cpp
+++ b/utils/rowgroup/dumper.cpp
@@ -157,8 +157,8 @@ void Dumper::checkBuffer(size_t len)
 std::string RGDumper::makeRGFilename(uint64_t rgid, const uint16_t generation) const
 {
   char buf[PATH_MAX];
-  snprintf(buf, sizeof(buf), "%s/%s-p%u-t%ld-rg%lu-g%u", fOperationName.c_str(), fTmpDir.c_str(), getpid(),
-           fUniqId, rgid, generation);
+  snprintf(buf, sizeof(buf), "%s/p%u-t%ld-g%u-rg%lu", fTmpDir.c_str(), getpid(),
+           fUniqId, generation, rgid);
   return buf;
 }
 
@@ -204,6 +204,8 @@ void RGDumper::saveRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGr
   rgdata->serialize(bs, fRowGroupOut.getDataSize());
 
   int errNo;
+  auto name = makeRGFilename(rgid, generation);
+  std::cout << "RGDumper::saveRG  " << name << std::endl;
   if ((errNo = write(makeRGFilename(rgid, generation), (char*)bs.buf(), bs.length())) != 0)
   {
     throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR,
diff --git a/utils/rowgroup/rowgroup.h b/utils/rowgroup/rowgroup.h
index 50baf681c1..16ca719d1a 100644
--- a/utils/rowgroup/rowgroup.h
+++ b/utils/rowgroup/rowgroup.h
@@ -365,7 +365,7 @@ class RGData
     return !!rowData;
   }
 
- private:
+//  private:
   uint32_t rowSize = 0;      // can't be.
   uint32_t columnCount = 0;  // shouldn't be, but...
   boost::shared_ptr<RGDataBufType> rowData;
diff --git a/utils/windowfunction/idborderby.h b/utils/windowfunction/idborderby.h
index 67f0379376..c8ca39de5d 100644
--- a/utils/windowfunction/idborderby.h
+++ b/utils/windowfunction/idborderby.h
@@ -458,7 +458,7 @@ class IdbOrderBy : public IdbCompare
 
   std::unique_ptr<SortingPQ> fOrderByQueue = nullptr;
 
- protected:
+//  protected:
   std::vector<IdbSortSpec> fOrderByCond;
   rowgroup::Row fRow0;
   CompareRule fRule;

From d9187e647477029b8d7c690730a29ce8f888de6e Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Fri, 9 May 2025 21:30:10 +0000
Subject: [PATCH 12/14] feat(sorting): fixed data flush with in-memory reset.

---
 dbcon/joblist/limitedorderby.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index 785f82c4d2..ccdf7b3268 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -263,10 +263,14 @@ void LimitedOrderBy::flushCurrentToDisk_(const bool firstFlush)
   // orderedRowsQueue
   queue<RGData> tempQueue;
   fDataQueue.swap(tempQueue);
+
   if (fDistinctMap)
   {
     fDistinctMap->clear();
   }
+  
+  fRm->returnMemory(fMemSize);
+  fMemSize = 0;
 
   flushThread.join();
 }

From 929289e6805474dbf93ef40fbe20cccd38e91cdf Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Tue, 17 Jun 2025 19:33:22 +0000
Subject: [PATCH 13/14] chore(): rebasing

---
 dbcon/joblist/disk-based-topnorderby.cpp | 4 ++--
 dbcon/joblist/disk-based-topnorderby.h   | 2 +-
 dbcon/joblist/tupleannexstep.cpp         | 7 +++----
 utils/rowgroup/CMakeLists.txt            | 2 +-
 utils/rowgroup/dumper.h                  | 2 +-
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/dbcon/joblist/disk-based-topnorderby.cpp b/dbcon/joblist/disk-based-topnorderby.cpp
index a94e461361..5aabbb5b10 100644
--- a/dbcon/joblist/disk-based-topnorderby.cpp
+++ b/dbcon/joblist/disk-based-topnorderby.cpp
@@ -45,11 +45,11 @@ void DiskBasedTopNOrderBy::flushCurrentToDisk(RowGroupDL& dl, rowgroup::RowGroup
 
   incrementGenerationCounter();
 }
-void DiskBasedTopNOrderBy::diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& dataLists)
+void DiskBasedTopNOrderBy::diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& /*dataLists*/)
 {
 }
 
-std::vector<std::string> DiskBasedTopNOrderBy::getGenerationFileNamesNextBatch(const size_t batchSize)
+std::vector<std::string> DiskBasedTopNOrderBy::getGenerationFileNamesNextBatch(const size_t /*batchSize*/)
 {
   return {};
 }
diff --git a/dbcon/joblist/disk-based-topnorderby.h b/dbcon/joblist/disk-based-topnorderby.h
index 74fa19c711..6e219fa76d 100644
--- a/dbcon/joblist/disk-based-topnorderby.h
+++ b/dbcon/joblist/disk-based-topnorderby.h
@@ -34,7 +34,7 @@ class DiskBasedTopNOrderBy : public rowgroup::RGDumper
   // std::string fCompStr = config::Config::makeConfig()->getConfig("RowAggregation", "Compression");
  public:
   // TODO Parametrize compression, tmpdir and memory manager (can be temp)
-  DiskBasedTopNOrderBy(ResourceManager* rm)
+  DiskBasedTopNOrderBy(ResourceManager* /*rm*/)
    : RGDumper(compress::getCompressInterfaceByName("LZ4"), std::make_unique<rowgroup::MemManager>(),
               config::Config::makeConfig()->getTempFileDir(config::Config::TempDirPurpose::Sorting),
               "Sorting", reinterpret_cast<std::uintptr_t>(this))
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index 74ce0b900b..c710238d3d 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -82,8 +82,8 @@ struct TAEq
   bool operator()(const rowgroup::Row::Pointer&, const rowgroup::Row::Pointer&) const;
 };
 // TODO:  Generalize these and put them back in utils/common/hasher.h
-using TNSDistinctMap_t = std::unordered_set<rowgroup::Row::Pointer, TAHasher, TAEq,
-                                            allocators::CountingAllocator<rowgroup::Row::Pointer> >;
+using TNSDistinctMap_t =
+    std::unordered_set<rowgroup::Row::Pointer, TAHasher, TAEq, STLPoolAllocator<rowgroup::Row::Pointer> >;
 };  // namespace
 
 inline uint64_t TAHasher::operator()(const Row::Pointer& p) const
@@ -821,9 +821,8 @@ void TupleAnnexStep::finalizeParallelOrderByDistinct()
 
   auto allocSorting = fRm->getAllocator<ordering::OrderByRow>();
   ordering::SortingPQ finalPQ(rowgroup::rgCommonSize, allocSorting);
-  auto allocDistinct = fRm->getAllocator<rowgroup::Row::Pointer>();
   std::unique_ptr<TNSDistinctMap_t> distinctMap(
-      new TNSDistinctMap_t(10, TAHasher(this), TAEq(this), allocDistinct));
+    new TNSDistinctMap_t(10, TAHasher(this), TAEq(this), STLPoolAllocator<rowgroup::Row::Pointer>(fRm)));
   fRowGroupIn.initRow(&row1);
   fRowGroupIn.initRow(&row2);
 
diff --git a/utils/rowgroup/CMakeLists.txt b/utils/rowgroup/CMakeLists.txt
index 2ba4c2d467..ccba6b1d53 100644
--- a/utils/rowgroup/CMakeLists.txt
+++ b/utils/rowgroup/CMakeLists.txt
@@ -2,7 +2,7 @@ include_directories(${ENGINE_COMMON_INCLUDES})
 
 # ########## next target ###############
 
-set(rowgroup_LIB_SRCS rowaggregation.cpp rowgroup.cpp rowstorage.cpp)
+set(rowgroup_LIB_SRCS rowaggregation.cpp rowgroup.cpp rowstorage.cpp dumper.cpp)
 
 columnstore_library(rowgroup ${rowgroup_LIB_SRCS})
 columnstore_link(rowgroup PRIVATE ${NETSNMP_LIBRARIES} funcexp loggingcpp)
diff --git a/utils/rowgroup/dumper.h b/utils/rowgroup/dumper.h
index b006ed933f..043c162ded 100644
--- a/utils/rowgroup/dumper.h
+++ b/utils/rowgroup/dumper.h
@@ -56,7 +56,7 @@ class RGDumper : protected Dumper
 {
  public:
   RGDumper(const compress::CompressInterface* comp, std::unique_ptr<MemManager> mm, const std::string& tmpDir,
-           const std::string& operationName, const uint64_t uniqId)
+           const std::string& operationName, const uint64_t /*uniqId*/)
    : Dumper(comp, mm), fTmpDir(tmpDir), fOperationName(operationName)
   {
   }

From bf4c17fd98d028af03ff96b8dbf3722e91d0dbe8 Mon Sep 17 00:00:00 2001
From: drrtuy <drrtuy@gmail.com>
Date: Tue, 24 Jun 2025 11:53:32 +0000
Subject: [PATCH 14/14] feat(sorting): preparation for parallel generation
 reading.

---
 dbcon/joblist/disk-based-topnorderby.cpp | 40 ++++++++++++++++--------
 dbcon/joblist/disk-based-topnorderby.h   |  6 +++-
 dbcon/joblist/limitedorderby.cpp         |  8 +++++
 dbcon/joblist/tupleannexstep.cpp         | 12 +++----
 dbcon/joblist/tupleannexstep.h           |  4 +--
 dbcon/mysql/columnstore.cnf              |  2 +-
 utils/rowgroup/dumper.cpp                | 15 +++++++--
 utils/rowgroup/dumper.h                  |  5 +--
 8 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/dbcon/joblist/disk-based-topnorderby.cpp b/dbcon/joblist/disk-based-topnorderby.cpp
index 5aabbb5b10..9a621eb239 100644
--- a/dbcon/joblist/disk-based-topnorderby.cpp
+++ b/dbcon/joblist/disk-based-topnorderby.cpp
@@ -28,31 +28,45 @@ void DiskBasedTopNOrderBy::flushCurrentToDisk(RowGroupDL& dl, rowgroup::RowGroup
   size_t rgid = (firstFlush) ? numberOfRGs : 0;
   rowgroup::RGData rgData;
 
+  size_t generation = (firstFlush) ? getGenerationCounter() : 0; // WIP 
+
   bool more = dl.next(0, &rgData);
   while (more)
   {
-    saveRG(rgid, getGenerationCounter(), rg, &rgData);
-    if (firstFlush)
-    {
-      --rgid;
-    }
-    else
-    {
-      ++rgid;
-    }
+    saveRG(rgid, generation, rg, &rgData);
     more = dl.next(0, &rgData);
+    rgid = (firstFlush) ? rgid - 1 : rgid + 1;
   }
 
-  incrementGenerationCounter();
+  if (firstFlush)
+  {
+    incrementGenerationCounter();
+  }
+  else
+  {
+    
+  }
 }
 void DiskBasedTopNOrderBy::diskBasedMergePhaseIfNeeded(std::vector<RowGroupDLSPtr>& /*dataLists*/)
 {
 }
 
-std::vector<std::string> DiskBasedTopNOrderBy::getGenerationFileNamesNextBatch(const size_t /*batchSize*/)
+std::vector<std::string> DiskBasedTopNOrderBy::getGenerationFileNamesNextBatch(const size_t batchSize)
 {
-  return {};
-}
+  // assert(getGenerationFilesNumber() > batchSize);
+  auto totalNumberOfFilesYetToMerge = getGenerationFilesNumber() - batchSize;
+  auto batchSizeOrFilesLeftNumber = std::max(getGenerationFilesNumber(), batchSize);
+  auto actualBatchSize = std::min(totalNumberOfFilesYetToMerge, batchSizeOrFilesLeftNumber);
+  // add state for the starting offset + wraparound
+  size_t startOffset = 0;
+  std::vector<std::string> res;
+  res.reserve(actualBatchSize);
+  for (size_t i = 0; i < startOffset + actualBatchSize; ++i)
+  {
+    res.push_back(makeRGFilePrefix(i));
+  }
 
+  return res;
+ 
 
 }  // namespace joblist
\ No newline at end of file
diff --git a/dbcon/joblist/disk-based-topnorderby.h b/dbcon/joblist/disk-based-topnorderby.h
index 6e219fa76d..d490a94da8 100644
--- a/dbcon/joblist/disk-based-topnorderby.h
+++ b/dbcon/joblist/disk-based-topnorderby.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <cstdint>
+#include <queue>
 #include <string>
 #include <vector>
 
@@ -45,10 +46,12 @@ class DiskBasedTopNOrderBy : public rowgroup::RGDumper
   void incrementGenerationCounter()
   {
     ++fGenerationCounter;
+    uint64_t newGeneration = (fGenerations.empty()) ? 1 : fGenerations.back() + 1;  
+    fGenerations.push(newGeneration);
   }
   uint64_t getGenerationCounter() const
   {
-    return fGenerationCounter;
+    return (fGenerations.empty()) ? 0 : fGenerations.back();  
   }
 
   bool isDiskBased() const
@@ -68,6 +71,7 @@ class DiskBasedTopNOrderBy : public rowgroup::RGDumper
 
   //  private:
   uint64_t fGenerationCounter{0};
+  std::queue<uint64_t> fGenerations;
 };
 
 }  // namespace joblist
\ No newline at end of file
diff --git a/dbcon/joblist/limitedorderby.cpp b/dbcon/joblist/limitedorderby.cpp
index ccdf7b3268..a417570aad 100644
--- a/dbcon/joblist/limitedorderby.cpp
+++ b/dbcon/joblist/limitedorderby.cpp
@@ -262,6 +262,14 @@ void LimitedOrderBy::flushCurrentToDisk_(const bool firstFlush)
   // fDistinctMap
   // orderedRowsQueue
   queue<RGData> tempQueue;
+  // std::cout << "flush num of refs before deletion " << fDataQueue.front().rowData.use_count() << std::endl;
+  // {
+  //   auto frontOfQueue = fDataQueue.front();
+  //   fDataQueue.pop();
+  //   std::cout << "flush num of refs w/o the queue " << frontOfQueue.rowData.use_count() << std::endl;
+  //   std::cout << "flush num of refs swaping the queue with empty " << frontOfQueue.rowData.use_count() << " free " << fMemSize << std::endl;
+  // }
+
   fDataQueue.swap(tempQueue);
 
   if (fDistinctMap)
diff --git a/dbcon/joblist/tupleannexstep.cpp b/dbcon/joblist/tupleannexstep.cpp
index c710238d3d..563fd3d049 100644
--- a/dbcon/joblist/tupleannexstep.cpp
+++ b/dbcon/joblist/tupleannexstep.cpp
@@ -603,19 +603,19 @@ void TupleAnnexStep::checkAndAllocateMemory4RGData(const rowgroup::RowGroup& row
   }
 }
 
-// RowGroupDL* dl1 = new RowGroupDL(1, jobInfo.fifoSize);
-std::vector<RowGroupDLSPtr> TupleAnnexStep::createInputDLs(const std::vector<std::string>& fileNames)
+std::vector<RowGroupDLSPtr> TupleAnnexStep::createInputDLs(const size_t dLsCount) const
 {
   std::vector<RowGroupDLSPtr> result;
-  for (size_t i = 0; i < fileNames.size(); ++i)
+  for (size_t i = 0; i < dLsCount; ++i)
   {
     result.emplace_back(new RowGroupDL(1, 1));  // WIP hardcode
   }
   return result;
 }
 
-std::vector<uint64_t> TupleAnnexStep::startReaders(std::vector<RowGroupDLSPtr>& dataLists)
+std::vector<uint64_t> TupleAnnexStep::startReaders(std::vector<RowGroupDLSPtr>& dataLists, std::vector<std::string>& fileNames)
 {
+  //assert(dataLists.size(), fileNames.size())
   std::vector<uint64_t> result(dataLists.size());
   for (size_t i = 0; i < dataLists.size(); ++i)
   {
@@ -728,8 +728,8 @@ void TupleAnnexStep::executeWithOrderBy()
     while (inputQueuesNumber < fOrderBy->getGenerationFilesNumber())
     {
       auto fileNames = fOrderBy->getGenerationFileNamesNextBatch(inputQueuesNumber);
-      auto inputDLs = createInputDLs(fileNames);
-      auto readers = startReaders(inputDLs);
+      auto inputDLs = createInputDLs(fileNames.size());
+      auto readers = startReaders(inputDLs, fileNames);
       // create outputDLs or simplier atomic queues + readers threads
       fOrderBy->diskBasedMergePhaseIfNeeded(inputDLs);
       jobstepThreadPool.join(readers);
diff --git a/dbcon/joblist/tupleannexstep.h b/dbcon/joblist/tupleannexstep.h
index 5c23398fe5..252001800f 100644
--- a/dbcon/joblist/tupleannexstep.h
+++ b/dbcon/joblist/tupleannexstep.h
@@ -137,8 +137,8 @@ class TupleAnnexStep : public JobStep, public TupleDeliveryStep
   //   return fGenerationCounter;
   // }
 
-  std::vector<RowGroupDLSPtr> createInputDLs(const std::vector<std::string>& fileNames);
-  std::vector<uint64_t> startReaders(std::vector<RowGroupDLSPtr>& inputDLs);
+  std::vector<RowGroupDLSPtr> createInputDLs(const size_t dLsCount) const;
+  std::vector<uint64_t> startReaders(std::vector<RowGroupDLSPtr>& inputDLs, std::vector<std::string>& fileNames);
 
   // input/output rowgroup and row
   rowgroup::RowGroup fRowGroupIn;
diff --git a/dbcon/mysql/columnstore.cnf b/dbcon/mysql/columnstore.cnf
index e0b77f0b24..08a035121a 100644
--- a/dbcon/mysql/columnstore.cnf
+++ b/dbcon/mysql/columnstore.cnf
@@ -1,6 +1,6 @@
 [mariadb-client]
 quick
-quick-max-column-width=0
+loose-quick-max-column-width=0
 
 [mysqld]
 plugin-load-add=ha_columnstore.so
diff --git a/utils/rowgroup/dumper.cpp b/utils/rowgroup/dumper.cpp
index 998cc0a94a..f9cee685f0 100644
--- a/utils/rowgroup/dumper.cpp
+++ b/utils/rowgroup/dumper.cpp
@@ -149,17 +149,26 @@ void Dumper::checkBuffer(size_t len)
     size_t newtmpsz = (len + 8191) / 8192 * 8192;
     std::vector<char> tmpvec(newtmpsz);
     // WIP needs OOM check
-    fMM->acquire(newtmpsz - fTmpBuf.size());
+      fMM->acquire(newtmpsz - fTmpBuf.size());
     fTmpBuf.swap(tmpvec);
   }
 }
 
+std::string RGDumper::makeRGFilePrefix(const uint16_t generation) const
+{
+  char buf[PATH_MAX];
+  snprintf(buf, sizeof(buf), "%s/p%u-t%ld-g%u-rg", fTmpDir.c_str(), getpid(),
+           fUniqId, generation);
+  return {buf};
+}
+
+// TODO Reuse prefix
 std::string RGDumper::makeRGFilename(uint64_t rgid, const uint16_t generation) const
 {
   char buf[PATH_MAX];
   snprintf(buf, sizeof(buf), "%s/p%u-t%ld-g%u-rg%lu", fTmpDir.c_str(), getpid(),
            fUniqId, generation, rgid);
-  return buf;
+  return {buf};
 }
 
 void RGDumper::loadRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut,
@@ -205,7 +214,7 @@ void RGDumper::saveRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGr
 
   int errNo;
   auto name = makeRGFilename(rgid, generation);
-  std::cout << "RGDumper::saveRG  " << name << std::endl;
+  // std::cout << "RGDumper::saveRG  " << name << std::endl;
   if ((errNo = write(makeRGFilename(rgid, generation), (char*)bs.buf(), bs.length())) != 0)
   {
     throw logging::IDBExcept(logging::IDBErrorInfo::instance()->errorMsg(logging::ERR_DISKAGG_FILEIO_ERROR,
diff --git a/utils/rowgroup/dumper.h b/utils/rowgroup/dumper.h
index 043c162ded..6c6185b293 100644
--- a/utils/rowgroup/dumper.h
+++ b/utils/rowgroup/dumper.h
@@ -56,8 +56,8 @@ class RGDumper : protected Dumper
 {
  public:
   RGDumper(const compress::CompressInterface* comp, std::unique_ptr<MemManager> mm, const std::string& tmpDir,
-           const std::string& operationName, const uint64_t /*uniqId*/)
-   : Dumper(comp, mm), fTmpDir(tmpDir), fOperationName(operationName)
+           const std::string& operationName, const uint64_t uniqId)
+   : Dumper(comp, mm), fTmpDir(tmpDir), fOperationName(operationName), fUniqId(uniqId)
   {
   }
   ~RGDumper() = default;
@@ -66,6 +66,7 @@ class RGDumper : protected Dumper
   void saveRG(uint64_t rgid, const uint16_t generation, RowGroup& fRowGroupOut, RGData* rgdata);
 
   std::string makeRGFilename(uint64_t rgid, const uint16_t generation) const;
+  std::string makeRGFilePrefix(const uint16_t generation) const;
 
  private:
   std::string fTmpDir;