From 37571d6e989b85ecde860512f4fb2867f79a0f1f Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Mon, 11 May 2026 05:37:37 +0000 Subject: [PATCH 01/10] Custom memory resource prototype --- velox/common/memory/CMakeLists.txt | 1 + velox/common/memory/Memory.cpp | 25 ++++++++- velox/common/memory/Memory.h | 26 ++++++++- velox/common/memory/MemoryPool.cpp | 1 + velox/common/memory/MemoryPool.h | 13 +++++ .../common/memory/tests/MemoryManagerTest.cpp | 55 +++++++++++++++++++ velox/core/QueryCtx.cpp | 33 +++++++++++ velox/core/QueryCtx.h | 16 ++++++ velox/core/tests/QueryCtxTest.cpp | 33 +++++++++++ 9 files changed, 200 insertions(+), 3 deletions(-) diff --git a/velox/common/memory/CMakeLists.txt b/velox/common/memory/CMakeLists.txt index d6353c696cc..503c9b7893f 100644 --- a/velox/common/memory/CMakeLists.txt +++ b/velox/common/memory/CMakeLists.txt @@ -40,6 +40,7 @@ velox_add_library( ArbitrationParticipant.h ByteStream.h CompactDoubleList.h + CustomMemoryResource.h HashStringAllocator.h MallocAllocator.h Memory.h diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index 45a185cae7e..5c9df0e66fc 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -284,7 +284,8 @@ std::shared_ptr MemoryManager::addRootPool( const std::string& name, int64_t maxCapacity, std::unique_ptr reclaimer, - const std::optional& poolDebugOpts) { + const std::optional& poolDebugOpts, + const std::optional& resourceTag) { std::string poolName = name; if (poolName.empty()) { static std::atomic poolId{0}; @@ -298,6 +299,7 @@ std::shared_ptr MemoryManager::addRootPool( options.coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_; options.getPreferredSize = getPreferredSize_; options.debugOptions = poolDebugOpts; + options.resourceTag = resourceTag; auto pool = createRootPool(poolName, reclaimer, options); if (!disableMemoryPoolTracking_) { @@ -378,6 +380,27 @@ MemoryArbitrator* MemoryManager::arbitrator() { return arbitrator_.get(); } +void MemoryManager::registerCustomResource(CustomMemoryResource resource) { + VELOX_USER_CHECK(!resource.tag.empty(), "CustomMemoryResource tag is empty"); + VELOX_USER_CHECK_NOT_NULL( + resource.allocator, + "CustomMemoryResource allocator is null for tag: {}", + resource.tag); + for (const auto& existing : customResources_) { + VELOX_USER_CHECK_NE( + existing.tag, + resource.tag, + "CustomMemoryResource already registered for tag: {}", + resource.tag); + } + customResources_.push_back(std::move(resource)); +} + +const std::vector& MemoryManager::customResources() + const { + return customResources_; +} + std::string MemoryManager::toString(bool detail) const { const int64_t allocatorCapacity = capacity(); std::stringstream out; diff --git a/velox/common/memory/Memory.h b/velox/common/memory/Memory.h index c321f204611..dee7d274730 100644 --- a/velox/common/memory/Memory.h +++ b/velox/common/memory/Memory.h @@ -37,6 +37,7 @@ #include "velox/common/base/CheckedArithmetic.h" #include "velox/common/base/SuccinctPrinter.h" #include "velox/common/memory/Allocation.h" +#include "velox/common/memory/CustomMemoryResource.h" #include "velox/common/memory/MemoryAllocator.h" #include "velox/common/memory/MemoryPool.h" @@ -215,13 +216,30 @@ class MemoryManager { /// Creates a root memory pool with specified 'name' and 'maxCapacity'. If /// 'name' is missing, the memory manager generates a default name internally - /// to ensure uniqueness. + /// to ensure uniqueness. If 'resourceTag' is set, the pool is associated + /// with the custom memory resource registered under that tag. std::shared_ptr addRootPool( const std::string& name = "", int64_t maxCapacity = kMaxMemory, std::unique_ptr reclaimer = nullptr, const std::optional& poolDebugOpts = - std::nullopt); + std::nullopt, + const std::optional& resourceTag = std::nullopt); + + /// Registers a custom memory resource. Tag must be unique across previously + /// registered resources. Throws VeloxUserError on duplicate. + /// + /// NOT thread-safe. Must be called during process startup, before any + /// QueryCtx is built or any user code reads customResources(). Typical + /// callers are extension init routines invoked from main() or a static + /// initializer. + void registerCustomResource(CustomMemoryResource resource); + + /// Returns the registered custom resources in registration order. + /// Lock-free read; safe only after all registrations have completed (see + /// registerCustomResource). The returned reference is stable for the + /// lifetime of the MemoryManager because registration only appends. + const std::vector& customResources() const; /// Creates a leaf memory pool for direct memory allocation use with specified /// 'name'. If 'name' is missing, the memory manager generates a default name @@ -332,6 +350,10 @@ class MemoryManager { mutable folly::SharedMutex mutex_; // All user root pools allocated from 'this'. std::unordered_map> pools_; + + // Custom memory resources registered before any query started. Not guarded + // by a mutex; see registerCustomResource for the threading contract. + std::vector customResources_; }; /// Initializes the process-wide memory manager based on the specified diff --git a/velox/common/memory/MemoryPool.cpp b/velox/common/memory/MemoryPool.cpp index 8ce43b2bd34..7c9a4ef08e6 100644 --- a/velox/common/memory/MemoryPool.cpp +++ b/velox/common/memory/MemoryPool.cpp @@ -243,6 +243,7 @@ MemoryPool::MemoryPool( trackUsage_(options.trackUsage), threadSafe_(options.threadSafe), debugOptions_(options.debugOptions), + resourceTag_(options.resourceTag), coreOnAllocationFailureEnabled_(options.coreOnAllocationFailureEnabled), getPreferredSize_( options.getPreferredSize == nullptr diff --git a/velox/common/memory/MemoryPool.h b/velox/common/memory/MemoryPool.h index 4fb4216ca5b..82628f265b7 100644 --- a/velox/common/memory/MemoryPool.h +++ b/velox/common/memory/MemoryPool.h @@ -164,6 +164,12 @@ class MemoryPool : public std::enable_shared_from_this { /// If non-empty, enables debug mode for the created memory pool. std::optional debugOptions{std::nullopt}; + + /// If set, this pool belongs to the custom memory resource registered + /// under this tag with MemoryManager::registerCustomResource. Unset means + /// the default CPU resource. Inert until consumers (allocator dispatch, + /// child inheritance) are wired in a follow-up phase. + std::optional resourceTag{std::nullopt}; }; /// Constructs a named memory pool with specified 'name', 'parent' and 'kind'. @@ -343,6 +349,12 @@ class MemoryPool : public std::enable_shared_from_this { return alignment_; } + /// Returns the resource tag this memory pool was created against, or + /// std::nullopt for the default CPU resource. + virtual const std::optional& resourceTag() const { + return resourceTag_; + } + /// Resource governing methods used to track and limit the memory usage /// through this memory pool object. @@ -600,6 +612,7 @@ class MemoryPool : public std::enable_shared_from_this { const bool trackUsage_; const bool threadSafe_; const std::optional debugOptions_; + const std::optional resourceTag_; const bool coreOnAllocationFailureEnabled_; std::function getPreferredSize_; diff --git a/velox/common/memory/tests/MemoryManagerTest.cpp b/velox/common/memory/tests/MemoryManagerTest.cpp index 260d446131f..151e1d8dbab 100644 --- a/velox/common/memory/tests/MemoryManagerTest.cpp +++ b/velox/common/memory/tests/MemoryManagerTest.cpp @@ -295,6 +295,61 @@ TEST_F(MemoryManagerTest, addPoolWithArbitrator) { } // TODO: remove this test when remove deprecatedDefaultMemoryManager. +TEST_F(MemoryManagerTest, customResourceRegistration) { + MemoryManager manager{}; + ASSERT_TRUE(manager.customResources().empty()); + + MemoryAllocator::Options allocatorOptions; + allocatorOptions.capacity = 1L << 30; + + CustomMemoryResource resource; + resource.tag = "test-resource"; + resource.maxCapacity = 1L << 28; + resource.allocator = std::make_shared(allocatorOptions); + manager.registerCustomResource(std::move(resource)); + + ASSERT_EQ(manager.customResources().size(), 1); + EXPECT_EQ(manager.customResources()[0].tag, "test-resource"); + EXPECT_EQ(manager.customResources()[0].maxCapacity, 1L << 28); + + // Empty tag is rejected. + CustomMemoryResource emptyTag; + emptyTag.allocator = std::make_shared(allocatorOptions); + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(emptyTag)), + "CustomMemoryResource tag is empty"); + + // Null allocator is rejected. + CustomMemoryResource nullAllocator; + nullAllocator.tag = "another"; + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(nullAllocator)), + "CustomMemoryResource allocator is null for tag: another"); + + // Duplicate tag is rejected. + CustomMemoryResource duplicate; + duplicate.tag = "test-resource"; + duplicate.allocator = std::make_shared(allocatorOptions); + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(duplicate)), + "CustomMemoryResource already registered for tag: test-resource"); + + // The first registration is unaffected. + ASSERT_EQ(manager.customResources().size(), 1); +} + +TEST_F(MemoryManagerTest, addRootPoolWithResourceTag) { + MemoryManager manager{}; + auto pool = manager.addRootPool( + "tagged-root", kMaxMemory, nullptr, std::nullopt, "gpu"); + ASSERT_NE(pool, nullptr); + ASSERT_TRUE(pool->resourceTag().has_value()); + EXPECT_EQ(*pool->resourceTag(), "gpu"); + + auto untagged = manager.addRootPool("untagged-root"); + EXPECT_FALSE(untagged->resourceTag().has_value()); +} + TEST_F(MemoryManagerTest, defaultMemoryManager) { auto& managerA = toMemoryManager(deprecatedDefaultMemoryManager()); auto& managerB = toMemoryManager(deprecatedDefaultMemoryManager()); diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp index 6cfcb3950ba..8440b339089 100644 --- a/velox/core/QueryCtx.cpp +++ b/velox/core/QueryCtx.cpp @@ -56,6 +56,23 @@ std::shared_ptr QueryCtx::Builder::build() { std::move(tokenProvider_), std::move(traceCtxProvider_))); queryCtx->maybeSetReclaimer(); + for (const auto& mr : memory::memoryManager()->customResources()) { + auto reclaimer = + mr.reclaimerFactory ? mr.reclaimerFactory(queryCtx.get()) : nullptr; + // Reuse generatePoolName so the custom pool gets the same uniqueness + // guarantee as the regular query pool (atomic seq counter), then suffix + // the resource tag so it's identifiable in pool dumps. + auto pool = memory::memoryManager()->addRootPool( + fmt::format( + "{}.{}", + QueryCtx::generatePoolName(queryCtx->queryId()), + mr.tag), + mr.maxCapacity, + std::move(reclaimer), + std::nullopt, + mr.tag); + queryCtx->addCustomPool(std::move(pool)); + } for (auto& cb : releaseCallbacks_) { queryCtx->addReleaseCallback(std::move(cb)); } @@ -105,6 +122,22 @@ QueryCtx::~QueryCtx() { return fmt::format("query.{}.{}", queryId, seqNum++); } +void QueryCtx::addCustomPool(std::shared_ptr pool) { + VELOX_CHECK_NOT_NULL(pool); + customPools_.push_back(std::move(pool)); +} + +std::shared_ptr QueryCtx::customPool( + const std::string& tag) const { + for (const auto& pool : customPools_) { + const auto& poolTag = pool->resourceTag(); + if (poolTag.has_value() && *poolTag == tag) { + return pool; + } + } + return nullptr; +} + void QueryCtx::maybeSetReclaimer() { VELOX_CHECK_NOT_NULL(pool_); VELOX_CHECK(!underArbitration_); diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index 749c6d0be3f..236f76db5bd 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -379,6 +379,21 @@ class QueryCtx : public std::enable_shared_from_this { pool_ = std::move(pool); } + /// Tracks an additional root pool tied to a custom memory resource. Called + /// by Builder::build for each entry in MemoryManager::customResources. + void addCustomPool(std::shared_ptr pool); + + /// Returns the custom root pool associated with the resource tag, or + /// nullptr if no pool was registered under that tag for this query. + std::shared_ptr customPool( + const std::string& tag) const; + + /// Returns all custom root pools registered for this query, in registration + /// order. The reference is stable for the lifetime of the QueryCtx. + const std::vector>& customPools() const { + return customPools_; + } + /// Indicates if the query is under memory arbitration or not. bool testingUnderArbitration() const { std::lock_guard l(mutex_); @@ -471,6 +486,7 @@ class QueryCtx : public std::enable_shared_from_this { std::unordered_map> connectorSessionProperties_; std::shared_ptr pool_; + std::vector> customPools_; QueryConfig queryConfig_; std::atomic numSpilledBytes_{0}; std::atomic numTracedBytes_{0}; diff --git a/velox/core/tests/QueryCtxTest.cpp b/velox/core/tests/QueryCtxTest.cpp index 44dbc5927c4..b78b6db0fe1 100644 --- a/velox/core/tests/QueryCtxTest.cpp +++ b/velox/core/tests/QueryCtxTest.cpp @@ -16,6 +16,7 @@ #include #include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/memory/MallocAllocator.h" #include "velox/core/QueryCtx.h" namespace facebook::velox::core::test { @@ -113,6 +114,38 @@ TEST_F(QueryCtxTest, releaseCallbackException) { ASSERT_EQ(callbackCount, 2); } +TEST_F(QueryCtxTest, customResourcePerQueryPool) { + memory::MemoryAllocator::Options allocatorOptions; + allocatorOptions.capacity = 1L << 30; + + memory::CustomMemoryResource resource; + resource.tag = "qctx-test-resource"; + resource.maxCapacity = 1L << 28; + resource.allocator = + std::make_shared(allocatorOptions); + + bool factoryInvoked = false; + resource.reclaimerFactory = [&factoryInvoked](QueryCtx* /*ctx*/) { + factoryInvoked = true; + return memory::MemoryReclaimer::create(0); + }; + + memory::memoryManager()->registerCustomResource(std::move(resource)); + + auto queryCtx = QueryCtx::Builder().queryId("custom-resource-query").build(); + EXPECT_TRUE(factoryInvoked); + + auto pool = queryCtx->customPool("qctx-test-resource"); + ASSERT_NE(pool, nullptr); + EXPECT_TRUE(pool->name().starts_with("query.custom-resource-query.")); + EXPECT_TRUE(pool->name().ends_with(".qctx-test-resource")); + ASSERT_TRUE(pool->resourceTag().has_value()); + EXPECT_EQ(*pool->resourceTag(), "qctx-test-resource"); + + EXPECT_EQ(queryCtx->customPool("does-not-exist"), nullptr); + EXPECT_EQ(queryCtx->customPools().size(), 1); +} + TEST_F(QueryCtxTest, builderReleaseCallbacks) { int callbackCount = 0; std::string capturedQueryId; From 2087256e7cdb7d6aabb752dbc0ad79a2c05a4c31 Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Thu, 14 May 2026 01:09:09 +0000 Subject: [PATCH 02/10] feat(memory): Wire CustomMemoryResource dispatch and add test suite The prototype commit registered CustomMemoryResource and gave QueryCtx per-resource root pools, but pool allocations still routed through the MemoryManager default allocator. This wires the dispatch end-to-end: - MemoryPool::Options carries a shared_ptr. MemoryPoolImpl resolves allocator_ and arbitrator_ from the resource at construction, falling back to manager defaults when null. - genChild propagates the resource to children, so aggregate and leaf children of a custom-resource root inherit the allocator and tag. - MemoryManager::addRootPool resolves resourceTag to a registered resource once and stores the shared_ptr. Unregistered tags throw. - arbitrator addPool/removePool route through pool->arbitrator() so custom arbitrators participate in pool registration. - customResources_ becomes vector>; pools keep strong references, replacing the implicit manager-outlives-pools contract with a refcount invariant. Tests consolidated into CustomMemoryResourceTest.cpp covering registration, ordering, pointer stability, per-query pool creation, chained-reclaimer ordering, cross-resource spill flow, and root/child allocator dispatch. Co-Authored-By: Claude Opus 4.7 (1M context) --- velox/common/memory/CustomMemoryResource.h | 59 +++ velox/common/memory/Memory.cpp | 28 +- velox/common/memory/Memory.h | 30 +- velox/common/memory/MemoryPool.cpp | 22 +- velox/common/memory/MemoryPool.h | 33 +- velox/common/memory/tests/CMakeLists.txt | 1 + .../memory/tests/CustomMemoryResourceTest.cpp | 388 ++++++++++++++++++ .../common/memory/tests/MemoryManagerTest.cpp | 55 --- velox/core/QueryCtx.cpp | 11 +- velox/core/QueryCtx.h | 10 +- velox/core/tests/QueryCtxTest.cpp | 33 -- 11 files changed, 532 insertions(+), 138 deletions(-) create mode 100644 velox/common/memory/CustomMemoryResource.h create mode 100644 velox/common/memory/tests/CustomMemoryResourceTest.cpp diff --git a/velox/common/memory/CustomMemoryResource.h b/velox/common/memory/CustomMemoryResource.h new file mode 100644 index 00000000000..c9bd9f50177 --- /dev/null +++ b/velox/common/memory/CustomMemoryResource.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace facebook::velox::core { +class QueryCtx; +} // namespace facebook::velox::core + +namespace facebook::velox::memory { + +class MemoryAllocator; +class MemoryArbitrator; +class MemoryReclaimer; + +/// Describes an externally-provided memory resource (e.g. a GPU or tiered +/// memory backend) registered with MemoryManager and referenced by 'tag' when +/// building per-query memory pools. +struct CustomMemoryResource { + /// Unique non-empty identifier. + std::string tag; + + /// Capacity of the per-query root pool created for this resource. + int64_t maxCapacity{std::numeric_limits::max()}; + + /// Allocator backing pools tagged with this resource. Required. + std::shared_ptr allocator; + + /// Per-resource arbitrator. When non-null, pools tagged with this resource + /// route capacity decisions through it. When null, they fall back to the + /// MemoryManager's default arbitrator. + std::shared_ptr arbitrator; + + /// Optional factory invoked once per QueryCtx to build the reclaimer + /// attached to the resource's per-query root pool. + std::function(core::QueryCtx*)> + reclaimerFactory; +}; + +} // namespace facebook::velox::memory diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index 5c9df0e66fc..805264019c9 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -274,7 +274,7 @@ std::shared_ptr MemoryManager::createRootPool( std::move(reclaimer), options); VELOX_CHECK_EQ(pool->capacity(), 0); - arbitrator_->addPool(pool); + pool->arbitrator()->addPool(pool); RECORD_HISTOGRAM_METRIC_VALUE( kMetricMemoryPoolInitialCapacityBytes, pool->capacity()); return pool; @@ -299,7 +299,18 @@ std::shared_ptr MemoryManager::addRootPool( options.coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_; options.getPreferredSize = getPreferredSize_; options.debugOptions = poolDebugOpts; - options.resourceTag = resourceTag; + if (resourceTag.has_value()) { + for (const auto& candidate : customResources_) { + if (candidate->tag == *resourceTag) { + options.resource = candidate; + break; + } + } + VELOX_USER_CHECK_NOT_NULL( + options.resource, + "No CustomMemoryResource registered for tag: {}", + *resourceTag); + } auto pool = createRootPool(poolName, reclaimer, options); if (!disableMemoryPoolTracking_) { @@ -310,7 +321,7 @@ std::shared_ptr MemoryManager::addRootPool( } pools_.emplace(poolName, pool); } catch (const VeloxRuntimeError&) { - arbitrator_->removePool(pool.get()); + pool->arbitrator()->removePool(pool.get()); throw; } } @@ -342,7 +353,7 @@ uint64_t MemoryManager::shrinkPools( void MemoryManager::dropPool(MemoryPool* pool) { VELOX_CHECK_NOT_NULL(pool); VELOX_DCHECK_EQ(pool->reservedBytes(), 0); - arbitrator_->removePool(pool); + pool->arbitrator()->removePool(pool); if (disableMemoryPoolTracking_) { return; } @@ -388,16 +399,17 @@ void MemoryManager::registerCustomResource(CustomMemoryResource resource) { resource.tag); for (const auto& existing : customResources_) { VELOX_USER_CHECK_NE( - existing.tag, + existing->tag, resource.tag, "CustomMemoryResource already registered for tag: {}", resource.tag); } - customResources_.push_back(std::move(resource)); + customResources_.push_back( + std::make_shared(std::move(resource))); } -const std::vector& MemoryManager::customResources() - const { +const std::vector>& +MemoryManager::customResources() const { return customResources_; } diff --git a/velox/common/memory/Memory.h b/velox/common/memory/Memory.h index dee7d274730..16225892fb0 100644 --- a/velox/common/memory/Memory.h +++ b/velox/common/memory/Memory.h @@ -226,20 +226,15 @@ class MemoryManager { std::nullopt, const std::optional& resourceTag = std::nullopt); - /// Registers a custom memory resource. Tag must be unique across previously - /// registered resources. Throws VeloxUserError on duplicate. - /// - /// NOT thread-safe. Must be called during process startup, before any - /// QueryCtx is built or any user code reads customResources(). Typical - /// callers are extension init routines invoked from main() or a static - /// initializer. + /// Registers a custom memory resource. NOT thread-safe: must be called + /// during process startup before any QueryCtx is built. Throws on empty + /// tag, null allocator, or duplicate tag. void registerCustomResource(CustomMemoryResource resource); - /// Returns the registered custom resources in registration order. - /// Lock-free read; safe only after all registrations have completed (see - /// registerCustomResource). The returned reference is stable for the - /// lifetime of the MemoryManager because registration only appends. - const std::vector& customResources() const; + /// Returns the registered custom resources in registration order. Safe to + /// read concurrently once registrations have completed. + const std::vector>& customResources() + const; /// Creates a leaf memory pool for direct memory allocation use with specified /// 'name'. If 'name' is missing, the memory manager generates a default name @@ -315,6 +310,11 @@ class MemoryManager { return sharedLeafPools_; } + /// Test-only: drops all registered custom memory resources. + void testingClearCustomResources() { + customResources_.clear(); + } + private: std::shared_ptr createRootPool( std::string poolName, @@ -351,9 +351,9 @@ class MemoryManager { // All user root pools allocated from 'this'. std::unordered_map> pools_; - // Custom memory resources registered before any query started. Not guarded - // by a mutex; see registerCustomResource for the threading contract. - std::vector customResources_; + // Append-only; see registerCustomResource for the threading contract. + // Pools hold shared_ptrs so a resource outlives every pool that uses it. + std::vector> customResources_; }; /// Initializes the process-wide memory manager based on the specified diff --git a/velox/common/memory/MemoryPool.cpp b/velox/common/memory/MemoryPool.cpp index 7c9a4ef08e6..796a38c3a4c 100644 --- a/velox/common/memory/MemoryPool.cpp +++ b/velox/common/memory/MemoryPool.cpp @@ -243,7 +243,7 @@ MemoryPool::MemoryPool( trackUsage_(options.trackUsage), threadSafe_(options.threadSafe), debugOptions_(options.debugOptions), - resourceTag_(options.resourceTag), + resource_(options.resource), coreOnAllocationFailureEnabled_(options.coreOnAllocationFailureEnabled), getPreferredSize_( options.getPreferredSize == nullptr @@ -260,6 +260,13 @@ MemoryPool::~MemoryPool() { VELOX_CHECK(children_.empty()); } +std::optional MemoryPool::resourceTag() const { + if (resource_ == nullptr) { + return std::nullopt; + } + return resource_->tag; +} + // static std::string MemoryPool::kindString(Kind kind) { switch (kind) { @@ -458,8 +465,14 @@ MemoryPoolImpl::MemoryPoolImpl( const Options& options) : MemoryPool{name, kind, parent, options}, manager_{memoryManager}, - allocator_{manager_->allocator()}, - arbitrator_{manager_->arbitrator()}, + allocator_{ + options.resource != nullptr && options.resource->allocator + ? options.resource->allocator.get() + : manager_->allocator()}, + arbitrator_{ + options.resource != nullptr && options.resource->arbitrator + ? options.resource->arbitrator.get() + : manager_->arbitrator()}, reclaimer_(std::move(reclaimer)), // The memory manager sets the capacity through grow() according to the // actually used memory arbitration policy. @@ -904,7 +917,8 @@ std::shared_ptr MemoryPoolImpl::genChild( .threadSafe = threadSafe, .coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_, .getPreferredSize = getPreferredSize, - .debugOptions = debugOptions_}); + .debugOptions = debugOptions_, + .resource = resource_}); } bool MemoryPoolImpl::maybeReserve(uint64_t increment) { diff --git a/velox/common/memory/MemoryPool.h b/velox/common/memory/MemoryPool.h index 82628f265b7..8369bc7d3f8 100644 --- a/velox/common/memory/MemoryPool.h +++ b/velox/common/memory/MemoryPool.h @@ -39,6 +39,7 @@ class ParallelMemoryReclaimer; namespace facebook::velox::memory { class TestArbitrator; class MemoryManager; +struct CustomMemoryResource; constexpr int64_t kMaxMemory = std::numeric_limits::max(); @@ -165,11 +166,11 @@ class MemoryPool : public std::enable_shared_from_this { /// If non-empty, enables debug mode for the created memory pool. std::optional debugOptions{std::nullopt}; - /// If set, this pool belongs to the custom memory resource registered - /// under this tag with MemoryManager::registerCustomResource. Unset means - /// the default CPU resource. Inert until consumers (allocator dispatch, - /// child inheritance) are wired in a follow-up phase. - std::optional resourceTag{std::nullopt}; + /// Custom memory resource this pool belongs to. Non-null means the pool + /// routes allocations to this resource's allocator (and arbitrator, if + /// set). Null means the default CPU resource. The pool keeps a strong + /// reference, so the resource outlives every pool that uses it. + std::shared_ptr resource{nullptr}; }; /// Constructs a named memory pool with specified 'name', 'parent' and 'kind'. @@ -349,12 +350,16 @@ class MemoryPool : public std::enable_shared_from_this { return alignment_; } - /// Returns the resource tag this memory pool was created against, or - /// std::nullopt for the default CPU resource. - virtual const std::optional& resourceTag() const { - return resourceTag_; + /// Returns the custom memory resource this pool routes allocations to, or + /// nullptr for the default CPU resource. + virtual const std::shared_ptr& resource() const { + return resource_; } + /// Returns the tag of the custom resource this pool was created against, or + /// std::nullopt for the default CPU resource. + virtual std::optional resourceTag() const; + /// Resource governing methods used to track and limit the memory usage /// through this memory pool object. @@ -427,6 +432,10 @@ class MemoryPool : public std::enable_shared_from_this { /// Returns the memory reclaimer of this memory pool if not null. virtual MemoryReclaimer* reclaimer() const = 0; + /// Returns the arbitrator that governs this pool's capacity. May be the + /// MemoryManager's default arbitrator or a custom resource's arbitrator. + virtual MemoryArbitrator* arbitrator() const = 0; + /// Function estimates the number of reclaimable bytes and returns in /// 'reclaimableBytes'. If the 'reclaimer' is not set, the function returns /// std::nullopt. Otherwise, it will invoke the corresponding method of the @@ -612,7 +621,7 @@ class MemoryPool : public std::enable_shared_from_this { const bool trackUsage_; const bool threadSafe_; const std::optional debugOptions_; - const std::optional resourceTag_; + const std::shared_ptr resource_; const bool coreOnAllocationFailureEnabled_; std::function getPreferredSize_; @@ -733,6 +742,10 @@ class MemoryPoolImpl : public MemoryPool { MemoryReclaimer* reclaimer() const override; + MemoryArbitrator* arbitrator() const override { + return arbitrator_; + } + std::optional reclaimableBytes() const override; uint64_t reclaim( diff --git a/velox/common/memory/tests/CMakeLists.txt b/velox/common/memory/tests/CMakeLists.txt index 77ce4489f9d..deff8332a32 100644 --- a/velox/common/memory/tests/CMakeLists.txt +++ b/velox/common/memory/tests/CMakeLists.txt @@ -20,6 +20,7 @@ add_executable( ArbitrationParticipantTest.cpp ByteStreamTest.cpp CompactDoubleListTest.cpp + CustomMemoryResourceTest.cpp HashStringAllocatorTest.cpp MemoryAllocatorTest.cpp MemoryArbitratorTest.cpp diff --git a/velox/common/memory/tests/CustomMemoryResourceTest.cpp b/velox/common/memory/tests/CustomMemoryResourceTest.cpp new file mode 100644 index 00000000000..b924d590222 --- /dev/null +++ b/velox/common/memory/tests/CustomMemoryResourceTest.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/MallocAllocator.h" +#include "velox/common/memory/Memory.h" +#include "velox/core/QueryCtx.h" + +namespace facebook::velox::memory::test { +namespace { + +CustomMemoryResource makeResource(const std::string& tag) { + MemoryAllocator::Options allocatorOptions; + allocatorOptions.capacity = 1L << 30; + CustomMemoryResource resource; + resource.tag = tag; + resource.allocator = std::make_shared(allocatorOptions); + return resource; +} + +} // namespace + +// MemoryManager-level tests use a local MemoryManager so they don't touch +// the process-wide singleton. +class CustomMemoryResourceManagerTest : public testing::Test {}; + +TEST_F(CustomMemoryResourceManagerTest, registrationValidation) { + MemoryManager manager{}; + ASSERT_TRUE(manager.customResources().empty()); + + auto resource = makeResource("test-resource"); + resource.maxCapacity = 1L << 28; + manager.registerCustomResource(std::move(resource)); + + ASSERT_EQ(manager.customResources().size(), 1); + EXPECT_EQ(manager.customResources()[0]->tag, "test-resource"); + EXPECT_EQ(manager.customResources()[0]->maxCapacity, 1L << 28); + + CustomMemoryResource emptyTag; + emptyTag.allocator = makeResource("ignored").allocator; + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(emptyTag)), + "CustomMemoryResource tag is empty"); + + CustomMemoryResource nullAllocator; + nullAllocator.tag = "another"; + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(nullAllocator)), + "CustomMemoryResource allocator is null for tag: another"); + + VELOX_ASSERT_THROW( + manager.registerCustomResource(makeResource("test-resource")), + "CustomMemoryResource already registered for tag: test-resource"); + + ASSERT_EQ(manager.customResources().size(), 1); +} + +TEST_F(CustomMemoryResourceManagerTest, customResourcesEmptyByDefault) { + MemoryManager manager{}; + EXPECT_TRUE(manager.customResources().empty()); +} + +TEST_F(CustomMemoryResourceManagerTest, registrationOrderPreserved) { + MemoryManager manager{}; + for (const auto* tag : {"a", "b", "c"}) { + manager.registerCustomResource(makeResource(tag)); + } + ASSERT_EQ(manager.customResources().size(), 3); + EXPECT_EQ(manager.customResources()[0]->tag, "a"); + EXPECT_EQ(manager.customResources()[1]->tag, "b"); + EXPECT_EQ(manager.customResources()[2]->tag, "c"); +} + +TEST_F(CustomMemoryResourceManagerTest, addRootPoolPropagatesResourceTag) { + MemoryManager manager{}; + manager.registerCustomResource(makeResource("gpu")); + + auto tagged = manager.addRootPool( + "tagged-root", kMaxMemory, nullptr, std::nullopt, "gpu"); + ASSERT_NE(tagged, nullptr); + ASSERT_TRUE(tagged->resourceTag().has_value()); + EXPECT_EQ(*tagged->resourceTag(), "gpu"); + + auto untagged = manager.addRootPool("untagged-root"); + EXPECT_FALSE(untagged->resourceTag().has_value()); + + VELOX_ASSERT_THROW( + manager.addRootPool( + "bad-root", kMaxMemory, nullptr, std::nullopt, "not-registered"), + "No CustomMemoryResource registered for tag: not-registered"); +} + +// QueryCtx-level tests run against the process-wide MemoryManager because +// QueryCtx::Builder reads it directly. Each test cleans up after itself so +// resources don't leak into siblings. +class CustomMemoryResourceQueryCtxTest : public testing::Test { + protected: + static void SetUpTestCase() { + MemoryManager::testingSetInstance(MemoryManager::Options{}); + } + + void TearDown() override { + memoryManager()->testingClearCustomResources(); + } +}; + +TEST_F(CustomMemoryResourceQueryCtxTest, queryCtxWithoutResourcesHasNoCustomPools) { + auto queryCtx = core::QueryCtx::Builder().queryId("q-empty").build(); + EXPECT_TRUE(queryCtx->customPools().empty()); + EXPECT_EQ(queryCtx->customPool("anything"), nullptr); +} + +TEST_F(CustomMemoryResourceQueryCtxTest, addCustomPoolRejectsNull) { + auto queryCtx = core::QueryCtx::Builder().queryId("q-null").build(); + VELOX_ASSERT_THROW(queryCtx->addCustomPool(nullptr), ""); +} + +TEST_F(CustomMemoryResourceQueryCtxTest, reclaimerFactoryExceptionPropagates) { + auto resource = makeResource("boom"); + resource.reclaimerFactory = + [](core::QueryCtx*) -> std::unique_ptr { + throw std::runtime_error("factory boom"); + }; + memoryManager()->registerCustomResource(std::move(resource)); + + EXPECT_THROW( + core::QueryCtx::Builder().queryId("q-boom").build(), + std::runtime_error); +} + +TEST_F(CustomMemoryResourceQueryCtxTest, multipleQueryCtxsGetDistinctCustomPools) { + memoryManager()->registerCustomResource(makeResource("gpu")); + + auto q1 = core::QueryCtx::Builder().queryId("q-shared").build(); + auto q2 = core::QueryCtx::Builder().queryId("q-shared").build(); + auto p1 = q1->customPool("gpu"); + auto p2 = q2->customPool("gpu"); + ASSERT_NE(p1, nullptr); + ASSERT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + EXPECT_NE(p1->name(), p2->name()); +} + +TEST_F(CustomMemoryResourceQueryCtxTest, perQueryRootPoolCreated) { + auto resource = makeResource("gpu"); + resource.maxCapacity = 1L << 28; + + bool factoryInvoked = false; + core::QueryCtx* factorySawCtx = nullptr; + resource.reclaimerFactory = [&](core::QueryCtx* ctx) { + factoryInvoked = true; + factorySawCtx = ctx; + return MemoryReclaimer::create(0); + }; + + memoryManager()->registerCustomResource(std::move(resource)); + + auto queryCtx = core::QueryCtx::Builder().queryId("q0").build(); + EXPECT_TRUE(factoryInvoked); + EXPECT_EQ(factorySawCtx, queryCtx.get()); + + auto pool = queryCtx->customPool("gpu"); + ASSERT_NE(pool, nullptr); + EXPECT_TRUE(pool->name().starts_with("query.q0.")); + EXPECT_TRUE(pool->name().ends_with(".gpu")); + ASSERT_TRUE(pool->resourceTag().has_value()); + EXPECT_EQ(*pool->resourceTag(), "gpu"); + EXPECT_EQ(pool->maxCapacity(), 1L << 28); + + EXPECT_EQ(queryCtx->customPool("missing"), nullptr); + EXPECT_EQ(queryCtx->customPools().size(), 1); +} + +TEST_F(CustomMemoryResourceQueryCtxTest, noReclaimerFactory) { + memoryManager()->registerCustomResource(makeResource("plain")); + + auto queryCtx = core::QueryCtx::Builder().queryId("q-plain").build(); + auto pool = queryCtx->customPool("plain"); + ASSERT_NE(pool, nullptr); + EXPECT_EQ(pool->reclaimer(), nullptr); +} + +TEST_F(CustomMemoryResourceQueryCtxTest, customPoolsMatchRegistrationOrder) { + for (const auto* tag : {"a", "b", "c"}) { + memoryManager()->registerCustomResource(makeResource(tag)); + } + + auto queryCtx = core::QueryCtx::Builder().queryId("q-order").build(); + ASSERT_EQ(queryCtx->customPools().size(), 3); + EXPECT_EQ(*queryCtx->customPools()[0]->resourceTag(), "a"); + EXPECT_EQ(*queryCtx->customPools()[1]->resourceTag(), "b"); + EXPECT_EQ(*queryCtx->customPools()[2]->resourceTag(), "c"); +} + +// Locks down the chained-reclaimer ordering contract: a later-registered +// resource's reclaimerFactory must see earlier resources' per-query pools +// already attached to the QueryCtx. A factory cannot see its own pool because +// the factory runs before that pool is added. +TEST_F(CustomMemoryResourceQueryCtxTest, reclaimerFactorySeesPriorSiblingPool) { + memoryManager()->registerCustomResource(makeResource("primary")); + + std::shared_ptr capturedPrimary; + std::shared_ptr capturedSelf; + auto secondary = makeResource("secondary"); + secondary.reclaimerFactory = [&](core::QueryCtx* ctx) { + capturedPrimary = ctx->customPool("primary"); + capturedSelf = ctx->customPool("secondary"); + return MemoryReclaimer::create(0); + }; + memoryManager()->registerCustomResource(std::move(secondary)); + + auto queryCtx = core::QueryCtx::Builder().queryId("q-chain").build(); + ASSERT_NE(capturedPrimary, nullptr); + EXPECT_EQ(capturedPrimary, queryCtx->customPool("primary")); + EXPECT_EQ(capturedSelf, nullptr); +} + +// Test reclaimer that "spills" by allocating from a sibling resource's pool. +// Models the chained-reclaimer pattern (e.g. device-memory pool spills into a +// pinned-host pool). +class SpillToSiblingReclaimer : public MemoryReclaimer { + public: + static std::unique_ptr create( + std::shared_ptr sibling) { + return std::unique_ptr( + new SpillToSiblingReclaimer(std::move(sibling))); + } + + ~SpillToSiblingReclaimer() override { + // Release any buffers we acquired during reclaim before the leaf pools + // they came from get destroyed; otherwise the leaf destructor aborts on + // outstanding usage. + for (auto& spill : spills_) { + spill.leaf->free(spill.buffer, static_cast(spill.bytes)); + } + } + + uint64_t reclaim( + MemoryPool* /*pool*/, + uint64_t targetBytes, + uint64_t /*maxWaitMs*/, + Stats& /*stats*/) override { + ++numReclaimCalls_; + auto leaf = sibling_->addLeafChild( + fmt::format("spill-{}", numReclaimCalls_)); + void* buffer = leaf->allocate(static_cast(targetBytes)); + spills_.push_back({std::move(leaf), buffer, targetBytes}); + return targetBytes; + } + + int numReclaimCalls() const { + return numReclaimCalls_; + } + + private: + explicit SpillToSiblingReclaimer(std::shared_ptr sibling) + : MemoryReclaimer(0), sibling_(std::move(sibling)) {} + + struct Spill { + std::shared_ptr leaf; + void* buffer; + uint64_t bytes; + }; + + std::shared_ptr sibling_; + int numReclaimCalls_{0}; + std::vector spills_; +}; + +// Locks down the cross-resource spill flow: when reclaim is triggered on a +// custom pool, the resource's reclaimer can allocate into a sibling resource's +// pool, modeling device -> pinned-host spill. +TEST_F(CustomMemoryResourceQueryCtxTest, deviceReclaimerSpillsToHostSibling) { + memoryManager()->registerCustomResource(makeResource("host")); + + SpillToSiblingReclaimer* mockReclaimer = nullptr; + auto device = makeResource("device"); + device.reclaimerFactory = [&](core::QueryCtx* ctx) { + auto reclaimer = SpillToSiblingReclaimer::create(ctx->customPool("host")); + mockReclaimer = static_cast(reclaimer.get()); + return reclaimer; + }; + memoryManager()->registerCustomResource(std::move(device)); + + auto queryCtx = core::QueryCtx::Builder().queryId("q-spill").build(); + auto devicePool = queryCtx->customPool("device"); + auto hostPool = queryCtx->customPool("host"); + ASSERT_NE(devicePool, nullptr); + ASSERT_NE(hostPool, nullptr); + ASSERT_NE(mockReclaimer, nullptr); + + const uint64_t target = 4 * 1024; + MemoryReclaimer::Stats stats; + const auto reclaimed = devicePool->reclaim(target, 0, stats); + + EXPECT_EQ(mockReclaimer->numReclaimCalls(), 1); + EXPECT_EQ(reclaimed, target); + EXPECT_GE(hostPool->usedBytes(), static_cast(target)); +} + +// Gap-coverage tests below. These pin the contracts that allocator dispatch +// and pointer-stability must satisfy. They are expected to fail until the +// dispatch and storage gaps are implemented. + +// Gap 1: a pool created against a custom resource must allocate through that +// resource's allocator, not the MemoryManager default. +TEST_F( + CustomMemoryResourceQueryCtxTest, + rootPoolDispatchesToCustomResourceAllocator) { + auto resource = makeResource("gpu"); + auto* expectedAllocator = resource.allocator.get(); + memoryManager()->registerCustomResource(std::move(resource)); + + auto queryCtx = core::QueryCtx::Builder().queryId("q-dispatch").build(); + auto pool = queryCtx->customPool("gpu"); + ASSERT_NE(pool, nullptr); + auto* impl = static_cast(pool.get()); + EXPECT_EQ(impl->testingAllocator(), expectedAllocator) + << "Root pool must resolve its allocator from the custom resource, " + "not the MemoryManager default."; +} + +// Gap 3: a child of a custom-resource root pool must inherit the resource's +// allocator (and, transitively, its arbitrator once dispatch is wired). +TEST_F( + CustomMemoryResourceQueryCtxTest, + childPoolInheritsCustomResourceAllocator) { + auto resource = makeResource("gpu"); + auto* expectedAllocator = resource.allocator.get(); + memoryManager()->registerCustomResource(std::move(resource)); + + auto queryCtx = core::QueryCtx::Builder().queryId("q-child").build(); + auto root = queryCtx->customPool("gpu"); + ASSERT_NE(root, nullptr); + auto aggregate = root->addAggregateChild("agg"); + auto leaf = aggregate->addLeafChild("leaf"); + + EXPECT_EQ( + static_cast(aggregate.get())->testingAllocator(), + expectedAllocator); + EXPECT_EQ( + static_cast(leaf.get())->testingAllocator(), + expectedAllocator); + ASSERT_TRUE(leaf->resourceTag().has_value()); + EXPECT_EQ(*leaf->resourceTag(), "gpu"); +} + +// Gap 4: the registration list must offer stable references. Once dispatch is +// wired, pools will store CustomMemoryResource* pointers obtained at +// construction time; later registrations must not invalidate them. +// +// Compares addresses only — does not dereference the captured pointer, since +// dereferencing a dangling pointer on contract violation would crash gtest +// before it can report the failure. +TEST_F(CustomMemoryResourceManagerTest, registrationKeepsExistingPointersStable) { + MemoryManager manager{}; + manager.registerCustomResource(makeResource("first")); + const CustomMemoryResource* firstAddress = + manager.customResources()[0].get(); + + for (int i = 0; i < 128; ++i) { + manager.registerCustomResource(makeResource(fmt::format("r{}", i))); + } + + EXPECT_EQ(manager.customResources()[0].get(), firstAddress) + << "Address of an already-registered resource must not change when " + "new resources are added."; + EXPECT_EQ(firstAddress->tag, "first"); +} + +} // namespace facebook::velox::memory::test diff --git a/velox/common/memory/tests/MemoryManagerTest.cpp b/velox/common/memory/tests/MemoryManagerTest.cpp index 151e1d8dbab..260d446131f 100644 --- a/velox/common/memory/tests/MemoryManagerTest.cpp +++ b/velox/common/memory/tests/MemoryManagerTest.cpp @@ -295,61 +295,6 @@ TEST_F(MemoryManagerTest, addPoolWithArbitrator) { } // TODO: remove this test when remove deprecatedDefaultMemoryManager. -TEST_F(MemoryManagerTest, customResourceRegistration) { - MemoryManager manager{}; - ASSERT_TRUE(manager.customResources().empty()); - - MemoryAllocator::Options allocatorOptions; - allocatorOptions.capacity = 1L << 30; - - CustomMemoryResource resource; - resource.tag = "test-resource"; - resource.maxCapacity = 1L << 28; - resource.allocator = std::make_shared(allocatorOptions); - manager.registerCustomResource(std::move(resource)); - - ASSERT_EQ(manager.customResources().size(), 1); - EXPECT_EQ(manager.customResources()[0].tag, "test-resource"); - EXPECT_EQ(manager.customResources()[0].maxCapacity, 1L << 28); - - // Empty tag is rejected. - CustomMemoryResource emptyTag; - emptyTag.allocator = std::make_shared(allocatorOptions); - VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(emptyTag)), - "CustomMemoryResource tag is empty"); - - // Null allocator is rejected. - CustomMemoryResource nullAllocator; - nullAllocator.tag = "another"; - VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(nullAllocator)), - "CustomMemoryResource allocator is null for tag: another"); - - // Duplicate tag is rejected. - CustomMemoryResource duplicate; - duplicate.tag = "test-resource"; - duplicate.allocator = std::make_shared(allocatorOptions); - VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(duplicate)), - "CustomMemoryResource already registered for tag: test-resource"); - - // The first registration is unaffected. - ASSERT_EQ(manager.customResources().size(), 1); -} - -TEST_F(MemoryManagerTest, addRootPoolWithResourceTag) { - MemoryManager manager{}; - auto pool = manager.addRootPool( - "tagged-root", kMaxMemory, nullptr, std::nullopt, "gpu"); - ASSERT_NE(pool, nullptr); - ASSERT_TRUE(pool->resourceTag().has_value()); - EXPECT_EQ(*pool->resourceTag(), "gpu"); - - auto untagged = manager.addRootPool("untagged-root"); - EXPECT_FALSE(untagged->resourceTag().has_value()); -} - TEST_F(MemoryManagerTest, defaultMemoryManager) { auto& managerA = toMemoryManager(deprecatedDefaultMemoryManager()); auto& managerB = toMemoryManager(deprecatedDefaultMemoryManager()); diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp index 8440b339089..90a026e75e7 100644 --- a/velox/core/QueryCtx.cpp +++ b/velox/core/QueryCtx.cpp @@ -58,19 +58,16 @@ std::shared_ptr QueryCtx::Builder::build() { queryCtx->maybeSetReclaimer(); for (const auto& mr : memory::memoryManager()->customResources()) { auto reclaimer = - mr.reclaimerFactory ? mr.reclaimerFactory(queryCtx.get()) : nullptr; - // Reuse generatePoolName so the custom pool gets the same uniqueness - // guarantee as the regular query pool (atomic seq counter), then suffix - // the resource tag so it's identifiable in pool dumps. + mr->reclaimerFactory ? mr->reclaimerFactory(queryCtx.get()) : nullptr; auto pool = memory::memoryManager()->addRootPool( fmt::format( "{}.{}", QueryCtx::generatePoolName(queryCtx->queryId()), - mr.tag), - mr.maxCapacity, + mr->tag), + mr->maxCapacity, std::move(reclaimer), std::nullopt, - mr.tag); + mr->tag); queryCtx->addCustomPool(std::move(pool)); } for (auto& cb : releaseCallbacks_) { diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index 236f76db5bd..8499c57dedf 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -379,17 +379,15 @@ class QueryCtx : public std::enable_shared_from_this { pool_ = std::move(pool); } - /// Tracks an additional root pool tied to a custom memory resource. Called - /// by Builder::build for each entry in MemoryManager::customResources. + /// Tracks an additional root pool tied to a custom memory resource. void addCustomPool(std::shared_ptr pool); - /// Returns the custom root pool associated with the resource tag, or - /// nullptr if no pool was registered under that tag for this query. + /// Returns the custom root pool for the given resource tag, or nullptr if + /// none is registered under that tag for this query. std::shared_ptr customPool( const std::string& tag) const; - /// Returns all custom root pools registered for this query, in registration - /// order. The reference is stable for the lifetime of the QueryCtx. + /// Returns all custom root pools for this query, in registration order. const std::vector>& customPools() const { return customPools_; } diff --git a/velox/core/tests/QueryCtxTest.cpp b/velox/core/tests/QueryCtxTest.cpp index b78b6db0fe1..44dbc5927c4 100644 --- a/velox/core/tests/QueryCtxTest.cpp +++ b/velox/core/tests/QueryCtxTest.cpp @@ -16,7 +16,6 @@ #include #include "velox/common/base/tests/GTestUtils.h" -#include "velox/common/memory/MallocAllocator.h" #include "velox/core/QueryCtx.h" namespace facebook::velox::core::test { @@ -114,38 +113,6 @@ TEST_F(QueryCtxTest, releaseCallbackException) { ASSERT_EQ(callbackCount, 2); } -TEST_F(QueryCtxTest, customResourcePerQueryPool) { - memory::MemoryAllocator::Options allocatorOptions; - allocatorOptions.capacity = 1L << 30; - - memory::CustomMemoryResource resource; - resource.tag = "qctx-test-resource"; - resource.maxCapacity = 1L << 28; - resource.allocator = - std::make_shared(allocatorOptions); - - bool factoryInvoked = false; - resource.reclaimerFactory = [&factoryInvoked](QueryCtx* /*ctx*/) { - factoryInvoked = true; - return memory::MemoryReclaimer::create(0); - }; - - memory::memoryManager()->registerCustomResource(std::move(resource)); - - auto queryCtx = QueryCtx::Builder().queryId("custom-resource-query").build(); - EXPECT_TRUE(factoryInvoked); - - auto pool = queryCtx->customPool("qctx-test-resource"); - ASSERT_NE(pool, nullptr); - EXPECT_TRUE(pool->name().starts_with("query.custom-resource-query.")); - EXPECT_TRUE(pool->name().ends_with(".qctx-test-resource")); - ASSERT_TRUE(pool->resourceTag().has_value()); - EXPECT_EQ(*pool->resourceTag(), "qctx-test-resource"); - - EXPECT_EQ(queryCtx->customPool("does-not-exist"), nullptr); - EXPECT_EQ(queryCtx->customPools().size(), 1); -} - TEST_F(QueryCtxTest, builderReleaseCallbacks) { int callbackCount = 0; std::string capturedQueryId; From 2f21d607ecb4f55bda083eda2e3954e7d4250ad8 Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Fri, 15 May 2026 05:19:40 +0000 Subject: [PATCH 03/10] refactor(memory): Tighten CustomMemoryResource contract and clean up dispatch - Require arbitrator and reclaimerFactory on registration; registration now throws on any null required field. - Drop pure-virtual MemoryPool::arbitrator() in favor of a nullptr default so external subclassers keep compiling; MemoryPoolImpl still overrides to return the resolved arbitrator. - Move QueryCtx custom-pool creation into an anonymous-namespace helper taking a raw QueryCtx*. - Refresh doc comments on the public API and restore the load-bearing invariant that pools share ownership of CustomMemoryResource. - Update test helper to populate the new required fields, add negative coverage for null arbitrator/factory, and drop the now-contradictory noReclaimerFactory test. --- velox/common/memory/CustomMemoryResource.h | 11 +++--- velox/common/memory/Memory.cpp | 8 +++++ velox/common/memory/Memory.h | 12 +++---- velox/common/memory/MemoryPool.h | 4 ++- .../memory/tests/CustomMemoryResourceTest.cpp | 26 +++++++++----- velox/core/QueryCtx.cpp | 34 +++++++++++-------- 6 files changed, 58 insertions(+), 37 deletions(-) diff --git a/velox/common/memory/CustomMemoryResource.h b/velox/common/memory/CustomMemoryResource.h index c9bd9f50177..4c9150da86d 100644 --- a/velox/common/memory/CustomMemoryResource.h +++ b/velox/common/memory/CustomMemoryResource.h @@ -42,16 +42,15 @@ struct CustomMemoryResource { /// Capacity of the per-query root pool created for this resource. int64_t maxCapacity{std::numeric_limits::max()}; - /// Allocator backing pools tagged with this resource. Required. + /// Allocator backing pools tagged with this resource. std::shared_ptr allocator; - /// Per-resource arbitrator. When non-null, pools tagged with this resource - /// route capacity decisions through it. When null, they fall back to the - /// MemoryManager's default arbitrator. + /// Arbitrator routing capacity decisions for pools tagged with this + /// resource. std::shared_ptr arbitrator; - /// Optional factory invoked once per QueryCtx to build the reclaimer - /// attached to the resource's per-query root pool. + /// Invoked once per QueryCtx to build the reclaimer attached to the + /// per-query root pool. std::function(core::QueryCtx*)> reclaimerFactory; }; diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index 805264019c9..763dfa69574 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -397,6 +397,14 @@ void MemoryManager::registerCustomResource(CustomMemoryResource resource) { resource.allocator, "CustomMemoryResource allocator is null for tag: {}", resource.tag); + VELOX_USER_CHECK_NOT_NULL( + resource.arbitrator, + "CustomMemoryResource arbitrator is null for tag: {}", + resource.tag); + VELOX_USER_CHECK_NOT_NULL( + resource.reclaimerFactory, + "CustomMemoryResource reclaimerFactory is null for tag: {}", + resource.tag); for (const auto& existing : customResources_) { VELOX_USER_CHECK_NE( existing->tag, diff --git a/velox/common/memory/Memory.h b/velox/common/memory/Memory.h index 16225892fb0..6b2ae79725a 100644 --- a/velox/common/memory/Memory.h +++ b/velox/common/memory/Memory.h @@ -226,13 +226,12 @@ class MemoryManager { std::nullopt, const std::optional& resourceTag = std::nullopt); - /// Registers a custom memory resource. NOT thread-safe: must be called - /// during process startup before any QueryCtx is built. Throws on empty - /// tag, null allocator, or duplicate tag. + /// Registers a custom memory resource. Throws on empty or duplicate tag, + /// or on any null required field. NOT thread-safe: must be called during + /// process startup. void registerCustomResource(CustomMemoryResource resource); - /// Returns the registered custom resources in registration order. Safe to - /// read concurrently once registrations have completed. + /// Returns the registered custom resources in registration order. const std::vector>& customResources() const; @@ -351,8 +350,7 @@ class MemoryManager { // All user root pools allocated from 'this'. std::unordered_map> pools_; - // Append-only; see registerCustomResource for the threading contract. - // Pools hold shared_ptrs so a resource outlives every pool that uses it. + // Shared ownership so a resource outlives every pool that uses it. std::vector> customResources_; }; diff --git a/velox/common/memory/MemoryPool.h b/velox/common/memory/MemoryPool.h index 8369bc7d3f8..626fcb4289c 100644 --- a/velox/common/memory/MemoryPool.h +++ b/velox/common/memory/MemoryPool.h @@ -434,7 +434,9 @@ class MemoryPool : public std::enable_shared_from_this { /// Returns the arbitrator that governs this pool's capacity. May be the /// MemoryManager's default arbitrator or a custom resource's arbitrator. - virtual MemoryArbitrator* arbitrator() const = 0; + virtual MemoryArbitrator* arbitrator() const { + return nullptr; + } /// Function estimates the number of reclaimable bytes and returns in /// 'reclaimableBytes'. If the 'reclaimer' is not set, the function returns diff --git a/velox/common/memory/tests/CustomMemoryResourceTest.cpp b/velox/common/memory/tests/CustomMemoryResourceTest.cpp index b924d590222..35bc8891cbc 100644 --- a/velox/common/memory/tests/CustomMemoryResourceTest.cpp +++ b/velox/common/memory/tests/CustomMemoryResourceTest.cpp @@ -21,6 +21,7 @@ #include "velox/common/memory/CustomMemoryResource.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryArbitrator.h" #include "velox/core/QueryCtx.h" namespace facebook::velox::memory::test { @@ -32,6 +33,10 @@ CustomMemoryResource makeResource(const std::string& tag) { CustomMemoryResource resource; resource.tag = tag; resource.allocator = std::make_shared(allocatorOptions); + resource.arbitrator = MemoryArbitrator::create({}); + resource.reclaimerFactory = [](core::QueryCtx*) { + return MemoryReclaimer::create(0); + }; return resource; } @@ -65,6 +70,18 @@ TEST_F(CustomMemoryResourceManagerTest, registrationValidation) { manager.registerCustomResource(std::move(nullAllocator)), "CustomMemoryResource allocator is null for tag: another"); + auto nullArbitrator = makeResource("another"); + nullArbitrator.arbitrator.reset(); + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(nullArbitrator)), + "CustomMemoryResource arbitrator is null for tag: another"); + + auto nullFactory = makeResource("another"); + nullFactory.reclaimerFactory = nullptr; + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(nullFactory)), + "CustomMemoryResource reclaimerFactory is null for tag: another"); + VELOX_ASSERT_THROW( manager.registerCustomResource(makeResource("test-resource")), "CustomMemoryResource already registered for tag: test-resource"); @@ -188,15 +205,6 @@ TEST_F(CustomMemoryResourceQueryCtxTest, perQueryRootPoolCreated) { EXPECT_EQ(queryCtx->customPools().size(), 1); } -TEST_F(CustomMemoryResourceQueryCtxTest, noReclaimerFactory) { - memoryManager()->registerCustomResource(makeResource("plain")); - - auto queryCtx = core::QueryCtx::Builder().queryId("q-plain").build(); - auto pool = queryCtx->customPool("plain"); - ASSERT_NE(pool, nullptr); - EXPECT_EQ(pool->reclaimer(), nullptr); -} - TEST_F(CustomMemoryResourceQueryCtxTest, customPoolsMatchRegistrationOrder) { for (const auto* tag : {"a", "b", "c"}) { memoryManager()->registerCustomResource(makeResource(tag)); diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp index 90a026e75e7..f1fe027869d 100644 --- a/velox/core/QueryCtx.cpp +++ b/velox/core/QueryCtx.cpp @@ -20,6 +20,25 @@ #include "velox/common/config/Config.h" namespace facebook::velox::core { +namespace { + +void createCustomMemoryPools(QueryCtx* queryCtx) { + for (const auto& mr : memory::memoryManager()->customResources()) { + auto reclaimer = mr->reclaimerFactory(queryCtx); + auto pool = memory::memoryManager()->addRootPool( + fmt::format( + "{}.{}", + QueryCtx::generatePoolName(queryCtx->queryId()), + mr->tag), + mr->maxCapacity, + std::move(reclaimer), + std::nullopt, + mr->tag); + queryCtx->addCustomPool(std::move(pool)); + } +} + +} // namespace // static std::shared_ptr QueryCtx::create( @@ -56,23 +75,10 @@ std::shared_ptr QueryCtx::Builder::build() { std::move(tokenProvider_), std::move(traceCtxProvider_))); queryCtx->maybeSetReclaimer(); - for (const auto& mr : memory::memoryManager()->customResources()) { - auto reclaimer = - mr->reclaimerFactory ? mr->reclaimerFactory(queryCtx.get()) : nullptr; - auto pool = memory::memoryManager()->addRootPool( - fmt::format( - "{}.{}", - QueryCtx::generatePoolName(queryCtx->queryId()), - mr->tag), - mr->maxCapacity, - std::move(reclaimer), - std::nullopt, - mr->tag); - queryCtx->addCustomPool(std::move(pool)); - } for (auto& cb : releaseCallbacks_) { queryCtx->addReleaseCallback(std::move(cb)); } + createCustomMemoryPools(queryCtx.get()); return queryCtx; } From ed905f057b84af9d6b0535de5bde81cc7699de6c Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Fri, 15 May 2026 07:39:02 +0000 Subject: [PATCH 04/10] Refactor memory pool and custom memory resource test --- velox/common/memory/Memory.cpp | 7 +- velox/common/memory/MemoryPool.cpp | 21 +- velox/common/memory/MemoryPool.h | 27 +-- .../memory/tests/CustomMemoryResourceTest.cpp | 197 ++++-------------- velox/core/QueryCtx.cpp | 18 +- velox/core/QueryCtx.h | 15 +- 6 files changed, 85 insertions(+), 200 deletions(-) diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index 763dfa69574..bdcbbb5dec9 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -300,16 +300,19 @@ std::shared_ptr MemoryManager::addRootPool( options.getPreferredSize = getPreferredSize_; options.debugOptions = poolDebugOpts; if (resourceTag.has_value()) { + std::shared_ptr matched; for (const auto& candidate : customResources_) { if (candidate->tag == *resourceTag) { - options.resource = candidate; + matched = candidate; break; } } VELOX_USER_CHECK_NOT_NULL( - options.resource, + matched, "No CustomMemoryResource registered for tag: {}", *resourceTag); + options.customAllocator = matched->allocator.get(); + options.customArbitrator = matched->arbitrator.get(); } auto pool = createRootPool(poolName, reclaimer, options); diff --git a/velox/common/memory/MemoryPool.cpp b/velox/common/memory/MemoryPool.cpp index 796a38c3a4c..df9461992e2 100644 --- a/velox/common/memory/MemoryPool.cpp +++ b/velox/common/memory/MemoryPool.cpp @@ -243,7 +243,6 @@ MemoryPool::MemoryPool( trackUsage_(options.trackUsage), threadSafe_(options.threadSafe), debugOptions_(options.debugOptions), - resource_(options.resource), coreOnAllocationFailureEnabled_(options.coreOnAllocationFailureEnabled), getPreferredSize_( options.getPreferredSize == nullptr @@ -260,13 +259,6 @@ MemoryPool::~MemoryPool() { VELOX_CHECK(children_.empty()); } -std::optional MemoryPool::resourceTag() const { - if (resource_ == nullptr) { - return std::nullopt; - } - return resource_->tag; -} - // static std::string MemoryPool::kindString(Kind kind) { switch (kind) { @@ -466,13 +458,11 @@ MemoryPoolImpl::MemoryPoolImpl( : MemoryPool{name, kind, parent, options}, manager_{memoryManager}, allocator_{ - options.resource != nullptr && options.resource->allocator - ? options.resource->allocator.get() - : manager_->allocator()}, + options.customAllocator != nullptr ? options.customAllocator + : manager_->allocator()}, arbitrator_{ - options.resource != nullptr && options.resource->arbitrator - ? options.resource->arbitrator.get() - : manager_->arbitrator()}, + options.customArbitrator != nullptr ? options.customArbitrator + : manager_->arbitrator()}, reclaimer_(std::move(reclaimer)), // The memory manager sets the capacity through grow() according to the // actually used memory arbitration policy. @@ -918,7 +908,8 @@ std::shared_ptr MemoryPoolImpl::genChild( .coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_, .getPreferredSize = getPreferredSize, .debugOptions = debugOptions_, - .resource = resource_}); + .customAllocator = allocator_, + .customArbitrator = arbitrator_}); } bool MemoryPoolImpl::maybeReserve(uint64_t increment) { diff --git a/velox/common/memory/MemoryPool.h b/velox/common/memory/MemoryPool.h index 626fcb4289c..ceb8ab42ae7 100644 --- a/velox/common/memory/MemoryPool.h +++ b/velox/common/memory/MemoryPool.h @@ -39,7 +39,6 @@ class ParallelMemoryReclaimer; namespace facebook::velox::memory { class TestArbitrator; class MemoryManager; -struct CustomMemoryResource; constexpr int64_t kMaxMemory = std::numeric_limits::max(); @@ -166,11 +165,16 @@ class MemoryPool : public std::enable_shared_from_this { /// If non-empty, enables debug mode for the created memory pool. std::optional debugOptions{std::nullopt}; - /// Custom memory resource this pool belongs to. Non-null means the pool - /// routes allocations to this resource's allocator (and arbitrator, if - /// set). Null means the default CPU resource. The pool keeps a strong - /// reference, so the resource outlives every pool that uses it. - std::shared_ptr resource{nullptr}; + /// Allocator override. Non-null routes allocations through this allocator + /// instead of the MemoryManager's default. Borrowed; the owner (typically + /// a CustomMemoryResource held by the MemoryManager) must outlive the + /// pool. + MemoryAllocator* customAllocator{nullptr}; + + /// Arbitrator override. Non-null routes capacity decisions through this + /// arbitrator instead of the MemoryManager's default. Same ownership + /// contract as 'customAllocator'. + MemoryArbitrator* customArbitrator{nullptr}; }; /// Constructs a named memory pool with specified 'name', 'parent' and 'kind'. @@ -350,16 +354,6 @@ class MemoryPool : public std::enable_shared_from_this { return alignment_; } - /// Returns the custom memory resource this pool routes allocations to, or - /// nullptr for the default CPU resource. - virtual const std::shared_ptr& resource() const { - return resource_; - } - - /// Returns the tag of the custom resource this pool was created against, or - /// std::nullopt for the default CPU resource. - virtual std::optional resourceTag() const; - /// Resource governing methods used to track and limit the memory usage /// through this memory pool object. @@ -623,7 +617,6 @@ class MemoryPool : public std::enable_shared_from_this { const bool trackUsage_; const bool threadSafe_; const std::optional debugOptions_; - const std::shared_ptr resource_; const bool coreOnAllocationFailureEnabled_; std::function getPreferredSize_; diff --git a/velox/common/memory/tests/CustomMemoryResourceTest.cpp b/velox/common/memory/tests/CustomMemoryResourceTest.cpp index 35bc8891cbc..a57f85a58e5 100644 --- a/velox/common/memory/tests/CustomMemoryResourceTest.cpp +++ b/velox/common/memory/tests/CustomMemoryResourceTest.cpp @@ -58,40 +58,40 @@ TEST_F(CustomMemoryResourceManagerTest, registrationValidation) { EXPECT_EQ(manager.customResources()[0]->tag, "test-resource"); EXPECT_EQ(manager.customResources()[0]->maxCapacity, 1L << 28); + // Try to register another resource with the same tag, which should be rejected. + VELOX_ASSERT_THROW( + manager.registerCustomResource(makeResource("test-resource")), + "CustomMemoryResource already registered for tag: test-resource"); + ASSERT_EQ(manager.customResources().size(), 1); + + // Try to register resources with missing tag, which should be rejected. CustomMemoryResource emptyTag; emptyTag.allocator = makeResource("ignored").allocator; VELOX_ASSERT_THROW( manager.registerCustomResource(std::move(emptyTag)), "CustomMemoryResource tag is empty"); + // Try to register resources with null allocator, which should be rejected. CustomMemoryResource nullAllocator; nullAllocator.tag = "another"; VELOX_ASSERT_THROW( manager.registerCustomResource(std::move(nullAllocator)), "CustomMemoryResource allocator is null for tag: another"); + // Try to register resources with null arbitrator, which should be rejected. auto nullArbitrator = makeResource("another"); nullArbitrator.arbitrator.reset(); VELOX_ASSERT_THROW( manager.registerCustomResource(std::move(nullArbitrator)), "CustomMemoryResource arbitrator is null for tag: another"); + // Try to register resources with null reclaimerFactory, which should be + // rejected. auto nullFactory = makeResource("another"); nullFactory.reclaimerFactory = nullptr; VELOX_ASSERT_THROW( manager.registerCustomResource(std::move(nullFactory)), "CustomMemoryResource reclaimerFactory is null for tag: another"); - - VELOX_ASSERT_THROW( - manager.registerCustomResource(makeResource("test-resource")), - "CustomMemoryResource already registered for tag: test-resource"); - - ASSERT_EQ(manager.customResources().size(), 1); -} - -TEST_F(CustomMemoryResourceManagerTest, customResourcesEmptyByDefault) { - MemoryManager manager{}; - EXPECT_TRUE(manager.customResources().empty()); } TEST_F(CustomMemoryResourceManagerTest, registrationOrderPreserved) { @@ -105,19 +105,20 @@ TEST_F(CustomMemoryResourceManagerTest, registrationOrderPreserved) { EXPECT_EQ(manager.customResources()[2]->tag, "c"); } -TEST_F(CustomMemoryResourceManagerTest, addRootPoolPropagatesResourceTag) { +TEST_F(CustomMemoryResourceManagerTest, addRootPoolRejectsUnregisteredTag) { MemoryManager manager{}; manager.registerCustomResource(makeResource("gpu")); + // Adding a root pool with a registered tag should succeed. auto tagged = manager.addRootPool( "tagged-root", kMaxMemory, nullptr, std::nullopt, "gpu"); ASSERT_NE(tagged, nullptr); - ASSERT_TRUE(tagged->resourceTag().has_value()); - EXPECT_EQ(*tagged->resourceTag(), "gpu"); + // Adding a root pool without a tag should succeed. auto untagged = manager.addRootPool("untagged-root"); - EXPECT_FALSE(untagged->resourceTag().has_value()); + ASSERT_NE(untagged, nullptr); + // Adding a root pool with an unregistered tag should be rejected. VELOX_ASSERT_THROW( manager.addRootPool( "bad-root", kMaxMemory, nullptr, std::nullopt, "not-registered"), @@ -138,44 +139,7 @@ class CustomMemoryResourceQueryCtxTest : public testing::Test { } }; -TEST_F(CustomMemoryResourceQueryCtxTest, queryCtxWithoutResourcesHasNoCustomPools) { - auto queryCtx = core::QueryCtx::Builder().queryId("q-empty").build(); - EXPECT_TRUE(queryCtx->customPools().empty()); - EXPECT_EQ(queryCtx->customPool("anything"), nullptr); -} - -TEST_F(CustomMemoryResourceQueryCtxTest, addCustomPoolRejectsNull) { - auto queryCtx = core::QueryCtx::Builder().queryId("q-null").build(); - VELOX_ASSERT_THROW(queryCtx->addCustomPool(nullptr), ""); -} - -TEST_F(CustomMemoryResourceQueryCtxTest, reclaimerFactoryExceptionPropagates) { - auto resource = makeResource("boom"); - resource.reclaimerFactory = - [](core::QueryCtx*) -> std::unique_ptr { - throw std::runtime_error("factory boom"); - }; - memoryManager()->registerCustomResource(std::move(resource)); - - EXPECT_THROW( - core::QueryCtx::Builder().queryId("q-boom").build(), - std::runtime_error); -} - -TEST_F(CustomMemoryResourceQueryCtxTest, multipleQueryCtxsGetDistinctCustomPools) { - memoryManager()->registerCustomResource(makeResource("gpu")); - - auto q1 = core::QueryCtx::Builder().queryId("q-shared").build(); - auto q2 = core::QueryCtx::Builder().queryId("q-shared").build(); - auto p1 = q1->customPool("gpu"); - auto p2 = q2->customPool("gpu"); - ASSERT_NE(p1, nullptr); - ASSERT_NE(p2, nullptr); - EXPECT_NE(p1, p2); - EXPECT_NE(p1->name(), p2->name()); -} - -TEST_F(CustomMemoryResourceQueryCtxTest, perQueryRootPoolCreated) { +TEST_F(CustomMemoryResourceQueryCtxTest, customPoolCreation) { auto resource = makeResource("gpu"); resource.maxCapacity = 1L << 28; @@ -197,47 +161,49 @@ TEST_F(CustomMemoryResourceQueryCtxTest, perQueryRootPoolCreated) { ASSERT_NE(pool, nullptr); EXPECT_TRUE(pool->name().starts_with("query.q0.")); EXPECT_TRUE(pool->name().ends_with(".gpu")); - ASSERT_TRUE(pool->resourceTag().has_value()); - EXPECT_EQ(*pool->resourceTag(), "gpu"); EXPECT_EQ(pool->maxCapacity(), 1L << 28); EXPECT_EQ(queryCtx->customPool("missing"), nullptr); EXPECT_EQ(queryCtx->customPools().size(), 1); } -TEST_F(CustomMemoryResourceQueryCtxTest, customPoolsMatchRegistrationOrder) { +TEST_F(CustomMemoryResourceQueryCtxTest, customPoolsKeyedByTag) { for (const auto* tag : {"a", "b", "c"}) { memoryManager()->registerCustomResource(makeResource(tag)); } - auto queryCtx = core::QueryCtx::Builder().queryId("q-order").build(); + auto queryCtx = core::QueryCtx::Builder().queryId("q-keyed").build(); ASSERT_EQ(queryCtx->customPools().size(), 3); - EXPECT_EQ(*queryCtx->customPools()[0]->resourceTag(), "a"); - EXPECT_EQ(*queryCtx->customPools()[1]->resourceTag(), "b"); - EXPECT_EQ(*queryCtx->customPools()[2]->resourceTag(), "c"); + EXPECT_NE(queryCtx->customPool("a"), nullptr); + EXPECT_NE(queryCtx->customPool("b"), nullptr); + EXPECT_NE(queryCtx->customPool("c"), nullptr); } -// Locks down the chained-reclaimer ordering contract: a later-registered -// resource's reclaimerFactory must see earlier resources' per-query pools -// already attached to the QueryCtx. A factory cannot see its own pool because -// the factory runs before that pool is added. -TEST_F(CustomMemoryResourceQueryCtxTest, reclaimerFactorySeesPriorSiblingPool) { - memoryManager()->registerCustomResource(makeResource("primary")); - - std::shared_ptr capturedPrimary; - std::shared_ptr capturedSelf; - auto secondary = makeResource("secondary"); - secondary.reclaimerFactory = [&](core::QueryCtx* ctx) { - capturedPrimary = ctx->customPool("primary"); - capturedSelf = ctx->customPool("secondary"); - return MemoryReclaimer::create(0); - }; - memoryManager()->registerCustomResource(std::move(secondary)); +// Locks down the core dispatch invariant: a custom-resource root pool and its +// children must allocate through the resource's allocator, not the +// MemoryManager default. +TEST_F( + CustomMemoryResourceQueryCtxTest, + customPoolDispatchesToResourceAllocator) { + auto resource = makeResource("gpu"); + auto* expectedAllocator = resource.allocator.get(); + memoryManager()->registerCustomResource(std::move(resource)); - auto queryCtx = core::QueryCtx::Builder().queryId("q-chain").build(); - ASSERT_NE(capturedPrimary, nullptr); - EXPECT_EQ(capturedPrimary, queryCtx->customPool("primary")); - EXPECT_EQ(capturedSelf, nullptr); + auto queryCtx = core::QueryCtx::Builder().queryId("q-dispatch").build(); + auto root = queryCtx->customPool("gpu"); + ASSERT_NE(root, nullptr); + EXPECT_EQ( + static_cast(root.get())->testingAllocator(), + expectedAllocator); + + auto aggregate = root->addAggregateChild("agg"); + auto leaf = aggregate->addLeafChild("leaf"); + EXPECT_EQ( + static_cast(aggregate.get())->testingAllocator(), + expectedAllocator); + EXPECT_EQ( + static_cast(leaf.get())->testingAllocator(), + expectedAllocator); } // Test reclaimer that "spills" by allocating from a sibling resource's pool. @@ -322,75 +288,4 @@ TEST_F(CustomMemoryResourceQueryCtxTest, deviceReclaimerSpillsToHostSibling) { EXPECT_EQ(reclaimed, target); EXPECT_GE(hostPool->usedBytes(), static_cast(target)); } - -// Gap-coverage tests below. These pin the contracts that allocator dispatch -// and pointer-stability must satisfy. They are expected to fail until the -// dispatch and storage gaps are implemented. - -// Gap 1: a pool created against a custom resource must allocate through that -// resource's allocator, not the MemoryManager default. -TEST_F( - CustomMemoryResourceQueryCtxTest, - rootPoolDispatchesToCustomResourceAllocator) { - auto resource = makeResource("gpu"); - auto* expectedAllocator = resource.allocator.get(); - memoryManager()->registerCustomResource(std::move(resource)); - - auto queryCtx = core::QueryCtx::Builder().queryId("q-dispatch").build(); - auto pool = queryCtx->customPool("gpu"); - ASSERT_NE(pool, nullptr); - auto* impl = static_cast(pool.get()); - EXPECT_EQ(impl->testingAllocator(), expectedAllocator) - << "Root pool must resolve its allocator from the custom resource, " - "not the MemoryManager default."; -} - -// Gap 3: a child of a custom-resource root pool must inherit the resource's -// allocator (and, transitively, its arbitrator once dispatch is wired). -TEST_F( - CustomMemoryResourceQueryCtxTest, - childPoolInheritsCustomResourceAllocator) { - auto resource = makeResource("gpu"); - auto* expectedAllocator = resource.allocator.get(); - memoryManager()->registerCustomResource(std::move(resource)); - - auto queryCtx = core::QueryCtx::Builder().queryId("q-child").build(); - auto root = queryCtx->customPool("gpu"); - ASSERT_NE(root, nullptr); - auto aggregate = root->addAggregateChild("agg"); - auto leaf = aggregate->addLeafChild("leaf"); - - EXPECT_EQ( - static_cast(aggregate.get())->testingAllocator(), - expectedAllocator); - EXPECT_EQ( - static_cast(leaf.get())->testingAllocator(), - expectedAllocator); - ASSERT_TRUE(leaf->resourceTag().has_value()); - EXPECT_EQ(*leaf->resourceTag(), "gpu"); -} - -// Gap 4: the registration list must offer stable references. Once dispatch is -// wired, pools will store CustomMemoryResource* pointers obtained at -// construction time; later registrations must not invalidate them. -// -// Compares addresses only — does not dereference the captured pointer, since -// dereferencing a dangling pointer on contract violation would crash gtest -// before it can report the failure. -TEST_F(CustomMemoryResourceManagerTest, registrationKeepsExistingPointersStable) { - MemoryManager manager{}; - manager.registerCustomResource(makeResource("first")); - const CustomMemoryResource* firstAddress = - manager.customResources()[0].get(); - - for (int i = 0; i < 128; ++i) { - manager.registerCustomResource(makeResource(fmt::format("r{}", i))); - } - - EXPECT_EQ(manager.customResources()[0].get(), firstAddress) - << "Address of an already-registered resource must not change when " - "new resources are added."; - EXPECT_EQ(firstAddress->tag, "first"); -} - } // namespace facebook::velox::memory::test diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp index f1fe027869d..fa13473f814 100644 --- a/velox/core/QueryCtx.cpp +++ b/velox/core/QueryCtx.cpp @@ -34,7 +34,7 @@ void createCustomMemoryPools(QueryCtx* queryCtx) { std::move(reclaimer), std::nullopt, mr->tag); - queryCtx->addCustomPool(std::move(pool)); + queryCtx->addCustomPool(mr->tag, std::move(pool)); } } @@ -125,20 +125,18 @@ QueryCtx::~QueryCtx() { return fmt::format("query.{}.{}", queryId, seqNum++); } -void QueryCtx::addCustomPool(std::shared_ptr pool) { +void QueryCtx::addCustomPool( + const std::string& tag, + std::shared_ptr pool) { VELOX_CHECK_NOT_NULL(pool); - customPools_.push_back(std::move(pool)); + auto [_, inserted] = customPools_.emplace(tag, std::move(pool)); + VELOX_CHECK(inserted, "Duplicate custom pool tag: {}", tag); } std::shared_ptr QueryCtx::customPool( const std::string& tag) const { - for (const auto& pool : customPools_) { - const auto& poolTag = pool->resourceTag(); - if (poolTag.has_value() && *poolTag == tag) { - return pool; - } - } - return nullptr; + auto it = customPools_.find(tag); + return it == customPools_.end() ? nullptr : it->second; } void QueryCtx::maybeSetReclaimer() { diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index 8499c57dedf..a6d4588ffdd 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -379,16 +379,20 @@ class QueryCtx : public std::enable_shared_from_this { pool_ = std::move(pool); } - /// Tracks an additional root pool tied to a custom memory resource. - void addCustomPool(std::shared_ptr pool); + /// Tracks an additional root pool tied to a custom memory resource. Throws + /// if 'tag' is already present or 'pool' is null. + void addCustomPool( + const std::string& tag, + std::shared_ptr pool); /// Returns the custom root pool for the given resource tag, or nullptr if /// none is registered under that tag for this query. std::shared_ptr customPool( const std::string& tag) const; - /// Returns all custom root pools for this query, in registration order. - const std::vector>& customPools() const { + /// Returns all custom root pools for this query, keyed by resource tag. + const std::unordered_map>& + customPools() const { return customPools_; } @@ -484,7 +488,8 @@ class QueryCtx : public std::enable_shared_from_this { std::unordered_map> connectorSessionProperties_; std::shared_ptr pool_; - std::vector> customPools_; + std::unordered_map> + customPools_; QueryConfig queryConfig_; std::atomic numSpilledBytes_{0}; std::atomic numTracedBytes_{0}; From e2a745ccdcf70e5f0c33558ae47d5d3d470959a3 Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Mon, 18 May 2026 07:58:52 +0000 Subject: [PATCH 05/10] fix: make changes suggested by PR review --- velox/common/base/tests/StatsReporterTest.cpp | 4 + velox/common/memory/CustomMemoryResource.h | 14 +- velox/common/memory/Memory.cpp | 64 ++- velox/common/memory/Memory.h | 35 +- velox/common/memory/MemoryPool.h | 4 +- velox/common/memory/tests/CMakeLists.txt | 4 +- .../tests/CustomMemoryEmulationTest.cpp | 444 ++++++++++++++++++ .../memory/tests/CustomMemoryPoolTest.cpp | 321 +++++++++++++ .../tests/CustomMemoryRegistrationTest.cpp | 122 +++++ .../memory/tests/CustomMemoryResourceTest.cpp | 291 ------------ velox/core/QueryCtx.cpp | 28 +- velox/core/QueryCtx.h | 29 +- velox/docs/develop/memory.rst | 154 ++++++ velox/docs/develop/spilling.rst | 123 +++++ 14 files changed, 1270 insertions(+), 367 deletions(-) create mode 100644 velox/common/memory/tests/CustomMemoryEmulationTest.cpp create mode 100644 velox/common/memory/tests/CustomMemoryPoolTest.cpp create mode 100644 velox/common/memory/tests/CustomMemoryRegistrationTest.cpp delete mode 100644 velox/common/memory/tests/CustomMemoryResourceTest.cpp diff --git a/velox/common/base/tests/StatsReporterTest.cpp b/velox/common/base/tests/StatsReporterTest.cpp index 974efd271e1..daddca74b25 100644 --- a/velox/common/base/tests/StatsReporterTest.cpp +++ b/velox/common/base/tests/StatsReporterTest.cpp @@ -344,6 +344,10 @@ class TestMemoryPool : public memory::MemoryPool { return nullptr; } + memory::MemoryArbitrator* arbitrator() const override { + return nullptr; + } + void enterArbitration() override {} void leaveArbitration() noexcept override {} diff --git a/velox/common/memory/CustomMemoryResource.h b/velox/common/memory/CustomMemoryResource.h index 4c9150da86d..00667a2e519 100644 --- a/velox/common/memory/CustomMemoryResource.h +++ b/velox/common/memory/CustomMemoryResource.h @@ -22,10 +22,6 @@ #include #include -namespace facebook::velox::core { -class QueryCtx; -} // namespace facebook::velox::core - namespace facebook::velox::memory { class MemoryAllocator; @@ -49,10 +45,12 @@ struct CustomMemoryResource { /// resource. std::shared_ptr arbitrator; - /// Invoked once per QueryCtx to build the reclaimer attached to the - /// per-query root pool. - std::function(core::QueryCtx*)> - reclaimerFactory; + /// Required. Builds a reclaimer for pools tagged with this resource. + /// Invoked by the caller (not the framework) when constructing the + /// per-query root pool. For reclaimers that need a QueryCtx-aware view, + /// the caller skips this factory and attaches the reclaimer post-build via + /// MemoryPool::setReclaimer. + std::function()> reclaimerFactory; }; } // namespace facebook::velox::memory diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index bdcbbb5dec9..029999369fa 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -286,6 +286,37 @@ std::shared_ptr MemoryManager::addRootPool( std::unique_ptr reclaimer, const std::optional& poolDebugOpts, const std::optional& resourceTag) { + if (!resourceTag.has_value()) { + return addRootPoolImpl( + name, + maxCapacity, + std::move(reclaimer), + poolDebugOpts, + /*customAllocator=*/nullptr, + /*customArbitrator=*/nullptr); + } + auto it = customResources_.find(*resourceTag); + VELOX_USER_CHECK( + it != customResources_.end(), + "No CustomMemoryResource registered for tag: {}", + *resourceTag); + const auto& resource = *it->second; + return addRootPoolImpl( + name, + maxCapacity, + std::move(reclaimer), + poolDebugOpts, + resource.allocator.get(), + resource.arbitrator.get()); +} + +std::shared_ptr MemoryManager::addRootPoolImpl( + const std::string& name, + int64_t maxCapacity, + std::unique_ptr reclaimer, + const std::optional& poolDebugOpts, + MemoryAllocator* customAllocator, + MemoryArbitrator* customArbitrator) { std::string poolName = name; if (poolName.empty()) { static std::atomic poolId{0}; @@ -299,21 +330,8 @@ std::shared_ptr MemoryManager::addRootPool( options.coreOnAllocationFailureEnabled = coreOnAllocationFailureEnabled_; options.getPreferredSize = getPreferredSize_; options.debugOptions = poolDebugOpts; - if (resourceTag.has_value()) { - std::shared_ptr matched; - for (const auto& candidate : customResources_) { - if (candidate->tag == *resourceTag) { - matched = candidate; - break; - } - } - VELOX_USER_CHECK_NOT_NULL( - matched, - "No CustomMemoryResource registered for tag: {}", - *resourceTag); - options.customAllocator = matched->allocator.get(); - options.customArbitrator = matched->arbitrator.get(); - } + options.customAllocator = customAllocator; + options.customArbitrator = customArbitrator; auto pool = createRootPool(poolName, reclaimer, options); if (!disableMemoryPoolTracking_) { @@ -408,18 +426,14 @@ void MemoryManager::registerCustomResource(CustomMemoryResource resource) { resource.reclaimerFactory, "CustomMemoryResource reclaimerFactory is null for tag: {}", resource.tag); - for (const auto& existing : customResources_) { - VELOX_USER_CHECK_NE( - existing->tag, - resource.tag, - "CustomMemoryResource already registered for tag: {}", - resource.tag); - } - customResources_.push_back( - std::make_shared(std::move(resource))); + const auto tag = resource.tag; + auto [_, inserted] = customResources_.emplace( + tag, std::make_shared(std::move(resource))); + VELOX_USER_CHECK( + inserted, "CustomMemoryResource already registered for tag: {}", tag); } -const std::vector>& +const folly::F14FastMap>& MemoryManager::customResources() const { return customResources_; } diff --git a/velox/common/memory/Memory.h b/velox/common/memory/Memory.h index 6b2ae79725a..8fec6eebc32 100644 --- a/velox/common/memory/Memory.h +++ b/velox/common/memory/Memory.h @@ -25,6 +25,7 @@ #include #include +#include #include #include @@ -216,8 +217,10 @@ class MemoryManager { /// Creates a root memory pool with specified 'name' and 'maxCapacity'. If /// 'name' is missing, the memory manager generates a default name internally - /// to ensure uniqueness. If 'resourceTag' is set, the pool is associated - /// with the custom memory resource registered under that tag. + /// to ensure uniqueness. If 'resourceTag' is set, the pool's allocator and + /// arbitrator come from the CustomMemoryResource previously registered + /// under that tag via registerCustomResource(); throws if the tag is not + /// registered. std::shared_ptr addRootPool( const std::string& name = "", int64_t maxCapacity = kMaxMemory, @@ -231,9 +234,10 @@ class MemoryManager { /// process startup. void registerCustomResource(CustomMemoryResource resource); - /// Returns the registered custom resources in registration order. - const std::vector>& customResources() - const; + /// Returns the registered custom resources keyed by tag. Iteration order + /// is unspecified. + const folly::F14FastMap>& + customResources() const; /// Creates a leaf memory pool for direct memory allocation use with specified /// 'name'. If 'name' is missing, the memory manager generates a default name @@ -309,17 +313,22 @@ class MemoryManager { return sharedLeafPools_; } - /// Test-only: drops all registered custom memory resources. - void testingClearCustomResources() { - customResources_.clear(); - } - private: std::shared_ptr createRootPool( std::string poolName, std::unique_ptr& reclaimer, MemoryPool::Options& options); + // Shared implementation for addRootPool overloads. 'customAllocator' and + // 'customArbitrator' are borrowed and may be null (default tier). + std::shared_ptr addRootPoolImpl( + const std::string& name, + int64_t maxCapacity, + std::unique_ptr reclaimer, + const std::optional& poolDebugOpts, + MemoryAllocator* customAllocator, + MemoryArbitrator* customArbitrator); + void dropPool(MemoryPool* pool); // Returns the shared references to all the alive memory pools in 'pools_'. @@ -350,8 +359,10 @@ class MemoryManager { // All user root pools allocated from 'this'. std::unordered_map> pools_; - // Shared ownership so a resource outlives every pool that uses it. - std::vector> customResources_; + // Shared ownership so a resource outlives every pool that uses it. Keyed + // by 'tag' so addRootPool() can resolve a resource in O(1). + folly::F14FastMap> + customResources_; }; /// Initializes the process-wide memory manager based on the specified diff --git a/velox/common/memory/MemoryPool.h b/velox/common/memory/MemoryPool.h index ceb8ab42ae7..9715dda847b 100644 --- a/velox/common/memory/MemoryPool.h +++ b/velox/common/memory/MemoryPool.h @@ -428,9 +428,7 @@ class MemoryPool : public std::enable_shared_from_this { /// Returns the arbitrator that governs this pool's capacity. May be the /// MemoryManager's default arbitrator or a custom resource's arbitrator. - virtual MemoryArbitrator* arbitrator() const { - return nullptr; - } + virtual MemoryArbitrator* arbitrator() const = 0; /// Function estimates the number of reclaimable bytes and returns in /// 'reclaimableBytes'. If the 'reclaimer' is not set, the function returns diff --git a/velox/common/memory/tests/CMakeLists.txt b/velox/common/memory/tests/CMakeLists.txt index deff8332a32..9e060414c9a 100644 --- a/velox/common/memory/tests/CMakeLists.txt +++ b/velox/common/memory/tests/CMakeLists.txt @@ -20,7 +20,9 @@ add_executable( ArbitrationParticipantTest.cpp ByteStreamTest.cpp CompactDoubleListTest.cpp - CustomMemoryResourceTest.cpp + CustomMemoryEmulationTest.cpp + CustomMemoryPoolTest.cpp + CustomMemoryRegistrationTest.cpp HashStringAllocatorTest.cpp MemoryAllocatorTest.cpp MemoryArbitratorTest.cpp diff --git a/velox/common/memory/tests/CustomMemoryEmulationTest.cpp b/velox/common/memory/tests/CustomMemoryEmulationTest.cpp new file mode 100644 index 00000000000..a8a6a117b27 --- /dev/null +++ b/velox/common/memory/tests/CustomMemoryEmulationTest.cpp @@ -0,0 +1,444 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/MallocAllocator.h" +#include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryArbitrator.h" +#include "velox/core/QueryCtx.h" + +namespace facebook::velox::memory::test { +namespace { + +// Mirrors the row layout the docs assume for the CXL-backed HashAggregation +// example: a fixed-width group key plus a single int64_t accumulator. +struct EmulatedRow { + int64_t key; + int64_t sum; +}; + +class EmulatedCxlHashAggregation; + +// Reclaimer installed on the operator's DRAM leaf row pool. Forwards +// reclaim() to the operator's DRAM -> CXL spill method, modeling Phase 1 +// in the spilling docs. +class DramReclaimer : public MemoryReclaimer { + public: + explicit DramReclaimer(EmulatedCxlHashAggregation* op) + : MemoryReclaimer(0), op_(op) {} + + uint64_t reclaim( + MemoryPool* pool, + uint64_t targetBytes, + uint64_t maxWaitMs, + Stats& stats) override; + + private: + EmulatedCxlHashAggregation* const op_; +}; + +// Reclaimer attached to the CXL custom root pool. Constructed by the +// resource's reclaimerFactory before the operator exists; the operator +// fills in 'callback' after construction and clears it on destruction. +class CxlReclaimer : public MemoryReclaimer { + public: + using Callback = std::function; + + explicit CxlReclaimer(std::shared_ptr callback) + : MemoryReclaimer(0), callback_(std::move(callback)) {} + + uint64_t reclaim( + MemoryPool* /*pool*/, + uint64_t targetBytes, + uint64_t /*maxWaitMs*/, + Stats& /*stats*/) override { + if (!callback_ || !*callback_) { + return 0; + } + return (*callback_)(targetBytes); + } + + private: + const std::shared_ptr callback_; +}; + +// Mini hash-aggregation operator emulating the CXL-aware HashAggregation +// described in spilling.rst (`Reclaim Across Memory Resources`). Allocates +// row bodies through MemoryPool to keep used-byte counters honest, +// partitions rows by key, and exposes spill methods that the two +// reclaimers invoke. +class EmulatedCxlHashAggregation { + public: + static constexpr int kNumPartitions = 8; + + EmulatedCxlHashAggregation( + core::QueryCtx* ctx, + std::shared_ptr cxlCallbackSlot) + : ctx_(ctx), + dramRowPool_(ctx->pool()->addLeafChild("emulated-dram-rows")), + cxlRootPool_(ctx->customPool("cxl")), + cxlRowPool_(cxlRootPool_->addLeafChild("emulated-cxl-rows")), + cxlCallbackSlot_(std::move(cxlCallbackSlot)) { + VELOX_CHECK_NOT_NULL(cxlRootPool_); + // MemoryPool::setReclaimer requires the parent to have a reclaimer + // first. The default root pool typically has none, so install a noop + // one before attaching the DRAM-leaf reclaimer. + if (ctx_->pool()->reclaimer() == nullptr) { + ctx_->pool()->setReclaimer(MemoryReclaimer::create(0)); + } + dramRowPool_->setReclaimer(std::make_unique(this)); + *cxlCallbackSlot_ = [this](uint64_t targetBytes) { + return spillCxlPartitionToDisk(targetBytes); + }; + } + + ~EmulatedCxlHashAggregation() { + // Clear the CXL reclaimer's callback so a stale reclaim invocation + // after operator destruction cannot dereference 'this'. + if (cxlCallbackSlot_) { + *cxlCallbackSlot_ = {}; + } + // Free every row still resident in DRAM or CXL before the pool + // shared_ptrs go out of scope; otherwise the pool destructors abort + // on outstanding usage. + for (auto& [_, entry] : hashTable_) { + auto* pool = entry.location == Location::kDram ? dramRowPool_.get() + : cxlRowPool_.get(); + pool->free(entry.row, sizeof(EmulatedRow)); + } + hashTable_.clear(); + } + + EmulatedCxlHashAggregation(const EmulatedCxlHashAggregation&) = delete; + EmulatedCxlHashAggregation& operator=(const EmulatedCxlHashAggregation&) = + delete; + + void addInput(int64_t key, int64_t value) { + auto it = hashTable_.find(key); + if (it != hashTable_.end()) { + it->second.row->sum += value; + return; + } + auto* row = static_cast( + dramRowPool_->allocate(sizeof(EmulatedRow))); + row->key = key; + row->sum = value; + hashTable_.emplace(key, Entry{row, Location::kDram}); + } + + // Reads every row still resident in the hash table (DRAM and CXL both + // CPU-addressable in this emulation) and merges with disk-resident + // rows to produce the final per-key aggregate. + std::unordered_map finalize() const { + std::unordered_map result; + for (const auto& [_, entry] : hashTable_) { + result[entry.row->key] += entry.row->sum; + } + for (const auto& row : diskRows_) { + result[row.key] += row.sum; + } + return result; + } + + // Returns the DRAM leaf pool — has the operator's DramReclaimer + // installed, so the test can trigger Phase 1 via dramPool()->reclaim(). + MemoryPool* dramPool() const { + return dramRowPool_.get(); + } + + // Returns the CXL root pool — has the resource's CxlReclaimer installed + // via reclaimerFactory, so the test can trigger Phase 2 via + // cxlPool()->reclaim(). + MemoryPool* cxlPool() const { + return cxlRootPool_.get(); + } + + size_t hashTableSize() const { + return hashTable_.size(); + } + + size_t dramRowCount() const { + return countByLocation(Location::kDram); + } + + size_t cxlRowCount() const { + return countByLocation(Location::kCxl); + } + + size_t diskRowCount() const { + return diskRows_.size(); + } + + // Phase 1 (DRAM -> CXL): move the partition with the most + // DRAM-resident rows into CXL. The hash-table bucket for each row is + // swizzled to its new CXL address; the entry is not removed. + uint64_t spillTopPartitionToCxl(uint64_t /*targetBytes*/) { + const int partition = pickPartition(Location::kDram); + if (partition < 0) { + return 0; + } + uint64_t freed = 0; + for (auto& [key, entry] : hashTable_) { + if (entry.location != Location::kDram || + partitionOf(key) != partition) { + continue; + } + auto* cxlRow = static_cast( + cxlRowPool_->allocate(sizeof(EmulatedRow))); + *cxlRow = *entry.row; + dramRowPool_->free(entry.row, sizeof(EmulatedRow)); + entry.row = cxlRow; + entry.location = Location::kCxl; + freed += sizeof(EmulatedRow); + } + return freed; + } + + // Phase 2 (CXL -> disk): move the partition with the most CXL-resident + // rows into the disk-resident vector. The hash-table entries are + // erased, because the rows are no longer directly addressable from the + // CPU after the on-disk copy is the only canonical copy. + uint64_t spillCxlPartitionToDisk(uint64_t /*targetBytes*/) { + const int partition = pickPartition(Location::kCxl); + if (partition < 0) { + return 0; + } + uint64_t freed = 0; + for (auto it = hashTable_.begin(); it != hashTable_.end();) { + if (it->second.location != Location::kCxl || + partitionOf(it->first) != partition) { + ++it; + continue; + } + diskRows_.push_back(*it->second.row); + cxlRowPool_->free(it->second.row, sizeof(EmulatedRow)); + it = hashTable_.erase(it); + freed += sizeof(EmulatedRow); + } + return freed; + } + + private: + enum class Location { kDram, kCxl }; + struct Entry { + EmulatedRow* row; + Location location; + }; + + static int partitionOf(int64_t key) { + return static_cast(static_cast(key) % kNumPartitions); + } + + size_t countByLocation(Location target) const { + size_t count = 0; + for (const auto& [_, entry] : hashTable_) { + if (entry.location == target) { + ++count; + } + } + return count; + } + + // Picks the partition with the most rows of the given location, or + // -1 if no rows match. + int pickPartition(Location target) const { + std::array counts{}; + for (const auto& [key, entry] : hashTable_) { + if (entry.location == target) { + ++counts[partitionOf(key)]; + } + } + int best = -1; + int bestCount = 0; + for (int i = 0; i < kNumPartitions; ++i) { + if (counts[i] > bestCount) { + best = i; + bestCount = counts[i]; + } + } + return best; + } + + core::QueryCtx* const ctx_; + std::shared_ptr dramRowPool_; + // Root of the CXL custom resource; holds the CxlReclaimer installed by + // the resource's reclaimerFactory. Reserved for reclaim() dispatch only; + // allocations go through 'cxlRowPool_' below. + std::shared_ptr cxlRootPool_; + // Leaf child of 'cxlRootPool_'. Row bodies are allocated and freed + // here. + std::shared_ptr cxlRowPool_; + std::shared_ptr cxlCallbackSlot_; + std::unordered_map hashTable_; + std::vector diskRows_; +}; + +uint64_t DramReclaimer::reclaim( + MemoryPool* /*pool*/, + uint64_t targetBytes, + uint64_t /*maxWaitMs*/, + Stats& /*stats*/) { + return op_->spillTopPartitionToCxl(targetBytes); +} + +// Bundles a CustomMemoryResource backed by MallocAllocator with the +// callback slot the operator fills in after QueryCtx construction. +struct CxlResourceBundle { + CustomMemoryResource resource; + std::shared_ptr callbackSlot; +}; + +CxlResourceBundle makeCxlResource() { + MemoryAllocator::Options allocatorOptions; + allocatorOptions.capacity = 1L << 30; + CxlResourceBundle bundle; + bundle.callbackSlot = std::make_shared(); + bundle.resource.tag = "cxl"; + bundle.resource.maxCapacity = 1L << 30; + bundle.resource.allocator = + std::make_shared(allocatorOptions); + bundle.resource.arbitrator = MemoryArbitrator::create({}); + auto slot = bundle.callbackSlot; + bundle.resource.reclaimerFactory = [slot]() { + return std::make_unique(slot); + }; + return bundle; +} + +// Materializes the CXL custom pool through the registered resource and +// attaches it to a fresh QueryCtx keyed by 'tag'. Mirrors the recommended +// caller-builds-pool flow for custom memory resources. +std::shared_ptr buildQueryCtxWithCxl( + CxlResourceBundle& bundle, + const std::string& queryId) { + auto* manager = memoryManager(); + manager->registerCustomResource(std::move(bundle.resource)); + const auto& registered = *manager->customResources().at("cxl"); + auto reclaimer = registered.reclaimerFactory(); + auto pool = manager->addRootPool( + fmt::format("{}.cxl", queryId), + registered.maxCapacity, + std::move(reclaimer), + std::nullopt, + "cxl"); + return core::QueryCtx::Builder() + .customPool("cxl", std::move(pool)) + .queryId(queryId) + .build(); +} + +} // namespace + +// End-to-end coverage for the CXL-backed HashAggregation flow documented in +// spilling.rst (`Reclaim Across Memory Resources` -> `Example: CXL-Backed +// Hash Aggregation`). The CXL custom memory resource is backed by +// MallocAllocator so the test runs on hardware without real CXL devices. +class CustomMemoryEmulationTest : public testing::Test { + protected: + void SetUp() override { + MemoryManager::testingSetInstance(MemoryManager::Options{}); + } +}; + +TEST_F(CustomMemoryEmulationTest, baselineAggregationWithoutSpill) { + auto cxl = makeCxlResource(); + auto queryCtx = buildQueryCtxWithCxl(cxl, "cxl-baseline"); + EmulatedCxlHashAggregation op(queryCtx.get(), cxl.callbackSlot); + + std::unordered_map expected; + for (int i = 0; i < 64; ++i) { + const int64_t key = i % 16; + const int64_t value = i; + op.addInput(key, value); + expected[key] += value; + } + + EXPECT_EQ(op.hashTableSize(), expected.size()); + EXPECT_EQ(op.dramRowCount(), expected.size()); + EXPECT_EQ(op.cxlRowCount(), 0); + EXPECT_EQ(op.diskRowCount(), 0); + EXPECT_EQ(op.finalize(), expected); +} + +TEST_F(CustomMemoryEmulationTest, dramToCxlToDiskChainPreservesIntegrity) { + auto cxl = makeCxlResource(); + auto queryCtx = buildQueryCtxWithCxl(cxl, "cxl-chain"); + EmulatedCxlHashAggregation op(queryCtx.get(), cxl.callbackSlot); + + std::unordered_map expected; + for (int i = 0; i < 64; ++i) { + const int64_t key = i % 16; + const int64_t value = i; + op.addInput(key, value); + expected[key] += value; + } + const size_t totalKeys = expected.size(); + ASSERT_EQ(op.dramRowCount(), totalKeys); + ASSERT_EQ(op.cxlRowCount(), 0); + ASSERT_EQ(op.diskRowCount(), 0); + + MemoryReclaimer::Stats stats; + + // Phase 1: DRAM -> CXL. The operator's DramReclaimer moves the top + // partition's rows into the CXL pool and swizzles the hash-table + // bucket pointers to the new addresses. The entries themselves remain; + // probe and finalize logic continue to read them directly. + const uint64_t freed1 = op.dramPool()->reclaim( + /*targetBytes=*/64, /*maxWaitMs=*/0, stats); + EXPECT_GT(freed1, 0); + const size_t cxlAfterFirst = op.cxlRowCount(); + EXPECT_GT(cxlAfterFirst, 0); + EXPECT_LT(op.dramRowCount(), totalKeys); + EXPECT_EQ(op.diskRowCount(), 0); + EXPECT_EQ(op.hashTableSize(), totalKeys) + << "Phase 1 swizzles bucket pointers; size must not change."; + + // Second DRAM reclaim moves another partition to CXL. + const uint64_t freed2 = op.dramPool()->reclaim( + /*targetBytes=*/64, /*maxWaitMs=*/0, stats); + EXPECT_GT(freed2, 0); + EXPECT_GT(op.cxlRowCount(), cxlAfterFirst); + EXPECT_EQ(op.diskRowCount(), 0); + EXPECT_EQ(op.hashTableSize(), totalKeys); + + // After two DRAM -> CXL hops, the CXL-resident rows are still directly + // readable: finalize() returns the same per-key sums it would in the + // baseline run. + EXPECT_EQ(op.finalize(), expected); + + // Phase 2: CXL -> disk. Arbitration on the CXL custom pool triggers the + // CxlReclaimer; the operator copies the top CXL partition's rows into + // the disk-resident vector and erases their hash-table entries. + const size_t cxlBeforePhase2 = op.cxlRowCount(); + const size_t hashSizeBeforePhase2 = op.hashTableSize(); + const uint64_t freed3 = op.cxlPool()->reclaim( + /*targetBytes=*/64, /*maxWaitMs=*/0, stats); + EXPECT_GT(freed3, 0); + EXPECT_GT(op.diskRowCount(), 0); + EXPECT_LT(op.cxlRowCount(), cxlBeforePhase2); + EXPECT_LT(op.hashTableSize(), hashSizeBeforePhase2) + << "Phase 2 erases hash-table entries for spilled keys."; + + // Phase 3: finalize merges DRAM-resident + CXL-resident + disk rows + // and yields the same per-key sums the baseline (no-spill) run would. + EXPECT_EQ(op.finalize(), expected); +} + +} // namespace facebook::velox::memory::test diff --git a/velox/common/memory/tests/CustomMemoryPoolTest.cpp b/velox/common/memory/tests/CustomMemoryPoolTest.cpp new file mode 100644 index 00000000000..648cdf8a354 --- /dev/null +++ b/velox/common/memory/tests/CustomMemoryPoolTest.cpp @@ -0,0 +1,321 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/MallocAllocator.h" +#include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryArbitrator.h" +#include "velox/core/QueryCtx.h" + +namespace facebook::velox::memory::test { +namespace { + +CustomMemoryResource makeResource(const std::string& tag) { + MemoryAllocator::Options allocatorOptions; + allocatorOptions.capacity = 1L << 30; + CustomMemoryResource resource; + resource.tag = tag; + resource.allocator = std::make_shared(allocatorOptions); + resource.arbitrator = MemoryArbitrator::create({}); + resource.reclaimerFactory = []() { return MemoryReclaimer::create(0); }; + return resource; +} + +// Builds a root pool from a CustomMemoryResource that the caller has already +// registered on 'manager'. The reclaimer is whatever the caller chooses; for +// tests that don't need a QueryCtx-aware reclaimer this is just the resource's +// default factory. +std::shared_ptr buildPool( + MemoryManager* manager, + const CustomMemoryResource& resource, + const std::string& poolName, + int64_t maxCapacity = kMaxMemory, + std::unique_ptr reclaimer = nullptr) { + if (reclaimer == nullptr && resource.reclaimerFactory) { + reclaimer = resource.reclaimerFactory(); + } + return manager->addRootPool( + poolName, + maxCapacity, + std::move(reclaimer), + std::nullopt, + resource.tag); +} + +} // namespace + +// Each test gets its own MemoryManager so per-test resource registrations +// cannot collide on tag. +class CustomMemoryPoolTest : public testing::Test { + protected: + void SetUp() override { + MemoryManager::testingSetInstance(MemoryManager::Options{}); + } +}; + +TEST_F(CustomMemoryPoolTest, customPoolCreation) { + auto* manager = memoryManager(); + manager->registerCustomResource(makeResource("gpu")); + const auto& registered = *manager->customResources().at("gpu"); + + auto pool = buildPool( + manager, registered, "query.q0.gpu", /*maxCapacity=*/1L << 28); + auto queryCtx = core::QueryCtx::Builder() + .customPool("gpu", pool) + .queryId("q0") + .build(); + + auto looked = queryCtx->customPool("gpu"); + ASSERT_NE(looked, nullptr); + EXPECT_EQ(looked.get(), pool.get()); + EXPECT_EQ(looked->name(), "query.q0.gpu"); + EXPECT_EQ(looked->maxCapacity(), 1L << 28); + + EXPECT_EQ(queryCtx->customPool("missing"), nullptr); + EXPECT_EQ(queryCtx->customPools().size(), 1); +} + +TEST_F(CustomMemoryPoolTest, customPoolsKeyedByTag) { + auto* manager = memoryManager(); + for (const auto* tag : {"a", "b", "c"}) { + manager->registerCustomResource(makeResource(tag)); + } + + auto builder = core::QueryCtx::Builder().queryId("q-keyed"); + for (const auto* tag : {"a", "b", "c"}) { + const auto& registered = *manager->customResources().at(tag); + builder.customPool( + tag, buildPool(manager, registered, fmt::format("q-keyed.{}", tag))); + } + auto queryCtx = builder.build(); + + ASSERT_EQ(queryCtx->customPools().size(), 3); + EXPECT_NE(queryCtx->customPool("a"), nullptr); + EXPECT_NE(queryCtx->customPool("b"), nullptr); + EXPECT_NE(queryCtx->customPool("c"), nullptr); +} + +// Locks down the core dispatch invariant: a custom-resource root pool and its +// children must allocate through the resource's allocator, not the +// MemoryManager default. +TEST_F(CustomMemoryPoolTest, customPoolDispatchesToResourceAllocator) { + auto* manager = memoryManager(); + manager->registerCustomResource(makeResource("gpu")); + const auto& registered = *manager->customResources().at("gpu"); + auto* expectedAllocator = registered.allocator.get(); + + auto pool = buildPool(manager, registered, "q-dispatch.gpu"); + auto queryCtx = core::QueryCtx::Builder() + .customPool("gpu", pool) + .queryId("q-dispatch") + .build(); + auto root = queryCtx->customPool("gpu"); + ASSERT_NE(root, nullptr); + EXPECT_EQ( + static_cast(root.get())->testingAllocator(), + expectedAllocator); + + auto aggregate = root->addAggregateChild("agg"); + auto leaf = aggregate->addLeafChild("leaf"); + EXPECT_EQ( + static_cast(aggregate.get())->testingAllocator(), + expectedAllocator); + EXPECT_EQ( + static_cast(leaf.get())->testingAllocator(), + expectedAllocator); +} + +namespace { + +// Test reclaimer that "spills" by allocating from a sibling resource's pool. +// Models the chained-reclaimer pattern (e.g. device-memory pool spills into a +// pinned-host pool). +class SpillToSiblingReclaimer : public MemoryReclaimer { + public: + static std::unique_ptr create( + std::shared_ptr sibling) { + return std::unique_ptr( + new SpillToSiblingReclaimer(std::move(sibling))); + } + + ~SpillToSiblingReclaimer() override { + // Release any buffers we acquired during reclaim before the leaf pools + // they came from get destroyed; otherwise the leaf destructor aborts on + // outstanding usage. + for (auto& spill : spills_) { + spill.leaf->free(spill.buffer, static_cast(spill.bytes)); + } + } + + uint64_t reclaim( + MemoryPool* /*pool*/, + uint64_t targetBytes, + uint64_t /*maxWaitMs*/, + Stats& /*stats*/) override { + ++numReclaimCalls_; + auto leaf = + sibling_->addLeafChild(fmt::format("spill-{}", numReclaimCalls_)); + void* buffer = leaf->allocate(static_cast(targetBytes)); + spills_.push_back({std::move(leaf), buffer, targetBytes}); + return targetBytes; + } + + int numReclaimCalls() const { + return numReclaimCalls_; + } + + private: + explicit SpillToSiblingReclaimer(std::shared_ptr sibling) + : MemoryReclaimer(0), sibling_(std::move(sibling)) {} + + struct Spill { + std::shared_ptr leaf; + void* buffer; + uint64_t bytes; + }; + + std::shared_ptr sibling_; + int numReclaimCalls_{0}; + std::vector spills_; +}; + +} // namespace + +// Locks down the cross-resource spill flow: when reclaim is triggered on a +// custom pool, the resource's reclaimer can allocate into a sibling resource's +// pool, modeling device -> pinned-host spill. The caller builds the host pool +// first, captures it inside the device reclaimer, then builds the device pool +// with that reclaimer — no QueryCtx pointer is needed during reclaimer +// construction because the sibling pool itself is the link. +TEST_F(CustomMemoryPoolTest, deviceReclaimerSpillsToHostSibling) { + auto* manager = memoryManager(); + manager->registerCustomResource(makeResource("host")); + manager->registerCustomResource(makeResource("device")); + const auto& hostResource = *manager->customResources().at("host"); + const auto& deviceResource = *manager->customResources().at("device"); + + auto hostPool = buildPool(manager, hostResource, "q-spill.host"); + auto deviceReclaimer = SpillToSiblingReclaimer::create(hostPool); + auto* mockReclaimer = + static_cast(deviceReclaimer.get()); + auto devicePool = buildPool( + manager, + deviceResource, + "q-spill.device", + kMaxMemory, + std::move(deviceReclaimer)); + + auto queryCtx = core::QueryCtx::Builder() + .customPool("host", hostPool) + .customPool("device", devicePool) + .queryId("q-spill") + .build(); + ASSERT_EQ(queryCtx->customPool("host").get(), hostPool.get()); + ASSERT_EQ(queryCtx->customPool("device").get(), devicePool.get()); + + const uint64_t target = 4 * 1024; + MemoryReclaimer::Stats stats; + const auto reclaimed = devicePool->reclaim(target, 0, stats); + + EXPECT_EQ(mockReclaimer->numReclaimCalls(), 1); + EXPECT_EQ(reclaimed, target); + EXPECT_GE(hostPool->usedBytes(), static_cast(target)); +} + +// Builder rejects duplicate tags and null pools. +TEST_F(CustomMemoryPoolTest, builderRejectsBadInputs) { + auto* manager = memoryManager(); + manager->registerCustomResource(makeResource("gpu")); + const auto& registered = *manager->customResources().at("gpu"); + auto pool = buildPool(manager, registered, "q-bad.gpu"); + + auto builder = core::QueryCtx::Builder().queryId("q-bad"); + builder.customPool("gpu", pool); + EXPECT_THROW(builder.customPool("gpu", pool), VeloxRuntimeError); + EXPECT_THROW(builder.customPool("other", nullptr), VeloxRuntimeError); + EXPECT_THROW(builder.customPool("", pool), VeloxRuntimeError); +} + +// QueryCtx::addCustomPool is part of the public surface so a query can +// attach a custom pool after the Builder has already produced the QueryCtx. +// Verifies the same validation as Builder::customPool applies. +TEST_F(CustomMemoryPoolTest, addCustomPoolDirectly) { + auto* manager = memoryManager(); + manager->registerCustomResource(makeResource("gpu")); + const auto& registered = *manager->customResources().at("gpu"); + + auto queryCtx = core::QueryCtx::Builder().queryId("q-direct").build(); + ASSERT_EQ(queryCtx->customPools().size(), 0); + + auto pool = buildPool(manager, registered, "q-direct.gpu"); + queryCtx->addCustomPool("gpu", pool); + EXPECT_EQ(queryCtx->customPool("gpu").get(), pool.get()); + EXPECT_EQ(queryCtx->customPools().size(), 1); + + EXPECT_THROW(queryCtx->addCustomPool("gpu", pool), VeloxRuntimeError); + EXPECT_THROW(queryCtx->addCustomPool("other", nullptr), VeloxRuntimeError); + EXPECT_THROW(queryCtx->addCustomPool("", pool), VeloxRuntimeError); +} + +// Models the QueryCtx-aware reclaimer flow: build the pool without a +// reclaimer (skipping the resource's factory), hand it to the Builder, then +// attach a reclaimer that references the QueryCtx via +// MemoryPool::setReclaimer. The reclaimer reaches a sibling pool through +// queryCtx->customPool(). +TEST_F(CustomMemoryPoolTest, postBuildSetReclaimerWithQueryCtxAware) { + auto* manager = memoryManager(); + manager->registerCustomResource(makeResource("host")); + manager->registerCustomResource(makeResource("device")); + + auto hostPool = manager->addRootPool( + "q-postbuild.host", + kMaxMemory, + /*reclaimer=*/nullptr, + std::nullopt, + "host"); + auto devicePool = manager->addRootPool( + "q-postbuild.device", + kMaxMemory, + /*reclaimer=*/nullptr, + std::nullopt, + "device"); + + auto queryCtx = core::QueryCtx::Builder() + .customPool("host", hostPool) + .customPool("device", devicePool) + .queryId("q-postbuild") + .build(); + + // Attach a sibling-aware reclaimer to the device pool after the QueryCtx + // exists, reaching the host pool through customPool(). + auto reclaimer = + SpillToSiblingReclaimer::create(queryCtx->customPool("host")); + auto* mockReclaimer = static_cast(reclaimer.get()); + devicePool->setReclaimer(std::move(reclaimer)); + + const uint64_t target = 4 * 1024; + MemoryReclaimer::Stats stats; + const auto reclaimed = devicePool->reclaim(target, 0, stats); + + EXPECT_EQ(mockReclaimer->numReclaimCalls(), 1); + EXPECT_EQ(reclaimed, target); + EXPECT_GE(hostPool->usedBytes(), static_cast(target)); +} + +} // namespace facebook::velox::memory::test diff --git a/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp b/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp new file mode 100644 index 00000000000..d95ed86e385 --- /dev/null +++ b/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/MallocAllocator.h" +#include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryArbitrator.h" + +namespace facebook::velox::memory::test { +namespace { + +CustomMemoryResource makeResource(const std::string& tag) { + MemoryAllocator::Options allocatorOptions; + allocatorOptions.capacity = 1L << 30; + CustomMemoryResource resource; + resource.tag = tag; + resource.allocator = std::make_shared(allocatorOptions); + resource.arbitrator = MemoryArbitrator::create({}); + resource.reclaimerFactory = []() { return MemoryReclaimer::create(0); }; + return resource; +} + +} // namespace + +TEST(CustomMemoryRegistration, registrationValidation) { + MemoryManager manager{}; + ASSERT_TRUE(manager.customResources().empty()); + + auto resource = makeResource("test-resource"); + resource.maxCapacity = 1L << 28; + manager.registerCustomResource(std::move(resource)); + + ASSERT_EQ(manager.customResources().size(), 1); + const auto& stored = manager.customResources().at("test-resource"); + EXPECT_EQ(stored->tag, "test-resource"); + EXPECT_EQ(stored->maxCapacity, 1L << 28); + + // Try to register another resource with the same tag, which should be + // rejected. + VELOX_ASSERT_THROW( + manager.registerCustomResource(makeResource("test-resource")), + "CustomMemoryResource already registered for tag: test-resource"); + ASSERT_EQ(manager.customResources().size(), 1); + + // Try to register resources with missing tag, which should be rejected. + CustomMemoryResource emptyTag; + emptyTag.allocator = makeResource("ignored").allocator; + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(emptyTag)), + "CustomMemoryResource tag is empty"); + + // Try to register resources with null allocator, which should be rejected. + CustomMemoryResource nullAllocator; + nullAllocator.tag = "another"; + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(nullAllocator)), + "CustomMemoryResource allocator is null for tag: another"); + + // Try to register resources with null arbitrator, which should be rejected. + auto nullArbitrator = makeResource("another"); + nullArbitrator.arbitrator.reset(); + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(nullArbitrator)), + "CustomMemoryResource arbitrator is null for tag: another"); + + // Try to register resources with null reclaimerFactory, which should be + // rejected. + auto nullFactory = makeResource("another"); + nullFactory.reclaimerFactory = nullptr; + VELOX_ASSERT_THROW( + manager.registerCustomResource(std::move(nullFactory)), + "CustomMemoryResource reclaimerFactory is null for tag: another"); +} + +TEST(CustomMemoryRegistration, registrationsAreReachableByTag) { + MemoryManager manager{}; + for (const auto* tag : {"a", "b", "c"}) { + manager.registerCustomResource(makeResource(tag)); + } + ASSERT_EQ(manager.customResources().size(), 3); + EXPECT_NE(manager.customResources().find("a"), manager.customResources().end()); + EXPECT_NE(manager.customResources().find("b"), manager.customResources().end()); + EXPECT_NE(manager.customResources().find("c"), manager.customResources().end()); +} + +TEST(CustomMemoryRegistration, addRootPoolRejectsUnregisteredTag) { + MemoryManager manager{}; + manager.registerCustomResource(makeResource("gpu")); + + // Adding a root pool with a registered tag should succeed. + auto tagged = manager.addRootPool( + "tagged-root", kMaxMemory, nullptr, std::nullopt, "gpu"); + ASSERT_NE(tagged, nullptr); + + // Adding a root pool without a tag should succeed. + auto untagged = manager.addRootPool("untagged-root"); + ASSERT_NE(untagged, nullptr); + + // Adding a root pool with an unregistered tag should be rejected. + VELOX_ASSERT_THROW( + manager.addRootPool( + "bad-root", kMaxMemory, nullptr, std::nullopt, "not-registered"), + "No CustomMemoryResource registered for tag: not-registered"); +} + +} // namespace facebook::velox::memory::test diff --git a/velox/common/memory/tests/CustomMemoryResourceTest.cpp b/velox/common/memory/tests/CustomMemoryResourceTest.cpp deleted file mode 100644 index a57f85a58e5..00000000000 --- a/velox/common/memory/tests/CustomMemoryResourceTest.cpp +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include "velox/common/base/tests/GTestUtils.h" -#include "velox/common/memory/CustomMemoryResource.h" -#include "velox/common/memory/MallocAllocator.h" -#include "velox/common/memory/Memory.h" -#include "velox/common/memory/MemoryArbitrator.h" -#include "velox/core/QueryCtx.h" - -namespace facebook::velox::memory::test { -namespace { - -CustomMemoryResource makeResource(const std::string& tag) { - MemoryAllocator::Options allocatorOptions; - allocatorOptions.capacity = 1L << 30; - CustomMemoryResource resource; - resource.tag = tag; - resource.allocator = std::make_shared(allocatorOptions); - resource.arbitrator = MemoryArbitrator::create({}); - resource.reclaimerFactory = [](core::QueryCtx*) { - return MemoryReclaimer::create(0); - }; - return resource; -} - -} // namespace - -// MemoryManager-level tests use a local MemoryManager so they don't touch -// the process-wide singleton. -class CustomMemoryResourceManagerTest : public testing::Test {}; - -TEST_F(CustomMemoryResourceManagerTest, registrationValidation) { - MemoryManager manager{}; - ASSERT_TRUE(manager.customResources().empty()); - - auto resource = makeResource("test-resource"); - resource.maxCapacity = 1L << 28; - manager.registerCustomResource(std::move(resource)); - - ASSERT_EQ(manager.customResources().size(), 1); - EXPECT_EQ(manager.customResources()[0]->tag, "test-resource"); - EXPECT_EQ(manager.customResources()[0]->maxCapacity, 1L << 28); - - // Try to register another resource with the same tag, which should be rejected. - VELOX_ASSERT_THROW( - manager.registerCustomResource(makeResource("test-resource")), - "CustomMemoryResource already registered for tag: test-resource"); - ASSERT_EQ(manager.customResources().size(), 1); - - // Try to register resources with missing tag, which should be rejected. - CustomMemoryResource emptyTag; - emptyTag.allocator = makeResource("ignored").allocator; - VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(emptyTag)), - "CustomMemoryResource tag is empty"); - - // Try to register resources with null allocator, which should be rejected. - CustomMemoryResource nullAllocator; - nullAllocator.tag = "another"; - VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(nullAllocator)), - "CustomMemoryResource allocator is null for tag: another"); - - // Try to register resources with null arbitrator, which should be rejected. - auto nullArbitrator = makeResource("another"); - nullArbitrator.arbitrator.reset(); - VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(nullArbitrator)), - "CustomMemoryResource arbitrator is null for tag: another"); - - // Try to register resources with null reclaimerFactory, which should be - // rejected. - auto nullFactory = makeResource("another"); - nullFactory.reclaimerFactory = nullptr; - VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(nullFactory)), - "CustomMemoryResource reclaimerFactory is null for tag: another"); -} - -TEST_F(CustomMemoryResourceManagerTest, registrationOrderPreserved) { - MemoryManager manager{}; - for (const auto* tag : {"a", "b", "c"}) { - manager.registerCustomResource(makeResource(tag)); - } - ASSERT_EQ(manager.customResources().size(), 3); - EXPECT_EQ(manager.customResources()[0]->tag, "a"); - EXPECT_EQ(manager.customResources()[1]->tag, "b"); - EXPECT_EQ(manager.customResources()[2]->tag, "c"); -} - -TEST_F(CustomMemoryResourceManagerTest, addRootPoolRejectsUnregisteredTag) { - MemoryManager manager{}; - manager.registerCustomResource(makeResource("gpu")); - - // Adding a root pool with a registered tag should succeed. - auto tagged = manager.addRootPool( - "tagged-root", kMaxMemory, nullptr, std::nullopt, "gpu"); - ASSERT_NE(tagged, nullptr); - - // Adding a root pool without a tag should succeed. - auto untagged = manager.addRootPool("untagged-root"); - ASSERT_NE(untagged, nullptr); - - // Adding a root pool with an unregistered tag should be rejected. - VELOX_ASSERT_THROW( - manager.addRootPool( - "bad-root", kMaxMemory, nullptr, std::nullopt, "not-registered"), - "No CustomMemoryResource registered for tag: not-registered"); -} - -// QueryCtx-level tests run against the process-wide MemoryManager because -// QueryCtx::Builder reads it directly. Each test cleans up after itself so -// resources don't leak into siblings. -class CustomMemoryResourceQueryCtxTest : public testing::Test { - protected: - static void SetUpTestCase() { - MemoryManager::testingSetInstance(MemoryManager::Options{}); - } - - void TearDown() override { - memoryManager()->testingClearCustomResources(); - } -}; - -TEST_F(CustomMemoryResourceQueryCtxTest, customPoolCreation) { - auto resource = makeResource("gpu"); - resource.maxCapacity = 1L << 28; - - bool factoryInvoked = false; - core::QueryCtx* factorySawCtx = nullptr; - resource.reclaimerFactory = [&](core::QueryCtx* ctx) { - factoryInvoked = true; - factorySawCtx = ctx; - return MemoryReclaimer::create(0); - }; - - memoryManager()->registerCustomResource(std::move(resource)); - - auto queryCtx = core::QueryCtx::Builder().queryId("q0").build(); - EXPECT_TRUE(factoryInvoked); - EXPECT_EQ(factorySawCtx, queryCtx.get()); - - auto pool = queryCtx->customPool("gpu"); - ASSERT_NE(pool, nullptr); - EXPECT_TRUE(pool->name().starts_with("query.q0.")); - EXPECT_TRUE(pool->name().ends_with(".gpu")); - EXPECT_EQ(pool->maxCapacity(), 1L << 28); - - EXPECT_EQ(queryCtx->customPool("missing"), nullptr); - EXPECT_EQ(queryCtx->customPools().size(), 1); -} - -TEST_F(CustomMemoryResourceQueryCtxTest, customPoolsKeyedByTag) { - for (const auto* tag : {"a", "b", "c"}) { - memoryManager()->registerCustomResource(makeResource(tag)); - } - - auto queryCtx = core::QueryCtx::Builder().queryId("q-keyed").build(); - ASSERT_EQ(queryCtx->customPools().size(), 3); - EXPECT_NE(queryCtx->customPool("a"), nullptr); - EXPECT_NE(queryCtx->customPool("b"), nullptr); - EXPECT_NE(queryCtx->customPool("c"), nullptr); -} - -// Locks down the core dispatch invariant: a custom-resource root pool and its -// children must allocate through the resource's allocator, not the -// MemoryManager default. -TEST_F( - CustomMemoryResourceQueryCtxTest, - customPoolDispatchesToResourceAllocator) { - auto resource = makeResource("gpu"); - auto* expectedAllocator = resource.allocator.get(); - memoryManager()->registerCustomResource(std::move(resource)); - - auto queryCtx = core::QueryCtx::Builder().queryId("q-dispatch").build(); - auto root = queryCtx->customPool("gpu"); - ASSERT_NE(root, nullptr); - EXPECT_EQ( - static_cast(root.get())->testingAllocator(), - expectedAllocator); - - auto aggregate = root->addAggregateChild("agg"); - auto leaf = aggregate->addLeafChild("leaf"); - EXPECT_EQ( - static_cast(aggregate.get())->testingAllocator(), - expectedAllocator); - EXPECT_EQ( - static_cast(leaf.get())->testingAllocator(), - expectedAllocator); -} - -// Test reclaimer that "spills" by allocating from a sibling resource's pool. -// Models the chained-reclaimer pattern (e.g. device-memory pool spills into a -// pinned-host pool). -class SpillToSiblingReclaimer : public MemoryReclaimer { - public: - static std::unique_ptr create( - std::shared_ptr sibling) { - return std::unique_ptr( - new SpillToSiblingReclaimer(std::move(sibling))); - } - - ~SpillToSiblingReclaimer() override { - // Release any buffers we acquired during reclaim before the leaf pools - // they came from get destroyed; otherwise the leaf destructor aborts on - // outstanding usage. - for (auto& spill : spills_) { - spill.leaf->free(spill.buffer, static_cast(spill.bytes)); - } - } - - uint64_t reclaim( - MemoryPool* /*pool*/, - uint64_t targetBytes, - uint64_t /*maxWaitMs*/, - Stats& /*stats*/) override { - ++numReclaimCalls_; - auto leaf = sibling_->addLeafChild( - fmt::format("spill-{}", numReclaimCalls_)); - void* buffer = leaf->allocate(static_cast(targetBytes)); - spills_.push_back({std::move(leaf), buffer, targetBytes}); - return targetBytes; - } - - int numReclaimCalls() const { - return numReclaimCalls_; - } - - private: - explicit SpillToSiblingReclaimer(std::shared_ptr sibling) - : MemoryReclaimer(0), sibling_(std::move(sibling)) {} - - struct Spill { - std::shared_ptr leaf; - void* buffer; - uint64_t bytes; - }; - - std::shared_ptr sibling_; - int numReclaimCalls_{0}; - std::vector spills_; -}; - -// Locks down the cross-resource spill flow: when reclaim is triggered on a -// custom pool, the resource's reclaimer can allocate into a sibling resource's -// pool, modeling device -> pinned-host spill. -TEST_F(CustomMemoryResourceQueryCtxTest, deviceReclaimerSpillsToHostSibling) { - memoryManager()->registerCustomResource(makeResource("host")); - - SpillToSiblingReclaimer* mockReclaimer = nullptr; - auto device = makeResource("device"); - device.reclaimerFactory = [&](core::QueryCtx* ctx) { - auto reclaimer = SpillToSiblingReclaimer::create(ctx->customPool("host")); - mockReclaimer = static_cast(reclaimer.get()); - return reclaimer; - }; - memoryManager()->registerCustomResource(std::move(device)); - - auto queryCtx = core::QueryCtx::Builder().queryId("q-spill").build(); - auto devicePool = queryCtx->customPool("device"); - auto hostPool = queryCtx->customPool("host"); - ASSERT_NE(devicePool, nullptr); - ASSERT_NE(hostPool, nullptr); - ASSERT_NE(mockReclaimer, nullptr); - - const uint64_t target = 4 * 1024; - MemoryReclaimer::Stats stats; - const auto reclaimed = devicePool->reclaim(target, 0, stats); - - EXPECT_EQ(mockReclaimer->numReclaimCalls(), 1); - EXPECT_EQ(reclaimed, target); - EXPECT_GE(hostPool->usedBytes(), static_cast(target)); -} -} // namespace facebook::velox::memory::test diff --git a/velox/core/QueryCtx.cpp b/velox/core/QueryCtx.cpp index fa13473f814..c6362f62d32 100644 --- a/velox/core/QueryCtx.cpp +++ b/velox/core/QueryCtx.cpp @@ -20,25 +20,6 @@ #include "velox/common/config/Config.h" namespace facebook::velox::core { -namespace { - -void createCustomMemoryPools(QueryCtx* queryCtx) { - for (const auto& mr : memory::memoryManager()->customResources()) { - auto reclaimer = mr->reclaimerFactory(queryCtx); - auto pool = memory::memoryManager()->addRootPool( - fmt::format( - "{}.{}", - QueryCtx::generatePoolName(queryCtx->queryId()), - mr->tag), - mr->maxCapacity, - std::move(reclaimer), - std::nullopt, - mr->tag); - queryCtx->addCustomPool(mr->tag, std::move(pool)); - } -} - -} // namespace // static std::shared_ptr QueryCtx::create( @@ -78,7 +59,9 @@ std::shared_ptr QueryCtx::Builder::build() { for (auto& cb : releaseCallbacks_) { queryCtx->addReleaseCallback(std::move(cb)); } - createCustomMemoryPools(queryCtx.get()); + for (auto& [tag, pool] : customPools_) { + queryCtx->addCustomPool(tag, std::move(pool)); + } return queryCtx; } @@ -126,9 +109,10 @@ QueryCtx::~QueryCtx() { } void QueryCtx::addCustomPool( - const std::string& tag, + std::string tag, std::shared_ptr pool) { - VELOX_CHECK_NOT_NULL(pool); + VELOX_CHECK(!tag.empty(), "Custom pool tag is empty"); + VELOX_CHECK_NOT_NULL(pool, "Custom pool is null for tag: {}", tag); auto [_, inserted] = customPools_.emplace(tag, std::move(pool)); VELOX_CHECK(inserted, "Duplicate custom pool tag: {}", tag); } diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index a6d4588ffdd..1cd605923e8 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -180,6 +180,21 @@ class QueryCtx : public std::enable_shared_from_this { return *this; } + /// Registers a caller-built root pool under 'tag' on the resulting + /// QueryCtx. Throws if 'tag' is already present or 'pool' is null. The + /// caller is responsible for constructing the pool, typically through + /// MemoryManager::addRootPool(name, cap, reclaimer, debugOpts, tag) with + /// a CustomMemoryResource previously registered on the MemoryManager. + Builder& customPool( + std::string tag, + std::shared_ptr pool) { + VELOX_CHECK(!tag.empty(), "Custom pool tag is empty"); + VELOX_CHECK_NOT_NULL(pool, "Custom pool is null for tag: {}", tag); + auto [_, inserted] = customPools_.emplace(tag, std::move(pool)); + VELOX_CHECK(inserted, "Duplicate custom pool tag: {}", tag); + return *this; + } + /// Adds a callback to be invoked when the QueryCtx is destroyed. /// Multiple callbacks can be added by calling this method multiple times. Builder& releaseCallback(ReleaseCallback callback) { @@ -209,6 +224,8 @@ class QueryCtx : public std::enable_shared_from_this { std::shared_ptr tokenProvider_; std::deque releaseCallbacks_; TraceCtxProvider traceCtxProvider_; + std::unordered_map> + customPools_; }; /// Generates a unique memory pool name for a query. @@ -379,11 +396,11 @@ class QueryCtx : public std::enable_shared_from_this { pool_ = std::move(pool); } - /// Tracks an additional root pool tied to a custom memory resource. Throws - /// if 'tag' is already present or 'pool' is null. - void addCustomPool( - const std::string& tag, - std::shared_ptr pool); + /// Tracks an additional root pool keyed by 'tag'. The pool's allocator + /// and arbitrator are borrowed from a CustomMemoryResource owned by the + /// MemoryManager registry, so its lifetime is governed externally. + /// Throws if 'tag' is already present or 'pool' is null. + void addCustomPool(std::string tag, std::shared_ptr pool); /// Returns the custom root pool for the given resource tag, or nullptr if /// none is registered under that tag for this query. @@ -488,8 +505,10 @@ class QueryCtx : public std::enable_shared_from_this { std::unordered_map> connectorSessionProperties_; std::shared_ptr pool_; + std::unordered_map> customPools_; + QueryConfig queryConfig_; std::atomic numSpilledBytes_{0}; std::atomic numTracedBytes_{0}; diff --git a/velox/docs/develop/memory.rst b/velox/docs/develop/memory.rst index 3675f821816..2bcf958da29 100644 --- a/velox/docs/develop/memory.rst +++ b/velox/docs/develop/memory.rst @@ -869,6 +869,160 @@ we don’t cap the memory allocations delegated to std::malloc in the reserve a small amount of memory capacity in *MmapAllocator* to compensate for these ad-hoc small allocations in practice. +Custom Memory Resources +----------------------- + +Velox supports heterogeneous compute extensions that operate across multiple +memory tiers. A query running on a GPU connector touches device memory; a +query running on a CXL-attached node touches a memory pool with different +latency and capacity characteristics; a connector issuing DMA transfers +needs a pinned-host allocator; NUMA-aware operators want a pool bound to a +specific socket. Each of these is a distinct memory resource that the engine +must track and arbitrate. + +The default memory system assumes a single homogeneous tier: host DRAM, +managed by *MemoryAllocator* and arbitrated by a single *MemoryArbitrator*. +Under this model, allocations issued by an extension are invisible to +*MemoryManager*, *MemoryArbitrator* cannot reason about them, and operators +have no way to request a memory pool backed by a particular tier. + +*CustomMemoryResource* is the registration primitive that lets an extension +expose additional memory tiers side-by-side with the default CPU tier. A +resource bundles a tag, an allocator, an arbitrator, and a factory that +builds a per-query reclaimer: + +.. code-block:: c++ + + struct CustomMemoryResource { + // Unique non-empty identifier. + std::string tag; + + // Capacity of the per-query root pool created for this resource. + int64_t maxCapacity{std::numeric_limits::max()}; + + // Allocator backing pools tagged with this resource. + std::shared_ptr allocator; + + // Arbitrator routing capacity decisions for pools tagged with this resource. + std::shared_ptr arbitrator; + + // Builds a reclaimer for pools tagged with this resource. + std::function()> reclaimerFactory; + }; + +Each resource carries its own allocator and arbitrator. The default CPU +allocator and arbitrator on *MemoryManager* are not shared with custom +resources, which keeps per-tier accounting and capacity decisions +self-contained. This matters when tiers differ in size, latency, alignment, +or allocation failure modes — properties that the default arbitrator was +not designed to reason about across resources. + +Registration and Per-Query Materialization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Extensions register resources at process startup, immediately after +*initializeMemoryManager*: + +.. code-block:: c++ + + auto allocator = std::make_shared(...); + auto arbitrator = MemoryArbitrator::create(...); + memory::memoryManager()->registerCustomResource( + memory::CustomMemoryResource{ + .tag = "device", + .maxCapacity = deviceCapacity, + .allocator = std::move(allocator), + .arbitrator = std::move(arbitrator), + .reclaimerFactory = []() { + return std::make_unique(); + }, + }); + +The *MemoryManager* keeps the resource alive until process shutdown. +Registration is not thread-safe and must happen before any *QueryCtx* is +constructed. + +For each query that wants to use a registered resource, the caller builds +the per-resource root pool through +*MemoryManager::addRootPool(name, maxCapacity, reclaimer, debugOpts, tag)* +and hands it to the *QueryCtx* via *Builder::customPool*: + +.. code-block:: c++ + + auto* manager = memory::memoryManager(); + auto reclaimer = std::make_unique(); + auto devicePool = manager->addRootPool( + "query.q0.device", deviceCapacity, std::move(reclaimer), + /*poolDebugOpts*/ std::nullopt, "device"); + auto queryCtx = core::QueryCtx::Builder() + .customPool("device", std::move(devicePool)) + .queryId("q0") + .build(); + +The pool allocates through the registered resource's allocator and is +arbitrated by the registered resource's arbitrator. The root pool is +exposed on *QueryCtx* keyed by tag: + +.. code-block:: c++ + + // Returns the custom root pool for the given resource tag, or nullptr. + std::shared_ptr QueryCtx::customPool( + const std::string& tag) const; + +If the reclaimer needs to address sibling custom pools on the *QueryCtx* +(for instance, to spill into another tier), construct the pool without a +reclaimer and attach the reclaimer post-build with the real *QueryCtx*: + +.. code-block:: c++ + + auto devicePool = manager->addRootPool( + "query.q0.device", deviceCapacity, /*reclaimer*/ nullptr, + std::nullopt, "device"); + auto queryCtx = core::QueryCtx::Builder() + .customPool("device", devicePool) + .queryId("q0") + .build(); + devicePool->setReclaimer(std::make_unique(queryCtx.get())); + +The *reclaimerFactory* on *CustomMemoryResource* is the resource's +commitment to produce reclaimers for its pools. Callers invoke it +explicitly when materializing the per-query pool (the framework no longer +calls it implicitly). It is required at registration; for the +post-build *setReclaimer* path above, the factory's output is unused for +that particular pool but still serves as the default for queries that do +not override it. + +Operators that want to allocate from a particular tier ask the *QueryCtx* +for the matching pool by tag and create their own child leaf pool from it. +Operators that don't care about custom tiers behave exactly as before — +the default CPU root pool returned by *QueryCtx::pool()* is unchanged, and +so is its arbitration path. When a *QueryCtx* is built without any +*customPool* calls, *CustomMemoryResource* is inert for that query. + +Scope +^^^^^ + +The framework is deliberately the *plumbing* layer. It provides: + +* registration and per-query materialization of additional memory pools, +* per-tier accounting and arbitration through the resource's own allocator + and arbitrator, +* an extension point (the per-query reclaimer) for tier-specific reclaim + behavior. + +It does not provide cross-resource arbitration. When a custom pool runs +short of capacity, its own arbitrator decides what to do; the default +*MemoryArbitrator* on *MemoryManager* will not move memory between +resources. If an extension wants reclaim on one tier to spill into another +(for instance, device memory into pinned-host memory), the policy lives in +the per-query reclaimer the caller attaches to the custom pool. The +reclaimer reaches sibling tiers either by holding a *shared_ptr* to the +sibling pool captured at construction, or by being attached after +*QueryCtx::build()* (via *MemoryPool::setReclaimer*) and looking siblings +up through *QueryCtx::customPool*. See +`reclaim across memory resources `_ +in the spilling document. + Server OOM Prevention --------------------- diff --git a/velox/docs/develop/spilling.rst b/velox/docs/develop/spilling.rst index d6cd4113ba5..ee16da1c632 100644 --- a/velox/docs/develop/spilling.rst +++ b/velox/docs/develop/spilling.rst @@ -530,6 +530,129 @@ spilling: bridge will split the spill partition files among the hash build operators with each one having an equally-sized shard to restore. +Reclaim Across Memory Resources +------------------------------- + +The spilling algorithms above describe how a spillable operator releases +memory by serializing in-memory state to disk. With +`custom memory resources `_, the reclaim +path is no longer fixed to disk. The per-query reclaimer attached to a +custom root pool can move data into a sibling tier instead — for example, +copying buffers out of a device memory pool into a pinned-host pool, or +evicting cold pages from a CXL-attached pool into a local DRAM pool. The +extension chooses what each reclaimer does; the framework only guarantees +that *MemoryPool::reclaim* on a custom pool dispatches to the reclaimer the +extension supplied. + +The reclaimer holds a pointer to its owning *QueryCtx* and can therefore +address every other custom pool on the query through *QueryCtx::customPool*. +A chain of reclaimers — for instance, hash table on CPU DRAM → CXL on first +reclaim → disk on second reclaim — is built by composing reclaimers across +resources. Each link in the chain is an independent reclaim decision made +by the arbitrator of the pool that ran short. The framework does not +coordinate the chain; the extension chooses the topology by deciding which +sibling each reclaimer allocates into. + +Example: CXL-Backed Hash Aggregation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The reclaim path above is meaningful when the custom resource exposes a +tier with different access properties from CPU DRAM. CXL-attached memory +is one such tier: it is slower than local DRAM but cheaper per byte, and +crucially it is exposed to the CPU as part of the coherent virtual address +space — host code can dereference a CXL pointer directly, with no copy +back to DRAM required. + +A CXL extension takes advantage of this by registering a CXL custom +resource and installing a CXL-aware *HashAggregation* through +*DriverAdapter*. The adapter is the standard mechanism Velox uses to swap +in alternate operator implementations (also used by the cuDF and Wave +extensions); see *DriverAdapter* in *velox/exec/Driver.h*. The CXL-backed +operator preserves the public *HashAggregation* contract while changing +the reclaim policy: + +#. **First reclaim trigger (DRAM is tight).** Like the default operator, + the CXL-backed *HashAggregation* selects a spill target — the set of + partitions with the most data — and processes the corresponding rows + in its DRAM row container. Unlike the default operator, it copies each + row into the CXL custom pool obtained via *queryCtx->customPool("cxl")* + and pointer-swizzles the corresponding hash-table bucket entries to the + new CXL addresses, rather than clearing them. The swizzle has the same + cost shape as the default operator's hash-table entry removal — a + *ProbeState* walk per row to find each bucket slot — but with greater + benefit: because CXL memory is coherent with the CPU address space, + probe and finalize logic continue to read the relocated entries + directly with no restore step. + +#. **CXL pool fills.** Subsequent reclaim triggers continue moving + partitions from DRAM into CXL until the CXL custom pool's arbitrator + decides it is out of capacity. At that point arbitration on the CXL + pool calls *MemoryPool::reclaim* on the CXL root pool, which dispatches + to the reclaimer the resource's *reclaimerFactory* produced. + +#. **Second reclaim — CXL into disk.** The reclaimer behaves like the + default *HashAggregation* spill path, just sourced from CXL instead of + DRAM. It scans the partitions stored in CXL, serializes them through + the standard *Spiller* onto disk, and notifies the operator to remove + the corresponding entries from the hash table. Removal happens here, + not in step 1, because this is the first point at which the entries + are no longer directly addressable. + +#. **Finalize.** After receiving all input the operator produces results + by merging three sources for each partition: rows still in the DRAM row + container, rows that live in CXL (read directly with no restore), and + rows on disk (read back through the standard *Spiller* restore path). + For partitions that never made it past step 1 the merge degenerates to + "DRAM + CXL"; for partitions reclaimed in step 3 it degenerates to + "DRAM + disk" — the same shape as the default *HashAggregation* + algorithm. + +The CXL pool is registered like any other custom resource. The resource +carries the reclaimer factory the framework uses to materialize per-query +reclaimers for pools tagged ``"cxl"``: + +.. code-block:: c++ + + memory::memoryManager()->registerCustomResource( + memory::CustomMemoryResource{ + .tag = "cxl", + .maxCapacity = cxlCapacityBytes, + .allocator = std::make_shared(...), + .arbitrator = MemoryArbitrator::create(...), + .reclaimerFactory = []() { + return std::make_unique(); + }, + }); + + // Per query: invoke the registered factory to mint the reclaimer, build + // the CXL root pool with it, then hand the pool to the QueryCtx. + auto* manager = memory::memoryManager(); + auto& cxlResource = *manager->customResources().at("cxl"); + auto cxlPool = manager->addRootPool( + "query.q0.cxl", cxlResource.maxCapacity, + cxlResource.reclaimerFactory(), + /*poolDebugOpts*/ std::nullopt, "cxl"); + auto queryCtx = core::QueryCtx::Builder() + .customPool("cxl", std::move(cxlPool)) + .queryId("q0") + .build(); + +If a query needs a reclaimer that references the *QueryCtx* (for instance, +to spill into a sibling custom pool reachable through +*QueryCtx::customPool*), build the pool with a null reclaimer and attach +the QueryCtx-aware reclaimer post-build via *MemoryPool::setReclaimer* — +*setReclaimer* is one-shot, so the path skips the factory entirely. + +The arbitrator passed here governs only the CXL pool's capacity. The +default *MemoryArbitrator* on *MemoryManager* continues to govern the DRAM +side; it triggers the first reclaim (step 1) without knowledge that the +"reclaimed" bytes have actually been relocated to CXL. From the default +arbitrator's perspective the DRAM pool simply gave bytes back; from the +CXL arbitrator's perspective new bytes arrived. Cross-tier coordination — +the decision that CXL is the right next stop for DRAM, and disk is the +right next stop for CXL — lives entirely in the extension's operator and +reclaimer, not in the framework. + Future Work ----------- From 1985491a9f36d6478395a700602e885ce0a02141 Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Tue, 19 May 2026 05:30:49 +0000 Subject: [PATCH 06/10] refactor(memory): align CustomMemoryResource with ScopedRegistry and make invalid states unrepresentable - Move the resource registry out of MemoryManager into a standalone CustomMemoryResourceRegistry that wraps ScopedRegistry. Exposes a process-global singleton via global() and per-query/per-test scopes via createRegistry(parent). Registration and lookup go through the Registry instance (Registry::insert, Registry::find, Registry::clear); aligns with the pattern from #16982/#16986. - Convert CustomMemoryResource from a struct with optional fields into a class with a validating constructor. Required fields are now checked at construction, so post-construction the resource is guaranteed valid. Removes the silent wrong-tier failure mode where a null allocator or arbitrator would let the pool quietly fall back to the default tier. - Add MemoryManager::addCustomRootPool(name, shared_ptr, debugOpts) as the per-query materialization entry point. Drop the resourceTag overload of addRootPool. - Tests use per-test isolated registries so concurrent runs cannot collide on tag registration. Co-Authored-By: Claude Opus 4.7 (1M context) --- velox/common/memory/CMakeLists.txt | 14 + velox/common/memory/CustomMemoryResource.cpp | 56 ++++ velox/common/memory/CustomMemoryResource.h | 56 ++-- .../memory/CustomMemoryResourceRegistry.cpp | 33 +++ .../memory/CustomMemoryResourceRegistry.h | 45 ++++ velox/common/memory/Memory.cpp | 62 ++--- velox/common/memory/Memory.h | 40 ++- velox/common/memory/tests/CMakeLists.txt | 1 + .../tests/CustomMemoryEmulationTest.cpp | 208 ++++----------- .../memory/tests/CustomMemoryPoolTest.cpp | 244 +++++++----------- .../tests/CustomMemoryRegistrationTest.cpp | 155 ++++++----- velox/core/QueryCtx.h | 11 +- velox/docs/develop/memory.rst | 178 ++++--------- velox/docs/develop/spilling.rst | 110 ++++---- 14 files changed, 562 insertions(+), 651 deletions(-) create mode 100644 velox/common/memory/CustomMemoryResource.cpp create mode 100644 velox/common/memory/CustomMemoryResourceRegistry.cpp create mode 100644 velox/common/memory/CustomMemoryResourceRegistry.h diff --git a/velox/common/memory/CMakeLists.txt b/velox/common/memory/CMakeLists.txt index 503c9b7893f..63c4c520d5a 100644 --- a/velox/common/memory/CMakeLists.txt +++ b/velox/common/memory/CMakeLists.txt @@ -22,6 +22,7 @@ velox_add_library( ArbitrationOperation.cpp ArbitrationParticipant.cpp ByteStream.cpp + CustomMemoryResource.cpp HashStringAllocator.cpp MallocAllocator.cpp Memory.cpp @@ -70,3 +71,16 @@ velox_link_libraries( glog::glog PRIVATE velox_test_util re2::re2 ) + +velox_add_library( + velox_custom_memory_resource_registry + CustomMemoryResourceRegistry.cpp + HEADERS + CustomMemoryResourceRegistry.h +) + +velox_link_libraries( + velox_custom_memory_resource_registry + velox_memory + velox_scoped_registry +) diff --git a/velox/common/memory/CustomMemoryResource.cpp b/velox/common/memory/CustomMemoryResource.cpp new file mode 100644 index 00000000000..1773963a38f --- /dev/null +++ b/velox/common/memory/CustomMemoryResource.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/memory/CustomMemoryResource.h" + +#include + +#include "velox/common/base/Exceptions.h" +#include "velox/common/memory/MemoryArbitrator.h" + +namespace facebook::velox::memory { + +CustomMemoryResource::CustomMemoryResource( + std::string tag, + std::shared_ptr allocator, + std::shared_ptr arbitrator, + ReclaimerFactory reclaimerFactory, + int64_t maxCapacity) + : tag_(std::move(tag)), + maxCapacity_(maxCapacity), + allocator_(std::move(allocator)), + arbitrator_(std::move(arbitrator)), + reclaimerFactory_(std::move(reclaimerFactory)) { + VELOX_USER_CHECK(!tag_.empty(), "CustomMemoryResource tag is empty"); + VELOX_USER_CHECK_NOT_NULL( + allocator_, + "CustomMemoryResource allocator is null for tag: {}", + tag_); + VELOX_USER_CHECK_NOT_NULL( + arbitrator_, + "CustomMemoryResource arbitrator is null for tag: {}", + tag_); + VELOX_USER_CHECK( + reclaimerFactory_ != nullptr, + "CustomMemoryResource reclaimerFactory is null for tag: {}", + tag_); +} + +std::unique_ptr CustomMemoryResource::newReclaimer() const { + return reclaimerFactory_(); +} + +} // namespace facebook::velox::memory diff --git a/velox/common/memory/CustomMemoryResource.h b/velox/common/memory/CustomMemoryResource.h index 00667a2e519..c4053e43183 100644 --- a/velox/common/memory/CustomMemoryResource.h +++ b/velox/common/memory/CustomMemoryResource.h @@ -29,28 +29,52 @@ class MemoryArbitrator; class MemoryReclaimer; /// Describes an externally-provided memory resource (e.g. a GPU or tiered -/// memory backend) registered with MemoryManager and referenced by 'tag' when -/// building per-query memory pools. -struct CustomMemoryResource { - /// Unique non-empty identifier. - std::string tag; +/// memory backend) registered with the memory subsystem and referenced by +/// 'tag' when building per-query memory pools. The constructor enforces +/// non-empty tag and non-null allocator, arbitrator, and reclaimerFactory; +/// once constructed, the resource is immutable. +class CustomMemoryResource { + public: + using ReclaimerFactory = std::function()>; - /// Capacity of the per-query root pool created for this resource. - int64_t maxCapacity{std::numeric_limits::max()}; + CustomMemoryResource( + std::string tag, + std::shared_ptr allocator, + std::shared_ptr arbitrator, + ReclaimerFactory reclaimerFactory, + int64_t maxCapacity = std::numeric_limits::max()); + + /// Unique identifier for this resource. + const std::string& tag() const { + return tag_; + } + + /// Capacity of the per-query root pool created from this resource. + int64_t maxCapacity() const { + return maxCapacity_; + } /// Allocator backing pools tagged with this resource. - std::shared_ptr allocator; + MemoryAllocator* allocator() const { + return allocator_.get(); + } /// Arbitrator routing capacity decisions for pools tagged with this /// resource. - std::shared_ptr arbitrator; - - /// Required. Builds a reclaimer for pools tagged with this resource. - /// Invoked by the caller (not the framework) when constructing the - /// per-query root pool. For reclaimers that need a QueryCtx-aware view, - /// the caller skips this factory and attaches the reclaimer post-build via - /// MemoryPool::setReclaimer. - std::function()> reclaimerFactory; + MemoryArbitrator* arbitrator() const { + return arbitrator_.get(); + } + + /// Returns a fresh reclaimer for a new pool by invoking the factory + /// supplied at construction. + std::unique_ptr newReclaimer() const; + + private: + const std::string tag_; + const int64_t maxCapacity_; + const std::shared_ptr allocator_; + const std::shared_ptr arbitrator_; + const ReclaimerFactory reclaimerFactory_; }; } // namespace facebook::velox::memory diff --git a/velox/common/memory/CustomMemoryResourceRegistry.cpp b/velox/common/memory/CustomMemoryResourceRegistry.cpp new file mode 100644 index 00000000000..b327eeb6f58 --- /dev/null +++ b/velox/common/memory/CustomMemoryResourceRegistry.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/common/memory/CustomMemoryResourceRegistry.h" + +namespace facebook::velox::memory { + +// static +CustomMemoryResourceRegistry::Registry& CustomMemoryResourceRegistry::global() { + static Registry instance; + return instance; +} + +// static +std::shared_ptr +CustomMemoryResourceRegistry::createRegistry(Registry* parent) { + return std::make_shared(parent); +} + +} // namespace facebook::velox::memory diff --git a/velox/common/memory/CustomMemoryResourceRegistry.h b/velox/common/memory/CustomMemoryResourceRegistry.h new file mode 100644 index 00000000000..9b2f2500595 --- /dev/null +++ b/velox/common/memory/CustomMemoryResourceRegistry.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "velox/common/ScopedRegistry.h" +#include "velox/common/memory/CustomMemoryResource.h" + +namespace facebook::velox::memory { + +/// Entry point for the CustomMemoryResource registry. Provides the +/// process-global root and a factory for scoped registries. Callers +/// register, look up, and clear entries directly on the Registry instance +/// (Registry::insert, Registry::find, Registry::clear). Registry methods +/// are thread-safe. +class CustomMemoryResourceRegistry { + public: + using Registry = ScopedRegistry; + + /// Process-global root registry. The reference is stable for the + /// lifetime of the process. + static Registry& global(); + + /// Creates a scoped registry. Defaults to inheriting from the global + /// registry; pass nullptr for isolation mode (no fallback). + static std::shared_ptr createRegistry(Registry* parent = &global()); +}; + +} // namespace facebook::velox::memory diff --git a/velox/common/memory/Memory.cpp b/velox/common/memory/Memory.cpp index 029999369fa..21951a575ea 100644 --- a/velox/common/memory/Memory.cpp +++ b/velox/common/memory/Memory.cpp @@ -284,30 +284,28 @@ std::shared_ptr MemoryManager::addRootPool( const std::string& name, int64_t maxCapacity, std::unique_ptr reclaimer, - const std::optional& poolDebugOpts, - const std::optional& resourceTag) { - if (!resourceTag.has_value()) { - return addRootPoolImpl( - name, - maxCapacity, - std::move(reclaimer), - poolDebugOpts, - /*customAllocator=*/nullptr, - /*customArbitrator=*/nullptr); - } - auto it = customResources_.find(*resourceTag); - VELOX_USER_CHECK( - it != customResources_.end(), - "No CustomMemoryResource registered for tag: {}", - *resourceTag); - const auto& resource = *it->second; + const std::optional& poolDebugOpts) { return addRootPoolImpl( name, maxCapacity, std::move(reclaimer), poolDebugOpts, - resource.allocator.get(), - resource.arbitrator.get()); + /*customAllocator=*/nullptr, + /*customArbitrator=*/nullptr); +} + +std::shared_ptr MemoryManager::addCustomRootPool( + const std::string& name, + std::shared_ptr resource, + const std::optional& poolDebugOpts) { + VELOX_USER_CHECK_NOT_NULL(resource); + return addRootPoolImpl( + name, + resource->maxCapacity(), + resource->newReclaimer(), + poolDebugOpts, + resource->allocator(), + resource->arbitrator()); } std::shared_ptr MemoryManager::addRootPoolImpl( @@ -412,32 +410,6 @@ MemoryArbitrator* MemoryManager::arbitrator() { return arbitrator_.get(); } -void MemoryManager::registerCustomResource(CustomMemoryResource resource) { - VELOX_USER_CHECK(!resource.tag.empty(), "CustomMemoryResource tag is empty"); - VELOX_USER_CHECK_NOT_NULL( - resource.allocator, - "CustomMemoryResource allocator is null for tag: {}", - resource.tag); - VELOX_USER_CHECK_NOT_NULL( - resource.arbitrator, - "CustomMemoryResource arbitrator is null for tag: {}", - resource.tag); - VELOX_USER_CHECK_NOT_NULL( - resource.reclaimerFactory, - "CustomMemoryResource reclaimerFactory is null for tag: {}", - resource.tag); - const auto tag = resource.tag; - auto [_, inserted] = customResources_.emplace( - tag, std::make_shared(std::move(resource))); - VELOX_USER_CHECK( - inserted, "CustomMemoryResource already registered for tag: {}", tag); -} - -const folly::F14FastMap>& -MemoryManager::customResources() const { - return customResources_; -} - std::string MemoryManager::toString(bool detail) const { const int64_t allocatorCapacity = capacity(); std::stringstream out; diff --git a/velox/common/memory/Memory.h b/velox/common/memory/Memory.h index 8fec6eebc32..a973fb6fa87 100644 --- a/velox/common/memory/Memory.h +++ b/velox/common/memory/Memory.h @@ -25,7 +25,6 @@ #include #include -#include #include #include @@ -217,27 +216,25 @@ class MemoryManager { /// Creates a root memory pool with specified 'name' and 'maxCapacity'. If /// 'name' is missing, the memory manager generates a default name internally - /// to ensure uniqueness. If 'resourceTag' is set, the pool's allocator and - /// arbitrator come from the CustomMemoryResource previously registered - /// under that tag via registerCustomResource(); throws if the tag is not - /// registered. + /// to ensure uniqueness. std::shared_ptr addRootPool( const std::string& name = "", int64_t maxCapacity = kMaxMemory, std::unique_ptr reclaimer = nullptr, const std::optional& poolDebugOpts = - std::nullopt, - const std::optional& resourceTag = std::nullopt); - - /// Registers a custom memory resource. Throws on empty or duplicate tag, - /// or on any null required field. NOT thread-safe: must be called during - /// process startup. - void registerCustomResource(CustomMemoryResource resource); - - /// Returns the registered custom resources keyed by tag. Iteration order - /// is unspecified. - const folly::F14FastMap>& - customResources() const; + std::nullopt); + + /// Creates a root memory pool backed by 'resource'. The pool's capacity + /// comes from 'resource->maxCapacity'; its reclaimer comes from + /// 'resource->reclaimerFactory()'; its allocator and arbitrator are + /// borrowed from 'resource->allocator' and 'resource->arbitrator'. The + /// caller (typically via CustomMemoryResourceRegistry) is responsible + /// for keeping 'resource' alive while the pool exists. + std::shared_ptr addCustomRootPool( + const std::string& name, + std::shared_ptr resource, + const std::optional& poolDebugOpts = + std::nullopt); /// Creates a leaf memory pool for direct memory allocation use with specified /// 'name'. If 'name' is missing, the memory manager generates a default name @@ -319,8 +316,8 @@ class MemoryManager { std::unique_ptr& reclaimer, MemoryPool::Options& options); - // Shared implementation for addRootPool overloads. 'customAllocator' and - // 'customArbitrator' are borrowed and may be null (default tier). + // 'customAllocator' and 'customArbitrator' are borrowed pointers; if both + // are null, the manager's default tier is used. std::shared_ptr addRootPoolImpl( const std::string& name, int64_t maxCapacity, @@ -358,11 +355,6 @@ class MemoryManager { mutable folly::SharedMutex mutex_; // All user root pools allocated from 'this'. std::unordered_map> pools_; - - // Shared ownership so a resource outlives every pool that uses it. Keyed - // by 'tag' so addRootPool() can resolve a resource in O(1). - folly::F14FastMap> - customResources_; }; /// Initializes the process-wide memory manager based on the specified diff --git a/velox/common/memory/tests/CMakeLists.txt b/velox/common/memory/tests/CMakeLists.txt index 9e060414c9a..09c276ecf05 100644 --- a/velox/common/memory/tests/CMakeLists.txt +++ b/velox/common/memory/tests/CMakeLists.txt @@ -41,6 +41,7 @@ target_link_libraries( PRIVATE velox_caching velox_common_base + velox_custom_memory_resource_registry velox_exception velox_exec velox_exec_test_lib diff --git a/velox/common/memory/tests/CustomMemoryEmulationTest.cpp b/velox/common/memory/tests/CustomMemoryEmulationTest.cpp index a8a6a117b27..ec4f7141902 100644 --- a/velox/common/memory/tests/CustomMemoryEmulationTest.cpp +++ b/velox/common/memory/tests/CustomMemoryEmulationTest.cpp @@ -15,10 +15,10 @@ */ #include -#include #include #include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/CustomMemoryResourceRegistry.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/Memory.h" #include "velox/common/memory/MemoryArbitrator.h" @@ -27,8 +27,6 @@ namespace facebook::velox::memory::test { namespace { -// Mirrors the row layout the docs assume for the CXL-backed HashAggregation -// example: a fixed-width group key plus a single int64_t accumulator. struct EmulatedRow { int64_t key; int64_t sum; @@ -37,8 +35,7 @@ struct EmulatedRow { class EmulatedCxlHashAggregation; // Reclaimer installed on the operator's DRAM leaf row pool. Forwards -// reclaim() to the operator's DRAM -> CXL spill method, modeling Phase 1 -// in the spilling docs. +// reclaim() to the operator's DRAM -> CXL spill method. class DramReclaimer : public MemoryReclaimer { public: explicit DramReclaimer(EmulatedCxlHashAggregation* op) @@ -54,48 +51,20 @@ class DramReclaimer : public MemoryReclaimer { EmulatedCxlHashAggregation* const op_; }; -// Reclaimer attached to the CXL custom root pool. Constructed by the -// resource's reclaimerFactory before the operator exists; the operator -// fills in 'callback' after construction and clears it on destruction. -class CxlReclaimer : public MemoryReclaimer { - public: - using Callback = std::function; - - explicit CxlReclaimer(std::shared_ptr callback) - : MemoryReclaimer(0), callback_(std::move(callback)) {} - - uint64_t reclaim( - MemoryPool* /*pool*/, - uint64_t targetBytes, - uint64_t /*maxWaitMs*/, - Stats& /*stats*/) override { - if (!callback_ || !*callback_) { - return 0; - } - return (*callback_)(targetBytes); - } - - private: - const std::shared_ptr callback_; -}; - // Mini hash-aggregation operator emulating the CXL-aware HashAggregation // described in spilling.rst (`Reclaim Across Memory Resources`). Allocates // row bodies through MemoryPool to keep used-byte counters honest, -// partitions rows by key, and exposes spill methods that the two -// reclaimers invoke. +// partitions rows by key, and exposes the DRAM -> CXL spill method that +// the DRAM-leaf reclaimer invokes. class EmulatedCxlHashAggregation { public: static constexpr int kNumPartitions = 8; - EmulatedCxlHashAggregation( - core::QueryCtx* ctx, - std::shared_ptr cxlCallbackSlot) + explicit EmulatedCxlHashAggregation(core::QueryCtx* ctx) : ctx_(ctx), dramRowPool_(ctx->pool()->addLeafChild("emulated-dram-rows")), cxlRootPool_(ctx->customPool("cxl")), - cxlRowPool_(cxlRootPool_->addLeafChild("emulated-cxl-rows")), - cxlCallbackSlot_(std::move(cxlCallbackSlot)) { + cxlRowPool_(cxlRootPool_->addLeafChild("emulated-cxl-rows")) { VELOX_CHECK_NOT_NULL(cxlRootPool_); // MemoryPool::setReclaimer requires the parent to have a reclaimer // first. The default root pool typically has none, so install a noop @@ -104,17 +73,9 @@ class EmulatedCxlHashAggregation { ctx_->pool()->setReclaimer(MemoryReclaimer::create(0)); } dramRowPool_->setReclaimer(std::make_unique(this)); - *cxlCallbackSlot_ = [this](uint64_t targetBytes) { - return spillCxlPartitionToDisk(targetBytes); - }; } ~EmulatedCxlHashAggregation() { - // Clear the CXL reclaimer's callback so a stale reclaim invocation - // after operator destruction cannot dereference 'this'. - if (cxlCallbackSlot_) { - *cxlCallbackSlot_ = {}; - } // Free every row still resident in DRAM or CXL before the pool // shared_ptrs go out of scope; otherwise the pool destructors abort // on outstanding usage. @@ -144,32 +105,23 @@ class EmulatedCxlHashAggregation { } // Reads every row still resident in the hash table (DRAM and CXL both - // CPU-addressable in this emulation) and merges with disk-resident - // rows to produce the final per-key aggregate. + // CPU-addressable in this emulation) to produce the final per-key + // aggregate. std::unordered_map finalize() const { std::unordered_map result; for (const auto& [_, entry] : hashTable_) { result[entry.row->key] += entry.row->sum; } - for (const auto& row : diskRows_) { - result[row.key] += row.sum; - } return result; } // Returns the DRAM leaf pool — has the operator's DramReclaimer - // installed, so the test can trigger Phase 1 via dramPool()->reclaim(). + // installed, so the test can trigger DRAM -> CXL spill via + // dramPool()->reclaim(). MemoryPool* dramPool() const { return dramRowPool_.get(); } - // Returns the CXL root pool — has the resource's CxlReclaimer installed - // via reclaimerFactory, so the test can trigger Phase 2 via - // cxlPool()->reclaim(). - MemoryPool* cxlPool() const { - return cxlRootPool_.get(); - } - size_t hashTableSize() const { return hashTable_.size(); } @@ -182,13 +134,9 @@ class EmulatedCxlHashAggregation { return countByLocation(Location::kCxl); } - size_t diskRowCount() const { - return diskRows_.size(); - } - - // Phase 1 (DRAM -> CXL): move the partition with the most - // DRAM-resident rows into CXL. The hash-table bucket for each row is - // swizzled to its new CXL address; the entry is not removed. + // Moves the partition with the most DRAM-resident rows into CXL. The + // hash-table bucket for each row is swizzled to its new CXL address; + // the entry is not removed. uint64_t spillTopPartitionToCxl(uint64_t /*targetBytes*/) { const int partition = pickPartition(Location::kDram); if (partition < 0) { @@ -211,30 +159,6 @@ class EmulatedCxlHashAggregation { return freed; } - // Phase 2 (CXL -> disk): move the partition with the most CXL-resident - // rows into the disk-resident vector. The hash-table entries are - // erased, because the rows are no longer directly addressable from the - // CPU after the on-disk copy is the only canonical copy. - uint64_t spillCxlPartitionToDisk(uint64_t /*targetBytes*/) { - const int partition = pickPartition(Location::kCxl); - if (partition < 0) { - return 0; - } - uint64_t freed = 0; - for (auto it = hashTable_.begin(); it != hashTable_.end();) { - if (it->second.location != Location::kCxl || - partitionOf(it->first) != partition) { - ++it; - continue; - } - diskRows_.push_back(*it->second.row); - cxlRowPool_->free(it->second.row, sizeof(EmulatedRow)); - it = hashTable_.erase(it); - freed += sizeof(EmulatedRow); - } - return freed; - } - private: enum class Location { kDram, kCxl }; struct Entry { @@ -278,16 +202,13 @@ class EmulatedCxlHashAggregation { core::QueryCtx* const ctx_; std::shared_ptr dramRowPool_; - // Root of the CXL custom resource; holds the CxlReclaimer installed by - // the resource's reclaimerFactory. Reserved for reclaim() dispatch only; + // Root of the CXL custom resource. Reserved for child-pool creation; // allocations go through 'cxlRowPool_' below. std::shared_ptr cxlRootPool_; // Leaf child of 'cxlRootPool_'. Row bodies are allocated and freed // here. std::shared_ptr cxlRowPool_; - std::shared_ptr cxlCallbackSlot_; std::unordered_map hashTable_; - std::vector diskRows_; }; uint64_t DramReclaimer::reclaim( @@ -298,46 +219,28 @@ uint64_t DramReclaimer::reclaim( return op_->spillTopPartitionToCxl(targetBytes); } -// Bundles a CustomMemoryResource backed by MallocAllocator with the -// callback slot the operator fills in after QueryCtx construction. -struct CxlResourceBundle { - CustomMemoryResource resource; - std::shared_ptr callbackSlot; -}; - -CxlResourceBundle makeCxlResource() { +std::shared_ptr makeCxlResource() { MemoryAllocator::Options allocatorOptions; allocatorOptions.capacity = 1L << 30; - CxlResourceBundle bundle; - bundle.callbackSlot = std::make_shared(); - bundle.resource.tag = "cxl"; - bundle.resource.maxCapacity = 1L << 30; - bundle.resource.allocator = - std::make_shared(allocatorOptions); - bundle.resource.arbitrator = MemoryArbitrator::create({}); - auto slot = bundle.callbackSlot; - bundle.resource.reclaimerFactory = [slot]() { - return std::make_unique(slot); - }; - return bundle; + return std::make_shared( + "cxl", + std::make_shared(allocatorOptions), + MemoryArbitrator::create({}), + []() { return MemoryReclaimer::create(0); }); } // Materializes the CXL custom pool through the registered resource and -// attaches it to a fresh QueryCtx keyed by 'tag'. Mirrors the recommended -// caller-builds-pool flow for custom memory resources. +// attaches it to a fresh QueryCtx keyed by 'tag'. std::shared_ptr buildQueryCtxWithCxl( - CxlResourceBundle& bundle, + std::shared_ptr registry, + std::shared_ptr resource, const std::string& queryId) { auto* manager = memoryManager(); - manager->registerCustomResource(std::move(bundle.resource)); - const auto& registered = *manager->customResources().at("cxl"); - auto reclaimer = registered.reclaimerFactory(); - auto pool = manager->addRootPool( - fmt::format("{}.cxl", queryId), - registered.maxCapacity, - std::move(reclaimer), - std::nullopt, - "cxl"); + registry->insert("cxl", resource); + auto registered = registry->find("cxl"); + VELOX_CHECK_NOT_NULL(registered); + auto pool = + manager->addCustomRootPool(fmt::format("{}.cxl", queryId), registered); return core::QueryCtx::Builder() .customPool("cxl", std::move(pool)) .queryId(queryId) @@ -348,19 +251,23 @@ std::shared_ptr buildQueryCtxWithCxl( // End-to-end coverage for the CXL-backed HashAggregation flow documented in // spilling.rst (`Reclaim Across Memory Resources` -> `Example: CXL-Backed -// Hash Aggregation`). The CXL custom memory resource is backed by -// MallocAllocator so the test runs on hardware without real CXL devices. +// Hash Aggregation`), restricted to the DRAM -> CXL hop. The CXL custom +// memory resource is backed by MallocAllocator so the test runs on +// hardware without real CXL devices. class CustomMemoryEmulationTest : public testing::Test { protected: void SetUp() override { MemoryManager::testingSetInstance(MemoryManager::Options{}); + registry_ = CustomMemoryResourceRegistry::createRegistry(nullptr); } + + std::shared_ptr registry_; }; TEST_F(CustomMemoryEmulationTest, baselineAggregationWithoutSpill) { - auto cxl = makeCxlResource(); - auto queryCtx = buildQueryCtxWithCxl(cxl, "cxl-baseline"); - EmulatedCxlHashAggregation op(queryCtx.get(), cxl.callbackSlot); + auto queryCtx = + buildQueryCtxWithCxl(registry_, makeCxlResource(), "cxl-baseline"); + EmulatedCxlHashAggregation op(queryCtx.get()); std::unordered_map expected; for (int i = 0; i < 64; ++i) { @@ -373,14 +280,13 @@ TEST_F(CustomMemoryEmulationTest, baselineAggregationWithoutSpill) { EXPECT_EQ(op.hashTableSize(), expected.size()); EXPECT_EQ(op.dramRowCount(), expected.size()); EXPECT_EQ(op.cxlRowCount(), 0); - EXPECT_EQ(op.diskRowCount(), 0); EXPECT_EQ(op.finalize(), expected); } -TEST_F(CustomMemoryEmulationTest, dramToCxlToDiskChainPreservesIntegrity) { - auto cxl = makeCxlResource(); - auto queryCtx = buildQueryCtxWithCxl(cxl, "cxl-chain"); - EmulatedCxlHashAggregation op(queryCtx.get(), cxl.callbackSlot); +TEST_F(CustomMemoryEmulationTest, dramToCxlPreservesIntegrity) { + auto queryCtx = + buildQueryCtxWithCxl(registry_, makeCxlResource(), "cxl-chain"); + EmulatedCxlHashAggregation op(queryCtx.get()); std::unordered_map expected; for (int i = 0; i < 64; ++i) { @@ -392,53 +298,33 @@ TEST_F(CustomMemoryEmulationTest, dramToCxlToDiskChainPreservesIntegrity) { const size_t totalKeys = expected.size(); ASSERT_EQ(op.dramRowCount(), totalKeys); ASSERT_EQ(op.cxlRowCount(), 0); - ASSERT_EQ(op.diskRowCount(), 0); MemoryReclaimer::Stats stats; - // Phase 1: DRAM -> CXL. The operator's DramReclaimer moves the top - // partition's rows into the CXL pool and swizzles the hash-table - // bucket pointers to the new addresses. The entries themselves remain; - // probe and finalize logic continue to read them directly. + // DRAM -> CXL. The operator's DramReclaimer moves the top partition's + // rows into the CXL pool and swizzles the hash-table bucket pointers to + // the new addresses. The entries themselves remain; probe and finalize + // logic continue to read them directly. const uint64_t freed1 = op.dramPool()->reclaim( /*targetBytes=*/64, /*maxWaitMs=*/0, stats); EXPECT_GT(freed1, 0); const size_t cxlAfterFirst = op.cxlRowCount(); EXPECT_GT(cxlAfterFirst, 0); EXPECT_LT(op.dramRowCount(), totalKeys); - EXPECT_EQ(op.diskRowCount(), 0); EXPECT_EQ(op.hashTableSize(), totalKeys) - << "Phase 1 swizzles bucket pointers; size must not change."; + << "DRAM -> CXL swizzles bucket pointers; size must not change."; - // Second DRAM reclaim moves another partition to CXL. + // Second reclaim moves another partition to CXL. const uint64_t freed2 = op.dramPool()->reclaim( /*targetBytes=*/64, /*maxWaitMs=*/0, stats); EXPECT_GT(freed2, 0); EXPECT_GT(op.cxlRowCount(), cxlAfterFirst); - EXPECT_EQ(op.diskRowCount(), 0); EXPECT_EQ(op.hashTableSize(), totalKeys); // After two DRAM -> CXL hops, the CXL-resident rows are still directly // readable: finalize() returns the same per-key sums it would in the // baseline run. EXPECT_EQ(op.finalize(), expected); - - // Phase 2: CXL -> disk. Arbitration on the CXL custom pool triggers the - // CxlReclaimer; the operator copies the top CXL partition's rows into - // the disk-resident vector and erases their hash-table entries. - const size_t cxlBeforePhase2 = op.cxlRowCount(); - const size_t hashSizeBeforePhase2 = op.hashTableSize(); - const uint64_t freed3 = op.cxlPool()->reclaim( - /*targetBytes=*/64, /*maxWaitMs=*/0, stats); - EXPECT_GT(freed3, 0); - EXPECT_GT(op.diskRowCount(), 0); - EXPECT_LT(op.cxlRowCount(), cxlBeforePhase2); - EXPECT_LT(op.hashTableSize(), hashSizeBeforePhase2) - << "Phase 2 erases hash-table entries for spilled keys."; - - // Phase 3: finalize merges DRAM-resident + CXL-resident + disk rows - // and yields the same per-key sums the baseline (no-spill) run would. - EXPECT_EQ(op.finalize(), expected); } } // namespace facebook::velox::memory::test diff --git a/velox/common/memory/tests/CustomMemoryPoolTest.cpp b/velox/common/memory/tests/CustomMemoryPoolTest.cpp index 648cdf8a354..275da4ba7d8 100644 --- a/velox/common/memory/tests/CustomMemoryPoolTest.cpp +++ b/velox/common/memory/tests/CustomMemoryPoolTest.cpp @@ -18,6 +18,7 @@ #include #include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/CustomMemoryResourceRegistry.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/Memory.h" #include "velox/common/memory/MemoryArbitrator.h" @@ -26,56 +27,40 @@ namespace facebook::velox::memory::test { namespace { -CustomMemoryResource makeResource(const std::string& tag) { +std::shared_ptr makeResource( + const std::string& tag, + int64_t maxCapacity = 1L << 30) { MemoryAllocator::Options allocatorOptions; - allocatorOptions.capacity = 1L << 30; - CustomMemoryResource resource; - resource.tag = tag; - resource.allocator = std::make_shared(allocatorOptions); - resource.arbitrator = MemoryArbitrator::create({}); - resource.reclaimerFactory = []() { return MemoryReclaimer::create(0); }; - return resource; -} - -// Builds a root pool from a CustomMemoryResource that the caller has already -// registered on 'manager'. The reclaimer is whatever the caller chooses; for -// tests that don't need a QueryCtx-aware reclaimer this is just the resource's -// default factory. -std::shared_ptr buildPool( - MemoryManager* manager, - const CustomMemoryResource& resource, - const std::string& poolName, - int64_t maxCapacity = kMaxMemory, - std::unique_ptr reclaimer = nullptr) { - if (reclaimer == nullptr && resource.reclaimerFactory) { - reclaimer = resource.reclaimerFactory(); - } - return manager->addRootPool( - poolName, - maxCapacity, - std::move(reclaimer), - std::nullopt, - resource.tag); + allocatorOptions.capacity = maxCapacity; + return std::make_shared( + tag, + std::make_shared(allocatorOptions), + MemoryArbitrator::create({}), + []() { return MemoryReclaimer::create(0); }, + maxCapacity); } } // namespace -// Each test gets its own MemoryManager so per-test resource registrations -// cannot collide on tag. +// Each test gets a fresh MemoryManager and its own isolated registry so +// concurrent tests cannot collide on tag registration in the global scope. class CustomMemoryPoolTest : public testing::Test { protected: void SetUp() override { MemoryManager::testingSetInstance(MemoryManager::Options{}); + registry_ = CustomMemoryResourceRegistry::createRegistry(nullptr); } + + std::shared_ptr registry_; }; TEST_F(CustomMemoryPoolTest, customPoolCreation) { auto* manager = memoryManager(); - manager->registerCustomResource(makeResource("gpu")); - const auto& registered = *manager->customResources().at("gpu"); + registry_->insert("gpu", makeResource("gpu", /*maxCapacity=*/1L << 28)); + auto resource = registry_->find("gpu"); + ASSERT_NE(resource, nullptr); - auto pool = buildPool( - manager, registered, "query.q0.gpu", /*maxCapacity=*/1L << 28); + auto pool = manager->addCustomRootPool("query.q0.gpu", resource); auto queryCtx = core::QueryCtx::Builder() .customPool("gpu", pool) .queryId("q0") @@ -94,14 +79,16 @@ TEST_F(CustomMemoryPoolTest, customPoolCreation) { TEST_F(CustomMemoryPoolTest, customPoolsKeyedByTag) { auto* manager = memoryManager(); for (const auto* tag : {"a", "b", "c"}) { - manager->registerCustomResource(makeResource(tag)); + registry_->insert(tag, makeResource(tag)); } auto builder = core::QueryCtx::Builder().queryId("q-keyed"); for (const auto* tag : {"a", "b", "c"}) { - const auto& registered = *manager->customResources().at(tag); + auto resource = registry_->find(tag); + ASSERT_NE(resource, nullptr); builder.customPool( - tag, buildPool(manager, registered, fmt::format("q-keyed.{}", tag))); + tag, + manager->addCustomRootPool(fmt::format("q-keyed.{}", tag), resource)); } auto queryCtx = builder.build(); @@ -111,16 +98,16 @@ TEST_F(CustomMemoryPoolTest, customPoolsKeyedByTag) { EXPECT_NE(queryCtx->customPool("c"), nullptr); } -// Locks down the core dispatch invariant: a custom-resource root pool and its -// children must allocate through the resource's allocator, not the -// MemoryManager default. +// A custom-resource root pool and its children must allocate through the +// resource's allocator, not the MemoryManager default. TEST_F(CustomMemoryPoolTest, customPoolDispatchesToResourceAllocator) { auto* manager = memoryManager(); - manager->registerCustomResource(makeResource("gpu")); - const auto& registered = *manager->customResources().at("gpu"); - auto* expectedAllocator = registered.allocator.get(); + registry_->insert("gpu", makeResource("gpu")); + auto resource = registry_->find("gpu"); + ASSERT_NE(resource, nullptr); + auto* expectedAllocator = resource->allocator(); - auto pool = buildPool(manager, registered, "q-dispatch.gpu"); + auto pool = manager->addCustomRootPool("q-dispatch.gpu", resource); auto queryCtx = core::QueryCtx::Builder() .customPool("gpu", pool) .queryId("q-dispatch") @@ -141,11 +128,52 @@ TEST_F(CustomMemoryPoolTest, customPoolDispatchesToResourceAllocator) { expectedAllocator); } +TEST_F(CustomMemoryPoolTest, builderRejectsBadInputs) { + auto* manager = memoryManager(); + registry_->insert("gpu", makeResource("gpu")); + auto resource = registry_->find("gpu"); + ASSERT_NE(resource, nullptr); + auto pool = manager->addCustomRootPool("q-bad.gpu", resource); + + auto builder = core::QueryCtx::Builder().queryId("q-bad"); + builder.customPool("gpu", pool); + EXPECT_THROW(builder.customPool("gpu", pool), VeloxRuntimeError); + EXPECT_THROW(builder.customPool("other", nullptr), VeloxRuntimeError); + EXPECT_THROW(builder.customPool("", pool), VeloxRuntimeError); +} + +// QueryCtx::addCustomPool is part of the public surface so a query can +// attach a custom pool after the Builder has already produced the QueryCtx. +// Verifies the same validation as Builder::customPool applies. +TEST_F(CustomMemoryPoolTest, addCustomPoolDirectly) { + auto* manager = memoryManager(); + registry_->insert("gpu", makeResource("gpu")); + auto resource = registry_->find("gpu"); + ASSERT_NE(resource, nullptr); + + auto queryCtx = core::QueryCtx::Builder().queryId("q-direct").build(); + ASSERT_EQ(queryCtx->customPools().size(), 0); + + auto pool = manager->addCustomRootPool("q-direct.gpu", resource); + queryCtx->addCustomPool("gpu", pool); + EXPECT_EQ(queryCtx->customPool("gpu").get(), pool.get()); + EXPECT_EQ(queryCtx->customPools().size(), 1); + + EXPECT_THROW(queryCtx->addCustomPool("gpu", pool), VeloxRuntimeError); + EXPECT_THROW(queryCtx->addCustomPool("other", nullptr), VeloxRuntimeError); + EXPECT_THROW(queryCtx->addCustomPool("", pool), VeloxRuntimeError); +} + +TEST_F(CustomMemoryPoolTest, addCustomRootPoolRejectsNullResource) { + auto* manager = memoryManager(); + EXPECT_THROW(manager->addCustomRootPool("q.null", nullptr), VeloxUserError); +} + namespace { // Test reclaimer that "spills" by allocating from a sibling resource's pool. // Models the chained-reclaimer pattern (e.g. device-memory pool spills into a -// pinned-host pool). +// pinned-host pool). The sibling pool is captured via shared_ptr. class SpillToSiblingReclaimer : public MemoryReclaimer { public: static std::unique_ptr create( @@ -155,9 +183,9 @@ class SpillToSiblingReclaimer : public MemoryReclaimer { } ~SpillToSiblingReclaimer() override { - // Release any buffers we acquired during reclaim before the leaf pools - // they came from get destroyed; otherwise the leaf destructor aborts on - // outstanding usage. + // Release any buffers acquired during reclaim before the leaf pools + // they came from get destroyed; otherwise the leaf destructor aborts + // on outstanding usage. for (auto& spill : spills_) { spill.leaf->free(spill.buffer, static_cast(spill.bytes)); } @@ -197,29 +225,29 @@ class SpillToSiblingReclaimer : public MemoryReclaimer { } // namespace -// Locks down the cross-resource spill flow: when reclaim is triggered on a -// custom pool, the resource's reclaimer can allocate into a sibling resource's -// pool, modeling device -> pinned-host spill. The caller builds the host pool -// first, captures it inside the device reclaimer, then builds the device pool -// with that reclaimer — no QueryCtx pointer is needed during reclaimer -// construction because the sibling pool itself is the link. +// The device resource's reclaimerFactory closes over the previously-built +// host pool, so reclaim on the device pool routes allocations into the host. TEST_F(CustomMemoryPoolTest, deviceReclaimerSpillsToHostSibling) { auto* manager = memoryManager(); - manager->registerCustomResource(makeResource("host")); - manager->registerCustomResource(makeResource("device")); - const auto& hostResource = *manager->customResources().at("host"); - const auto& deviceResource = *manager->customResources().at("device"); - auto hostPool = buildPool(manager, hostResource, "q-spill.host"); - auto deviceReclaimer = SpillToSiblingReclaimer::create(hostPool); + registry_->insert("host", makeResource("host")); + auto hostResource = registry_->find("host"); + ASSERT_NE(hostResource, nullptr); + auto hostPool = manager->addCustomRootPool("q-spill.host", hostResource); + + MemoryAllocator::Options deviceAllocatorOptions; + deviceAllocatorOptions.capacity = 1L << 30; + auto deviceResource = std::make_shared( + "device", + std::make_shared(deviceAllocatorOptions), + MemoryArbitrator::create({}), + [hostPool]() { return SpillToSiblingReclaimer::create(hostPool); }); + + auto devicePool = + manager->addCustomRootPool("q-spill.device", deviceResource); auto* mockReclaimer = - static_cast(deviceReclaimer.get()); - auto devicePool = buildPool( - manager, - deviceResource, - "q-spill.device", - kMaxMemory, - std::move(deviceReclaimer)); + static_cast(devicePool->reclaimer()); + ASSERT_NE(mockReclaimer, nullptr); auto queryCtx = core::QueryCtx::Builder() .customPool("host", hostPool) @@ -238,84 +266,4 @@ TEST_F(CustomMemoryPoolTest, deviceReclaimerSpillsToHostSibling) { EXPECT_GE(hostPool->usedBytes(), static_cast(target)); } -// Builder rejects duplicate tags and null pools. -TEST_F(CustomMemoryPoolTest, builderRejectsBadInputs) { - auto* manager = memoryManager(); - manager->registerCustomResource(makeResource("gpu")); - const auto& registered = *manager->customResources().at("gpu"); - auto pool = buildPool(manager, registered, "q-bad.gpu"); - - auto builder = core::QueryCtx::Builder().queryId("q-bad"); - builder.customPool("gpu", pool); - EXPECT_THROW(builder.customPool("gpu", pool), VeloxRuntimeError); - EXPECT_THROW(builder.customPool("other", nullptr), VeloxRuntimeError); - EXPECT_THROW(builder.customPool("", pool), VeloxRuntimeError); -} - -// QueryCtx::addCustomPool is part of the public surface so a query can -// attach a custom pool after the Builder has already produced the QueryCtx. -// Verifies the same validation as Builder::customPool applies. -TEST_F(CustomMemoryPoolTest, addCustomPoolDirectly) { - auto* manager = memoryManager(); - manager->registerCustomResource(makeResource("gpu")); - const auto& registered = *manager->customResources().at("gpu"); - - auto queryCtx = core::QueryCtx::Builder().queryId("q-direct").build(); - ASSERT_EQ(queryCtx->customPools().size(), 0); - - auto pool = buildPool(manager, registered, "q-direct.gpu"); - queryCtx->addCustomPool("gpu", pool); - EXPECT_EQ(queryCtx->customPool("gpu").get(), pool.get()); - EXPECT_EQ(queryCtx->customPools().size(), 1); - - EXPECT_THROW(queryCtx->addCustomPool("gpu", pool), VeloxRuntimeError); - EXPECT_THROW(queryCtx->addCustomPool("other", nullptr), VeloxRuntimeError); - EXPECT_THROW(queryCtx->addCustomPool("", pool), VeloxRuntimeError); -} - -// Models the QueryCtx-aware reclaimer flow: build the pool without a -// reclaimer (skipping the resource's factory), hand it to the Builder, then -// attach a reclaimer that references the QueryCtx via -// MemoryPool::setReclaimer. The reclaimer reaches a sibling pool through -// queryCtx->customPool(). -TEST_F(CustomMemoryPoolTest, postBuildSetReclaimerWithQueryCtxAware) { - auto* manager = memoryManager(); - manager->registerCustomResource(makeResource("host")); - manager->registerCustomResource(makeResource("device")); - - auto hostPool = manager->addRootPool( - "q-postbuild.host", - kMaxMemory, - /*reclaimer=*/nullptr, - std::nullopt, - "host"); - auto devicePool = manager->addRootPool( - "q-postbuild.device", - kMaxMemory, - /*reclaimer=*/nullptr, - std::nullopt, - "device"); - - auto queryCtx = core::QueryCtx::Builder() - .customPool("host", hostPool) - .customPool("device", devicePool) - .queryId("q-postbuild") - .build(); - - // Attach a sibling-aware reclaimer to the device pool after the QueryCtx - // exists, reaching the host pool through customPool(). - auto reclaimer = - SpillToSiblingReclaimer::create(queryCtx->customPool("host")); - auto* mockReclaimer = static_cast(reclaimer.get()); - devicePool->setReclaimer(std::move(reclaimer)); - - const uint64_t target = 4 * 1024; - MemoryReclaimer::Stats stats; - const auto reclaimed = devicePool->reclaim(target, 0, stats); - - EXPECT_EQ(mockReclaimer->numReclaimCalls(), 1); - EXPECT_EQ(reclaimed, target); - EXPECT_GE(hostPool->usedBytes(), static_cast(target)); -} - } // namespace facebook::velox::memory::test diff --git a/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp b/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp index d95ed86e385..24c101b7137 100644 --- a/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp +++ b/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp @@ -18,6 +18,7 @@ #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/CustomMemoryResourceRegistry.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/Memory.h" #include "velox/common/memory/MemoryArbitrator.h" @@ -25,98 +26,110 @@ namespace facebook::velox::memory::test { namespace { -CustomMemoryResource makeResource(const std::string& tag) { - MemoryAllocator::Options allocatorOptions; - allocatorOptions.capacity = 1L << 30; - CustomMemoryResource resource; - resource.tag = tag; - resource.allocator = std::make_shared(allocatorOptions); - resource.arbitrator = MemoryArbitrator::create({}); - resource.reclaimerFactory = []() { return MemoryReclaimer::create(0); }; - return resource; +std::shared_ptr makeAllocator() { + MemoryAllocator::Options options; + options.capacity = 1L << 30; + return std::make_shared(options); } -} // namespace +CustomMemoryResource::ReclaimerFactory makeReclaimerFactory() { + return []() { return MemoryReclaimer::create(0); }; +} -TEST(CustomMemoryRegistration, registrationValidation) { - MemoryManager manager{}; - ASSERT_TRUE(manager.customResources().empty()); +std::shared_ptr makeResource(const std::string& tag) { + return std::make_shared( + tag, makeAllocator(), MemoryArbitrator::create({}), makeReclaimerFactory()); +} - auto resource = makeResource("test-resource"); - resource.maxCapacity = 1L << 28; - manager.registerCustomResource(std::move(resource)); +} // namespace - ASSERT_EQ(manager.customResources().size(), 1); - const auto& stored = manager.customResources().at("test-resource"); - EXPECT_EQ(stored->tag, "test-resource"); - EXPECT_EQ(stored->maxCapacity, 1L << 28); +TEST(CustomMemoryRegistration, constructorRejectsInvalidFields) { + auto allocator = makeAllocator(); + std::shared_ptr arbitrator = MemoryArbitrator::create({}); + auto factory = makeReclaimerFactory(); - // Try to register another resource with the same tag, which should be - // rejected. - VELOX_ASSERT_THROW( - manager.registerCustomResource(makeResource("test-resource")), - "CustomMemoryResource already registered for tag: test-resource"); - ASSERT_EQ(manager.customResources().size(), 1); + // Valid construction succeeds. + CustomMemoryResource("ok", allocator, arbitrator, factory); - // Try to register resources with missing tag, which should be rejected. - CustomMemoryResource emptyTag; - emptyTag.allocator = makeResource("ignored").allocator; VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(emptyTag)), + CustomMemoryResource("", allocator, arbitrator, factory), "CustomMemoryResource tag is empty"); - - // Try to register resources with null allocator, which should be rejected. - CustomMemoryResource nullAllocator; - nullAllocator.tag = "another"; VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(nullAllocator)), - "CustomMemoryResource allocator is null for tag: another"); - - // Try to register resources with null arbitrator, which should be rejected. - auto nullArbitrator = makeResource("another"); - nullArbitrator.arbitrator.reset(); + CustomMemoryResource("tag", nullptr, arbitrator, factory), + "CustomMemoryResource allocator is null for tag: tag"); VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(nullArbitrator)), - "CustomMemoryResource arbitrator is null for tag: another"); - - // Try to register resources with null reclaimerFactory, which should be - // rejected. - auto nullFactory = makeResource("another"); - nullFactory.reclaimerFactory = nullptr; + CustomMemoryResource("tag", allocator, nullptr, factory), + "CustomMemoryResource arbitrator is null for tag: tag"); VELOX_ASSERT_THROW( - manager.registerCustomResource(std::move(nullFactory)), - "CustomMemoryResource reclaimerFactory is null for tag: another"); + CustomMemoryResource("tag", allocator, arbitrator, nullptr), + "CustomMemoryResource reclaimerFactory is null for tag: tag"); } -TEST(CustomMemoryRegistration, registrationsAreReachableByTag) { - MemoryManager manager{}; +TEST(CustomMemoryRegistration, insertAndFindOnIsolatedRegistry) { + auto registry = CustomMemoryResourceRegistry::createRegistry(nullptr); for (const auto* tag : {"a", "b", "c"}) { - manager.registerCustomResource(makeResource(tag)); + registry->insert(tag, makeResource(tag)); } - ASSERT_EQ(manager.customResources().size(), 3); - EXPECT_NE(manager.customResources().find("a"), manager.customResources().end()); - EXPECT_NE(manager.customResources().find("b"), manager.customResources().end()); - EXPECT_NE(manager.customResources().find("c"), manager.customResources().end()); + EXPECT_NE(registry->find("a"), nullptr); + EXPECT_NE(registry->find("b"), nullptr); + EXPECT_NE(registry->find("c"), nullptr); + EXPECT_EQ(registry->find("missing"), nullptr); } -TEST(CustomMemoryRegistration, addRootPoolRejectsUnregisteredTag) { - MemoryManager manager{}; - manager.registerCustomResource(makeResource("gpu")); +TEST(CustomMemoryRegistration, insertRejectsDuplicateTag) { + auto registry = CustomMemoryResourceRegistry::createRegistry(nullptr); + registry->insert("device", makeResource("device")); + VELOX_ASSERT_THROW( + registry->insert("device", makeResource("device")), + "Key already registered: device"); +} - // Adding a root pool with a registered tag should succeed. - auto tagged = manager.addRootPool( - "tagged-root", kMaxMemory, nullptr, std::nullopt, "gpu"); - ASSERT_NE(tagged, nullptr); +TEST(CustomMemoryRegistration, childRegistryShadowsParent) { + auto parent = CustomMemoryResourceRegistry::createRegistry(nullptr); + auto defaultResource = makeResource("device"); + auto* defaultAllocator = defaultResource->allocator(); + parent->insert("device", std::move(defaultResource)); - // Adding a root pool without a tag should succeed. - auto untagged = manager.addRootPool("untagged-root"); - ASSERT_NE(untagged, nullptr); + auto child = CustomMemoryResourceRegistry::createRegistry(parent.get()); + auto tenantResource = makeResource("device"); + auto* tenantAllocator = tenantResource->allocator(); + child->insert("device", std::move(tenantResource)); - // Adding a root pool with an unregistered tag should be rejected. - VELOX_ASSERT_THROW( - manager.addRootPool( - "bad-root", kMaxMemory, nullptr, std::nullopt, "not-registered"), - "No CustomMemoryResource registered for tag: not-registered"); + EXPECT_EQ(child->find("device")->allocator(), tenantAllocator); + EXPECT_EQ(parent->find("device")->allocator(), defaultAllocator) + << "Parent must remain untouched by scoped registration."; +} + +TEST(CustomMemoryRegistration, childRegistryFallsBackToParent) { + auto parent = CustomMemoryResourceRegistry::createRegistry(nullptr); + parent->insert("only-parent", makeResource("only-parent")); + auto child = CustomMemoryResourceRegistry::createRegistry(parent.get()); + EXPECT_NE(child->find("only-parent"), nullptr); +} + +TEST(CustomMemoryRegistration, isolationRegistryHasNoFallback) { + auto parent = CustomMemoryResourceRegistry::createRegistry(nullptr); + parent->insert("only-parent", makeResource("only-parent")); + + auto isolated = CustomMemoryResourceRegistry::createRegistry(nullptr); + EXPECT_EQ(isolated->find("only-parent"), nullptr) + << "Isolation mode must not fall back to any parent."; + + isolated->insert("only-local", makeResource("only-local")); + EXPECT_NE(isolated->find("only-local"), nullptr); + EXPECT_EQ(parent->find("only-local"), nullptr); +} + +TEST(CustomMemoryRegistration, addCustomRootPoolWithResource) { + MemoryManager manager{}; + auto registry = CustomMemoryResourceRegistry::createRegistry(nullptr); + registry->insert("gpu", makeResource("gpu")); + + auto resource = registry->find("gpu"); + ASSERT_NE(resource, nullptr); + + auto tagged = manager.addCustomRootPool("tagged-root", resource); + ASSERT_NE(tagged, nullptr); } } // namespace facebook::velox::memory::test diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index 1cd605923e8..1eb3da21544 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -182,9 +182,7 @@ class QueryCtx : public std::enable_shared_from_this { /// Registers a caller-built root pool under 'tag' on the resulting /// QueryCtx. Throws if 'tag' is already present or 'pool' is null. The - /// caller is responsible for constructing the pool, typically through - /// MemoryManager::addRootPool(name, cap, reclaimer, debugOpts, tag) with - /// a CustomMemoryResource previously registered on the MemoryManager. + /// pool is typically built through MemoryManager::addCustomRootPool. Builder& customPool( std::string tag, std::shared_ptr pool) { @@ -397,9 +395,10 @@ class QueryCtx : public std::enable_shared_from_this { } /// Tracks an additional root pool keyed by 'tag'. The pool's allocator - /// and arbitrator are borrowed from a CustomMemoryResource owned by the - /// MemoryManager registry, so its lifetime is governed externally. - /// Throws if 'tag' is already present or 'pool' is null. + /// and arbitrator are borrowed from the CustomMemoryResource the caller + /// passed to MemoryManager::addCustomRootPool; the resource's lifetime + /// is governed externally. Throws if 'tag' is already present or 'pool' + /// is null. void addCustomPool(std::string tag, std::shared_ptr pool); /// Returns the custom root pool for the given resource tag, or nullptr if diff --git a/velox/docs/develop/memory.rst b/velox/docs/develop/memory.rst index 2bcf958da29..801fc21b0e4 100644 --- a/velox/docs/develop/memory.rst +++ b/velox/docs/develop/memory.rst @@ -872,96 +872,80 @@ these ad-hoc small allocations in practice. Custom Memory Resources ----------------------- -Velox supports heterogeneous compute extensions that operate across multiple -memory tiers. A query running on a GPU connector touches device memory; a -query running on a CXL-attached node touches a memory pool with different -latency and capacity characteristics; a connector issuing DMA transfers -needs a pinned-host allocator; NUMA-aware operators want a pool bound to a -specific socket. Each of these is a distinct memory resource that the engine -must track and arbitrate. - -The default memory system assumes a single homogeneous tier: host DRAM, -managed by *MemoryAllocator* and arbitrated by a single *MemoryArbitrator*. -Under this model, allocations issued by an extension are invisible to -*MemoryManager*, *MemoryArbitrator* cannot reason about them, and operators -have no way to request a memory pool backed by a particular tier. - -*CustomMemoryResource* is the registration primitive that lets an extension -expose additional memory tiers side-by-side with the default CPU tier. A -resource bundles a tag, an allocator, an arbitrator, and a factory that -builds a per-query reclaimer: +*CustomMemoryResource* lets an extension expose memory tiers other than +host DRAM (GPU device memory, CXL-attached memory, pinned host memory, +NUMA-bound pools) side-by-side with the default CPU tier. A resource +bundles a tag, an allocator, an arbitrator, and a factory that builds a +per-query reclaimer. The constructor requires a non-empty tag and +non-null allocator, arbitrator, and reclaimerFactory; the resource is +immutable once constructed: .. code-block:: c++ - struct CustomMemoryResource { - // Unique non-empty identifier. - std::string tag; - - // Capacity of the per-query root pool created for this resource. - int64_t maxCapacity{std::numeric_limits::max()}; - - // Allocator backing pools tagged with this resource. - std::shared_ptr allocator; - - // Arbitrator routing capacity decisions for pools tagged with this resource. - std::shared_ptr arbitrator; - - // Builds a reclaimer for pools tagged with this resource. - std::function()> reclaimerFactory; + class CustomMemoryResource { + public: + CustomMemoryResource( + std::string tag, + std::shared_ptr allocator, + std::shared_ptr arbitrator, + std::function()> reclaimerFactory, + int64_t maxCapacity = std::numeric_limits::max()); + + const std::string& tag() const; + int64_t maxCapacity() const; + MemoryAllocator* allocator() const; + MemoryArbitrator* arbitrator() const; + std::unique_ptr newReclaimer() const; }; -Each resource carries its own allocator and arbitrator. The default CPU +Each resource carries its own allocator and arbitrator; the default CPU allocator and arbitrator on *MemoryManager* are not shared with custom -resources, which keeps per-tier accounting and capacity decisions -self-contained. This matters when tiers differ in size, latency, alignment, -or allocation failure modes — properties that the default arbitrator was -not designed to reason about across resources. +resources. Per-tier accounting and capacity decisions stay self-contained, +which matters when tiers differ in size, latency, alignment, or allocation +failure modes. -Registration and Per-Query Materialization -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Registration +^^^^^^^^^^^^ -Extensions register resources at process startup, immediately after -*initializeMemoryManager*: +Resources are tracked by *CustomMemoryResourceRegistry*. The class exposes +a process-global root via *global()* and creates child scopes via +*createRegistry(parent)*. Registration and lookup go directly through the +*Registry* instance (its *insert*, *find*, and *clear* methods); each +scope is protected by its own lock. Extensions register resources in the +global scope at process startup, after *initializeMemoryManager*: .. code-block:: c++ - auto allocator = std::make_shared(...); - auto arbitrator = MemoryArbitrator::create(...); - memory::memoryManager()->registerCustomResource( - memory::CustomMemoryResource{ - .tag = "device", - .maxCapacity = deviceCapacity, - .allocator = std::move(allocator), - .arbitrator = std::move(arbitrator), - .reclaimerFactory = []() { - return std::make_unique(); - }, - }); - -The *MemoryManager* keeps the resource alive until process shutdown. -Registration is not thread-safe and must happen before any *QueryCtx* is -constructed. - -For each query that wants to use a registered resource, the caller builds -the per-resource root pool through -*MemoryManager::addRootPool(name, maxCapacity, reclaimer, debugOpts, tag)* -and hands it to the *QueryCtx* via *Builder::customPool*: + auto resource = std::make_shared( + "device", + std::make_shared(...), + MemoryArbitrator::create(...), + []() { return std::make_unique(); }, + deviceCapacity); + memory::CustomMemoryResourceRegistry::global().insert( + resource->tag(), resource); + +For each query that wants to use a registered resource, the caller looks +the resource up by tag, materializes the per-resource root pool through +*MemoryManager::addCustomRootPool*, and hands the pool to the *QueryCtx* +via *Builder::customPool*: .. code-block:: c++ auto* manager = memory::memoryManager(); - auto reclaimer = std::make_unique(); - auto devicePool = manager->addRootPool( - "query.q0.device", deviceCapacity, std::move(reclaimer), - /*poolDebugOpts*/ std::nullopt, "device"); + auto resource = + memory::CustomMemoryResourceRegistry::global().find("device"); + VELOX_USER_CHECK_NOT_NULL(resource, "Unknown resource tag: device"); + auto devicePool = manager->addCustomRootPool("query.q0.device", resource); auto queryCtx = core::QueryCtx::Builder() .customPool("device", std::move(devicePool)) .queryId("q0") .build(); -The pool allocates through the registered resource's allocator and is -arbitrated by the registered resource's arbitrator. The root pool is -exposed on *QueryCtx* keyed by tag: +*addCustomRootPool* invokes *resource->newReclaimer()* to build the +pool's reclaimer, uses *resource->maxCapacity()* as the pool capacity, and +backs the pool with *resource->allocator()* and *resource->arbitrator()*. +The root pool is exposed on *QueryCtx* keyed by tag: .. code-block:: c++ @@ -969,60 +953,6 @@ exposed on *QueryCtx* keyed by tag: std::shared_ptr QueryCtx::customPool( const std::string& tag) const; -If the reclaimer needs to address sibling custom pools on the *QueryCtx* -(for instance, to spill into another tier), construct the pool without a -reclaimer and attach the reclaimer post-build with the real *QueryCtx*: - -.. code-block:: c++ - - auto devicePool = manager->addRootPool( - "query.q0.device", deviceCapacity, /*reclaimer*/ nullptr, - std::nullopt, "device"); - auto queryCtx = core::QueryCtx::Builder() - .customPool("device", devicePool) - .queryId("q0") - .build(); - devicePool->setReclaimer(std::make_unique(queryCtx.get())); - -The *reclaimerFactory* on *CustomMemoryResource* is the resource's -commitment to produce reclaimers for its pools. Callers invoke it -explicitly when materializing the per-query pool (the framework no longer -calls it implicitly). It is required at registration; for the -post-build *setReclaimer* path above, the factory's output is unused for -that particular pool but still serves as the default for queries that do -not override it. - -Operators that want to allocate from a particular tier ask the *QueryCtx* -for the matching pool by tag and create their own child leaf pool from it. -Operators that don't care about custom tiers behave exactly as before — -the default CPU root pool returned by *QueryCtx::pool()* is unchanged, and -so is its arbitration path. When a *QueryCtx* is built without any -*customPool* calls, *CustomMemoryResource* is inert for that query. - -Scope -^^^^^ - -The framework is deliberately the *plumbing* layer. It provides: - -* registration and per-query materialization of additional memory pools, -* per-tier accounting and arbitration through the resource's own allocator - and arbitrator, -* an extension point (the per-query reclaimer) for tier-specific reclaim - behavior. - -It does not provide cross-resource arbitration. When a custom pool runs -short of capacity, its own arbitrator decides what to do; the default -*MemoryArbitrator* on *MemoryManager* will not move memory between -resources. If an extension wants reclaim on one tier to spill into another -(for instance, device memory into pinned-host memory), the policy lives in -the per-query reclaimer the caller attaches to the custom pool. The -reclaimer reaches sibling tiers either by holding a *shared_ptr* to the -sibling pool captured at construction, or by being attached after -*QueryCtx::build()* (via *MemoryPool::setReclaimer*) and looking siblings -up through *QueryCtx::customPool*. See -`reclaim across memory resources `_ -in the spilling document. - Server OOM Prevention --------------------- diff --git a/velox/docs/develop/spilling.rst b/velox/docs/develop/spilling.rst index ee16da1c632..4b637fffba7 100644 --- a/velox/docs/develop/spilling.rst +++ b/velox/docs/develop/spilling.rst @@ -535,33 +535,32 @@ Reclaim Across Memory Resources The spilling algorithms above describe how a spillable operator releases memory by serializing in-memory state to disk. With -`custom memory resources `_, the reclaim -path is no longer fixed to disk. The per-query reclaimer attached to a -custom root pool can move data into a sibling tier instead — for example, +`custom memory resources `_, the +reclaim path is not restricted to disk. The per-query reclaimer attached +to a custom root pool can move data into a sibling tier — for example, copying buffers out of a device memory pool into a pinned-host pool, or -evicting cold pages from a CXL-attached pool into a local DRAM pool. The -extension chooses what each reclaimer does; the framework only guarantees -that *MemoryPool::reclaim* on a custom pool dispatches to the reclaimer the -extension supplied. - -The reclaimer holds a pointer to its owning *QueryCtx* and can therefore -address every other custom pool on the query through *QueryCtx::customPool*. -A chain of reclaimers — for instance, hash table on CPU DRAM → CXL on first -reclaim → disk on second reclaim — is built by composing reclaimers across -resources. Each link in the chain is an independent reclaim decision made -by the arbitrator of the pool that ran short. The framework does not -coordinate the chain; the extension chooses the topology by deciding which -sibling each reclaimer allocates into. +evicting cold pages from a CXL-attached pool into a local DRAM pool. +*MemoryPool::reclaim* on a custom pool dispatches to the reclaimer the +*reclaimerFactory* produced; what the reclaimer does is up to the +extension. + +A chain of reclaimers — for instance, hash table on CPU DRAM → CXL on +first reclaim → disk on second reclaim — is built by composing reclaimers +across resources. Each link is an independent reclaim decision made by +the arbitrator of the pool that ran short. The framework does not +coordinate the chain; the extension picks the topology by deciding which +sibling each reclaimer allocates into. A reclaimer can reach a sibling pool +by closure-capturing the sibling at the resource's +*reclaimerFactory*. Example: CXL-Backed Hash Aggregation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The reclaim path above is meaningful when the custom resource exposes a tier with different access properties from CPU DRAM. CXL-attached memory -is one such tier: it is slower than local DRAM but cheaper per byte, and -crucially it is exposed to the CPU as part of the coherent virtual address -space — host code can dereference a CXL pointer directly, with no copy -back to DRAM required. +is one such tier: it is slower than local DRAM but exposed to the CPU as +part of the coherent virtual address space — host code can dereference a +CXL pointer directly, with no copy back to DRAM required. A CXL extension takes advantage of this by registering a CXL custom resource and installing a CXL-aware *HashAggregation* through @@ -607,51 +606,50 @@ the reclaim policy: "DRAM + disk" — the same shape as the default *HashAggregation* algorithm. -The CXL pool is registered like any other custom resource. The resource -carries the reclaimer factory the framework uses to materialize per-query -reclaimers for pools tagged ``"cxl"``: +The CXL pool is registered like any other custom resource on the +*CustomMemoryResourceRegistry*. The resource carries the reclaimer factory +the caller uses to materialize per-query reclaimers for pools tagged +``"cxl"``: .. code-block:: c++ - - memory::memoryManager()->registerCustomResource( - memory::CustomMemoryResource{ - .tag = "cxl", - .maxCapacity = cxlCapacityBytes, - .allocator = std::make_shared(...), - .arbitrator = MemoryArbitrator::create(...), - .reclaimerFactory = []() { - return std::make_unique(); - }, - }); - - // Per query: invoke the registered factory to mint the reclaimer, build - // the CXL root pool with it, then hand the pool to the QueryCtx. + auto cxlResource = std::make_shared( + "cxl", + std::make_shared(...), + MemoryArbitrator::create(...), + []() { return std::make_unique(); }, + cxlCapacityBytes); + memory::CustomMemoryResourceRegistry::global().insert( + cxlResource->tag(), cxlResource); + + // Per query: resolve the resource and materialize the CXL root pool via + // addCustomRootPool, then hand it to the QueryCtx. addCustomRootPool + // invokes the resource's reclaimerFactory and uses the resource's + // maxCapacity, allocator, and arbitrator under the hood. auto* manager = memory::memoryManager(); - auto& cxlResource = *manager->customResources().at("cxl"); - auto cxlPool = manager->addRootPool( - "query.q0.cxl", cxlResource.maxCapacity, - cxlResource.reclaimerFactory(), - /*poolDebugOpts*/ std::nullopt, "cxl"); + auto resolved = memory::CustomMemoryResourceRegistry::global().find("cxl"); + VELOX_USER_CHECK_NOT_NULL(resolved); + auto cxlPool = manager->addCustomRootPool("query.q0.cxl", resolved); auto queryCtx = core::QueryCtx::Builder() .customPool("cxl", std::move(cxlPool)) .queryId("q0") .build(); -If a query needs a reclaimer that references the *QueryCtx* (for instance, -to spill into a sibling custom pool reachable through -*QueryCtx::customPool*), build the pool with a null reclaimer and attach -the QueryCtx-aware reclaimer post-build via *MemoryPool::setReclaimer* — -*setReclaimer* is one-shot, so the path skips the factory entirely. - The arbitrator passed here governs only the CXL pool's capacity. The -default *MemoryArbitrator* on *MemoryManager* continues to govern the DRAM -side; it triggers the first reclaim (step 1) without knowledge that the -"reclaimed" bytes have actually been relocated to CXL. From the default -arbitrator's perspective the DRAM pool simply gave bytes back; from the -CXL arbitrator's perspective new bytes arrived. Cross-tier coordination — -the decision that CXL is the right next stop for DRAM, and disk is the -right next stop for CXL — lives entirely in the extension's operator and -reclaimer, not in the framework. +framework never makes the *decision* to move bytes from DRAM to CXL on +its own; the trigger and the reclaim policy come from the extension. +Step 1 can be driven as follows: the operator calls *MemoryPool::setReclaimer* +on its DRAM-side row pool with a custom reclaimer that moves rows to +CXL. The default *MemoryArbitrator* on *MemoryManager* triggers +reclaim normally when the query is under capacity pressure; the +propagation reaches the installed reclaimer, which redirects the move +to CXL instead of disk. See *DramReclaimer* in +*velox/common/memory/tests/CustomMemoryEmulationTest.cpp* for a +worked example. + +The CXL → disk hop is analogous: install a reclaimer on the CXL root +pool through the resource's *reclaimerFactory*, and let the CXL +pool's arbitrator trigger it when the CXL budget is exhausted. This way, +we can achieve cross-resource arbitration. Future Work ----------- From e1a38d5eec75115d90f17a8cc8e95c2044c4e236 Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Wed, 20 May 2026 00:07:44 +0000 Subject: [PATCH 07/10] fix: build error --- velox/dwio/dwrf/test/WriterFlushTest.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/velox/dwio/dwrf/test/WriterFlushTest.cpp b/velox/dwio/dwrf/test/WriterFlushTest.cpp index 4612e5e46b1..3d308cfe3ed 100644 --- a/velox/dwio/dwrf/test/WriterFlushTest.cpp +++ b/velox/dwio/dwrf/test/WriterFlushTest.cpp @@ -198,6 +198,10 @@ class MockMemoryPool : public velox::memory::MemoryPool { return nullptr; } + memory::MemoryArbitrator* arbitrator() const override { + return nullptr; + } + void enterArbitration() override { VELOX_UNSUPPORTED("{} unsupported", __FUNCTION__); } From c4e93967c30b9e306ef6712741b87b4d06726609 Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Thu, 21 May 2026 00:09:20 +0000 Subject: [PATCH 08/10] fix pre-commit lint errors --- velox/common/memory/CMakeLists.txt | 6 +----- velox/common/memory/CustomMemoryResource.cpp | 8 ++------ .../memory/tests/CustomMemoryEmulationTest.cpp | 13 ++++++------- velox/common/memory/tests/CustomMemoryPoolTest.cpp | 6 ++---- .../memory/tests/CustomMemoryRegistrationTest.cpp | 5 ++++- velox/core/QueryCtx.h | 3 +-- 6 files changed, 16 insertions(+), 25 deletions(-) diff --git a/velox/common/memory/CMakeLists.txt b/velox/common/memory/CMakeLists.txt index 63c4c520d5a..cf0cce89d32 100644 --- a/velox/common/memory/CMakeLists.txt +++ b/velox/common/memory/CMakeLists.txt @@ -79,8 +79,4 @@ velox_add_library( CustomMemoryResourceRegistry.h ) -velox_link_libraries( - velox_custom_memory_resource_registry - velox_memory - velox_scoped_registry -) +velox_link_libraries(velox_custom_memory_resource_registry velox_memory velox_scoped_registry) diff --git a/velox/common/memory/CustomMemoryResource.cpp b/velox/common/memory/CustomMemoryResource.cpp index 1773963a38f..0341903469c 100644 --- a/velox/common/memory/CustomMemoryResource.cpp +++ b/velox/common/memory/CustomMemoryResource.cpp @@ -36,13 +36,9 @@ CustomMemoryResource::CustomMemoryResource( reclaimerFactory_(std::move(reclaimerFactory)) { VELOX_USER_CHECK(!tag_.empty(), "CustomMemoryResource tag is empty"); VELOX_USER_CHECK_NOT_NULL( - allocator_, - "CustomMemoryResource allocator is null for tag: {}", - tag_); + allocator_, "CustomMemoryResource allocator is null for tag: {}", tag_); VELOX_USER_CHECK_NOT_NULL( - arbitrator_, - "CustomMemoryResource arbitrator is null for tag: {}", - tag_); + arbitrator_, "CustomMemoryResource arbitrator is null for tag: {}", tag_); VELOX_USER_CHECK( reclaimerFactory_ != nullptr, "CustomMemoryResource reclaimerFactory is null for tag: {}", diff --git a/velox/common/memory/tests/CustomMemoryEmulationTest.cpp b/velox/common/memory/tests/CustomMemoryEmulationTest.cpp index ec4f7141902..139ee05b0d1 100644 --- a/velox/common/memory/tests/CustomMemoryEmulationTest.cpp +++ b/velox/common/memory/tests/CustomMemoryEmulationTest.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include "velox/common/memory/CustomMemoryResource.h" #include "velox/common/memory/CustomMemoryResourceRegistry.h" @@ -97,8 +97,8 @@ class EmulatedCxlHashAggregation { it->second.row->sum += value; return; } - auto* row = static_cast( - dramRowPool_->allocate(sizeof(EmulatedRow))); + auto* row = + static_cast(dramRowPool_->allocate(sizeof(EmulatedRow))); row->key = key; row->sum = value; hashTable_.emplace(key, Entry{row, Location::kDram}); @@ -144,12 +144,11 @@ class EmulatedCxlHashAggregation { } uint64_t freed = 0; for (auto& [key, entry] : hashTable_) { - if (entry.location != Location::kDram || - partitionOf(key) != partition) { + if (entry.location != Location::kDram || partitionOf(key) != partition) { continue; } - auto* cxlRow = static_cast( - cxlRowPool_->allocate(sizeof(EmulatedRow))); + auto* cxlRow = + static_cast(cxlRowPool_->allocate(sizeof(EmulatedRow))); *cxlRow = *entry.row; dramRowPool_->free(entry.row, sizeof(EmulatedRow)); entry.row = cxlRow; diff --git a/velox/common/memory/tests/CustomMemoryPoolTest.cpp b/velox/common/memory/tests/CustomMemoryPoolTest.cpp index 275da4ba7d8..8627033898f 100644 --- a/velox/common/memory/tests/CustomMemoryPoolTest.cpp +++ b/velox/common/memory/tests/CustomMemoryPoolTest.cpp @@ -61,10 +61,8 @@ TEST_F(CustomMemoryPoolTest, customPoolCreation) { ASSERT_NE(resource, nullptr); auto pool = manager->addCustomRootPool("query.q0.gpu", resource); - auto queryCtx = core::QueryCtx::Builder() - .customPool("gpu", pool) - .queryId("q0") - .build(); + auto queryCtx = + core::QueryCtx::Builder().customPool("gpu", pool).queryId("q0").build(); auto looked = queryCtx->customPool("gpu"); ASSERT_NE(looked, nullptr); diff --git a/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp b/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp index 24c101b7137..640b2ead355 100644 --- a/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp +++ b/velox/common/memory/tests/CustomMemoryRegistrationTest.cpp @@ -38,7 +38,10 @@ CustomMemoryResource::ReclaimerFactory makeReclaimerFactory() { std::shared_ptr makeResource(const std::string& tag) { return std::make_shared( - tag, makeAllocator(), MemoryArbitrator::create({}), makeReclaimerFactory()); + tag, + makeAllocator(), + MemoryArbitrator::create({}), + makeReclaimerFactory()); } } // namespace diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index 1eb3da21544..ce8658e69f9 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -403,8 +403,7 @@ class QueryCtx : public std::enable_shared_from_this { /// Returns the custom root pool for the given resource tag, or nullptr if /// none is registered under that tag for this query. - std::shared_ptr customPool( - const std::string& tag) const; + std::shared_ptr customPool(const std::string& tag) const; /// Returns all custom root pools for this query, keyed by resource tag. const std::unordered_map>& From 76d6e9556f9cc6a8cff0e312e29eae7802a90b9d Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Thu, 21 May 2026 11:30:20 +0000 Subject: [PATCH 09/10] feat: Mirror task/node/operator pool hierarchy under custom memory resources Wires CustomMemoryResourceRegistry into Task/Driver/Operator so that for every custom root pool registered on a QueryCtx, Task builds a parallel 'task -> node -> operator' aggregate hierarchy. Operators look up their mirrored leaf pool via Operator::customPool(tag), letting them allocate from non-default memory backends (e.g. GPU, CXL) without disturbing the default pool hierarchy or its accounting. - Adds kCustomMemoryResourceRegistryKey for QueryCtx-scoped lookup. - Task: initCustomTaskPools, getOrAddCustomNodePool / getOrAddCustomJoinNodePool, addCustomOperatorPool, customNodePool, with ownership in customChildPools_. - DriverCtx::addCustomOperatorPools mirrors addOperatorPool per tag. - OperatorCtx caches the per-tag leaf pools and exposes customPool(). - New CustomMemoryHierarchyTest covers the mirrored hierarchy. - Docs: memory.rst and spilling.rst updated. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../memory/CustomMemoryResourceRegistry.h | 8 + velox/common/memory/tests/CMakeLists.txt | 1 + .../tests/CustomMemoryHierarchyTest.cpp | 244 ++++++++++++++++++ velox/docs/develop/memory.rst | 13 + velox/docs/develop/spilling.rst | 15 +- velox/exec/Driver.cpp | 19 ++ velox/exec/Driver.h | 10 + velox/exec/Operator.cpp | 9 +- velox/exec/Operator.h | 16 ++ velox/exec/Task.cpp | 125 +++++++++ velox/exec/Task.h | 54 ++++ 11 files changed, 505 insertions(+), 9 deletions(-) create mode 100644 velox/common/memory/tests/CustomMemoryHierarchyTest.cpp diff --git a/velox/common/memory/CustomMemoryResourceRegistry.h b/velox/common/memory/CustomMemoryResourceRegistry.h index 9b2f2500595..210ec8908c2 100644 --- a/velox/common/memory/CustomMemoryResourceRegistry.h +++ b/velox/common/memory/CustomMemoryResourceRegistry.h @@ -18,12 +18,20 @@ #include #include +#include #include "velox/common/ScopedRegistry.h" #include "velox/common/memory/CustomMemoryResource.h" namespace facebook::velox::memory { +/// Key under which a per-QueryCtx scoped CustomMemoryResourceRegistry is +/// stored on QueryCtx via QueryCtx::setRegistry / QueryCtx::registry. Tasks +/// use this key to look up resources when building the custom memory pool +/// hierarchy. +inline constexpr std::string_view kCustomMemoryResourceRegistryKey{ + "customMemoryResource"}; + /// Entry point for the CustomMemoryResource registry. Provides the /// process-global root and a factory for scoped registries. Callers /// register, look up, and clear entries directly on the Registry instance diff --git a/velox/common/memory/tests/CMakeLists.txt b/velox/common/memory/tests/CMakeLists.txt index 09c276ecf05..d797e39e5ed 100644 --- a/velox/common/memory/tests/CMakeLists.txt +++ b/velox/common/memory/tests/CMakeLists.txt @@ -21,6 +21,7 @@ add_executable( ByteStreamTest.cpp CompactDoubleListTest.cpp CustomMemoryEmulationTest.cpp + CustomMemoryHierarchyTest.cpp CustomMemoryPoolTest.cpp CustomMemoryRegistrationTest.cpp HashStringAllocatorTest.cpp diff --git a/velox/common/memory/tests/CustomMemoryHierarchyTest.cpp b/velox/common/memory/tests/CustomMemoryHierarchyTest.cpp new file mode 100644 index 00000000000..068333729cb --- /dev/null +++ b/velox/common/memory/tests/CustomMemoryHierarchyTest.cpp @@ -0,0 +1,244 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/CustomMemoryResourceRegistry.h" +#include "velox/common/memory/MallocAllocator.h" +#include "velox/common/memory/Memory.h" +#include "velox/common/memory/MemoryArbitrator.h" +#include "velox/core/QueryCtx.h" +#include "velox/exec/Driver.h" +#include "velox/exec/OperatorType.h" +#include "velox/exec/Task.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/vector/BaseVector.h" + +namespace facebook::velox::memory::test { +namespace { + +class CustomMemoryHierarchyTest : public testing::Test { + protected: + static void SetUpTestSuite() { + MemoryManager::testingSetInstance(MemoryManager::Options{}); + } + + void SetUp() override { + vectorPool_ = memoryManager()->addLeafPool("test-vec"); + } + + std::shared_ptr makeResource( + const std::string& tag, + int64_t capacity = 1L << 30) { + MemoryAllocator::Options options; + options.capacity = capacity; + return std::make_shared( + tag, + std::make_shared(options), + MemoryArbitrator::create({}), + []() { return MemoryReclaimer::create(0); }, + capacity); + } + + // Builds a QueryCtx with a custom root pool per tag, installs an isolated + // per-query CustomMemoryResourceRegistry on the QueryCtx so tests do not + // contend on the global registry, and inserts each backing resource into + // it so Task can resolve it at construction time. + std::shared_ptr buildQueryCtx( + const std::vector& tags, + const std::string& queryId) { + auto* manager = memoryManager(); + auto builder = core::QueryCtx::Builder().queryId(queryId); + std::vector> resources; + for (const auto& tag : tags) { + auto resource = makeResource(tag); + builder.customPool( + tag, + manager->addCustomRootPool( + fmt::format("{}.{}", queryId, tag), resource)); + resources.push_back(resource); + } + auto queryCtx = builder.build(); + auto registry = CustomMemoryResourceRegistry::createRegistry(nullptr); + queryCtx->setRegistry( + kCustomMemoryResourceRegistryKey, registry); + for (size_t i = 0; i < tags.size(); ++i) { + registry->insert(tags[i], resources[i]); + } + return queryCtx; + } + + // Returns a single-row Values plan. Vector data is allocated from + // 'vectorPool_', which outlives every task built off this plan because + // it is a fixture member destroyed after the test body. + core::PlanFragment makePlan() { + auto rowType = ROW({"a"}, {BIGINT()}); + auto rowVector = + BaseVector::create(rowType, /*size=*/1, vectorPool_.get()); + return exec::test::PlanBuilder().values({rowVector}).planFragment(); + } + + std::shared_ptr makeTask( + const std::string& taskId, + const std::shared_ptr& queryCtx) { + return exec::Task::create( + taskId, + makePlan(), + /*destination=*/0, + queryCtx, + exec::Task::ExecutionMode::kSerial, + exec::Consumer{}); + } + + // Returns the first child of 'pool' whose name matches 'name', or nullptr. + static MemoryPool* findChild(MemoryPool* pool, const std::string& name) { + MemoryPool* found = nullptr; + pool->visitChildren([&](MemoryPool* child) { + if (child->name() == name) { + found = child; + return false; + } + return true; + }); + return found; + } + + std::shared_ptr vectorPool_; +}; + +// Task construction creates 'task..' aggregate under each +// registered custom root. +TEST_F(CustomMemoryHierarchyTest, taskCreationMirrorsTaskPool) { + auto queryCtx = buildQueryCtx({"gpu"}, "q1"); + auto task = makeTask("t1", queryCtx); + + auto gpuRoot = queryCtx->customPool("gpu"); + ASSERT_NE(gpuRoot, nullptr); + auto* taskMirror = findChild(gpuRoot.get(), "task.t1.gpu"); + ASSERT_NE(taskMirror, nullptr); + EXPECT_EQ(taskMirror->kind(), MemoryPool::Kind::kAggregate); +} + +// Multiple tags produce independent mirror subtrees that do not bleed +// into one another. +TEST_F(CustomMemoryHierarchyTest, multipleTagsMirrorIndependently) { + auto queryCtx = buildQueryCtx({"gpu", "cxl"}, "q2"); + auto task = makeTask("t2", queryCtx); + + EXPECT_NE( + findChild(queryCtx->customPool("gpu").get(), "task.t2.gpu"), nullptr); + EXPECT_NE( + findChild(queryCtx->customPool("cxl").get(), "task.t2.cxl"), nullptr); + EXPECT_EQ( + findChild(queryCtx->customPool("gpu").get(), "task.t2.cxl"), nullptr); + EXPECT_EQ( + findChild(queryCtx->customPool("cxl").get(), "task.t2.gpu"), nullptr); +} + +// With no custom pools registered, Task creation runs the default path +// only — no exceptions and the default subtree is unaffected. +TEST_F(CustomMemoryHierarchyTest, noCustomPoolsRegisteredIsHarmless) { + auto queryCtx = buildQueryCtx({}, "q3"); + ASSERT_EQ(queryCtx->customPools().size(), 0); + EXPECT_NO_THROW(makeTask("t3", queryCtx)); +} + +// getOrAddCustomNodePool creates the aggregate under the task mirror +// and is idempotent for repeated calls. +TEST_F(CustomMemoryHierarchyTest, getOrAddCustomNodePoolIsIdempotent) { + auto queryCtx = buildQueryCtx({"gpu"}, "q4"); + auto task = makeTask("t4", queryCtx); + + auto* nodePool = task->getOrAddCustomNodePool("gpu", "n0"); + ASSERT_NE(nodePool, nullptr); + EXPECT_EQ(nodePool->name(), "node.n0.gpu"); + EXPECT_EQ(nodePool->kind(), MemoryPool::Kind::kAggregate); + EXPECT_EQ(nodePool->parent()->name(), "task.t4.gpu"); + + EXPECT_EQ(task->getOrAddCustomNodePool("gpu", "n0"), nodePool); +} + +// addCustomOperatorPool returns a fresh leaf parented to the node mirror +// for non-join operator types. +TEST_F(CustomMemoryHierarchyTest, addCustomOperatorPoolReturnsLeaf) { + auto queryCtx = buildQueryCtx({"gpu"}, "q5"); + auto task = makeTask("t5", queryCtx); + + auto* leaf = task->addCustomOperatorPool( + "gpu", + "n0", + exec::kUngroupedGroupId, + /*pipelineId=*/0, + /*driverId=*/0, + "Project"); + ASSERT_NE(leaf, nullptr); + EXPECT_EQ(leaf->name(), "op.n0.0.0.Project.gpu"); + EXPECT_EQ(leaf->kind(), MemoryPool::Kind::kLeaf); + EXPECT_EQ(leaf->parent()->name(), "node.n0.gpu"); +} + +// HashBuild / HashProbe operator types route through the join-keyed +// node pool, mirroring the default getOrAddJoinNodePool path. +TEST_F(CustomMemoryHierarchyTest, hashJoinOperatorUsesJoinNodeKey) { + auto queryCtx = buildQueryCtx({"gpu"}, "q6"); + auto task = makeTask("t6", queryCtx); + + auto* leaf = task->addCustomOperatorPool( + "gpu", + "n0", + /*splitGroupId=*/7, + /*pipelineId=*/0, + /*driverId=*/0, + std::string(exec::OperatorType::kHashBuild)); + ASSERT_NE(leaf, nullptr); + EXPECT_EQ(leaf->parent()->name(), "node.n0[7].gpu"); +} + +// customNodePool returns the cached pool after creation, and nullptr for +// unknown tags or node ids. +TEST_F(CustomMemoryHierarchyTest, customNodePoolAccessor) { + auto queryCtx = buildQueryCtx({"gpu"}, "q7"); + auto task = makeTask("t7", queryCtx); + + EXPECT_EQ(task->customNodePool("gpu", "n0"), nullptr); + auto* nodePool = task->getOrAddCustomNodePool("gpu", "n0"); + EXPECT_EQ(task->customNodePool("gpu", "n0"), nodePool); + EXPECT_EQ(task->customNodePool("missing-tag", "n0"), nullptr); + EXPECT_EQ(task->customNodePool("gpu", "missing-node"), nullptr); +} + +// Looking up a tag with no registered resource throws clearly during +// task creation. An isolated empty per-query registry is installed so the +// lookup never falls back to the process-global registry (which other +// tests in this process may have populated). +TEST_F(CustomMemoryHierarchyTest, taskCreationFailsWhenResourceMissing) { + auto* manager = memoryManager(); + auto resource = makeResource("gpu"); + auto pool = manager->addCustomRootPool("q-missing.gpu", resource); + auto queryCtx = core::QueryCtx::Builder() + .customPool("gpu", std::move(pool)) + .queryId("q-missing") + .build(); + queryCtx->setRegistry( + kCustomMemoryResourceRegistryKey, + CustomMemoryResourceRegistry::createRegistry(nullptr)); + EXPECT_THROW(makeTask("t-missing", queryCtx), VeloxRuntimeError); +} + +} // namespace +} // namespace facebook::velox::memory::test diff --git a/velox/docs/develop/memory.rst b/velox/docs/develop/memory.rst index 801fc21b0e4..94437cb90c9 100644 --- a/velox/docs/develop/memory.rst +++ b/velox/docs/develop/memory.rst @@ -953,6 +953,19 @@ The root pool is exposed on *QueryCtx* keyed by tag: std::shared_ptr QueryCtx::customPool( const std::string& tag) const; +Per-Query Pool Hierarchy +^^^^^^^^^^^^^^^^^^^^^^^^ + +For every custom root pool registered on *QueryCtx* via +*Builder::customPool*, *Task* builds a parallel ``task → node → operator`` +aggregate/leaf subtree beneath it that mirrors the default hierarchy. +Aggregate children under a custom root are created at the same moment as +their default counterparts. Reclaimers for these aggregates come +from each resource's ``reclaimerFactory`` via +*CustomMemoryResource::newReclaimer*, so capacity decisions and reclaim +on a custom subtree are governed end-to-end by the resource's own +arbitrator and reclaimer — separate from the default DRAM tier. + Server OOM Prevention --------------------- diff --git a/velox/docs/develop/spilling.rst b/velox/docs/develop/spilling.rst index 4b637fffba7..5723e8be852 100644 --- a/velox/docs/develop/spilling.rst +++ b/velox/docs/develop/spilling.rst @@ -574,14 +574,13 @@ the reclaim policy: the CXL-backed *HashAggregation* selects a spill target — the set of partitions with the most data — and processes the corresponding rows in its DRAM row container. Unlike the default operator, it copies each - row into the CXL custom pool obtained via *queryCtx->customPool("cxl")* - and pointer-swizzles the corresponding hash-table bucket entries to the - new CXL addresses, rather than clearing them. The swizzle has the same - cost shape as the default operator's hash-table entry removal — a - *ProbeState* walk per row to find each bucket slot — but with greater - benefit: because CXL memory is coherent with the CPU address space, - probe and finalize logic continue to read the relocated entries - directly with no restore step. + row into the CXL custom pool and pointer-swizzles the corresponding + hash-table bucket entries to the new CXL addresses, rather than clearing + them. The swizzle has the same cost shape as the default operator's + hash-table entry removal — a *ProbeState* walk per row to find each bucket + slot — but with greater benefit: because CXL memory is coherent with the + CPU address space, probe and finalize logic continue to read the relocated + entries directly with no restore step. #. **CXL pool fills.** Subsequent reclaim triggers continue moving partitions from DRAM into CXL until the CXL custom pool's arbitrator diff --git a/velox/exec/Driver.cpp b/velox/exec/Driver.cpp index 1514f9a8516..92c5ea76e39 100644 --- a/velox/exec/Driver.cpp +++ b/velox/exec/Driver.cpp @@ -122,6 +122,25 @@ velox::memory::MemoryPool* DriverCtx::addOperatorPool( planNodeId, splitGroupId, pipelineId, driverId, operatorType); } +std::unordered_map +DriverCtx::addCustomOperatorPools( + const core::PlanNodeId& planNodeId, + const std::string& operatorType) { + std::unordered_map result; + const auto& customRoots = task->queryCtx()->customPools(); + if (customRoots.empty()) { + return result; + } + result.reserve(customRoots.size()); + for (const auto& [tag, _] : customRoots) { + result.emplace( + tag, + task->addCustomOperatorPool( + tag, planNodeId, splitGroupId, pipelineId, driverId, operatorType)); + } + return result; +} + namespace { bool isHashJoinSpillOperator(std::string_view operatorType) { return operatorType == OperatorType::kHashBuild || diff --git a/velox/exec/Driver.h b/velox/exec/Driver.h index 2407e7240cc..e791d6ee7d7 100644 --- a/velox/exec/Driver.h +++ b/velox/exec/Driver.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -264,6 +265,15 @@ struct DriverCtx { const core::PlanNodeId& planNodeId, const std::string& operatorType); + /// Creates one leaf operator pool per registered custom root pool on the + /// owning task's QueryCtx, mirroring 'addOperatorPool' but under each + /// custom root. Returns a map keyed by resource tag. Empty when no custom + /// pools are registered. + std::unordered_map + addCustomOperatorPools( + const core::PlanNodeId& planNodeId, + const std::string& operatorType); + /// Builds the spill config for the operator with specified 'operatorId' and /// 'operatorType'. std::optional makeSpillConfig( diff --git a/velox/exec/Operator.cpp b/velox/exec/Operator.cpp index 9e6e6a13fd3..aa804287233 100644 --- a/velox/exec/Operator.cpp +++ b/velox/exec/Operator.cpp @@ -37,7 +37,14 @@ OperatorCtx::OperatorCtx( planNodeId_(planNodeId), operatorId_(operatorId), operatorType_(operatorType), - pool_(driverCtx_->addOperatorPool(planNodeId, operatorType_)) {} + pool_(driverCtx_->addOperatorPool(planNodeId, operatorType_)), + customPools_( + driverCtx_->addCustomOperatorPools(planNodeId, operatorType_)) {} + +memory::MemoryPool* OperatorCtx::customPool(std::string_view tag) const { + auto it = customPools_.find(std::string(tag)); + return it == customPools_.end() ? nullptr : it->second; +} core::ExecCtx* OperatorCtx::execCtx() const { if (!execCtx_) { diff --git a/velox/exec/Operator.h b/velox/exec/Operator.h index 30f109b48a8..a93dac3d978 100644 --- a/velox/exec/Operator.h +++ b/velox/exec/Operator.h @@ -17,6 +17,7 @@ #include #include +#include #include "velox/core/PlanNode.h" #include "velox/core/QueryCtx.h" #include "velox/exec/Driver.h" @@ -65,6 +66,10 @@ class OperatorCtx { return pool_; } + /// Returns the leaf operator pool under the custom root registered with + /// 'tag', or nullptr if no such custom root is registered on this query. + velox::memory::MemoryPool* customPool(std::string_view tag) const; + const core::PlanNodeId& planNodeId() const { return planNodeId_; } @@ -102,6 +107,11 @@ class OperatorCtx { const std::string operatorType_; velox::memory::MemoryPool* const pool_; + // Per-resource-tag leaf pools mirroring 'pool_' under each registered + // custom root pool. Empty when no custom pools are registered. + const std::unordered_map + customPools_; + // These members are created on demand. mutable std::unique_ptr execCtx_; }; @@ -372,6 +382,12 @@ class Operator : public BaseRuntimeStatWriter { return operatorCtx_->pool(); } + /// Returns this operator's leaf pool under the custom root registered with + /// 'tag', or nullptr if no such custom root is registered. + velox::memory::MemoryPool* customPool(std::string_view tag) const { + return operatorCtx_->customPool(tag); + } + /// Returns true if the operator is reclaimable. Currently, we only support /// to reclaim memory from a spillable operator. FOLLY_ALWAYS_INLINE virtual bool canReclaim() const { diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index 38efc1c4aa3..270c52f9c31 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -23,6 +23,8 @@ #include "velox/common/base/Counters.h" #include "velox/common/base/StatsReporter.h" #include "velox/common/file/FileSystems.h" +#include "velox/common/memory/CustomMemoryResource.h" +#include "velox/common/memory/CustomMemoryResourceRegistry.h" #include "velox/common/testutil/TestValue.h" #include "velox/common/time/Timer.h" #include "velox/exec/Exchange.h" @@ -212,6 +214,18 @@ bool isHashJoinOperator(const std::string& operatorType) { (operatorType == OperatorType::kHashProbe); } +// Returns the CustomMemoryResourceRegistry visible to 'queryCtx': the +// per-query scoped registry if one has been installed under +// 'kCustomMemoryResourceRegistryKey', otherwise the process-global +// registry. Mirrors connector::ConnectorRegistry's per-query lookup. +memory::CustomMemoryResourceRegistry::Registry& customMemoryResourceRegistryFor( + const core::QueryCtx& queryCtx) { + auto perQuery = + queryCtx.registry( + memory::kCustomMemoryResourceRegistryKey); + return perQuery ? *perQuery : memory::CustomMemoryResourceRegistry::global(); +} + class QueueSplitsStore : public SplitsStore { public: using SplitsStore::SplitsStore; @@ -487,6 +501,9 @@ Task::~Task() { CLEAR(exchangeClients_.clear()); CLEAR(exception_ = nullptr); CLEAR(nodePools_.clear()); + CLEAR(customNodePools_.clear()); + CLEAR(customTaskPools_.clear()); + CLEAR(customChildPools_.clear()); CLEAR(childPools_.clear()); CLEAR(planFragment_ = core::PlanFragment()); CLEAR(pool_.reset()); @@ -711,6 +728,25 @@ void Task::initTaskPool() { VELOX_CHECK_NULL(pool_); pool_ = queryCtx_->pool()->addAggregateChild( fmt::format("task.{}", taskId_.c_str()), createTaskReclaimer()); + initCustomTaskPools(); +} + +void Task::initCustomTaskPools() { + VELOX_CHECK_NOT_NULL(pool_); + const auto& customRoots = queryCtx_->customPools(); + if (customRoots.empty()) { + return; + } + auto& registry = customMemoryResourceRegistryFor(*queryCtx_); + for (const auto& [tag, root] : customRoots) { + auto resource = registry.find(tag); + VELOX_CHECK_NOT_NULL( + resource, "No CustomMemoryResource registered for tag: {}", tag); + customChildPools_.push_back(root->addAggregateChild( + fmt::format("task.{}.{}", taskId_.c_str(), tag), + resource->newReclaimer())); + customTaskPools_[tag] = customChildPools_.back().get(); + } } velox::memory::MemoryPool* Task::getOrAddNodePool( @@ -725,6 +761,31 @@ velox::memory::MemoryPool* Task::getOrAddNodePool( }))); auto* nodePool = childPools_.back().get(); nodePools_[planNodeId] = nodePool; + for (const auto& [tag, _] : queryCtx_->customPools()) { + getOrAddCustomNodePool(tag, planNodeId); + } + return nodePool; +} + +memory::MemoryPool* Task::getOrAddCustomNodePool( + const std::string& tag, + const core::PlanNodeId& planNodeId) { + auto& nodeMap = customNodePools_[tag]; + if (auto it = nodeMap.find(planNodeId); it != nodeMap.end()) { + return it->second; + } + auto taskIt = customTaskPools_.find(tag); + VELOX_CHECK( + taskIt != customTaskPools_.end(), + "Custom task pool missing for tag: {}", + tag); + auto resource = customMemoryResourceRegistryFor(*queryCtx_).find(tag); + VELOX_CHECK_NOT_NULL( + resource, "No CustomMemoryResource registered for tag: {}", tag); + customChildPools_.push_back(taskIt->second->addAggregateChild( + fmt::format("node.{}.{}", planNodeId, tag), resource->newReclaimer())); + auto* nodePool = customChildPools_.back().get(); + nodeMap[planNodeId] = nodePool; return nodePool; } @@ -747,9 +808,73 @@ memory::MemoryPool* Task::getOrAddJoinNodePool( }))); auto* nodePool = childPools_.back().get(); nodePools_[nodeId] = nodePool; + for (const auto& [tag, _] : queryCtx_->customPools()) { + getOrAddCustomJoinNodePool(tag, planNodeId, splitGroupId); + } + return nodePool; +} + +memory::MemoryPool* Task::getOrAddCustomJoinNodePool( + const std::string& tag, + const core::PlanNodeId& planNodeId, + uint32_t splitGroupId) { + const std::string nodeId = splitGroupId == kUngroupedGroupId + ? planNodeId + : fmt::format("{}[{}]", planNodeId, splitGroupId); + auto& nodeMap = customNodePools_[tag]; + if (auto it = nodeMap.find(nodeId); it != nodeMap.end()) { + return it->second; + } + auto taskIt = customTaskPools_.find(tag); + VELOX_CHECK( + taskIt != customTaskPools_.end(), + "Custom task pool missing for tag: {}", + tag); + auto resource = customMemoryResourceRegistryFor(*queryCtx_).find(tag); + VELOX_CHECK_NOT_NULL( + resource, "No CustomMemoryResource registered for tag: {}", tag); + customChildPools_.push_back(taskIt->second->addAggregateChild( + fmt::format("node.{}.{}", nodeId, tag), resource->newReclaimer())); + auto* nodePool = customChildPools_.back().get(); + nodeMap[nodeId] = nodePool; return nodePool; } +memory::MemoryPool* Task::addCustomOperatorPool( + const std::string& tag, + const core::PlanNodeId& planNodeId, + uint32_t splitGroupId, + int pipelineId, + uint32_t driverId, + const std::string& operatorType) { + memory::MemoryPool* nodePool; + if (isHashJoinOperator(operatorType)) { + nodePool = getOrAddCustomJoinNodePool(tag, planNodeId, splitGroupId); + } else { + nodePool = getOrAddCustomNodePool(tag, planNodeId); + } + customChildPools_.push_back(nodePool->addLeafChild( + fmt::format( + "op.{}.{}.{}.{}.{}", + planNodeId, + pipelineId, + driverId, + operatorType, + tag))); + return customChildPools_.back().get(); +} + +memory::MemoryPool* Task::customNodePool( + const std::string& tag, + const core::PlanNodeId& planNodeId) const { + auto tagIt = customNodePools_.find(tag); + if (tagIt == customNodePools_.end()) { + return nullptr; + } + auto nodeIt = tagIt->second.find(planNodeId); + return nodeIt == tagIt->second.end() ? nullptr : nodeIt->second; +} + std::unique_ptr Task::createNodeReclaimer( const std::function()>& reclaimerFactory) const { diff --git a/velox/exec/Task.h b/velox/exec/Task.h index 2b483b806a9..2c7ba1c7132 100644 --- a/velox/exec/Task.h +++ b/velox/exec/Task.h @@ -407,6 +407,38 @@ class Task : public std::enable_shared_from_this { uint32_t driverId, const std::string& operatorType); + /// Returns the node-level aggregate pool under the custom root for 'tag' + /// and 'planNodeId', creating it if needed. Throws if 'tag' has no + /// registered custom root on this query's QueryCtx. + memory::MemoryPool* getOrAddCustomNodePool( + const std::string& tag, + const core::PlanNodeId& planNodeId); + + /// Hash-join variant of getOrAddCustomNodePool. Mirrors + /// getOrAddJoinNodePool: keys the node pool by '[]' + /// for grouped execution. + memory::MemoryPool* getOrAddCustomJoinNodePool( + const std::string& tag, + const core::PlanNodeId& planNodeId, + uint32_t splitGroupId); + + /// Returns a new leaf operator pool under the custom root for 'tag', + /// mirroring addOperatorPool. Selects the join or non-join node parent + /// based on 'operatorType'. + memory::MemoryPool* addCustomOperatorPool( + const std::string& tag, + const core::PlanNodeId& planNodeId, + uint32_t splitGroupId, + int pipelineId, + uint32_t driverId, + const std::string& operatorType); + + /// Read-only accessor for the cached custom node pool under 'tag' and + /// 'planNodeId', or nullptr if absent. + memory::MemoryPool* customNodePool( + const std::string& tag, + const core::PlanNodeId& planNodeId) const; + /// Creates new instance of MemoryPool with aggregate kind for the connector /// use, stores it in the task to ensure lifetime and returns a raw pointer. /// Not thread safe, e.g. must be called from the Operator's constructor. @@ -891,6 +923,11 @@ class Task : public std::enable_shared_from_this { // Invoked to initialize the memory pool for this task on creation. void initTaskPool(); + // Creates the per-tag 'task..' aggregate child under every + // custom root pool registered on the QueryCtx. Called from initTaskPool + // after the default task pool is created. + void initCustomTaskPools(); + // Creates a scaled scan controller for a given table scan node. void addScaledScanControllerLocked( uint32_t splitGroupId, @@ -1214,6 +1251,23 @@ class Task : public std::enable_shared_from_this { // NOTE: 'childPools_' holds the ownerships of node memory pools. std::unordered_map nodePools_; + // Aggregate child of each registered custom root pool. Keyed by resource + // tag. Lifetime is held in 'customChildPools_'. + std::unordered_map customTaskPools_; + + // Node-level aggregate pools under each custom root, keyed first by + // resource tag, then by plan node id. Lifetime is held in + // 'customChildPools_'. + std::unordered_map< + std::string, + std::unordered_map> + customNodePools_; + + // Owns the shared_ptrs to all custom task/node/operator pools mirrored + // under registered custom roots. Kept separate from 'childPools_' so the + // default hierarchy is unchanged. + std::vector> customChildPools_; + // Set to true by OutputBufferManager when all output is // acknowledged. If this happens before Drivers are at end, the last // Driver to finish will set state_ to kFinished. If Drivers have From c9260b4bafc63d45646a33713c783f1889dd97c9 Mon Sep 17 00:00:00 2001 From: sungwoo-XCENA Date: Tue, 2 Jun 2026 08:38:34 +0000 Subject: [PATCH 10/10] fix: Memory leak error in CustomMemoryResource test --- .../memory/tests/CustomMemoryHierarchyTest.cpp | 16 +++++++++++++++- velox/core/QueryCtx.h | 7 ++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/velox/common/memory/tests/CustomMemoryHierarchyTest.cpp b/velox/common/memory/tests/CustomMemoryHierarchyTest.cpp index 068333729cb..db725da1e9a 100644 --- a/velox/common/memory/tests/CustomMemoryHierarchyTest.cpp +++ b/velox/common/memory/tests/CustomMemoryHierarchyTest.cpp @@ -42,6 +42,13 @@ class CustomMemoryHierarchyTest : public testing::Test { vectorPool_ = memoryManager()->addLeafPool("test-vec"); } + void TearDown() override { + for (const auto& task : tasks_) { + task->requestCancel(); + } + tasks_.clear(); + } + std::shared_ptr makeResource( const std::string& tag, int64_t capacity = 1L << 30) { @@ -96,13 +103,16 @@ class CustomMemoryHierarchyTest : public testing::Test { std::shared_ptr makeTask( const std::string& taskId, const std::shared_ptr& queryCtx) { - return exec::Task::create( + auto task = exec::Task::create( taskId, makePlan(), /*destination=*/0, queryCtx, exec::Task::ExecutionMode::kSerial, exec::Consumer{}); + // Track so TearDown can cancel it and release the Task<->Driver cycle. + tasks_.push_back(task); + return task; } // Returns the first child of 'pool' whose name matches 'name', or nullptr. @@ -119,6 +129,10 @@ class CustomMemoryHierarchyTest : public testing::Test { } std::shared_ptr vectorPool_; + + // Tasks created via makeTask, cancelled in TearDown to break the + // Task<->Driver cycle and release their pools. + std::vector> tasks_; }; // Task construction creates 'task..' aggregate under each diff --git a/velox/core/QueryCtx.h b/velox/core/QueryCtx.h index ce8658e69f9..12d8f5daa83 100644 --- a/velox/core/QueryCtx.h +++ b/velox/core/QueryCtx.h @@ -504,9 +504,6 @@ class QueryCtx : public std::enable_shared_from_this { connectorSessionProperties_; std::shared_ptr pool_; - std::unordered_map> - customPools_; - QueryConfig queryConfig_; std::atomic numSpilledBytes_{0}; std::atomic numTracedBytes_{0}; @@ -535,6 +532,10 @@ class QueryCtx : public std::enable_shared_from_this { // Per-query registry overrides keyed by subsystem name. folly::Synchronized> registries_; + + // Custom root memory pools keyed by tag. + std::unordered_map> + customPools_; }; // Represents the state of one thread of query execution.