From 50a724b0e5cb6cf13d1eca9cde048b31f7fa3929 Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Tue, 4 Mar 2025 18:01:55 +0100 Subject: [PATCH 1/7] [ntuple] Add `RNTupleModel::GetFieldNames` --- tree/ntuple/v7/inc/ROOT/RNTupleModel.hxx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tree/ntuple/v7/inc/ROOT/RNTupleModel.hxx b/tree/ntuple/v7/inc/ROOT/RNTupleModel.hxx index 2fab4fca0354a..5488f912a0ee4 100644 --- a/tree/ntuple/v7/inc/ROOT/RNTupleModel.hxx +++ b/tree/ntuple/v7/inc/ROOT/RNTupleModel.hxx @@ -337,6 +337,9 @@ public: const std::string &GetDescription() const { return fDescription; } void SetDescription(std::string_view description); + /// Get the names of the fields currently present in the model, including projected fields. Registered subfields + /// are not included, use GetRegisteredSubfieldnames() for this. + const std::unordered_set &GetFieldNames() const { return fFieldNames; } /// Get the (qualified) names of subfields that have been registered to be included in entries from this model. const std::unordered_set &GetRegisteredSubfieldNames() const { return fRegisteredSubfields; } From 95b817fb678f47aa8c754b697e580e5a224254ae Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Thu, 6 Mar 2025 12:06:01 +0100 Subject: [PATCH 2/7] [ntuple] Split primary and auxiliary models To reflect the interface changes introduced in rev d5b4931, instead of a single list of models (one per primary/auxiliary ntuple), a distinction is made between the models of the primary and auxiliary ntuples. Same as before, internally they are combined into one singular model used by the processor. --- tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx | 66 ++++-- tree/ntuple/v7/src/RNTupleProcessor.cxx | 208 ++++++++---------- tree/ntuple/v7/test/ntuple_processor_join.cxx | 21 +- 3 files changed, 147 insertions(+), 148 deletions(-) diff --git a/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx b/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx index 978894608f223..65ff82d61dc47 100644 --- a/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx +++ b/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx @@ -381,14 +381,18 @@ public: /// \param[in] joinFields The names of the fields on which to join, in case the specified RNTuples are unaligned. /// The join is made based on the combined join field values, and therefore each field has to be present in each /// specified RNTuple. If an empty list is provided, it is assumed that the specified ntuple are fully aligned. - /// \param[in] models A list of models for the RNTuples. This list must either contain a model for the primary - /// RNTuple and each auxiliary RNTuple (following the specification order), or be empty. When the list is empty, the - /// default model (i.e. containing all fields) will be used for each RNTuple. + /// \param[in] primaryModel An RNTupleModel specifying which fields from the primary RNTuple can be read by the + /// processor. If no model is provided, one will be created based on the descriptor of the primary RNTuple. + /// \param[in] auxModels A list of RNTupleModels specifying which fields from the corresponding auxiliary RNTuple + /// (according to the order of `auxNTuples`) can be read by the processor. If this vector is empty, the models will + /// be created based on the descriptors of their corresponding RNTuples. This also applies to individual auxiliary + /// RNTuples for which the provided model is a `nullptr`. /// /// \return A pointer to the newly created RNTupleProcessor. static std::unique_ptr - CreateJoin(RNTupleOpenSpec primaryNTuple, std::vector auxNTuples, - const std::vector &joinFields, std::vector> models = {}); + CreateJoin(const RNTupleOpenSpec &primaryNTuple, const std::vector &auxNTuples, + const std::vector &joinFields, std::unique_ptr primaryModel = nullptr, + std::vector> auxModels = {}); ///////////////////////////////////////////////////////////////////////////// /// \brief Create an RNTupleProcessor for a *join* (i.e., a horizontal combination) of RNTuples. @@ -403,16 +407,22 @@ public: /// specified RNTuple. If an empty list is provided, it is assumed that the specified RNTuple are fully aligned. /// \param[in] processorName The name to give to the processor. Use /// CreateJoin(const RNTupleOpenSpec &, const std::vector &, const std::vector &, - /// std::vector>) to automatically use the name of the input RNTuple instead. - /// \param[in] models A list of models for the RNTuples. This list must either contain a model for the primary - /// RNTuple and each auxiliary RNTuple (following the specification order), or be empty. When the list is empty, the - /// default model (i.e. containing all fields) will be used for each RNTuple. + /// std::unique_ptr, std::vector>) to automatically use the name of the + /// input RNTuple instead. + /// \param[in] primaryModel An RNTupleModel specifying which fields from the primary RNTuple + /// can be read by the processor. If no model is provided, one will be created based on the descriptor of the primary + /// RNTuple. + /// \param[in] auxModels A list of RNTupleModels specifying which fields from the corresponding auxiliary + /// RNTuple (according to the order of `auxNTuples`) can be read by the processor. If this vector is empty, the + /// models will be created based on the descriptors of their corresponding RNTuples. This also applies to individual + /// auxiliary RNTuples for which the provided model is a `nullptr`. /// /// \return A pointer to the newly created RNTupleProcessor. static std::unique_ptr - CreateJoin(RNTupleOpenSpec primaryNTuple, std::vector auxNTuples, + CreateJoin(const RNTupleOpenSpec &primaryNTuple, const std::vector &auxNTuples, const std::vector &joinFields, std::string_view processorName, - std::vector> models = {}); + std::unique_ptr primaryModel = nullptr, + std::vector> auxModels = {}); }; // clang-format off @@ -540,6 +550,17 @@ private: /// \brief Get the total number of entries in this processor. ROOT::NTupleSize_t GetNEntries() final { return fNEntries; } + ///////////////////////////////////////////////////////////////////////////// + /// \brief Set fModel by combining the primary and auxiliary models. + /// + /// \param[in] primaryModel The model of the primary RNTuple. + /// \param[in] auxModels Models of the auxiliary RNTuples. + /// + /// To prevent field name clashes when one or more models have fields with duplicate names, fields from each + /// auxiliary model are stored as a anonymous record, and subsequently registered as subfields in the join model. + /// This way, they can be accessed from the processor's entry as `auxNTupleName.fieldName`. + void SetModel(std::unique_ptr primaryModel, std::vector> auxModels); + ///////////////////////////////////////////////////////////////////////////// /// \brief Construct a new RNTupleJoinProcessor. /// @@ -550,24 +571,25 @@ private: /// specified RNTuple. If an empty list is provided, it is assumed that the RNTuples are fully aligned. /// \param[in] processorName Name of the processor. Unless specified otherwise in RNTupleProcessor::CreateJoin, this /// is the name of the main RNTuple. - /// \param[in] models The models that specify which fields should be read by the processor, ordered according to - /// {mainNTuple, auxNTuple[0], ...}. The pointer returned by RNTupleModel::MakeField can be used to access a field's - /// value during the processor iteration. When an empty list is passed, the models are created from the descriptor of - /// each RNTuple specified in `mainNTuple` and `auxNTuple`. - RNTupleJoinProcessor(RNTupleOpenSpec mainNTuple, std::vector auxNTuples, + /// \param[in] primaryModel An RNTupleModel specifying which fields from the primary RNTuple can be read by the + /// processor. If no model is provided, one will be created based on the descriptor of the primary RNTuple. + /// \param[in] auxModels A list of RNTupleModels specifying which fields from the corresponding auxiliary RNTuple + /// (according to the order of `auxNTuples`) can be read by the processor. If this vector is empty, the models will + /// be created based on the descriptors of their corresponding RNTuples. This also applies to individual auxiliary + /// RNTuples for which the provided model is a `nullptr`. + RNTupleJoinProcessor(const RNTupleOpenSpec &mainNTuple, const std::vector &auxNTuples, const std::vector &joinFields, std::string_view processorName, - std::vector> models = {}); + std::unique_ptr primaryModel = nullptr, + std::vector> auxModels = {}); ///////////////////////////////////////////////////////////////////////////// /// \brief Add an auxiliary RNTuple to the processor. /// /// \param[in] auxNTuple The source specification (name and storage location) of the auxiliary RNTuple. /// \param[in] joinFields The names of the fields used in the join. - /// \param[in] model The model that specifies which fields should be read by the processor. The pointer returned by - /// RNTupleModel::MakeField can be used to access a field's value during the processor iteration. When no model is - /// specified, it is created from the RNTuple's descriptor. - void AddAuxiliary(const RNTupleOpenSpec &auxNTuple, const std::vector &joinFields, - std::unique_ptr model = nullptr); + /// \param[in] ntupleIdx The index of the ntuple according to fNTuples. + void + AddAuxiliary(const RNTupleOpenSpec &auxNTuple, const std::vector &joinFields, std::size_t ntupleIdx); ///////////////////////////////////////////////////////////////////////////// /// \brief Connect all fields, once the primary and all auxiliary RNTuples have been added. diff --git a/tree/ntuple/v7/src/RNTupleProcessor.cxx b/tree/ntuple/v7/src/RNTupleProcessor.cxx index 63cd9bb147998..cc2a7cc2c323c 100644 --- a/tree/ntuple/v7/src/RNTupleProcessor.cxx +++ b/tree/ntuple/v7/src/RNTupleProcessor.cxx @@ -109,23 +109,23 @@ ROOT::Experimental::RNTupleProcessor::CreateChain(std::vector -ROOT::Experimental::RNTupleProcessor::CreateJoin(RNTupleOpenSpec primaryNTuple, std::vector auxNTuples, +ROOT::Experimental::RNTupleProcessor::CreateJoin(const RNTupleOpenSpec &primaryNTuple, + const std::vector &auxNTuples, const std::vector &joinFields, - std::vector> models) + std::unique_ptr primaryModel, + std::vector> auxModels) { - auto processorName = primaryNTuple.fNTupleName; - return CreateJoin(std::move(primaryNTuple), std::move(auxNTuples), joinFields, processorName, std::move(models)); + return CreateJoin(primaryNTuple, auxNTuples, joinFields, primaryNTuple.fNTupleName, std::move(primaryModel), + std::move(auxModels)); } -std::unique_ptr -ROOT::Experimental::RNTupleProcessor::CreateJoin(RNTupleOpenSpec primaryNTuple, std::vector auxNTuples, - const std::vector &joinFields, - std::string_view processorName, - std::vector> models) +std::unique_ptr ROOT::Experimental::RNTupleProcessor::CreateJoin( + const RNTupleOpenSpec &primaryNTuple, const std::vector &auxNTuples, + const std::vector &joinFields, std::string_view processorName, + std::unique_ptr primaryModel, std::vector> auxModels) { - if (!models.empty() && models.size() != (auxNTuples.size() + 1)) { - throw RException(R__FAIL("number of provided models must match number of specified ntuples")); - } + if (!auxModels.empty() && auxModels.size() != auxNTuples.size()) + throw RException(R__FAIL("number of auxiliary models and auxiliary RNTuples does not match")); if (joinFields.size() > 4) { throw RException(R__FAIL("a maximum of four join fields is allowed")); @@ -146,14 +146,8 @@ ROOT::Experimental::RNTupleProcessor::CreateJoin(RNTupleOpenSpec primaryNTuple, } } - std::unique_ptr processor; - if (!models.empty()) { - processor = std::unique_ptr(new RNTupleJoinProcessor( - std::move(primaryNTuple), std::move(auxNTuples), joinFields, processorName, std::move(models))); - } else { - processor = std::unique_ptr( - new RNTupleJoinProcessor(std::move(primaryNTuple), std::move(auxNTuples), joinFields, processorName)); - } + std::unique_ptr processor = std::unique_ptr(new RNTupleJoinProcessor( + primaryNTuple, auxNTuples, joinFields, processorName, std::move(primaryModel), std::move(auxModels))); processor->SetJoinFieldTokens(joinFields); processor->ConnectFields(); @@ -164,6 +158,7 @@ ROOT::Experimental::RNTupleProcessor::CreateJoin(RNTupleOpenSpec primaryNTuple, void ROOT::Experimental::RNTupleProcessor::ConnectField(RFieldContext &fieldContext, Internal::RPageSource &pageSource, REntry &entry) { + pageSource.Attach(); auto desc = pageSource.GetSharedDescriptorGuard(); const auto fieldId = desc->FindFieldId(fieldContext.GetProtoField().GetFieldName()); @@ -344,14 +339,17 @@ ROOT::NTupleSize_t ROOT::Experimental::RNTupleChainProcessor::LoadEntry(ROOT::NT //------------------------------------------------------------------------------ -ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(RNTupleOpenSpec mainNTuple, - std::vector auxNTuples, +ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(const RNTupleOpenSpec &mainNTuple, + const std::vector &auxNTuples, const std::vector &joinFields, std::string_view processorName, - std::vector> models) + std::unique_ptr primaryModel, + std::vector> auxModels) : RNTupleProcessor(processorName, nullptr) { fNTuples.emplace_back(mainNTuple); + fNTuples.insert(fNTuples.end(), auxNTuples.begin(), auxNTuples.end()); + fPageSource = mainNTuple.CreatePageSource(); fPageSource->Attach(); @@ -361,10 +359,23 @@ ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(RNTupleOpenSpec m fNEntries = fPageSource->GetNEntries(); - if (models.empty()) - fModel = fPageSource->GetSharedDescriptorGuard()->CreateModel(); - else - fModel = models[0]->Clone(); + for (const auto &auxNTuple : auxNTuples) { + fAuxiliaryPageSources.emplace_back(auxNTuple.CreatePageSource()); + } + + if (!primaryModel) + primaryModel = fPageSource->GetSharedDescriptorGuard()->CreateModel(); + if (auxModels.empty()) { + auxModels.resize(fAuxiliaryPageSources.size()); + } + for (unsigned i = 0; i < fAuxiliaryPageSources.size(); ++i) { + if (!auxModels[i]) { + fAuxiliaryPageSources[i]->Attach(); + auxModels[i] = fAuxiliaryPageSources[i]->GetSharedDescriptorGuard()->CreateModel(); + } + } + + SetModel(std::move(primaryModel), std::move(auxModels)); fModel->Freeze(); fEntry = fModel->CreateEntry(); @@ -373,123 +384,92 @@ ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(RNTupleOpenSpec m auto &field = value.GetField(); const auto &fieldName = field.GetQualifiedFieldName(); + auto isAuxParent = std::find_if(auxNTuples.cbegin(), auxNTuples.cend(), [&fieldName](const RNTupleOpenSpec &n) { + return fieldName.substr(0, n.fNTupleName.size()) == n.fNTupleName; + }); + if (isAuxParent != auxNTuples.end()) + continue; + // If the model provided by the user has a default entry, use the value pointers from the default entry of the // model that was passed to this constructor. This way, the pointers returned by RNTupleModel::MakeField can be // used in the processor loop to access the corresponding field values. - if (!models.empty() && !models[0]->IsBare()) { - auto valuePtr = models[0]->GetDefaultEntry().GetPtr(fieldName); + if (!fModel->IsBare()) { + auto valuePtr = fModel->GetDefaultEntry().GetPtr(fieldName); fEntry->BindValue(fieldName, valuePtr); } - const auto &[fieldContext, _] = - fFieldContexts.try_emplace(fieldName, field.Clone(fieldName), fEntry->GetToken(fieldName)); - ConnectField(fieldContext->second, *fPageSource, *fEntry); + fFieldContexts.try_emplace(fieldName, field.Clone(fieldName), fEntry->GetToken(fieldName)); } for (unsigned i = 0; i < auxNTuples.size(); ++i) { - if (models.empty()) { - AddAuxiliary(auxNTuples[i], joinFields); - } else { - // The size of `models` is checked in `CreateJoin`; at this point we can safely assume that `models.size() == - // auxNTuples.size() + 1`. - AddAuxiliary(auxNTuples[i], joinFields, std::move(models[i + 1])); - } + AddAuxiliary(auxNTuples[i], joinFields, i + 1 /* ntupleIdx */); } } -void ROOT::Experimental::RNTupleJoinProcessor::AddAuxiliary(const RNTupleOpenSpec &auxNTuple, - const std::vector &joinFields, - std::unique_ptr model) +void ROOT::Experimental::RNTupleJoinProcessor::SetModel(std::unique_ptr primaryModel, + std::vector> auxModels) { - assert(fNEntriesProcessed == 0 && "cannot add auxiliary ntuples after processing has started"); - - fNTuples.emplace_back(auxNTuple); - - auto pageSource = auxNTuple.CreatePageSource(); - pageSource->Attach(); - - if (pageSource->GetNEntries() == 0) { - throw RException(R__FAIL("provided RNTuple is empty")); - } - - if (!model) - model = pageSource->GetSharedDescriptorGuard()->CreateModel(); - - model->Freeze(); - auto entry = model->CreateBareEntry(); - - // Append the auxiliary fields to the join model + fModel = std::move(primaryModel); fModel->Unfreeze(); - // The fields of the auxiliary ntuple are contained in an anonymous record field and subsequently registered as - // subfields to the join model. This way they can be accessed through the processor as `auxNTupleName.fieldName`, - // which is necessary in case there are duplicate field names between the main ntuple and (any of the) auxiliary - // ntuples. - std::vector> auxFields; - auxFields.reserve(entry->fValues.size()); - for (const auto &val : *entry) { - auto &field = val.GetField(); + // Create an anonymous record field for each auxiliary ntuple, containing their top-level fields. These original + // top-level fields are registered as subfields in the join model, such that they can be accessed as + // `auxNTupleName.fieldName`. + for (unsigned i = 0; i < auxModels.size(); ++i) { + std::vector> auxFields; + auxFields.reserve(auxModels[i]->GetFieldNames().size()); - auxFields.emplace_back(field.Clone(field.GetQualifiedFieldName())); - } - std::unique_ptr auxParentField = - std::make_unique(auxNTuple.fNTupleName, std::move(auxFields)); + for (const auto &fieldName : auxModels[i]->GetFieldNames()) { + auxFields.emplace_back(auxModels[i]->GetConstField(fieldName).Clone(fieldName)); + } - if (!auxParentField) { - throw RException(R__FAIL("could not create auxiliary RNTuple parent field")); - } + auto auxParentField = std::make_unique(fNTuples[i + 1].fNTupleName, std::move(auxFields)); + const auto &subFields = auxParentField->GetConstSubfields(); + fModel->AddField(std::move(auxParentField)); - const auto &subfields = auxParentField->GetConstSubfields(); - fModel->AddField(std::move(auxParentField)); - for (const auto &field : subfields) { - fModel->RegisterSubfield(field->GetQualifiedFieldName()); + for (const auto &field : subFields) { + fModel->RegisterSubfield(field->GetQualifiedFieldName()); + } + + // If the model has a default entry, adopt its value pointers. This way, the pointers returned by + // RNTupleModel::MakeField can be used in the processor loop to access the corresponding field values. + if (!auxModels[i]->IsBare()) { + const auto &auxDefaultEntry = auxModels[i]->GetDefaultEntry(); + auto &joinDefaultEntry = fModel->GetDefaultEntry(); + for (const auto &fieldName : auxModels[i]->GetFieldNames()) { + auto valuePtr = auxDefaultEntry.GetPtr(fieldName); + joinDefaultEntry.BindValue(fNTuples[i + 1].fNTupleName + "." + fieldName, valuePtr); + } + } } fModel->Freeze(); - // After modifying the join model, we need to create a new entry since the old one is invalidated. However, we do - // want to carry over the value pointers, so the pointers returned by `MakeField` during the creation of the original - // model by the user can be used in the processor loop. - auto newEntry = fModel->CreateEntry(); - - for (const auto &value : *newEntry) { - const auto &field = value.GetField(); +} - // Skip if the field is the untyped record that holds the fields of auxiliary ntuples. - const auto fnIsNTuple = [&field](RNTupleOpenSpec n) { return n.fNTupleName == field.GetFieldName(); }; - if (std::find_if(fNTuples.cbegin(), fNTuples.cend(), fnIsNTuple) != fNTuples.end()) { - continue; - } +void ROOT::Experimental::RNTupleJoinProcessor::AddAuxiliary(const RNTupleOpenSpec &auxNTuple, + const std::vector &joinFields, + std::size_t ntupleIdx) +{ + assert(fNEntriesProcessed == 0 && "cannot add auxiliary ntuples after processing has started"); - auto fieldContext = fFieldContexts.find(field.GetQualifiedFieldName()); - // If the field belongs to the auxiliary ntuple currently being added, apart from assigning its entry value the - // correct pointer, we also have to create a field context for it. - if (fieldContext == fFieldContexts.end()) { - // If the model has a default entry, use the value pointers from the entry in the entry managed by the - // processor. This way, the pointers returned by RNTupleModel::MakeField can be used in the processor loop to - // access the corresponding field values. - if (!model->IsBare()) { - auto valuePtr = model->GetDefaultEntry().GetPtr(field.GetFieldName()); - newEntry->BindValue(field.GetQualifiedFieldName(), valuePtr); - } + auto &auxParentField = fModel->GetConstField(auxNTuple.fNTupleName); - auto token = newEntry->GetToken(field.GetQualifiedFieldName()); - fFieldContexts.try_emplace(field.GetQualifiedFieldName(), field.Clone(field.GetFieldName()), token, - fNTuples.size() - 1); - } else { - auto valuePtr = fEntry->GetPtr(fieldContext->second.fToken); - auto newToken = newEntry->GetToken(field.GetQualifiedFieldName()); - newEntry->BindValue(newToken, valuePtr); - fieldContext->second.fToken = std::move(newToken); + for (const auto &field : auxParentField.GetConstSubfields()) { + // If the model was provided by the user and it has a default entry, use the value pointers from the entry in + // the entry managed by the processor. This way, the pointers returned by RNTupleModel::MakeField can be used + // in the processor loop to access the corresponding field values. + if (!fModel->IsBare()) { + auto valuePtr = fModel->GetDefaultEntry().GetPtr(field->GetQualifiedFieldName()); + fEntry->BindValue(field->GetQualifiedFieldName(), valuePtr); } - } - fEntry.swap(newEntry); + auto token = fEntry->GetToken(field->GetQualifiedFieldName()); + fFieldContexts.try_emplace(field->GetQualifiedFieldName(), field->Clone(field->GetFieldName()), token, ntupleIdx); + } // If no join fields have been specified, an aligned join is assumed and an join table won't be created. if (!joinFields.empty()) fJoinTables.emplace_back(Internal::RNTupleJoinTable::Create(joinFields)); - - fAuxiliaryPageSources.emplace_back(std::move(pageSource)); } void ROOT::Experimental::RNTupleJoinProcessor::ConnectFields() diff --git a/tree/ntuple/v7/test/ntuple_processor_join.cxx b/tree/ntuple/v7/test/ntuple_processor_join.cxx index ab7ac46a13324..60c76d0964396 100644 --- a/tree/ntuple/v7/test/ntuple_processor_join.cxx +++ b/tree/ntuple/v7/test/ntuple_processor_join.cxx @@ -243,24 +243,21 @@ TEST_F(RNTupleJoinProcessorTest, MissingEntries) TEST_F(RNTupleJoinProcessorTest, WithModel) { - auto model1 = RNTupleModel::Create(); - auto i = model1->MakeField("i"); - auto x = model1->MakeField("x"); + auto primaryModel = RNTupleModel::Create(); + auto i = primaryModel->MakeField("i"); + auto x = primaryModel->MakeField("x"); - auto model2 = RNTupleModel::Create(); - auto y = model2->MakeField>("y"); + std::vector> auxModels; - auto model3 = RNTupleModel::Create(); - auto z = model3->MakeField("z"); + auxModels.push_back(RNTupleModel::Create()); + auto y = auxModels.back()->MakeField>("y"); - std::vector> models; - models.push_back(std::move(model1)); - models.push_back(std::move(model2)); - models.push_back(std::move(model3)); + auxModels.push_back(RNTupleModel::Create()); + auto z = auxModels.back()->MakeField("z"); auto proc = RNTupleProcessor::CreateJoin({fNTupleNames[0], fFileNames[0]}, {{fNTupleNames[1], fFileNames[1]}, {fNTupleNames[2], fFileNames[2]}}, {"i"}, - std::move(models)); + std::move(primaryModel), std::move(auxModels)); int nEntries = 0; std::vector yExpected; From e94679b5b3a54d469d82fd503a02402c34a9a32c Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Wed, 12 Mar 2025 16:55:41 +0100 Subject: [PATCH 3/7] [ntuple] Refactor `AddAuxiliary` away --- tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx | 9 ---- tree/ntuple/v7/src/RNTupleProcessor.cxx | 49 ++++++-------------- 2 files changed, 13 insertions(+), 45 deletions(-) diff --git a/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx b/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx index 65ff82d61dc47..09b09b30892ed 100644 --- a/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx +++ b/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx @@ -582,15 +582,6 @@ private: std::unique_ptr primaryModel = nullptr, std::vector> auxModels = {}); - ///////////////////////////////////////////////////////////////////////////// - /// \brief Add an auxiliary RNTuple to the processor. - /// - /// \param[in] auxNTuple The source specification (name and storage location) of the auxiliary RNTuple. - /// \param[in] joinFields The names of the fields used in the join. - /// \param[in] ntupleIdx The index of the ntuple according to fNTuples. - void - AddAuxiliary(const RNTupleOpenSpec &auxNTuple, const std::vector &joinFields, std::size_t ntupleIdx); - ///////////////////////////////////////////////////////////////////////////// /// \brief Connect all fields, once the primary and all auxiliary RNTuples have been added. void ConnectFields(); diff --git a/tree/ntuple/v7/src/RNTupleProcessor.cxx b/tree/ntuple/v7/src/RNTupleProcessor.cxx index cc2a7cc2c323c..b4531c07df8b7 100644 --- a/tree/ntuple/v7/src/RNTupleProcessor.cxx +++ b/tree/ntuple/v7/src/RNTupleProcessor.cxx @@ -361,6 +361,8 @@ ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(const RNTupleOpen for (const auto &auxNTuple : auxNTuples) { fAuxiliaryPageSources.emplace_back(auxNTuple.CreatePageSource()); + if (!joinFields.empty()) + fJoinTables.emplace_back(Internal::RNTupleJoinTable::Create(joinFields)); } if (!primaryModel) @@ -384,12 +386,6 @@ ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(const RNTupleOpen auto &field = value.GetField(); const auto &fieldName = field.GetQualifiedFieldName(); - auto isAuxParent = std::find_if(auxNTuples.cbegin(), auxNTuples.cend(), [&fieldName](const RNTupleOpenSpec &n) { - return fieldName.substr(0, n.fNTupleName.size()) == n.fNTupleName; - }); - if (isAuxParent != auxNTuples.end()) - continue; - // If the model provided by the user has a default entry, use the value pointers from the default entry of the // model that was passed to this constructor. This way, the pointers returned by RNTupleModel::MakeField can be // used in the processor loop to access the corresponding field values. @@ -398,11 +394,18 @@ ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(const RNTupleOpen fEntry->BindValue(fieldName, valuePtr); } - fFieldContexts.try_emplace(fieldName, field.Clone(fieldName), fEntry->GetToken(fieldName)); - } + auto auxNTupleName = std::find_if(auxNTuples.cbegin(), auxNTuples.cend(), [&fieldName](const RNTupleOpenSpec &n) { + return fieldName.substr(0, n.fNTupleName.size()) == n.fNTupleName; + }); - for (unsigned i = 0; i < auxNTuples.size(); ++i) { - AddAuxiliary(auxNTuples[i], joinFields, i + 1 /* ntupleIdx */); + if (auxNTupleName == auxNTuples.end()) { + fFieldContexts.try_emplace(fieldName, field.Clone(field.GetFieldName()), fEntry->GetToken(fieldName)); + } else if (fieldName != auxNTupleName->fNTupleName) { + // Add 1 because we also have to take into account the primary ntuple. + auto ntupleIdx = std::distance(auxNTuples.begin(), auxNTupleName) + 1; + fFieldContexts.try_emplace(fieldName, field.Clone(field.GetFieldName()), fEntry->GetToken(fieldName), + ntupleIdx); + } } } @@ -446,32 +449,6 @@ void ROOT::Experimental::RNTupleJoinProcessor::SetModel(std::unique_ptrFreeze(); } -void ROOT::Experimental::RNTupleJoinProcessor::AddAuxiliary(const RNTupleOpenSpec &auxNTuple, - const std::vector &joinFields, - std::size_t ntupleIdx) -{ - assert(fNEntriesProcessed == 0 && "cannot add auxiliary ntuples after processing has started"); - - auto &auxParentField = fModel->GetConstField(auxNTuple.fNTupleName); - - for (const auto &field : auxParentField.GetConstSubfields()) { - // If the model was provided by the user and it has a default entry, use the value pointers from the entry in - // the entry managed by the processor. This way, the pointers returned by RNTupleModel::MakeField can be used - // in the processor loop to access the corresponding field values. - if (!fModel->IsBare()) { - auto valuePtr = fModel->GetDefaultEntry().GetPtr(field->GetQualifiedFieldName()); - fEntry->BindValue(field->GetQualifiedFieldName(), valuePtr); - } - - auto token = fEntry->GetToken(field->GetQualifiedFieldName()); - fFieldContexts.try_emplace(field->GetQualifiedFieldName(), field->Clone(field->GetFieldName()), token, ntupleIdx); - } - - // If no join fields have been specified, an aligned join is assumed and an join table won't be created. - if (!joinFields.empty()) - fJoinTables.emplace_back(Internal::RNTupleJoinTable::Create(joinFields)); -} - void ROOT::Experimental::RNTupleJoinProcessor::ConnectFields() { for (auto &[_, fieldContext] : fFieldContexts) { From 14ec02dcef44195c13659907a1a0a4c28929e2af Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Wed, 12 Mar 2025 16:56:27 +0100 Subject: [PATCH 4/7] [ntuple] Add tests for partial model passing --- tree/ntuple/v7/test/ntuple_processor_join.cxx | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/tree/ntuple/v7/test/ntuple_processor_join.cxx b/tree/ntuple/v7/test/ntuple_processor_join.cxx index 60c76d0964396..6dbc2d4747531 100644 --- a/tree/ntuple/v7/test/ntuple_processor_join.cxx +++ b/tree/ntuple/v7/test/ntuple_processor_join.cxx @@ -287,6 +287,100 @@ TEST_F(RNTupleJoinProcessorTest, WithModel) EXPECT_EQ(5, proc->GetNEntriesProcessed()); } +TEST_F(RNTupleJoinProcessorTest, PartialModels) +{ + { + std::vector> auxModels; + + auxModels.emplace_back(RNTupleModel::Create()); + auto y = auxModels.back()->MakeField>("y"); + + auxModels.emplace_back(RNTupleModel::Create()); + auto z = auxModels.back()->MakeField("z"); + + // no primary model provided, aux models have been provided + auto procNoPrimaryModel = RNTupleProcessor::CreateJoin( + {fNTupleNames[0], fFileNames[0]}, {{fNTupleNames[1], fFileNames[1]}, {fNTupleNames[2], fFileNames[2]}}, {"i"}, + nullptr, std::move(auxModels)); + + auto i = procNoPrimaryModel->GetEntry().GetPtr("i"); + std::vector yExpected; + for (auto &entry : *procNoPrimaryModel) { + EXPECT_EQ(procNoPrimaryModel->GetCurrentEntryNumber() * 2, *i); + EXPECT_FLOAT_EQ(*i * 0.5f, *entry.GetPtr("x")); + yExpected = {static_cast(*i * 0.2), 3.14, static_cast(*i * 1.3)}; + EXPECT_EQ(yExpected, *y); + EXPECT_FLOAT_EQ(static_cast(*i * 2.f), *z); + + try { + entry.GetPtr("ntuple2.z"); + FAIL() << "should not be able to access values from fields not present in the provided models"; + } catch (const ROOT::RException &err) { + EXPECT_THAT(err.what(), testing::HasSubstr("invalid field name: ntuple2.z")); + } + } + } + { + // primary model provided, no aux models have been provided + auto primaryModel = RNTupleModel::Create(); + auto i = primaryModel->MakeField("i"); + auto x = primaryModel->MakeField("x"); + + auto procNoAuxModels = RNTupleProcessor::CreateJoin( + {fNTupleNames[0], fFileNames[0]}, {{fNTupleNames[1], fFileNames[1]}, {fNTupleNames[2], fFileNames[2]}}, {"i"}, + std::move(primaryModel)); + + std::vector yExpected; + for (auto &entry : *procNoAuxModels) { + EXPECT_EQ(procNoAuxModels->GetCurrentEntryNumber() * 2, *i); + EXPECT_FLOAT_EQ(*i * 0.5f, *x); + yExpected = {static_cast(*i * 0.2), 3.14, static_cast(*i * 1.3)}; + EXPECT_EQ(yExpected, *entry.GetPtr>("ntuple2.y")); + EXPECT_FLOAT_EQ(static_cast(*i * 2.f), *entry.GetPtr("ntuple3.z")); + + try { + entry.GetPtr("ntuple2.z"); + FAIL() << "should not be able to access values from fields not present in the provided models"; + } catch (const ROOT::RException &err) { + EXPECT_THAT(err.what(), testing::HasSubstr("invalid field name: ntuple2.z")); + } + } + } + { + // primary model and model for first aux ntuple has been provided, but not for the second + auto primaryModel = RNTupleModel::Create(); + auto i = primaryModel->MakeField("i"); + auto x = primaryModel->MakeField("x"); + + std::vector> partialAuxModels; + + partialAuxModels.emplace_back(RNTupleModel::Create()); + auto y = partialAuxModels.back()->MakeField>("y"); + + partialAuxModels.emplace_back(nullptr); + + auto procPartialAuxModels = RNTupleProcessor::CreateJoin( + {fNTupleNames[0], fFileNames[0]}, {{fNTupleNames[1], fFileNames[1]}, {fNTupleNames[2], fFileNames[2]}}, {"i"}, + std::move(primaryModel), std::move(partialAuxModels)); + + std::vector yExpected; + for (auto &entry : *procPartialAuxModels) { + EXPECT_EQ(procPartialAuxModels->GetCurrentEntryNumber() * 2, *i); + EXPECT_FLOAT_EQ(*i * 0.5f, *x); + yExpected = {static_cast(*i * 0.2), 3.14, static_cast(*i * 1.3)}; + EXPECT_EQ(yExpected, *y); + EXPECT_FLOAT_EQ(static_cast(*i * 2.f), *entry.GetPtr("ntuple3.z")); + + try { + entry.GetPtr("ntuple2.z"); + FAIL() << "should not be able to access values from fields not present in the provided models"; + } catch (const ROOT::RException &err) { + EXPECT_THAT(err.what(), testing::HasSubstr("invalid field name: ntuple2.z")); + } + } + } +} + TEST_F(RNTupleJoinProcessorTest, TMemFile) { TMemFile memFile("test_ntuple_processor_join_tmemfile.root", "RECREATE"); From dbb62abf7d767901165b1283bd2429137ff696e0 Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Wed, 12 Mar 2025 17:00:49 +0100 Subject: [PATCH 5/7] [ntuple][NFC] Move `RNTupleJoinProcessor` ctor to bottom --- tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx | 32 ++++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx b/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx index 09b09b30892ed..7f4515fcf55c7 100644 --- a/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx +++ b/tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx @@ -561,6 +561,22 @@ private: /// This way, they can be accessed from the processor's entry as `auxNTupleName.fieldName`. void SetModel(std::unique_ptr primaryModel, std::vector> auxModels); + ///////////////////////////////////////////////////////////////////////////// + /// \brief Connect all fields, once the primary and all auxiliary RNTuples have been added. + void ConnectFields(); + + ///////////////////////////////////////////////////////////////////////////// + /// \brief Populate fJoinFieldTokens with tokens for join fields belonging to the main RNTuple in the join model. + /// + /// \param[in] joinFields The names of the fields used in the join. + void SetJoinFieldTokens(const std::vector &joinFields) + { + fJoinFieldTokens.reserve(joinFields.size()); + for (const auto &fieldName : joinFields) { + fJoinFieldTokens.emplace_back(fEntry->GetToken(fieldName)); + } + } + ///////////////////////////////////////////////////////////////////////////// /// \brief Construct a new RNTupleJoinProcessor. /// @@ -582,22 +598,6 @@ private: std::unique_ptr primaryModel = nullptr, std::vector> auxModels = {}); - ///////////////////////////////////////////////////////////////////////////// - /// \brief Connect all fields, once the primary and all auxiliary RNTuples have been added. - void ConnectFields(); - - ///////////////////////////////////////////////////////////////////////////// - /// \brief Populate fJoinFieldTokens with tokens for join fields belonging to the main RNTuple in the join model. - /// - /// \param[in] joinFields The names of the fields used in the join. - void SetJoinFieldTokens(const std::vector &joinFields) - { - fJoinFieldTokens.reserve(joinFields.size()); - for (const auto &fieldName : joinFields) { - fJoinFieldTokens.emplace_back(fEntry->GetToken(fieldName)); - } - } - public: RNTupleJoinProcessor(const RNTupleJoinProcessor &) = delete; RNTupleJoinProcessor operator=(const RNTupleJoinProcessor &) = delete; From 628569793563db7ddc01c6031ae200050c1e62a6 Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Tue, 18 Mar 2025 15:43:54 +0100 Subject: [PATCH 6/7] [ntuple] Add test for bare models --- tree/ntuple/v7/test/ntuple_processor_join.cxx | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tree/ntuple/v7/test/ntuple_processor_join.cxx b/tree/ntuple/v7/test/ntuple_processor_join.cxx index 6dbc2d4747531..02c11e455a8fd 100644 --- a/tree/ntuple/v7/test/ntuple_processor_join.cxx +++ b/tree/ntuple/v7/test/ntuple_processor_join.cxx @@ -287,6 +287,53 @@ TEST_F(RNTupleJoinProcessorTest, WithModel) EXPECT_EQ(5, proc->GetNEntriesProcessed()); } +TEST_F(RNTupleJoinProcessorTest, WithBareModel) +{ + auto primaryModel = RNTupleModel::CreateBare(); + primaryModel->MakeField("i"); + primaryModel->MakeField("x"); + + std::vector> auxModels; + + auxModels.push_back(RNTupleModel::CreateBare()); + auxModels.back()->MakeField>("y"); + + auxModels.push_back(RNTupleModel::CreateBare()); + auxModels.back()->MakeField("z"); + + auto proc = RNTupleProcessor::CreateJoin({fNTupleNames[0], fFileNames[0]}, + {{fNTupleNames[1], fFileNames[1]}, {fNTupleNames[2], fFileNames[2]}}, {"i"}, + std::move(primaryModel), std::move(auxModels)); + + auto i = proc->GetEntry().GetPtr("i"); + auto x = proc->GetEntry().GetPtr("x"); + auto y = proc->GetEntry().GetPtr>("ntuple2.y"); + auto z = proc->GetEntry().GetPtr("ntuple3.z"); + + int nEntries = 0; + std::vector yExpected; + for (auto &entry : *proc) { + EXPECT_EQ(proc->GetCurrentEntryNumber(), nEntries++); + + EXPECT_EQ(proc->GetCurrentEntryNumber() * 2, *i); + + EXPECT_FLOAT_EQ(*i * 0.5f, *x); + + yExpected = {static_cast(*i * 0.2), 3.14, static_cast(*i * 1.3)}; + EXPECT_EQ(yExpected, *y); + EXPECT_FLOAT_EQ(static_cast(*i * 2.f), *z); + + try { + entry.GetPtr("ntuple2.z"); + FAIL() << "should not be able to access values from fields not present in the provided models"; + } catch (const ROOT::RException &err) { + EXPECT_THAT(err.what(), testing::HasSubstr("invalid field name: ntuple2.z")); + } + } + + EXPECT_EQ(5, proc->GetNEntriesProcessed()); +} + TEST_F(RNTupleJoinProcessorTest, PartialModels) { { From 56660212ab44e1fb217c68e741f01f9615ae12b1 Mon Sep 17 00:00:00 2001 From: Florine de Geus Date: Fri, 21 Mar 2025 17:02:32 +0100 Subject: [PATCH 7/7] [ntuple] Add clarifying comment --- tree/ntuple/v7/src/RNTupleProcessor.cxx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tree/ntuple/v7/src/RNTupleProcessor.cxx b/tree/ntuple/v7/src/RNTupleProcessor.cxx index b4531c07df8b7..ac70a081db3fd 100644 --- a/tree/ntuple/v7/src/RNTupleProcessor.cxx +++ b/tree/ntuple/v7/src/RNTupleProcessor.cxx @@ -398,6 +398,13 @@ ROOT::Experimental::RNTupleJoinProcessor::RNTupleJoinProcessor(const RNTupleOpen return fieldName.substr(0, n.fNTupleName.size()) == n.fNTupleName; }); + // If the current field name does not begin with the name of one of the auxiliary ntuples, we are dealing with a + // field from the primary ntuple, so it can be added as a field context. Otherwise, if it does begin with the + // name, but is not equal to just the name (e.g. it is a subfield of `auxNTupleName`, which means it is a proper + // field in the corresponding auxiliary ntuple) we also need to add it as a field context. If it is exactly equal + // to an auxiliary ntuple name, it is the untyped record field containing the auxiliary fields itself. This one we + // don't want to add as a field context, because there is nothing to read from. + // TODO(fdegeus) handle the case where a primary field has the name of an auxiliary ntuple. if (auxNTupleName == auxNTuples.end()) { fFieldContexts.try_emplace(fieldName, field.Clone(field.GetFieldName()), fEntry->GetToken(fieldName)); } else if (fieldName != auxNTupleName->fNTupleName) {