Skip to content

Commit ef7c511

Browse files
committed
[df] Always process TTree via its RDataSource
The rest of the RDF codebase is adapted to use the new TTree data source. All the previous usage of raw TTree classes or API is now delegated to the RDataSource. With this commit, there are no more ways to create an RDataFrame processing a TTree without an RDataSource.
1 parent 7391719 commit ef7c511

18 files changed

+239
-293
lines changed

tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#include "ROOT/RNTupleDS.hxx"
5353
#include "ROOT/RNTupleWriter.hxx" // for SnapshotRNTupleHelper
5454
#endif
55+
#include "ROOT/RTTreeDS.hxx"
5556

5657
#include <algorithm>
5758
#include <functional>
@@ -1530,12 +1531,15 @@ class R__CLING_PTRCHECK(off) SnapshotTTreeHelper : public RActionImpl<SnapshotTT
15301531
std::vector<void *> fBranchAddresses; // Addresses of objects associated to output branches
15311532
RBranchSet fOutputBranches;
15321533
std::vector<bool> fIsDefine;
1534+
ROOT::Detail::RDF::RLoopManager *fOutputLoopManager;
1535+
ROOT::RDF::RDataSource *fInputDataSource;
15331536

15341537
public:
15351538
using ColumnTypes_t = TypeList<ColTypes...>;
15361539
SnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename,
15371540
const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options,
1538-
std::vector<bool> &&isDefine)
1541+
std::vector<bool> &&isDefine, ROOT::Detail::RDF::RLoopManager *loopManager,
1542+
ROOT::RDF::RDataSource *inputDataSource)
15391543
: fFileName(filename),
15401544
fDirName(dirname),
15411545
fTreeName(treename),
@@ -1544,7 +1548,9 @@ public:
15441548
fOutputBranchNames(ReplaceDotWithUnderscore(bnames)),
15451549
fBranches(vbnames.size(), nullptr),
15461550
fBranchAddresses(vbnames.size(), nullptr),
1547-
fIsDefine(std::move(isDefine))
1551+
fIsDefine(std::move(isDefine)),
1552+
fOutputLoopManager(loopManager),
1553+
fInputDataSource(inputDataSource)
15481554
{
15491555
EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName);
15501556
}
@@ -1571,6 +1577,8 @@ public:
15711577
{
15721578
if (r)
15731579
fInputTree = r->GetTree();
1580+
else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputDataSource))
1581+
fInputTree = treeDS->GetTree();
15741582
fBranchAddressesNeedReset = true;
15751583
}
15761584

@@ -1650,6 +1658,10 @@ public:
16501658
// must destroy the TTree first, otherwise TFile will delete it too leading to a double delete
16511659
fOutputTree.reset();
16521660
fOutputFile->Close();
1661+
1662+
// Now connect the data source to the loop manager so it can be used for further processing
1663+
auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
1664+
fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
16531665
}
16541666

16551667
std::string GetActionName() { return "Snapshot"; }
@@ -1673,8 +1685,15 @@ public:
16731685
SnapshotTTreeHelper MakeNew(void *newName, std::string_view /*variation*/ = "nominal")
16741686
{
16751687
const std::string finalName = *reinterpret_cast<const std::string *>(newName);
1676-
return SnapshotTTreeHelper{
1677-
finalName, fDirName, fTreeName, fInputBranchNames, fOutputBranchNames, fOptions, std::vector<bool>(fIsDefine)};
1688+
return SnapshotTTreeHelper{finalName,
1689+
fDirName,
1690+
fTreeName,
1691+
fInputBranchNames,
1692+
fOutputBranchNames,
1693+
fOptions,
1694+
std::vector<bool>(fIsDefine),
1695+
fOutputLoopManager,
1696+
fInputDataSource};
16781697
}
16791698
};
16801699

@@ -1699,12 +1718,16 @@ class R__CLING_PTRCHECK(off) SnapshotTTreeHelperMT : public RActionImpl<Snapshot
16991718
std::vector<std::vector<void *>> fBranchAddresses;
17001719
std::vector<RBranchSet> fOutputBranches;
17011720
std::vector<bool> fIsDefine;
1721+
ROOT::Detail::RDF::RLoopManager *fOutputLoopManager;
1722+
ROOT::RDF::RDataSource *fInputDataSource;
17021723

17031724
public:
17041725
using ColumnTypes_t = TypeList<ColTypes...>;
1726+
17051727
SnapshotTTreeHelperMT(const unsigned int nSlots, std::string_view filename, std::string_view dirname,
17061728
std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames,
1707-
const RSnapshotOptions &options, std::vector<bool> &&isDefine)
1729+
const RSnapshotOptions &options, std::vector<bool> &&isDefine,
1730+
ROOT::Detail::RDF::RLoopManager *loopManager, ROOT::RDF::RDataSource *inputDataSource)
17081731
: fNSlots(nSlots),
17091732
fOutputFiles(fNSlots),
17101733
fOutputTrees(fNSlots),
@@ -1719,7 +1742,9 @@ public:
17191742
fBranches(fNSlots, std::vector<TBranch *>(vbnames.size(), nullptr)),
17201743
fBranchAddresses(fNSlots, std::vector<void *>(vbnames.size(), nullptr)),
17211744
fOutputBranches(fNSlots),
1722-
fIsDefine(std::move(isDefine))
1745+
fIsDefine(std::move(isDefine)),
1746+
fOutputLoopManager(loopManager),
1747+
fInputDataSource(inputDataSource)
17231748
{
17241749
EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName);
17251750
}
@@ -1766,7 +1791,9 @@ public:
17661791
if (r) {
17671792
// not an empty-source RDF
17681793
fInputTrees[slot] = r->GetTree();
1769-
}
1794+
} else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputDataSource))
1795+
fInputTrees[slot] = treeDS->GetTree();
1796+
17701797
fBranchAddressesNeedReset[slot] = 1; // reset first event flag for this slot
17711798
}
17721799

@@ -1855,6 +1882,10 @@ public:
18551882
// flush all buffers to disk by destroying the TBufferMerger
18561883
fOutputFiles.clear();
18571884
fMerger.reset();
1885+
1886+
// Now connect the data source to the loop manager so it can be used for further processing
1887+
auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
1888+
fOutputLoopManager->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
18581889
}
18591890

18601891
std::string GetActionName() { return "Snapshot"; }
@@ -1878,8 +1909,16 @@ public:
18781909
SnapshotTTreeHelperMT MakeNew(void *newName, std::string_view /*variation*/ = "nominal")
18791910
{
18801911
const std::string finalName = *reinterpret_cast<const std::string *>(newName);
1881-
return SnapshotTTreeHelperMT{fNSlots, finalName, fDirName, fTreeName,
1882-
fInputBranchNames, fOutputBranchNames, fOptions, std::vector<bool>(fIsDefine)};
1912+
return SnapshotTTreeHelperMT{fNSlots,
1913+
finalName,
1914+
fDirName,
1915+
fTreeName,
1916+
fInputBranchNames,
1917+
fOutputBranchNames,
1918+
fOptions,
1919+
std::vector<bool>(fIsDefine),
1920+
fOutputLoopManager,
1921+
fInputDataSource};
18831922
}
18841923
};
18851924

@@ -1907,7 +1946,7 @@ class R__CLING_PTRCHECK(off) SnapshotRNTupleHelper : public RActionImpl<Snapshot
19071946
std::unique_ptr<TFile> fOutputFile{nullptr};
19081947

19091948
RSnapshotOptions fOptions;
1910-
ROOT::Detail::RDF::RLoopManager *fLoopManager;
1949+
ROOT::Detail::RDF::RLoopManager *fOutputLoopManager;
19111950
ColumnNames_t fInputFieldNames; // This contains the resolved aliases
19121951
ColumnNames_t fOutputFieldNames;
19131952
std::unique_ptr<ROOT::Experimental::RNTupleWriter> fWriter{nullptr};
@@ -1925,7 +1964,7 @@ public:
19251964
fDirName(dirname),
19261965
fNTupleName(ntuplename),
19271966
fOptions(options),
1928-
fLoopManager(lm),
1967+
fOutputLoopManager(lm),
19291968
fInputFieldNames(vfnames),
19301969
fOutputFieldNames(ReplaceDotWithUnderscore(fnames)),
19311970
fIsDefine(std::move(isDefine))
@@ -1939,7 +1978,7 @@ public:
19391978
SnapshotRNTupleHelper &operator=(SnapshotRNTupleHelper &&) = default;
19401979
~SnapshotRNTupleHelper()
19411980
{
1942-
if (!fNTupleName.empty() && !fLoopManager->GetDataSource() && fOptions.fLazy)
1981+
if (!fNTupleName.empty() && !fOutputLoopManager->GetDataSource() && fOptions.fLazy)
19431982
Warning("Snapshot", "A lazy Snapshot action was booked but never triggered.");
19441983
}
19451984

@@ -1999,7 +2038,7 @@ public:
19992038
{
20002039
fWriter.reset();
20012040
// We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call.
2002-
fLoopManager->SetDataSource(
2041+
fOutputLoopManager->SetDataSource(
20032042
std::make_unique<ROOT::Experimental::RNTupleDS>(fDirName + "/" + fNTupleName, fFileName));
20042043
}
20052044

@@ -2029,7 +2068,7 @@ public:
20292068
fInputFieldNames,
20302069
fOutputFieldNames,
20312070
fOptions,
2032-
fLoopManager,
2071+
fOutputLoopManager,
20332072
std::vector<bool>(fIsDefine)};
20342073
}
20352074
};

tree/dataframe/inc/ROOT/RDF/ColumnReaderUtils.hxx

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
#include <typeinfo> // for typeid
3232
#include <vector>
3333

34+
class TTreeReader;
35+
3436
namespace ROOT {
3537
namespace Internal {
3638
namespace RDF {
@@ -56,7 +58,7 @@ struct RColumnReadersInfo {
5658
/// Create a group of column readers, one per type in the parameter pack.
5759
template <typename... ColTypes>
5860
std::array<RDFDetail::RColumnReaderBase *, sizeof...(ColTypes)>
59-
GetColumnReaders(unsigned int slot, TTreeReader *r, TypeList<ColTypes...>, const RColumnReadersInfo &colInfo,
61+
GetColumnReaders(unsigned int slot, TTreeReader *treeReader, TypeList<ColTypes...>, const RColumnReadersInfo &colInfo,
6062
const std::string &variationName = "nominal")
6163
{
6264
// see RColumnReadersInfo for why we pass these arguments like this rather than directly as function arguments
@@ -65,9 +67,10 @@ GetColumnReaders(unsigned int slot, TTreeReader *r, TypeList<ColTypes...>, const
6567
auto &colRegister = colInfo.fColRegister;
6668

6769
int i = -1;
70+
6871
std::array<RDFDetail::RColumnReaderBase *, sizeof...(ColTypes)> ret{
69-
(++i, GetColumnReader(slot, colRegister.GetReader(slot, colNames[i], variationName, typeid(ColTypes)), lm, r,
70-
colNames[i], typeid(ColTypes)))...};
72+
(++i, GetColumnReader(slot, colRegister.GetReader(slot, colNames[i], variationName, typeid(ColTypes)), lm,
73+
treeReader, colNames[i], typeid(ColTypes)))...};
7174
return ret;
7275
}
7376

tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,8 @@ struct SnapshotHelperArgs {
250250
std::string fTreeName;
251251
std::vector<std::string> fOutputColNames;
252252
ROOT::RDF::RSnapshotOptions fOptions;
253-
RDFDetail::RLoopManager *fLoopManager;
253+
ROOT::Detail::RDF::RLoopManager *fLoopManager;
254+
ROOT::RDF::RDataSource *fDataSource;
254255
bool fToNTuple;
255256
};
256257

@@ -266,6 +267,8 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr<SnapshotHelperA
266267
const auto &treename = snapHelperArgs->fTreeName;
267268
const auto &outputColNames = snapHelperArgs->fOutputColNames;
268269
const auto &options = snapHelperArgs->fOptions;
270+
const auto &lmPtr = snapHelperArgs->fLoopManager;
271+
const auto &dataSource = snapHelperArgs->fDataSource;
269272

270273
auto sz = sizeof...(ColTypes);
271274
std::vector<bool> isDefine(sz);
@@ -280,10 +283,8 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr<SnapshotHelperA
280283
using Helper_t = SnapshotRNTupleHelper<ColTypes...>;
281284
using Action_t = RAction<Helper_t, PrevNodeType>;
282285

283-
auto loopManager = snapHelperArgs->fLoopManager;
284-
285286
actionPtr.reset(new Action_t(
286-
Helper_t(filename, dirname, treename, colNames, outputColNames, options, loopManager, std::move(isDefine)),
287+
Helper_t(filename, dirname, treename, colNames, outputColNames, options, lmPtr, std::move(isDefine)),
287288
colNames, prevNode, colRegister));
288289
} else {
289290
// multi-thread snapshot to RNTuple is not yet supported
@@ -302,16 +303,16 @@ BuildAction(const ColumnNames_t &colNames, const std::shared_ptr<SnapshotHelperA
302303
// single-thread snapshot
303304
using Helper_t = SnapshotTTreeHelper<ColTypes...>;
304305
using Action_t = RAction<Helper_t, PrevNodeType>;
305-
actionPtr.reset(
306-
new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)),
307-
colNames, prevNode, colRegister));
306+
actionPtr.reset(new Action_t(Helper_t(filename, dirname, treename, colNames, outputColNames, options,
307+
std::move(isDefine), lmPtr, dataSource),
308+
colNames, prevNode, colRegister));
308309
} else {
309310
// multi-thread snapshot
310311
using Helper_t = SnapshotTTreeHelperMT<ColTypes...>;
311312
using Action_t = RAction<Helper_t, PrevNodeType>;
312-
actionPtr.reset(new Action_t(
313-
Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options, std::move(isDefine)),
314-
colNames, prevNode, colRegister));
313+
actionPtr.reset(new Action_t(Helper_t(nSlots, filename, dirname, treename, colNames, outputColNames, options,
314+
std::move(isDefine), lmPtr, dataSource),
315+
colNames, prevNode, colRegister));
315316
}
316317
}
317318
return actionPtr;
@@ -412,8 +413,15 @@ std::vector<bool> FindUndefinedDSColumns(const ColumnNames_t &requestedCols, con
412413
template <typename T>
413414
void AddDSColumnsHelper(const std::string &colName, RLoopManager &lm, RDataSource &ds, RColumnRegister &colRegister)
414415
{
415-
if (colRegister.IsDefineOrAlias(colName) || !ds.HasColumn(colName) ||
416-
lm.HasDataSourceColumnReaders(colName, typeid(T)))
416+
417+
if (colRegister.IsDefineOrAlias(colName))
418+
return;
419+
420+
if (lm.HasDataSourceColumnReaders(colName, typeid(T)))
421+
return;
422+
423+
if (!ds.HasColumn(colName) &&
424+
lm.GetSuppressErrorsForMissingBranches().find(colName) == lm.GetSuppressErrorsForMissingBranches().end())
417425
return;
418426

419427
const auto nSlots = lm.GetNSlots();
@@ -428,7 +436,8 @@ void AddDSColumnsHelper(const std::string &colName, RLoopManager &lm, RDataSourc
428436
} else { // using the new GetColumnReaders mechanism
429437
// TODO consider changing the interface so we return all of these for all slots in one go
430438
for (auto slot = 0u; slot < lm.GetNSlots(); ++slot)
431-
colReaders.emplace_back(ds.GetColumnReaders(slot, colName, typeid(T)));
439+
colReaders.emplace_back(
440+
ROOT::Internal::RDF::CreateColumnReader(ds, slot, colName, typeid(T), /*treeReader*/ nullptr));
432441
}
433442

434443
lm.AddDataSourceColumnReaders(colName, std::move(colReaders), typeid(T));
@@ -540,7 +549,7 @@ void JitDefineHelper(F &&f, const char **colsPtr, std::size_t colsSize, std::str
540549
using ColTypes_t = typename TTraits::CallableTraits<Callable_t>::arg_types;
541550

542551
auto ds = lm->GetDataSource();
543-
if (ds != nullptr)
552+
if (ds != nullptr && colsPtr)
544553
AddDSColumns(cols, *lm, *ds, ColTypes_t(), *colRegister);
545554

546555
// will never actually be used (trumped by jittedDefine->GetTypeName()), but we set it to something meaningful
@@ -800,8 +809,8 @@ template <typename T>
800809
using InnerValueType_t = typename InnerValueType<T>::type;
801810

802811
std::pair<std::vector<std::string>, std::vector<std::string>>
803-
AddSizeBranches(const std::vector<std::string> &branches, TTree *tree, std::vector<std::string> &&colsWithoutAliases,
804-
std::vector<std::string> &&colsWithAliases);
812+
AddSizeBranches(const std::vector<std::string> &branches, ROOT::RDF::RDataSource *ds,
813+
std::vector<std::string> &&colsWithoutAliases, std::vector<std::string> &&colsWithAliases);
805814

806815
void RemoveDuplicates(ColumnNames_t &columnNames);
807816

0 commit comments

Comments
 (0)