Skip to content

Commit 6095fe3

Browse files
committed
[df] Add new RDataSource for TTree-based processing
1 parent 2da80d3 commit 6095fe3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+1550
-376
lines changed

bindings/pyroot/pythonizations/test/string_view.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def test_rdataframe(self):
2727

2828
df = ROOT.RDataFrame(treename, filename)
2929
self.assertEqual(
30-
str(df), "A data frame built on top of the tree dataset.")
30+
str(df), "A data frame associated to the data source \"TTree data source\"")
3131
os.remove(filename)
3232

3333
def test_17497(self):

tree/dataframe/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTDataFrame
4949
ROOT/RRootDS.hxx
5050
ROOT/RSnapshotOptions.hxx
5151
ROOT/RTrivialDS.hxx
52+
ROOT/RTTreeDS.hxx
5253
ROOT/RDF/ActionHelpers.hxx
5354
ROOT/RDF/ColumnReaderUtils.hxx
5455
ROOT/RDF/GraphNode.hxx
@@ -107,7 +108,9 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTDataFrame
107108
src/RCutFlowReport.cxx
108109
src/RDataFrame.cxx
109110
src/RDatasetSpec.cxx
111+
src/RDataSource.cxx
110112
src/RDFActionHelpers.cxx
113+
src/RDFColumnReaderUtils.cxx
111114
src/RDFColumnRegister.cxx
112115
src/RDFDisplay.cxx
113116
src/RDFGraphUtils.cxx
@@ -127,6 +130,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTDataFrame
127130
src/RRangeBase.cxx
128131
src/RSample.cxx
129132
src/RResultPtr.cxx
133+
src/RTTreeDS.cxx
130134
src/RVariationBase.cxx
131135
src/RVariationReader.cxx
132136
src/RVariationsDescription.cxx

tree/dataframe/inc/ROOT/RCsvDS.hxx

-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ private:
7373
std::int64_t fDataLineNumber = 0;
7474
std::int64_t fLineNumber = 0; // used to skip the last lines
7575
std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
76-
unsigned int fNSlots = 0U;
7776
std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
7877
ULong64_t fEntryRangesRequested = 0ULL;
7978
ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines

tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx

+58-15
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#include "ROOT/RNTupleDS.hxx"
5353
#include "ROOT/RNTupleWriter.hxx" // for SnapshotRNTupleHelper
5454
#endif
55+
#include "ROOT/RTTreeDS.hxx"
5556

5657
#include <algorithm>
5758
#include <functional>
@@ -1530,12 +1531,15 @@ class R__CLING_PTRCHECK(off) SnapshotTTreeHelper : public RActionImpl<SnapshotTT
15301531
std::vector<void *> fBranchAddresses; // Addresses of objects associated to output branches
15311532
RBranchSet fOutputBranches;
15321533
std::vector<bool> fIsDefine;
1534+
std::weak_ptr<ROOT::Detail::RDF::RLoopManager> fOutputLoopManager;
1535+
ROOT::RDF::RDataSource *fInputDataSource;
15331536

15341537
public:
15351538
using ColumnTypes_t = TypeList<ColTypes...>;
15361539
SnapshotTTreeHelper(std::string_view filename, std::string_view dirname, std::string_view treename,
15371540
const ColumnNames_t &vbnames, const ColumnNames_t &bnames, const RSnapshotOptions &options,
1538-
std::vector<bool> &&isDefine)
1541+
std::vector<bool> &&isDefine, std::weak_ptr<ROOT::Detail::RDF::RLoopManager> loopManager,
1542+
ROOT::RDF::RDataSource *inputDataSource)
15391543
: fFileName(filename),
15401544
fDirName(dirname),
15411545
fTreeName(treename),
@@ -1544,7 +1548,9 @@ public:
15441548
fOutputBranchNames(ReplaceDotWithUnderscore(bnames)),
15451549
fBranches(vbnames.size(), nullptr),
15461550
fBranchAddresses(vbnames.size(), nullptr),
1547-
fIsDefine(std::move(isDefine))
1551+
fIsDefine(std::move(isDefine)),
1552+
fOutputLoopManager(loopManager),
1553+
fInputDataSource(inputDataSource)
15481554
{
15491555
EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName);
15501556
}
@@ -1571,6 +1577,8 @@ public:
15711577
{
15721578
if (r)
15731579
fInputTree = r->GetTree();
1580+
else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputDataSource))
1581+
fInputTree = treeDS->GetTree();
15741582
fBranchAddressesNeedReset = true;
15751583
}
15761584

@@ -1650,6 +1658,11 @@ public:
16501658
// must destroy the TTree first, otherwise TFile will delete it too leading to a double delete
16511659
fOutputTree.reset();
16521660
fOutputFile->Close();
1661+
1662+
// Now connect the data source to the loop manager so it can be used for further processing
1663+
auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
1664+
if (auto lmPtr = fOutputLoopManager.lock())
1665+
lmPtr->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
16531666
}
16541667

16551668
std::string GetActionName() { return "Snapshot"; }
@@ -1673,8 +1686,15 @@ public:
16731686
SnapshotTTreeHelper MakeNew(void *newName, std::string_view /*variation*/ = "nominal")
16741687
{
16751688
const std::string finalName = *reinterpret_cast<const std::string *>(newName);
1676-
return SnapshotTTreeHelper{
1677-
finalName, fDirName, fTreeName, fInputBranchNames, fOutputBranchNames, fOptions, std::vector<bool>(fIsDefine)};
1689+
return SnapshotTTreeHelper{finalName,
1690+
fDirName,
1691+
fTreeName,
1692+
fInputBranchNames,
1693+
fOutputBranchNames,
1694+
fOptions,
1695+
std::vector<bool>(fIsDefine),
1696+
fOutputLoopManager,
1697+
fInputDataSource};
16781698
}
16791699
};
16801700

@@ -1699,12 +1719,17 @@ class R__CLING_PTRCHECK(off) SnapshotTTreeHelperMT : public RActionImpl<Snapshot
16991719
std::vector<std::vector<void *>> fBranchAddresses;
17001720
std::vector<RBranchSet> fOutputBranches;
17011721
std::vector<bool> fIsDefine;
1722+
std::weak_ptr<ROOT::Detail::RDF::RLoopManager> fOutputLoopManager;
1723+
ROOT::RDF::RDataSource *fInputDataSource;
17021724

17031725
public:
17041726
using ColumnTypes_t = TypeList<ColTypes...>;
1727+
17051728
SnapshotTTreeHelperMT(const unsigned int nSlots, std::string_view filename, std::string_view dirname,
17061729
std::string_view treename, const ColumnNames_t &vbnames, const ColumnNames_t &bnames,
1707-
const RSnapshotOptions &options, std::vector<bool> &&isDefine)
1730+
const RSnapshotOptions &options, std::vector<bool> &&isDefine,
1731+
std::weak_ptr<ROOT::Detail::RDF::RLoopManager> loopManager,
1732+
ROOT::RDF::RDataSource *inputDataSource)
17081733
: fNSlots(nSlots),
17091734
fOutputFiles(fNSlots),
17101735
fOutputTrees(fNSlots),
@@ -1719,7 +1744,9 @@ public:
17191744
fBranches(fNSlots, std::vector<TBranch *>(vbnames.size(), nullptr)),
17201745
fBranchAddresses(fNSlots, std::vector<void *>(vbnames.size(), nullptr)),
17211746
fOutputBranches(fNSlots),
1722-
fIsDefine(std::move(isDefine))
1747+
fIsDefine(std::move(isDefine)),
1748+
fOutputLoopManager(loopManager),
1749+
fInputDataSource(inputDataSource)
17231750
{
17241751
EnsureValidSnapshotTTreeOutput(fOptions, fTreeName, fFileName);
17251752
}
@@ -1766,7 +1793,9 @@ public:
17661793
if (r) {
17671794
// not an empty-source RDF
17681795
fInputTrees[slot] = r->GetTree();
1769-
}
1796+
} else if (auto treeDS = dynamic_cast<ROOT::Internal::RDF::RTTreeDS *>(fInputDataSource))
1797+
fInputTrees[slot] = treeDS->GetTree();
1798+
17701799
fBranchAddressesNeedReset[slot] = 1; // reset first event flag for this slot
17711800
}
17721801

@@ -1855,6 +1884,11 @@ public:
18551884
// flush all buffers to disk by destroying the TBufferMerger
18561885
fOutputFiles.clear();
18571886
fMerger.reset();
1887+
1888+
// Now connect the data source to the loop manager so it can be used for further processing
1889+
auto fullTreeName = fDirName.empty() ? fTreeName : fDirName + '/' + fTreeName;
1890+
if (auto lmPtr = fOutputLoopManager.lock())
1891+
lmPtr->SetDataSource(std::make_unique<ROOT::Internal::RDF::RTTreeDS>(fullTreeName, fFileName));
18581892
}
18591893

18601894
std::string GetActionName() { return "Snapshot"; }
@@ -1878,8 +1912,16 @@ public:
18781912
SnapshotTTreeHelperMT MakeNew(void *newName, std::string_view /*variation*/ = "nominal")
18791913
{
18801914
const std::string finalName = *reinterpret_cast<const std::string *>(newName);
1881-
return SnapshotTTreeHelperMT{fNSlots, finalName, fDirName, fTreeName,
1882-
fInputBranchNames, fOutputBranchNames, fOptions, std::vector<bool>(fIsDefine)};
1915+
return SnapshotTTreeHelperMT{fNSlots,
1916+
finalName,
1917+
fDirName,
1918+
fTreeName,
1919+
fInputBranchNames,
1920+
fOutputBranchNames,
1921+
fOptions,
1922+
std::vector<bool>(fIsDefine),
1923+
fOutputLoopManager,
1924+
fInputDataSource};
18831925
}
18841926
};
18851927

@@ -1906,7 +1948,7 @@ class R__CLING_PTRCHECK(off) SnapshotRNTupleHelper : public RActionImpl<Snapshot
19061948
std::unique_ptr<TFile> fOutputFile{nullptr};
19071949

19081950
RSnapshotOptions fOptions;
1909-
ROOT::Detail::RDF::RLoopManager *fLoopManager;
1951+
std::weak_ptr<ROOT::Detail::RDF::RLoopManager> fOutputLoopManager;
19101952
ColumnNames_t fInputFieldNames; // This contains the resolved aliases
19111953
ColumnNames_t fOutputFieldNames;
19121954
std::unique_ptr<ROOT::Experimental::RNTupleWriter> fWriter{nullptr};
@@ -1919,11 +1961,11 @@ public:
19191961
using ColumnTypes_t = TypeList<ColTypes...>;
19201962
SnapshotRNTupleHelper(std::string_view filename, std::string_view ntuplename, const ColumnNames_t &vfnames,
19211963
const ColumnNames_t &fnames, const RSnapshotOptions &options,
1922-
ROOT::Detail::RDF::RLoopManager *lm, std::vector<bool> &&isDefine)
1964+
std::weak_ptr<ROOT::Detail::RDF::RLoopManager> lm, std::vector<bool> &&isDefine)
19231965
: fFileName(filename),
19241966
fNTupleName(ntuplename),
19251967
fOptions(options),
1926-
fLoopManager(lm),
1968+
fOutputLoopManager(lm),
19271969
fInputFieldNames(vfnames),
19281970
fOutputFieldNames(ReplaceDotWithUnderscore(fnames)),
19291971
fIsDefine(std::move(isDefine))
@@ -1937,7 +1979,7 @@ public:
19371979
SnapshotRNTupleHelper &operator=(SnapshotRNTupleHelper &&) = default;
19381980
~SnapshotRNTupleHelper()
19391981
{
1940-
if (!fNTupleName.empty() && !fLoopManager->GetDataSource() && fOptions.fLazy)
1982+
if (auto lm = fOutputLoopManager.lock(); lm && !fNTupleName.empty() && !lm->GetDataSource() && fOptions.fLazy)
19411983
Warning("Snapshot", "A lazy Snapshot action was booked but never triggered.");
19421984
}
19431985

@@ -1993,7 +2035,8 @@ public:
19932035
{
19942036
fWriter.reset();
19952037
// We can now set the data source of the loop manager for the RDataFrame that is returned by the Snapshot call.
1996-
fLoopManager->SetDataSource(std::make_unique<ROOT::Experimental::RNTupleDS>(fNTupleName, fFileName));
2038+
if (auto lm = fOutputLoopManager.lock())
2039+
lm->SetDataSource(std::make_unique<ROOT::Experimental::RNTupleDS>(fNTupleName, fFileName));
19972040
}
19982041

19992042
std::string GetActionName() { return "Snapshot"; }
@@ -2022,7 +2065,7 @@ public:
20222065
fInputFieldNames,
20232066
fOutputFieldNames,
20242067
fOptions,
2025-
fLoopManager,
2068+
fOutputLoopManager,
20262069
std::vector<bool>(fIsDefine)};
20272070
}
20282071
};

tree/dataframe/inc/ROOT/RDF/ColumnReaderUtils.hxx

+19-24
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,11 @@
1717
#include "RDefineReader.hxx"
1818
#include "RDSColumnReader.hxx"
1919
#include "RLoopManager.hxx"
20-
#include "RTreeColumnReader.hxx"
2120
#include "RVariationBase.hxx"
2221
#include "RVariationReader.hxx"
2322

2423
#include <ROOT/RDataSource.hxx>
2524
#include <ROOT/TypeTraits.hxx>
26-
#include <TTreeReader.h>
2725

2826
#include <array>
2927
#include <cassert>
@@ -33,32 +31,21 @@
3331
#include <typeinfo> // for typeid
3432
#include <vector>
3533

34+
class TTreeReader;
35+
3636
namespace ROOT {
3737
namespace Internal {
3838
namespace RDF {
3939

4040
using namespace ROOT::TypeTraits;
4141
namespace RDFDetail = ROOT::Detail::RDF;
4242

43-
template <typename T>
4443
RDFDetail::RColumnReaderBase *GetColumnReader(unsigned int slot, RColumnReaderBase *defineOrVariationReader,
45-
RLoopManager &lm, TTreeReader *r, const std::string &colName)
46-
{
47-
if (defineOrVariationReader != nullptr)
48-
return defineOrVariationReader;
49-
50-
// Check if we already inserted a reader for this column in the dataset column readers (RDataSource or Tree/TChain
51-
// readers)
52-
auto *datasetColReader = lm.GetDatasetColumnReader(slot, colName, typeid(T));
53-
if (datasetColReader != nullptr)
54-
return datasetColReader;
44+
RLoopManager &lm, const std::string &colName, const std::type_info &ti);
5545

56-
assert(r != nullptr && "We could not find a reader for this column, this should never happen at this point.");
57-
58-
// Make a RTreeColumnReader for this column and insert it in RLoopManager's map
59-
auto treeColReader = std::make_unique<RTreeColumnReader<T>>(*r, colName);
60-
return lm.AddTreeColumnReader(slot, colName, std::move(treeColReader), typeid(T));
61-
}
46+
RDFDetail::RColumnReaderBase *GetColumnReaderForTTreeIMT(unsigned int slot, RColumnReaderBase *defineOrVariationReader,
47+
RLoopManager &lm, TTreeReader &treeReader,
48+
const std::string &colName, const std::type_info &ti);
6249

6350
/// This type aggregates some of the arguments passed to GetColumnReaders.
6451
/// We need to pass a single RColumnReadersInfo object rather than each argument separately because with too many
@@ -74,7 +61,7 @@ struct RColumnReadersInfo {
7461
/// Create a group of column readers, one per type in the parameter pack.
7562
template <typename... ColTypes>
7663
std::array<RDFDetail::RColumnReaderBase *, sizeof...(ColTypes)>
77-
GetColumnReaders(unsigned int slot, TTreeReader *r, TypeList<ColTypes...>, const RColumnReadersInfo &colInfo,
64+
GetColumnReaders(unsigned int slot, TTreeReader *treeReader, TypeList<ColTypes...>, const RColumnReadersInfo &colInfo,
7865
const std::string &variationName = "nominal")
7966
{
8067
// see RColumnReadersInfo for why we pass these arguments like this rather than directly as function arguments
@@ -83,10 +70,18 @@ GetColumnReaders(unsigned int slot, TTreeReader *r, TypeList<ColTypes...>, const
8370
auto &colRegister = colInfo.fColRegister;
8471

8572
int i = -1;
86-
std::array<RDFDetail::RColumnReaderBase *, sizeof...(ColTypes)> ret{
87-
(++i, GetColumnReader<ColTypes>(slot, colRegister.GetReader(slot, colNames[i], variationName, typeid(ColTypes)),
88-
lm, r, colNames[i]))...};
89-
return ret;
73+
if (!treeReader) {
74+
std::array<RDFDetail::RColumnReaderBase *, sizeof...(ColTypes)> ret{
75+
(++i, GetColumnReader(slot, colRegister.GetReader(slot, colNames[i], variationName, typeid(ColTypes)), lm,
76+
colNames[i], typeid(ColTypes)))...};
77+
return ret;
78+
} else {
79+
std::array<RDFDetail::RColumnReaderBase *, sizeof...(ColTypes)> ret{
80+
(++i,
81+
GetColumnReaderForTTreeIMT(slot, colRegister.GetReader(slot, colNames[i], variationName, typeid(ColTypes)),
82+
lm, *treeReader, colNames[i], typeid(ColTypes)))...};
83+
return ret;
84+
}
9085
}
9186

9287
// Shortcut overload for the case of no columns

0 commit comments

Comments
 (0)