Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ntuple] Add partioning to RNTupleJoinTable #17919

Merged
merged 4 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 95 additions & 67 deletions tree/ntuple/v7/inc/ROOT/RNTupleJoinTable.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -35,63 +35,84 @@ namespace Internal {
// clang-format on
class RNTupleJoinTable {
public:
using NTupleJoinValue_t = std::uint64_t;
using JoinValue_t = std::uint64_t;
using PartitionKey_t = std::uint64_t;
static constexpr PartitionKey_t kDefaultPartitionKey = PartitionKey_t(-1);

private:
/////////////////////////////////////////////////////////////////////////////
/// Container for the hashes of the join fields.
class RCombinedJoinFieldValue {
public:
std::vector<NTupleJoinValue_t> fFieldValues;
RCombinedJoinFieldValue(const std::vector<NTupleJoinValue_t> &fieldValues)
{
fFieldValues.reserve(fieldValues.size());
fFieldValues = fieldValues;
}
inline bool operator==(const RCombinedJoinFieldValue &other) const { return other.fFieldValues == fFieldValues; }
};

/////////////////////////////////////////////////////////////////////////////
/// Hash combining the individual join field value hashes from RCombinedJoinFieldValue. Uses the implementation from
/// `boost::hash_combine` (see
/// https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine).
struct RCombinedJoinFieldValueHash {
inline std::size_t operator()(const RCombinedJoinFieldValue &joinFieldValue) const
{
std::size_t combinedHash = 0;
for (const auto &fieldVal : joinFieldValue.fFieldValues) {
combinedHash ^= fieldVal + 0x9e3779b9 + (fieldVal << 6) + (fieldVal >> 2);
// clang-format off
/**
\class ROOT::Experimental::Internal::RNTupleJoinTable::REntryMapping
\ingroup NTuple
\brief Provides a mapping from one or several join field values to an entry index.
*/
// clang-format on
class REntryMapping {
private:
//////////////////////////////////////////////////////////////////////////
/// Container for the combined hashes of join field values.
struct RCombinedJoinFieldValue {
std::vector<JoinValue_t> fJoinFieldValues;

RCombinedJoinFieldValue(const std::vector<JoinValue_t> &joinFieldValues) : fJoinFieldValues(joinFieldValues) {}

inline bool operator==(const RCombinedJoinFieldValue &other) const
{
return other.fJoinFieldValues == fJoinFieldValues;
}
return combinedHash;
}
};
};

//////////////////////////////////////////////////////////////////////////
/// Hash combining the individual join field value hashes from RCombinedJoinFieldValue. Uses the implementation
/// from `boost::hash_combine` (see
/// https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine).
struct RCombinedJoinFieldValueHash {
inline std::size_t operator()(const RCombinedJoinFieldValue &joinFieldValue) const
{
std::size_t combinedHash = 0;
for (const auto &fieldVal : joinFieldValue.fJoinFieldValues) {
combinedHash ^= fieldVal + 0x9e3779b9 + (fieldVal << 6) + (fieldVal >> 2);
}
return combinedHash;
}
};

/// The mapping itself. Maps field values (or combinations thereof in case the join key is composed of multiple
/// fields) to their respective entry numbers.
std::unordered_map<RCombinedJoinFieldValue, std::vector<ROOT::NTupleSize_t>, RCombinedJoinFieldValueHash>
fMapping;

/// Names of the join fields used for the mapping to their respective entry indexes.
std::vector<std::string> fJoinFieldNames;

/// The join table itself. Maps field values (or combinations thereof in case the join table is defined for multiple
/// fields) to their respective entry indexes.
std::unordered_map<RCombinedJoinFieldValue, std::vector<ROOT::NTupleSize_t>, RCombinedJoinFieldValueHash> fJoinTable;
/// The size (in bytes) for each join field, corresponding to `fJoinFieldNames`. This information is stored to be
/// able to properly cast incoming void pointers to the join field values in `GetEntryIndexes`.
std::vector<std::size_t> fJoinFieldValueSizes;

public:
//////////////////////////////////////////////////////////////////////////
/// \brief Get the entry indexes for this entry mapping.
const std::vector<ROOT::NTupleSize_t> *GetEntryIndexes(std::vector<void *> valuePtrs) const;

//////////////////////////////////////////////////////////////////////////
/// \brief Create a new entry mapping.
///
/// \param[in] pageSource The page source of the RNTuple with the entries to map.
/// \param[in] joinFieldNames Names of the join fields to use in the mapping.
REntryMapping(RPageSource &pageSource, const std::vector<std::string> &joinFieldNames);
};
/// Names of the join fields used for the mapping to their respective entry indexes.
std::vector<std::string> fJoinFieldNames;

/// The size (in bytes) for each join field, corresponding to `fJoinFieldNames`. This information is stored to be
/// able to properly cast incoming void pointers to the join field values in `GetEntryIndexes`.
std::vector<std::size_t> fJoinFieldValueSizes;

/// Only built join tables can be queried.
bool fIsBuilt = false;
/// Partitions of one or multiple entry mappings.
std::unordered_map<PartitionKey_t, std::vector<std::unique_ptr<REntryMapping>>> fPartitions;

/////////////////////////////////////////////////////////////////////////////
/// \brief Create an a new RNTupleJoinTable for the RNTuple represented by the provided page source.
///
/// \param[in] fieldNames The names of the join fields to use for the join table. Only integral-type fields are
/// \param[in] joinFieldNames The names of the join fields to use for the join table. Only integral-type fields are
/// allowed.
RNTupleJoinTable(const std::vector<std::string> &fieldNames) : fJoinFieldNames(fieldNames) {}

/////////////////////////////////////////////////////////////////////////////
/// \brief Ensure the RNTupleJoinTable has been built.
///
/// \throws RException If the join table has not been built, and can therefore not be used yet.
void EnsureBuilt() const;
RNTupleJoinTable(const std::vector<std::string> &joinFieldNames) : fJoinFieldNames(joinFieldNames) {}

public:
RNTupleJoinTable(const RNTupleJoinTable &other) = delete;
Expand All @@ -103,49 +124,56 @@ public:
/////////////////////////////////////////////////////////////////////////////
/// \brief Create an RNTupleJoinTable from an existing RNTuple.
///
/// \param[in] fieldNames The names of the join fields to use for the join table. Only integral-type fields are
/// \param[in] joinFieldNames The names of the join fields to use for the join table. Only integral-type fields are
/// allowed.
///
/// \return A pointer to the newly-created join table.
static std::unique_ptr<RNTupleJoinTable> Create(const std::vector<std::string> &fieldNames);
static std::unique_ptr<RNTupleJoinTable> Create(const std::vector<std::string> &joinFieldNames);

/////////////////////////////////////////////////////////////////////////////
/// \brief Build the join table.
/// \brief Add an entry mapping to the join table.
///
///
/// \param[in] pageSource The page source of the RNTuple for which to build the join table.
/// \param[in] pageSource The page source of the RNTuple with the entries to map.
/// \param[in] partitionKey Which partition to add the mapping to. If not provided, it will be added to the default
/// partition.
///
/// Only a built join table can be queried (with RNTupleJoinTable::GetEntryIndexes).
void Build(RPageSource &pageSource);
/// \return A reference to the updated join table.
RNTupleJoinTable &Add(RPageSource &pageSource, PartitionKey_t partitionKey = kDefaultPartitionKey);

/////////////////////////////////////////////////////////////////////////////
/// \brief Get the number of entries in the join table.
/// \brief Get all entry indexes for the given join field value(s) within a partition.
///
/// \return The number of entries in the join table.
/// \param[in] valuePtrs A vector of pointers to the join field values to look up.
/// \param[in] partitionKey The partition key to use for the lookup. If not provided, it will use the default
/// partition key.
///
/// \note This does not have to correspond to the number of entries in the original RNTuple. If the original RNTuple
/// contains duplicate join field values, they are counted as one.
std::size_t GetSize() const
{
EnsureBuilt();
return fJoinTable.size();
}
/// \return The entry numbers that correspond to `valuePtrs`. When there are no corresponding entries, an empty
/// vector is returned.
std::vector<ROOT::NTupleSize_t>
GetEntryIndexes(const std::vector<void *> &valuePtrs, PartitionKey_t partitionKey = kDefaultPartitionKey) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Whether the join table has been built (and therefore ready to be used).
/// \brief Get all entry indexes for the given join field value(s) for a specific set of partitions.
///
/// \return `true` if the join table has been built.
/// \param[in] valuePtrs A vector of pointers to the join field values to look up.
/// \param[in] partitionKeys The partition keys to use for the lookup.
///
/// Only built join tables can be queried.
bool IsBuilt() const { return fIsBuilt; }
/// \return The entry numbers that correspond to `valuePtrs`, grouped by partition. When there are no corresponding
/// entries, an empty map is returned.
std::unordered_map<PartitionKey_t, std::vector<ROOT::NTupleSize_t>>
GetPartitionedEntryIndexes(const std::vector<void *> &valuePtrs,
const std::vector<PartitionKey_t> &partitionKeys) const;

/////////////////////////////////////////////////////////////////////////////
/// \brief Get all entry indexes for the given join field value(s).
/// \brief Get all entry indexes for the given join field value(s) for all partitions.
///
/// \param[in] valuePtrs A vector of pointers to the join field values to look up.
///
/// \return The entry indexes that correspond to `valuePtrs`. An empty vector is returned when there are no matching
/// indexes.
std::vector<ROOT::NTupleSize_t> GetEntryIndexes(const std::vector<void *> &valuePtrs) const;
/// \return The entry numbers that correspond to `valuePtrs`, grouped by partition. When there are no corresponding
/// entries, an empty map is returned.
std::unordered_map<PartitionKey_t, std::vector<ROOT::NTupleSize_t>>
GetPartitionedEntryIndexes(const std::vector<void *> &valuePtrs) const;
};
} // namespace Internal
} // namespace Experimental
Expand Down
1 change: 1 addition & 0 deletions tree/ntuple/v7/inc/ROOT/RNTupleProcessor.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,7 @@ private:
/// Tokens representing the join fields present in the main RNTuple
std::vector<REntry::RFieldToken> fJoinFieldTokens;
std::vector<std::unique_ptr<Internal::RNTupleJoinTable>> fJoinTables;
bool fJoinTablesAreBuilt = false;

bool HasJoinTable() const { return fJoinTables.size() > 0; }

Expand Down
Loading
Loading