Skip to content

[ML] Refactor initialization of the bucket gatherer #2844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
@@ -23,6 +23,7 @@ Checks: >
-modernize-use-trailing-return-type,
-modernize-concat-nested-namespaces,
-modernize-use-nodiscard,
-modernize-use-ranges,

performance-*,

32 changes: 25 additions & 7 deletions include/model/CBucketGatherer.h
Original file line number Diff line number Diff line change
@@ -32,7 +32,6 @@
#include <map>
#include <optional>
#include <string>
#include <utility>
#include <vector>

namespace ml {
@@ -98,7 +97,7 @@ class MODEL_EXPORT CBucketGatherer {
//! \brief Hashes a ((size_t, size_t), string*) pair.
struct MODEL_EXPORT SSizeSizePrOptionalStrPrHash {
std::size_t operator()(const TSizeSizePrOptionalStrPr& key) const {
std::uint64_t seed = core::CHashing::hashCombine(
std::uint64_t const seed = core::CHashing::hashCombine(
static_cast<std::uint64_t>(key.first.first),
static_cast<std::uint64_t>(key.first.second));
return core::CHashing::hashCombine(seed, s_Hasher(key.second));
@@ -132,6 +131,26 @@ class MODEL_EXPORT CBucketGatherer {
using TTimeVec = std::vector<core_t::TTime>;
using TTimeVecCItr = TTimeVec::const_iterator;

struct SBucketGathererInitData {
// The name of the field holding the summary count.
const std::string& s_SummaryCountFieldName;
// The name of the field which identifies people.
const std::string& s_PersonFieldName;
// The name of the field which defines the person attributes.
const std::string& s_AttributeFieldName;
// The name of the field which contains the metric values.
const std::string& s_ValueFieldName;
// The field names for which we will compute influences.
const TStrVec& s_InfluenceFieldNames;
// The start of the time interval for which to gather data.
core_t::TTime s_StartTime;
// Override for the number of measurements
// in a statistic. (Note that this is intended for testing only.)
// A zero value means that the data gatherer class will determine
// an appropriate value for the bucket length and data rate.
unsigned int s_SampleOverrideCount;
};

public:
static const std::string EVENTRATE_BUCKET_GATHERER_TAG;
static const std::string METRIC_BUCKET_GATHERER_TAG;
@@ -142,11 +161,10 @@ class MODEL_EXPORT CBucketGatherer {
//! Create a new data series gatherer.
//!
//! \param[in] dataGatherer The owning data gatherer.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
//! \param[in] numberInfluencers The number of result influencers
//! for which to gather data.
CBucketGatherer(CDataGatherer& dataGatherer, core_t::TTime startTime, std::size_t numberInfluencers);
//! \param[in] bucketGathererInitData The parameter initialization object
//! for the bucket gatherer.
CBucketGatherer(CDataGatherer& dataGatherer,
const SBucketGathererInitData& bucketGathererInitData);

//! Create a copy that will result in the same persisted state as the
//! original. This is effectively a copy constructor that creates a
6 changes: 3 additions & 3 deletions include/model/CCountingModelFactory.h
Original file line number Diff line number Diff line change
@@ -68,15 +68,15 @@ class MODEL_EXPORT CCountingModelFactory : public CModelFactory {
//! \param[in] initData The parameters needed to initialize the data
//! gatherer.
//! \warning It is owned by the calling code.
CDataGatherer* makeDataGatherer(const SGathererInitializationData& initData) const override;
TDataGathererPtr makeDataGatherer(const SGathererInitializationData& initData) const override;

//! Make a new event rate data gatherer from part of a state document.
//!
//! \param[in] partitionFieldValue The partition field value.
//! \param[in,out] traverser A state document traverser.
//! \warning It is owned by the calling code.
CDataGatherer* makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const override;
TDataGathererPtr makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const override;
//@}

//! \name Defaults
62 changes: 12 additions & 50 deletions include/model/CDataGatherer.h
Original file line number Diff line number Diff line change
@@ -146,51 +146,26 @@ class MODEL_EXPORT CDataGatherer {
//! \param[in] summaryMode Indicates whether the data being gathered
//! are already summarized by an external aggregation process.
//! \param[in] modelParams The global configuration parameters.
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] partitionFieldValue The value of the field which splits
//! the data.
//! \param[in] personFieldName The name of the field which identifies
//! people.
//! \param[in] attributeFieldName The name of the field which defines
//! the person attributes.
//! \param[in] valueFieldName The name of the field which contains
//! the metric values.
//! \param[in] influenceFieldNames The field names for which we will
//! compute influences.
//! \param[in] key The key of the search for which to gatherer data.
//! \param[in] features The features of the data to model.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
//! \param[in] sampleCountOverride for the number of measurements
//! in a statistic. (Note that this is intended for testing only.)
//! A zero value means that the data gatherer class will determine
//! an appropriate value for the bucket length and data rate.
//! \param[in] bucketGathererInitData The parameter initialization object for the bucket gatherer.
CDataGatherer(model_t::EAnalysisCategory gathererType,
model_t::ESummaryMode summaryMode,
const SModelParams& modelParams,
const std::string& summaryCountFieldName,
const std::string& partitionFieldValue,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
std::string partitionFieldValue,
const CSearchKey& key,
const TFeatureVec& features,
core_t::TTime startTime,
int sampleCountOverride);
const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData);

//! Construct from a state document.
CDataGatherer(model_t::EAnalysisCategory gathererType,
model_t::ESummaryMode summaryMode,
const SModelParams& modelParams,
const std::string& summaryCountFieldName,
const std::string& partitionFieldValue,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
std::string partitionFieldValue,
const CSearchKey& key,
const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Create a copy that will result in the same persisted state as the
@@ -330,6 +305,7 @@ class MODEL_EXPORT CDataGatherer {
//! containing \p time.
//!
//! \param[in] time The time of interest.
//! \param[in] bucketLength The length of the bucketing interval.
//! \param[out] result Filled in with the feature data at \p time.
//! \tparam T The type of the feature data.
template<typename T>
@@ -648,9 +624,9 @@ class MODEL_EXPORT CDataGatherer {

//! Helper to avoid code duplication when getting a count from a
//! field. Logs different errors for missing value and invalid value.
bool extractCountFromField(const std::string& fieldName,
const std::string* fieldValue,
std::size_t& count) const;
static bool extractCountFromField(const std::string& fieldName,
const std::string* fieldValue,
std::size_t& count);

//! Helper to avoid code duplication when getting a metric value from a
//! field. Logs different errors for missing value and invalid value.
@@ -675,19 +651,11 @@ class MODEL_EXPORT CDataGatherer {

private:
//! Restore state from supplied traverser.
bool acceptRestoreTraverser(const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
bool acceptRestoreTraverser(const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Restore a bucket gatherer from the supplied traverser.
bool restoreBucketGatherer(const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
bool restoreBucketGatherer(const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Persist a bucket gatherer by passing information to the supplied
@@ -696,13 +664,7 @@ class MODEL_EXPORT CDataGatherer {

//! Create the bucket specific data gatherer.
void createBucketGatherer(model_t::EAnalysisCategory gathererType,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
core_t::TTime startTime,
unsigned int sampleCountOverride);
const CBucketGatherer::SBucketGathererInitData& initData);

private:
//! The type of the bucket gatherer(s) used.
38 changes: 7 additions & 31 deletions include/model/CEventRateBucketGatherer.h
Original file line number Diff line number Diff line change
@@ -111,33 +111,13 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
//! Create an event rate bucket gatherer.
//!
//! \param[in] dataGatherer The owning data gatherer.
//! \param[in] summaryCountFieldName If summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] personFieldName The name of the field which identifies
//! people.
//! \param[in] attributeFieldName The name of the field which defines
//! the person attributes.
//! \param[in] valueFieldName The name of the field which contains
//! the metric values.
//! \param[in] influenceFieldNames The field names for which we will
//! compute influences.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
//! \param[in] bucketGathererInitData The parameter initialization object.
CEventRateBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
core_t::TTime startTime);
const SBucketGathererInitData& bucketGathererInitData);

//! Construct from a state document.
CEventRateBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
const SBucketGathererInitData& bucketGathererInitData,
core::CStateRestoreTraverser& traverser);

//! Create a copy that will result in the same persisted state as the
@@ -444,11 +424,7 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
void startNewBucket(core_t::TTime time, bool skipUpdates) override;

//! Initialize the field names collection.
void initializeFieldNames(const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const std::string& summaryCountFieldName,
const TStrVec& influenceFieldNames);
void initializeFieldNames(const CBucketGatherer::SBucketGathererInitData& initData);

//! Initialize the feature data gatherers.
void initializeFeatureData();
@@ -481,13 +457,13 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
TStrVec m_FieldNames;

//! The position of the first influencer field
std::size_t m_BeginInfluencingFields;
std::size_t m_BeginInfluencingFields{0};

//! The position of the first count/value field.
std::size_t m_BeginValueField;
std::size_t m_BeginValueField{0};

//! The position of the field holding the summarised count.
std::size_t m_BeginSummaryFields;
std::size_t m_BeginSummaryFields{0};

//! The data features we are gathering.
TCategoryAnyMap m_FeatureData;
6 changes: 3 additions & 3 deletions include/model/CEventRateModelFactory.h
Original file line number Diff line number Diff line change
@@ -69,15 +69,15 @@ class MODEL_EXPORT CEventRateModelFactory final : public CModelFactory {
//! \param[in] initData The parameters needed to initialize the data
//! gatherer.
//! \warning It is owned by the calling code.
CDataGatherer* makeDataGatherer(const SGathererInitializationData& initData) const override;
TDataGathererPtr makeDataGatherer(const SGathererInitializationData& initData) const override;

//! Make a new event rate data gatherer from part of a state document.
//!
//! \param[in] partitionFieldValue The partition field value.
//! \param[in,out] traverser A state document traverser.
//! \warning It is owned by the calling code.
CDataGatherer* makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const override;
TDataGathererPtr makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const override;
//@}

//! \name Defaults
6 changes: 3 additions & 3 deletions include/model/CEventRatePopulationModelFactory.h
Original file line number Diff line number Diff line change
@@ -69,16 +69,16 @@ class MODEL_EXPORT CEventRatePopulationModelFactory final : public CModelFactory
//! \param[in] initData The parameters needed to initialize the
//! data gatherer.
//! \warning It is owned by the calling code.
CDataGatherer* makeDataGatherer(const SGathererInitializationData& initData) const override;
TDataGathererPtr makeDataGatherer(const SGathererInitializationData& initData) const override;

//! Make a new population event rate data gatherer from part of
//! a state document.
//!
//! \param[in] partitionFieldValue The partition field value.
//! \param[in,out] traverser A state document traverser.
//! \warning It is owned by the calling code.
CDataGatherer* makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const override;
TDataGathererPtr makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const override;
//@}

//! \name Defaults
39 changes: 8 additions & 31 deletions include/model/CMetricBucketGatherer.h
Original file line number Diff line number Diff line change
@@ -53,33 +53,13 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
//! Create a new population metric data gatherer.
//!
//! \param[in] dataGatherer The owning data gatherer.
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] personFieldName The name of the field which identifies
//! people.
//! \param[in] attributeFieldName The name of the field which defines
//! the person attributes.
//! \param[in] valueFieldName The name of the field which contains
//! the metric values.
//! \param[in] influenceFieldNames The field names for which we will
//! compute influences.
//! \param[in] startTime The start of the time interval for which
//! to gather data.
CMetricBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
core_t::TTime startTime);
//! \param[in] initData The parameter initialization object for the bucket
//! gatherer.
CMetricBucketGatherer(CDataGatherer& dataGatherer, const SBucketGathererInitData& initData);

//! Construct from a state document.
CMetricBucketGatherer(CDataGatherer& dataGatherer,
const std::string& summaryCountFieldName,
const std::string& personFieldName,
const std::string& attributeFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames,
const SBucketGathererInitData& initData,
core::CStateRestoreTraverser& traverser);

//! Create a copy that will result in the same persisted state as the
@@ -266,9 +246,7 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
//! 1) initializeFieldNamesPart1()
//! 2) restore state
//! 3) initializeFieldNamesPart2()
void initializeFieldNamesPart1(const std::string& personFieldName,
const std::string& attributeFieldName,
const TStrVec& influenceFieldNames);
void initializeFieldNamesPart1(const SBucketGathererInitData& initData);

//! Initialize the field names collection.
//! initializeFieldNamesPart1() must be called before this.
@@ -277,8 +255,7 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
//! 1) initializeFieldNamesPart1()
//! 2) restore state
//! 3) initializeFieldNamesPart2()
void initializeFieldNamesPart2(const std::string& valueFieldName,
const std::string& summaryCountFieldName);
void initializeFieldNamesPart2(const SBucketGathererInitData& initData);

//! Initialize the feature data gatherers.
void initializeFeatureData();
@@ -307,10 +284,10 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
TStrVec m_FieldNames;

//! The position of the first influencing field.
std::size_t m_BeginInfluencingFields;
std::size_t m_BeginInfluencingFields{0};

//! The position of the first count/value field.
std::size_t m_BeginValueFields;
std::size_t m_BeginValueFields{0};

//! For summarized values, this stores the metric categories
//! corresponding to the summarized field names in m_FieldNames;
Loading