Skip to content

Commit 66ed0f3

Browse files
authored
[ML] Refactor initialization of the bucket gatherer (#2844)
This is a purely refactoring PR. It introduces a parameter initialization object for the bucket gatherer to reduce the constructor signature and simplify future code extensions. I also added -modernize-use-ranges to the clang-tidy rules because Clang 14 does not support range algorithms.
1 parent c6334ab commit 66ed0f3

28 files changed

+361
-433
lines changed

.clang-tidy

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Checks: >
2323
-modernize-use-trailing-return-type,
2424
-modernize-concat-nested-namespaces,
2525
-modernize-use-nodiscard,
26+
-modernize-use-ranges,
2627
2728
performance-*,
2829

include/model/CBucketGatherer.h

+25-7
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
#include <map>
3333
#include <optional>
3434
#include <string>
35-
#include <utility>
3635
#include <vector>
3736

3837
namespace ml {
@@ -98,7 +97,7 @@ class MODEL_EXPORT CBucketGatherer {
9897
//! \brief Hashes a ((size_t, size_t), string*) pair.
9998
struct MODEL_EXPORT SSizeSizePrOptionalStrPrHash {
10099
std::size_t operator()(const TSizeSizePrOptionalStrPr& key) const {
101-
std::uint64_t seed = core::CHashing::hashCombine(
100+
std::uint64_t const seed = core::CHashing::hashCombine(
102101
static_cast<std::uint64_t>(key.first.first),
103102
static_cast<std::uint64_t>(key.first.second));
104103
return core::CHashing::hashCombine(seed, s_Hasher(key.second));
@@ -132,6 +131,26 @@ class MODEL_EXPORT CBucketGatherer {
132131
using TTimeVec = std::vector<core_t::TTime>;
133132
using TTimeVecCItr = TTimeVec::const_iterator;
134133

134+
struct SBucketGathererInitData {
135+
// The name of the field holding the summary count.
136+
const std::string& s_SummaryCountFieldName;
137+
// The name of the field which identifies people.
138+
const std::string& s_PersonFieldName;
139+
// The name of the field which defines the person attributes.
140+
const std::string& s_AttributeFieldName;
141+
// The name of the field which contains the metric values.
142+
const std::string& s_ValueFieldName;
143+
// The field names for which we will compute influences.
144+
const TStrVec& s_InfluenceFieldNames;
145+
// The start of the time interval for which to gather data.
146+
core_t::TTime s_StartTime;
147+
// Override for the number of measurements
148+
// in a statistic. (Note that this is intended for testing only.)
149+
// A zero value means that the data gatherer class will determine
150+
// an appropriate value for the bucket length and data rate.
151+
unsigned int s_SampleOverrideCount;
152+
};
153+
135154
public:
136155
static const std::string EVENTRATE_BUCKET_GATHERER_TAG;
137156
static const std::string METRIC_BUCKET_GATHERER_TAG;
@@ -142,11 +161,10 @@ class MODEL_EXPORT CBucketGatherer {
142161
//! Create a new data series gatherer.
143162
//!
144163
//! \param[in] dataGatherer The owning data gatherer.
145-
//! \param[in] startTime The start of the time interval for which
146-
//! to gather data.
147-
//! \param[in] numberInfluencers The number of result influencers
148-
//! for which to gather data.
149-
CBucketGatherer(CDataGatherer& dataGatherer, core_t::TTime startTime, std::size_t numberInfluencers);
164+
//! \param[in] bucketGathererInitData The parameter initialization object
165+
//! for the bucket gatherer.
166+
CBucketGatherer(CDataGatherer& dataGatherer,
167+
const SBucketGathererInitData& bucketGathererInitData);
150168

151169
//! Create a copy that will result in the same persisted state as the
152170
//! original. This is effectively a copy constructor that creates a

include/model/CCountingModelFactory.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,15 @@ class MODEL_EXPORT CCountingModelFactory : public CModelFactory {
6868
//! \param[in] initData The parameters needed to initialize the data
6969
//! gatherer.
7070
//! \warning It is owned by the calling code.
71-
CDataGatherer* makeDataGatherer(const SGathererInitializationData& initData) const override;
71+
TDataGathererPtr makeDataGatherer(const SGathererInitializationData& initData) const override;
7272

7373
//! Make a new event rate data gatherer from part of a state document.
7474
//!
7575
//! \param[in] partitionFieldValue The partition field value.
7676
//! \param[in,out] traverser A state document traverser.
7777
//! \warning It is owned by the calling code.
78-
CDataGatherer* makeDataGatherer(const std::string& partitionFieldValue,
79-
core::CStateRestoreTraverser& traverser) const override;
78+
TDataGathererPtr makeDataGatherer(const std::string& partitionFieldValue,
79+
core::CStateRestoreTraverser& traverser) const override;
8080
//@}
8181

8282
//! \name Defaults

include/model/CDataGatherer.h

+12-50
Original file line numberDiff line numberDiff line change
@@ -146,51 +146,26 @@ class MODEL_EXPORT CDataGatherer {
146146
//! \param[in] summaryMode Indicates whether the data being gathered
147147
//! are already summarized by an external aggregation process.
148148
//! \param[in] modelParams The global configuration parameters.
149-
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
150-
//! then this is the name of the field holding the summary count.
151149
//! \param[in] partitionFieldValue The value of the field which splits
152150
//! the data.
153-
//! \param[in] personFieldName The name of the field which identifies
154-
//! people.
155-
//! \param[in] attributeFieldName The name of the field which defines
156-
//! the person attributes.
157-
//! \param[in] valueFieldName The name of the field which contains
158-
//! the metric values.
159-
//! \param[in] influenceFieldNames The field names for which we will
160-
//! compute influences.
161151
//! \param[in] key The key of the search for which to gatherer data.
162152
//! \param[in] features The features of the data to model.
163-
//! \param[in] startTime The start of the time interval for which
164-
//! to gather data.
165-
//! \param[in] sampleCountOverride for the number of measurements
166-
//! in a statistic. (Note that this is intended for testing only.)
167-
//! A zero value means that the data gatherer class will determine
168-
//! an appropriate value for the bucket length and data rate.
153+
//! \param[in] bucketGathererInitData The parameter initialization object for the bucket gatherer.
169154
CDataGatherer(model_t::EAnalysisCategory gathererType,
170155
model_t::ESummaryMode summaryMode,
171156
const SModelParams& modelParams,
172-
const std::string& summaryCountFieldName,
173-
const std::string& partitionFieldValue,
174-
const std::string& personFieldName,
175-
const std::string& attributeFieldName,
176-
const std::string& valueFieldName,
177-
const TStrVec& influenceFieldNames,
157+
std::string partitionFieldValue,
178158
const CSearchKey& key,
179159
const TFeatureVec& features,
180-
core_t::TTime startTime,
181-
int sampleCountOverride);
160+
const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData);
182161

183162
//! Construct from a state document.
184163
CDataGatherer(model_t::EAnalysisCategory gathererType,
185164
model_t::ESummaryMode summaryMode,
186165
const SModelParams& modelParams,
187-
const std::string& summaryCountFieldName,
188-
const std::string& partitionFieldValue,
189-
const std::string& personFieldName,
190-
const std::string& attributeFieldName,
191-
const std::string& valueFieldName,
192-
const TStrVec& influenceFieldNames,
166+
std::string partitionFieldValue,
193167
const CSearchKey& key,
168+
const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
194169
core::CStateRestoreTraverser& traverser);
195170

196171
//! Create a copy that will result in the same persisted state as the
@@ -330,6 +305,7 @@ class MODEL_EXPORT CDataGatherer {
330305
//! containing \p time.
331306
//!
332307
//! \param[in] time The time of interest.
308+
//! \param[in] bucketLength The length of the bucketing interval.
333309
//! \param[out] result Filled in with the feature data at \p time.
334310
//! \tparam T The type of the feature data.
335311
template<typename T>
@@ -648,9 +624,9 @@ class MODEL_EXPORT CDataGatherer {
648624

649625
//! Helper to avoid code duplication when getting a count from a
650626
//! field. Logs different errors for missing value and invalid value.
651-
bool extractCountFromField(const std::string& fieldName,
652-
const std::string* fieldValue,
653-
std::size_t& count) const;
627+
static bool extractCountFromField(const std::string& fieldName,
628+
const std::string* fieldValue,
629+
std::size_t& count);
654630

655631
//! Helper to avoid code duplication when getting a metric value from a
656632
//! field. Logs different errors for missing value and invalid value.
@@ -675,19 +651,11 @@ class MODEL_EXPORT CDataGatherer {
675651

676652
private:
677653
//! Restore state from supplied traverser.
678-
bool acceptRestoreTraverser(const std::string& summaryCountFieldName,
679-
const std::string& personFieldName,
680-
const std::string& attributeFieldName,
681-
const std::string& valueFieldName,
682-
const TStrVec& influenceFieldNames,
654+
bool acceptRestoreTraverser(const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
683655
core::CStateRestoreTraverser& traverser);
684656

685657
//! Restore a bucket gatherer from the supplied traverser.
686-
bool restoreBucketGatherer(const std::string& summaryCountFieldName,
687-
const std::string& personFieldName,
688-
const std::string& attributeFieldName,
689-
const std::string& valueFieldName,
690-
const TStrVec& influenceFieldNames,
658+
bool restoreBucketGatherer(const CBucketGatherer::SBucketGathererInitData& bucketGathererInitData,
691659
core::CStateRestoreTraverser& traverser);
692660

693661
//! Persist a bucket gatherer by passing information to the supplied
@@ -696,13 +664,7 @@ class MODEL_EXPORT CDataGatherer {
696664

697665
//! Create the bucket specific data gatherer.
698666
void createBucketGatherer(model_t::EAnalysisCategory gathererType,
699-
const std::string& summaryCountFieldName,
700-
const std::string& personFieldName,
701-
const std::string& attributeFieldName,
702-
const std::string& valueFieldName,
703-
const TStrVec& influenceFieldNames,
704-
core_t::TTime startTime,
705-
unsigned int sampleCountOverride);
667+
const CBucketGatherer::SBucketGathererInitData& initData);
706668

707669
private:
708670
//! The type of the bucket gatherer(s) used.

include/model/CEventRateBucketGatherer.h

+7-31
Original file line numberDiff line numberDiff line change
@@ -111,33 +111,13 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
111111
//! Create an event rate bucket gatherer.
112112
//!
113113
//! \param[in] dataGatherer The owning data gatherer.
114-
//! \param[in] summaryCountFieldName If summaryMode is E_Manual
115-
//! then this is the name of the field holding the summary count.
116-
//! \param[in] personFieldName The name of the field which identifies
117-
//! people.
118-
//! \param[in] attributeFieldName The name of the field which defines
119-
//! the person attributes.
120-
//! \param[in] valueFieldName The name of the field which contains
121-
//! the metric values.
122-
//! \param[in] influenceFieldNames The field names for which we will
123-
//! compute influences.
124-
//! \param[in] startTime The start of the time interval for which
125-
//! to gather data.
114+
//! \param[in] bucketGathererInitData The parameter initialization object.
126115
CEventRateBucketGatherer(CDataGatherer& dataGatherer,
127-
const std::string& summaryCountFieldName,
128-
const std::string& personFieldName,
129-
const std::string& attributeFieldName,
130-
const std::string& valueFieldName,
131-
const TStrVec& influenceFieldNames,
132-
core_t::TTime startTime);
116+
const SBucketGathererInitData& bucketGathererInitData);
133117

134118
//! Construct from a state document.
135119
CEventRateBucketGatherer(CDataGatherer& dataGatherer,
136-
const std::string& summaryCountFieldName,
137-
const std::string& personFieldName,
138-
const std::string& attributeFieldName,
139-
const std::string& valueFieldName,
140-
const TStrVec& influenceFieldNames,
120+
const SBucketGathererInitData& bucketGathererInitData,
141121
core::CStateRestoreTraverser& traverser);
142122

143123
//! Create a copy that will result in the same persisted state as the
@@ -444,11 +424,7 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
444424
void startNewBucket(core_t::TTime time, bool skipUpdates) override;
445425

446426
//! Initialize the field names collection.
447-
void initializeFieldNames(const std::string& personFieldName,
448-
const std::string& attributeFieldName,
449-
const std::string& valueFieldName,
450-
const std::string& summaryCountFieldName,
451-
const TStrVec& influenceFieldNames);
427+
void initializeFieldNames(const CBucketGatherer::SBucketGathererInitData& initData);
452428

453429
//! Initialize the feature data gatherers.
454430
void initializeFeatureData();
@@ -481,13 +457,13 @@ class MODEL_EXPORT CEventRateBucketGatherer final : public CBucketGatherer {
481457
TStrVec m_FieldNames;
482458

483459
//! The position of the first influencer field
484-
std::size_t m_BeginInfluencingFields;
460+
std::size_t m_BeginInfluencingFields{0};
485461

486462
//! The position of the first count/value field.
487-
std::size_t m_BeginValueField;
463+
std::size_t m_BeginValueField{0};
488464

489465
//! The position of the field holding the summarised count.
490-
std::size_t m_BeginSummaryFields;
466+
std::size_t m_BeginSummaryFields{0};
491467

492468
//! The data features we are gathering.
493469
TCategoryAnyMap m_FeatureData;

include/model/CEventRateModelFactory.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,15 @@ class MODEL_EXPORT CEventRateModelFactory final : public CModelFactory {
6969
//! \param[in] initData The parameters needed to initialize the data
7070
//! gatherer.
7171
//! \warning It is owned by the calling code.
72-
CDataGatherer* makeDataGatherer(const SGathererInitializationData& initData) const override;
72+
TDataGathererPtr makeDataGatherer(const SGathererInitializationData& initData) const override;
7373

7474
//! Make a new event rate data gatherer from part of a state document.
7575
//!
7676
//! \param[in] partitionFieldValue The partition field value.
7777
//! \param[in,out] traverser A state document traverser.
7878
//! \warning It is owned by the calling code.
79-
CDataGatherer* makeDataGatherer(const std::string& partitionFieldValue,
80-
core::CStateRestoreTraverser& traverser) const override;
79+
TDataGathererPtr makeDataGatherer(const std::string& partitionFieldValue,
80+
core::CStateRestoreTraverser& traverser) const override;
8181
//@}
8282

8383
//! \name Defaults

include/model/CEventRatePopulationModelFactory.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,16 @@ class MODEL_EXPORT CEventRatePopulationModelFactory final : public CModelFactory
6969
//! \param[in] initData The parameters needed to initialize the
7070
//! data gatherer.
7171
//! \warning It is owned by the calling code.
72-
CDataGatherer* makeDataGatherer(const SGathererInitializationData& initData) const override;
72+
TDataGathererPtr makeDataGatherer(const SGathererInitializationData& initData) const override;
7373

7474
//! Make a new population event rate data gatherer from part of
7575
//! a state document.
7676
//!
7777
//! \param[in] partitionFieldValue The partition field value.
7878
//! \param[in,out] traverser A state document traverser.
7979
//! \warning It is owned by the calling code.
80-
CDataGatherer* makeDataGatherer(const std::string& partitionFieldValue,
81-
core::CStateRestoreTraverser& traverser) const override;
80+
TDataGathererPtr makeDataGatherer(const std::string& partitionFieldValue,
81+
core::CStateRestoreTraverser& traverser) const override;
8282
//@}
8383

8484
//! \name Defaults

include/model/CMetricBucketGatherer.h

+8-31
Original file line numberDiff line numberDiff line change
@@ -53,33 +53,13 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
5353
//! Create a new population metric data gatherer.
5454
//!
5555
//! \param[in] dataGatherer The owning data gatherer.
56-
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
57-
//! then this is the name of the field holding the summary count.
58-
//! \param[in] personFieldName The name of the field which identifies
59-
//! people.
60-
//! \param[in] attributeFieldName The name of the field which defines
61-
//! the person attributes.
62-
//! \param[in] valueFieldName The name of the field which contains
63-
//! the metric values.
64-
//! \param[in] influenceFieldNames The field names for which we will
65-
//! compute influences.
66-
//! \param[in] startTime The start of the time interval for which
67-
//! to gather data.
68-
CMetricBucketGatherer(CDataGatherer& dataGatherer,
69-
const std::string& summaryCountFieldName,
70-
const std::string& personFieldName,
71-
const std::string& attributeFieldName,
72-
const std::string& valueFieldName,
73-
const TStrVec& influenceFieldNames,
74-
core_t::TTime startTime);
56+
//! \param[in] initData The parameter initialization object for the bucket
57+
//! gatherer.
58+
CMetricBucketGatherer(CDataGatherer& dataGatherer, const SBucketGathererInitData& initData);
7559

7660
//! Construct from a state document.
7761
CMetricBucketGatherer(CDataGatherer& dataGatherer,
78-
const std::string& summaryCountFieldName,
79-
const std::string& personFieldName,
80-
const std::string& attributeFieldName,
81-
const std::string& valueFieldName,
82-
const TStrVec& influenceFieldNames,
62+
const SBucketGathererInitData& initData,
8363
core::CStateRestoreTraverser& traverser);
8464

8565
//! Create a copy that will result in the same persisted state as the
@@ -266,9 +246,7 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
266246
//! 1) initializeFieldNamesPart1()
267247
//! 2) restore state
268248
//! 3) initializeFieldNamesPart2()
269-
void initializeFieldNamesPart1(const std::string& personFieldName,
270-
const std::string& attributeFieldName,
271-
const TStrVec& influenceFieldNames);
249+
void initializeFieldNamesPart1(const SBucketGathererInitData& initData);
272250

273251
//! Initialize the field names collection.
274252
//! initializeFieldNamesPart1() must be called before this.
@@ -277,8 +255,7 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
277255
//! 1) initializeFieldNamesPart1()
278256
//! 2) restore state
279257
//! 3) initializeFieldNamesPart2()
280-
void initializeFieldNamesPart2(const std::string& valueFieldName,
281-
const std::string& summaryCountFieldName);
258+
void initializeFieldNamesPart2(const SBucketGathererInitData& initData);
282259

283260
//! Initialize the feature data gatherers.
284261
void initializeFeatureData();
@@ -307,10 +284,10 @@ class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer {
307284
TStrVec m_FieldNames;
308285

309286
//! The position of the first influencing field.
310-
std::size_t m_BeginInfluencingFields;
287+
std::size_t m_BeginInfluencingFields{0};
311288

312289
//! The position of the first count/value field.
313-
std::size_t m_BeginValueFields;
290+
std::size_t m_BeginValueFields{0};
314291

315292
//! For summarized values, this stores the metric categories
316293
//! corresponding to the summarized field names in m_FieldNames;

0 commit comments

Comments
 (0)