Skip to content

Commit e1e93b5

Browse files
edsavagevaleriy42
andauthored
[ML] Report the "actual" memory usage of the autodetect process (#2846)
Determine the actual memory usgae of the autodetect process as reported by the OS, e.g. on Linux this mould be the value of the maximum resident set size returned by a call to `getrusage`. Add this value to the model size stats record returned to the ES Java process so it can be included in the `job counts` tab for anomaly detection jobs. --------- Co-authored-by: Valeriy Khakhutskyy <[email protected]>
1 parent 8cffe88 commit e1e93b5

16 files changed

+160
-12
lines changed

bin/autodetect/Main.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <core/CJsonOutputStreamWrapper.h>
2525
#include <core/CLogger.h>
2626
#include <core/CProcessPriority.h>
27+
#include <core/CProcessStats.h>
2728
#include <core/CProgramCounters.h>
2829
#include <core/CStringUtils.h>
2930
#include <core/CoreTypes.h>
@@ -83,7 +84,9 @@ int main(int argc, char** argv) {
8384
ml::counter_t::E_TSADNumberMemoryLimitModelCreationFailures,
8485
ml::counter_t::E_TSADNumberPrunedItems,
8586
ml::counter_t::E_TSADAssignmentMemoryBasis,
86-
ml::counter_t::E_TSADOutputMemoryAllocatorUsage};
87+
ml::counter_t::E_TSADOutputMemoryAllocatorUsage,
88+
ml::counter_t::E_TSADSystemMemoryUsage,
89+
ml::counter_t::E_TSADMaxSystemMemoryUsage};
8790

8891
ml::core::CProgramCounters::registerProgramCounterTypes(counters);
8992

@@ -151,6 +154,8 @@ int main(int argc, char** argv) {
151154
}
152155
cancellerThread.stop();
153156

157+
LOG_DEBUG(<< "Max Resident Set Size: " << ml::core::CProcessStats::maxResidentSetSize());
158+
LOG_DEBUG(<< "Resident Set Size: " << ml::core::CProcessStats::residentSetSize());
154159
// Log the program version immediately after reconfiguring the logger. This
155160
// must be done from the program, and NOT a shared library, as each program
156161
// statically links its own version library.

docs/CHANGELOG.asciidoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@
2828

2929
//=== Regressions
3030

31+
== {es} version 9.2.0
32+
33+
=== Enhancements
34+
35+
* Report the actual memory usage of the autodetect process. (See {ml-pull}2846[#2846])
36+
3137
== {es} version 9.1.0
3238

3339
=== Enhancements

include/core/CProgramCounters.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,12 @@ enum ECounterTypes {
112112
//! The memory currently used by the allocators to output JSON documents, in bytes.
113113
E_TSADOutputMemoryAllocatorUsage = 30,
114114

115+
//! The resident set size of the process, in bytes.
116+
E_TSADSystemMemoryUsage = 31,
117+
118+
//! The maximum resident set size of the process, in bytes.
119+
E_TSADMaxSystemMemoryUsage = 32,
120+
115121
// Data Frame Outlier Detection
116122

117123
//! The estimated peak memory usage for outlier detection in bytes
@@ -146,7 +152,7 @@ enum ECounterTypes {
146152
// Add any new values here
147153

148154
//! This MUST be last, increment the value for every new enum added
149-
E_LastEnumCounter = 31
155+
E_LastEnumCounter = 33
150156
};
151157

152158
static constexpr std::size_t NUM_COUNTERS = static_cast<std::size_t>(E_LastEnumCounter);
@@ -355,6 +361,10 @@ class CORE_EXPORT CProgramCounters {
355361
"Which option is being used to get model memory for node assignment?"},
356362
{counter_t::E_TSADOutputMemoryAllocatorUsage, "E_TSADOutputMemoryAllocatorUsage",
357363
"The amount of memory used to output JSON documents, in bytes."},
364+
{counter_t::E_TSADSystemMemoryUsage, "E_TSADSystemMemoryUsage",
365+
"The amount of system memory used by the process, in bytes"},
366+
{counter_t::E_TSADMaxSystemMemoryUsage, "E_TSADMaxSystemMemoryUsage",
367+
"The maximum amount of system memory used by the process, in bytes"},
358368
{counter_t::E_DFOEstimatedPeakMemoryUsage, "E_DFOEstimatedPeakMemoryUsage",
359369
"The upfront estimate of the peak memory outlier detection would use"},
360370
{counter_t::E_DFOPeakMemoryUsage, "E_DFOPeakMemoryUsage", "The peak memory outlier detection used"},

include/model/CProcessMemoryUsage.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the following additional limitation. Functionality enabled by the
5+
* files subject to the Elastic License 2.0 may only be used in production when
6+
* invoked by an Elasticsearch process with a license key installed that permits
7+
* use of machine learning features. You may not use this file except in
8+
* compliance with the Elastic License 2.0 and the foregoing additional
9+
* limitation.
10+
*/
11+
12+
#ifndef INCLUDED_ml_model_CSystemMemoryUsage_h
13+
#define INCLUDED_ml_model_CSystemMemoryUsage_h
14+
15+
#include <model/ImportExport.h>
16+
17+
#include <cstddef>
18+
19+
namespace ml {
20+
namespace model {
21+
22+
//! \brief Determines how to calculate the memory used by the current process.
23+
//!
24+
//! DESCRIPTION:\n
25+
//! Determines how to calculate the memory used by the current process based on the operating system.
26+
//! On some OS's (Mac, Windows) we use the estimated memory usage of the models,
27+
//! while on others (Linux) we use the actual memory of the process as provided by system calls.
28+
class MODEL_EXPORT CProcessMemoryUsage {
29+
public:
30+
enum class EMemoryStrategy { E_Estimated, E_System };
31+
32+
static const EMemoryStrategy MEMORY_STRATEGY;
33+
34+
public:
35+
CProcessMemoryUsage() = delete;
36+
};
37+
}
38+
}
39+
40+
#endif //INCLUDED_ml_model_CSystemMemoryUsage_h

include/model/CResourceMonitor.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ class MODEL_EXPORT CResourceMonitor {
180180
//! Returns the sum of used memory plus any extra memory
181181
std::size_t totalMemory() const;
182182

183+
//! Returns the current physical memory of the process (rss) as reported by the system
184+
static std::size_t systemMemory();
185+
186+
//! Returns the maximum physical memory of the process (max rss) as reported by the system
187+
static std::size_t maxSystemMemory();
188+
183189
private:
184190
using TMonitoredResourcePtrSizeUMap =
185191
boost::unordered_map<CMonitoredResource*, std::size_t>;
@@ -229,6 +235,9 @@ class MODEL_EXPORT CResourceMonitor {
229235
//! Returns the amount by which reported memory usage is scaled depending on the type of persistence in use
230236
std::size_t persistenceMemoryIncreaseFactor() const;
231237

238+
//! Modify the supplied usage value depending on a platform dependent strategy.
239+
std::size_t applyMemoryStrategy(std::size_t usage) const;
240+
232241
private:
233242
//! The registered collection of components
234243
TMonitoredResourcePtrSizeUMap m_Resources;

lib/api/CAnomalyJob.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,10 @@ bool CAnomalyJob::handleRecord(const TStrStrUMap& dataRowFields, TOptionalTime t
205205
}
206206

207207
++core::CProgramCounters::counter(counter_t::E_TSADNumberApiRecordsHandled);
208+
core::CProgramCounters::counter(counter_t::E_TSADSystemMemoryUsage) =
209+
model::CResourceMonitor::systemMemory();
210+
core::CProgramCounters::counter(counter_t::E_TSADMaxSystemMemoryUsage) =
211+
model::CResourceMonitor::maxSystemMemory();
208212

209213
++m_NumRecordsHandled;
210214
m_LatestRecordTime = std::max(m_LatestRecordTime, *time);

lib/api/CModelSizeStatsJsonWriter.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ const std::string JOB_ID{"job_id"};
2525
const std::string MODEL_SIZE_STATS{"model_size_stats"};
2626
const std::string MODEL_BYTES{"model_bytes"};
2727
const std::string PEAK_MODEL_BYTES{"peak_model_bytes"};
28+
const std::string SYSTEM_MEMORY_BYTES{"system_memory_bytes"};
29+
const std::string MAX_SYSTEM_MEMORY_BYTES{"max_system_memory_bytes"};
2830
const std::string MODEL_BYTES_EXCEEDED{"model_bytes_exceeded"};
2931
const std::string MODEL_BYTES_MEMORY_LIMIT{"model_bytes_memory_limit"};
3032
const std::string TOTAL_BY_FIELD_COUNT{"total_by_field_count"};

lib/api/unittest/CAnomalyJobLimitTest.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* limitation.
1010
*/
1111
#include <core/CJsonOutputStreamWrapper.h>
12+
#include <core/CProcessStats.h>
1213
#include <core/CoreTypes.h>
1314

1415
#include <maths/common/CIntegerTools.h>
@@ -105,8 +106,6 @@ BOOST_AUTO_TEST_CASE(testAccuracy) {
105106
core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm);
106107

107108
model::CLimits limits;
108-
//limits.resourceMonitor().m_ByteLimitHigh = 100000;
109-
//limits.resourceMonitor().m_ByteLimitLow = 90000;
110109

111110
{
112111
LOG_TRACE(<< "Setting up job");
@@ -129,6 +128,7 @@ BOOST_AUTO_TEST_CASE(testAccuracy) {
129128
nonLimitedUsage = limits.resourceMonitor().totalMemory();
130129
}
131130
}
131+
LOG_DEBUG(<< "nonLimitedUsage: " << nonLimitedUsage);
132132
{
133133
// Now run the data with limiting
134134
ml::api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig(

lib/api/unittest/CJsonOutputWriterTest.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1733,7 +1733,7 @@ BOOST_AUTO_TEST_CASE(testReportMemoryUsage) {
17331733
resourceUsage.s_OverFields = 7;
17341734
resourceUsage.s_AllocationFailures = 8;
17351735
resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
1736-
resourceUsage.s_AssignmentMemoryBasis = ml::model_t::E_AssignmentBasisCurrentModelBytes;
1736+
resourceUsage.s_AssignmentMemoryBasis = ml::model_t::E_AssignmentBasisPeakModelBytes;
17371737
resourceUsage.s_BucketStartTime = 9;
17381738
resourceUsage.s_BytesExceeded = 10;
17391739
resourceUsage.s_BytesMemoryLimit = 11;
@@ -1785,7 +1785,7 @@ BOOST_AUTO_TEST_CASE(testReportMemoryUsage) {
17851785
BOOST_TEST_REQUIRE(sizeStats.contains("memory_status"));
17861786
BOOST_REQUIRE_EQUAL("hard_limit", sizeStats.at("memory_status").as_string());
17871787
BOOST_TEST_REQUIRE(sizeStats.contains("assignment_memory_basis"));
1788-
BOOST_REQUIRE_EQUAL("current_model_bytes",
1788+
BOOST_REQUIRE_EQUAL("peak_model_bytes",
17891789
sizeStats.at("assignment_memory_basis").as_string());
17901790
BOOST_TEST_REQUIRE(sizeStats.contains("log_time"));
17911791
std::int64_t nowMs{ml::core::CTimeUtils::nowMs()};

lib/core/CProcessStats_MacOSX.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
* compliance with the Elastic License 2.0 and the foregoing additional
99
* limitation.
1010
*/
11-
#include <core/CLogger.h>
1211
#include <core/CProcessStats.h>
1312

13+
#include <core/CLogger.h>
14+
1415
#include <errno.h>
1516
#include <fcntl.h>
1617
#include <sys/resource.h>

0 commit comments

Comments
 (0)