Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions core/index/index_meta.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ class format;
class IndexWriter;

struct SegmentInfo {
SegmentInfo() = default;

// Added for testing purposes.
SegmentInfo(
const std::string& _name,
uint64_t _byte_size
) : name(_name), byte_size(_byte_size)
{}

bool operator==(const SegmentInfo&) const = default;

std::string name; // FIXME(gnusi): move to SegmentMeta
Expand Down
222 changes: 37 additions & 185 deletions core/utils/index_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,149 +28,25 @@

#include "formats/format_utils.hpp"

namespace {

// Returns percentage of live documents
inline double FillFactor(const irs::SegmentInfo& segment) noexcept {
return static_cast<double>(segment.live_docs_count) /
static_cast<double>(segment.docs_count);
}

// Returns approximated size of a segment in the absence of removals
inline size_t SizeWithoutRemovals(const irs::SegmentInfo& segment) noexcept {
return size_t(static_cast<double>(segment.byte_size) * FillFactor(segment));
}

namespace tier {

struct SegmentStats {
// cppcheck-suppress noExplicitConstructor
SegmentStats(const irs::SubReader& reader) noexcept
: reader{&reader},
meta{&reader.Meta()},
size{SizeWithoutRemovals(*meta)},
fill_factor{FillFactor(*meta)} {}

bool operator<(const SegmentStats& rhs) const noexcept {
// cppcheck-suppress constVariable
auto& lhs = *this;

if (lhs.size == rhs.size) {
if (lhs.fill_factor > rhs.fill_factor) {
return true;
} else if (lhs.fill_factor < rhs.fill_factor) {
return false;
}

return lhs.meta->name < rhs.meta->name;
// interface to fetch the required attributes from
// SegmentStats struct.
// We use this function in struct ConsolidationCandidate
// to fetch the segment dimensions from the SegmentStats
// struct.
//
void getSegmentDimensions(
const tier::SegmentStats& segment,
tier::SegmentAttributes& attrs) {

auto* meta = segment.meta;
attrs.byteSize = meta->byte_size;
attrs.docsCount = meta->docs_count;
attrs.liveDocsCount = meta->live_docs_count;
}

return lhs.size < rhs.size;
}

operator const irs::SubReader*() const noexcept { return reader; }

const irs::SubReader* reader;
const irs::SegmentInfo* meta;
size_t size; // approximate size of segment without removals
double_t fill_factor;
};

struct ConsolidationCandidate {
using iterator_t = std::vector<SegmentStats>::const_iterator;
using range_t = std::pair<iterator_t, iterator_t>;

explicit ConsolidationCandidate(iterator_t i) noexcept : segments(i, i) {}

iterator_t begin() const noexcept { return segments.first; }
iterator_t end() const noexcept { return segments.second; }

range_t segments;
size_t count{0};
size_t size{0}; // estimated size of the level
double_t score{DBL_MIN}; // how good this permutation is
};

/// @returns score of the consolidation bucket
double_t consolidation_score(const ConsolidationCandidate& consolidation,
const size_t segments_per_tier,
const size_t floor_segment_bytes) noexcept {
// to detect how skewed the consolidation we do the following:
// 1. evaluate coefficient of variation, less is better
// 2. good candidates are in range [0;1]
// 3. favor condidates where number of segments is equal to
// 'segments_per_tier' approx
// 4. prefer smaller consolidations
// 5. prefer consolidations which clean removals

switch (consolidation.count) {
case 0:
// empty consolidation makes not sense
return DBL_MIN;
case 1: {
auto& meta = *consolidation.segments.first->meta;

if (meta.docs_count == meta.live_docs_count) {
// singletone without removals makes no sense
return DBL_MIN;
}

// FIXME honor number of deletes???
// signletone with removals makes sense if nothing better is found
return DBL_MIN + DBL_EPSILON;
}
}

size_t size_before_consolidation = 0;
size_t size_after_consolidation = 0;
size_t size_after_consolidation_floored = 0;
for (auto& segment_stat : consolidation) {
size_before_consolidation += segment_stat.meta->byte_size;
size_after_consolidation += segment_stat.size;
size_after_consolidation_floored +=
std::max(segment_stat.size, floor_segment_bytes);
}

// evaluate coefficient of variation
double sum_square_differences = 0;
const auto segment_size_after_consolidaton_mean =
static_cast<double>(size_after_consolidation_floored) /
static_cast<double>(consolidation.count);
for (auto& segment_stat : consolidation) {
const auto diff =
static_cast<double>(std::max(segment_stat.size, floor_segment_bytes)) -
segment_size_after_consolidaton_mean;
sum_square_differences += diff * diff;
}

const auto stdev = std::sqrt(sum_square_differences /
static_cast<double>(consolidation.count));
const auto cv = (stdev / segment_size_after_consolidaton_mean);

// evaluate initial score
auto score = 1. - cv;

// favor consolidations that contain approximately the requested number of
// segments
score *= std::pow(static_cast<double>(consolidation.count) /
static_cast<double>(segments_per_tier),
1.5);

// FIXME use relative measure, e.g. cosolidation_size/total_size
// carefully prefer smaller consolidations over the bigger ones
score /= std::pow(size_after_consolidation, 0.5);

// favor consolidations which clean out removals
score /= std::pow(static_cast<double>(size_after_consolidation) /
static_cast<double>(size_before_consolidation),
2);

return score;
}

} // namespace tier
} // namespace

namespace irs::index_utils {

ConsolidationPolicy MakePolicy(const ConsolidateBytes& options) {
Expand Down Expand Up @@ -391,6 +267,9 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
/// if
/// - segment size is greater than 'max_segments_bytes / 2'
/// - segment has many documents but only few deletions
///
/// TODO - too_big_segments_threshold formula is unreasonable
/// - add unit tests as well
///////////////////////////////////////////////////////////////////////////

const double_t total_fill_factor =
Expand All @@ -413,63 +292,36 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
}
}

///////////////////////////////////////////////////////////////////////////
/// Stage 3
/// sort candidates
///////////////////////////////////////////////////////////////////////////

std::sort(sorted_segments.begin(), sorted_segments.end());
// No point in attempting consolidation if we don't have
// enough segments to fill the consolidation window
if (sorted_segments.size() < tier::ConsolidationConfig::candidate_size)
return;

///////////////////////////////////////////////////////////////////////////
/// Stage 4
/// find proper candidates
/// Stage 3
/// Find cleanup candidates
///////////////////////////////////////////////////////////////////////////

tier::ConsolidationCandidate best(sorted_segments.begin());

if (sorted_segments.size() >= min_segments_per_tier) {
for (auto i = sorted_segments.begin(), end = sorted_segments.end();
i != end; ++i) {
tier::ConsolidationCandidate candidate(i);

while (candidate.segments.second != end &&
candidate.count < max_segments_per_tier) {
candidate.size += candidate.segments.second->size;

if (candidate.size > max_segments_bytes) {
// overcome the limit
break;
}

++candidate.count;
++candidate.segments.second;

if (candidate.count < min_segments_per_tier) {
// not enough segments yet
continue;
}

candidate.score = tier::consolidation_score(
candidate, max_segments_per_tier, floor_segment_bytes);

if (candidate.score < min_score) {
// score is too small
continue;
}

if (best.score < candidate.score) {
best = candidate;
}
}
}
tier::ConsolidationCandidate<tier::SegmentStats> best;
auto ret = tier::findBestCleanupCandidate<tier::SegmentStats>(sorted_segments, tier::getSegmentDimensions, best);
if (ret && best.initialized && std::distance(best.first(), best.last()) >= 0) {
std::copy(best.first(), best.last() + 1, std::back_inserter(candidates));
return;
}

///////////////////////////////////////////////////////////////////////////
/// Stage 4
/// pick the best candidate
/// find consolidation candidates
///////////////////////////////////////////////////////////////////////////

std::copy(best.begin(), best.end(), std::back_inserter(candidates));
if (!tier::findBestConsolidationCandidate<tier::SegmentStats>(
sorted_segments,
max_segments_bytes,
tier::getSegmentDimensions, best))
return;

candidates.reserve(std::distance(best.first(), best.last()) + 1);
std::copy(best.first(), best.last() + 1, std::back_inserter(candidates));
};
}

Expand Down
Loading