Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/Amalgam/IntegerSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class SortedIntegerSet
}

//returns the number of elements that exist in the hash set
__forceinline size_t size()
__forceinline size_t size() const
{
return integers.size();
}
Expand Down Expand Up @@ -626,7 +626,7 @@ class BitArrayIntegerSet
}

//returns the number of elements that exist in the hash set
constexpr size_t size()
constexpr size_t size() const
{
return numElements;
}
Expand Down Expand Up @@ -1372,7 +1372,7 @@ class EfficientIntegerSet
}

//returns the number of elements that exist
__forceinline size_t size()
__forceinline size_t size() const
{
if(isSisContainer)
return sisContainer.size();
Expand Down
103 changes: 42 additions & 61 deletions src/Amalgam/SBFDSColumnData.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

//SBFDSColumnData class maintains a sorted linear and random access data collection
//values with the same key are placed into the same bucket. buckets are stored in sorted order by key
//this is "indexing" as in database terminology, but otherwise "index" herein means a row identifier
class SBFDSColumnData
{
public:
Expand Down Expand Up @@ -101,14 +102,14 @@ class SBFDSColumnData
size_entry->second = std::make_unique<SortedIntegerSet>();

//add the entity
size_entry->second->insert(index);
size_entry->second->InsertNewLargestInteger(index);

UpdateLargestCode(code_size, index);
}
}

//inserts indices assuming that they have been sorted by value,
// and that index_values are also sorted from smallest to largest
// and that index_values are also sorted from smallest to largest within each value
void AppendSortedNumberIndicesWithSortedIndices(std::vector<DistanceReferencePair<size_t>> &index_values)
{
if(index_values.size() == 0)
Expand All @@ -117,8 +118,11 @@ class SBFDSColumnData
//count unique values so only need to perform one allocation for the main list
size_t num_uniques = 1;
double prev_value = index_values[0].distance;
size_t maxIndex = 0;
for(size_t i = 1; i < index_values.size(); i++)
{
if(auto reference = index_values[i].reference; maxIndex < reference)
maxIndex = reference;
if(prev_value != index_values[i].distance)
{
num_uniques++;
Expand All @@ -127,7 +131,7 @@ class SBFDSColumnData
}

sortedNumberValueEntries.reserve(num_uniques);
numberIndices.ReserveNumIntegers(index_values.back().reference + 1);
numberIndices.ReserveNumIntegers(maxIndex + 1);

for(auto &index_value : index_values)
{
Expand Down Expand Up @@ -308,6 +312,7 @@ class SBFDSColumnData
std::make_unique<ValueEntry>(new_number_value));

InsertFirstIndexIntoNumberValueEntry(index, new_value_entry_index);
new_value_index = sortedNumberValueEntries[new_value_entry_index]->valueInternIndex;
}

if(internedNumberValues.valueInterningEnabled)
Expand Down Expand Up @@ -371,11 +376,16 @@ class SBFDSColumnData
}
}
}
else if(inserted) //shouldn't make it here, but ensure integrity just in case
else //shouldn't make it here, but ensure integrity just in case
{
assert(false);
new_id_entry->second = std::make_unique<ValueEntry>(new_sid_value);
InsertFirstIndexIntoStringIdValueEntry(index, new_id_entry);
if(inserted) {
new_id_entry->second = std::make_unique<ValueEntry>(new_sid_value);
InsertFirstIndexIntoStringIdValueEntry(index, new_id_entry);
}
else
new_id_entry->second->indicesWithValue.insert(index);

new_value_index = new_id_entry->second->valueInternIndex;
}

Expand Down Expand Up @@ -430,15 +440,16 @@ class SBFDSColumnData
valueCodeSizeToIndices.erase(old_size_entry);
}
}
else if(inserted) //shouldn't make it here, but ensure integrity just in case
else //shouldn't make it here, but ensure integrity just in case
{
assert(false);
new_size_entry->second = std::make_unique<SortedIntegerSet>();
if(inserted) {
new_size_entry->second = std::make_unique<SortedIntegerSet>();
}
new_size_entry->second->insert(index);
}
}

//update longest string as appropriate
//see if need to update largest code
if(index == indexWithLargestCode)
RecomputeLargestCode();
Expand Down Expand Up @@ -513,14 +524,15 @@ class SBFDSColumnData
assert(false);

auto &entities = id_entry->second->indicesWithValue;
entities.erase(index);

//if no more entries have the value, remove it
if(entities.size() == 0)
if(entities.size() <= 1)
{
internedStringIdValues.DeleteInternIndex(id_entry->second->valueInternIndex);
stringIdValueEntries.erase(id_entry);
}
else
entities.erase(index);

//see if need to compute new longest string
if(index == indexWithLongestString)
Expand Down Expand Up @@ -555,10 +567,11 @@ class SBFDSColumnData

//remove the entity
auto &entities = *(id_entry->second);
entities.erase(index);

if(entities.size() == 0)
if(entities.size() <= 1)
valueCodeSizeToIndices.erase(id_entry);
else
entities.erase(index);

//see if need to update largest code
if(index == indexWithLargestCode)
Expand All @@ -576,6 +589,7 @@ class SBFDSColumnData
{
ValueEntry *value_entry = sortedNumberValueEntries[value_index].get();

assert(value_entry->indicesWithValue.size() == 0);
value_entry->indicesWithValue.insert(index);
internedNumberValues.InsertValueEntry(value_entry, sortedNumberValueEntries.size());
}
Expand All @@ -587,6 +601,7 @@ class SBFDSColumnData
{
ValueEntry *value_entry = value_iter->second.get();

assert(value_entry->indicesWithValue.size() == 0);
value_entry->indicesWithValue.insert(index);
internedStringIdValues.InsertValueEntry(value_entry, stringIdValueEntries.size());
}
Expand Down Expand Up @@ -656,10 +671,12 @@ class SBFDSColumnData

//try to insert the value if not already there
auto [inserted_id_entry, inserted] = stringIdValueEntries.emplace(string_id, nullptr);
if(inserted)
if(inserted) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style is for newline here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, there's logic in InsertFirstIndexIntoStringIdValueEntry to do the right thing, which mimics the logic above (line 658) for numbers.

inserted_id_entry->second = std::make_unique<ValueEntry>(string_id);

InsertFirstIndexIntoStringIdValueEntry(index, inserted_id_entry);
InsertFirstIndexIntoStringIdValueEntry(index, inserted_id_entry);
}
else
inserted_id_entry->second->indicesWithValue.insert(index);

UpdateLongestString(string_id, index);

Expand Down Expand Up @@ -701,42 +718,6 @@ class SBFDSColumnData
return numberIndices.size() + stringIdIndices.size() + codeIndices.size();
}

//returns the maximum difference between value and any other value for this column
//if empty, will return infinity
inline double GetMaxDifferenceTerm(GeneralizedDistanceEvaluator::FeatureAttributes &feature_attribs)
{
switch(feature_attribs.featureType)
{
case GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC:
case GeneralizedDistanceEvaluator::FDT_NOMINAL_STRING:
case GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE:
return 1.0 - 1.0 / (numberIndices.size() + stringIdIndices.size() + codeIndices.size());

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC:
if(sortedNumberValueEntries.size() <= 1)
return 0.0;

return sortedNumberValueEntries.back()->value.number - sortedNumberValueEntries[0]->value.number;

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC:
//maximum is the other side of the cycle
return feature_attribs.typeAttributes.maxCyclicDifference / 2;

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_STRING:
//the max difference is the worst case edit distance, of removing all the characters
// and then adding back in another of equal size but different
return static_cast<double>(longestStringLength * 2);

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE:
//the max difference is the worst case edit distance, of removing all the characters
// and then adding back in another of equal size but different
return static_cast<double>(largestCodeSize * 2);

default:
return std::numeric_limits<double>::infinity();
}
}

//returns the exact index of value
//Same as std::binary_search but returns both index and if found
// .first: found index - if not found, returns closest index from lower_bound if
Expand Down Expand Up @@ -840,8 +821,10 @@ class SBFDSColumnData
}
}

//given a feature_id and a range [low, high], inserts all the elements with values of feature feature_id within specified range into out; does not clear out
//if the feature value is null, it will NOT be present in the search results, ie "x" != 3 will NOT include elements with x is null, even though null != 3
//given a range [low, high], inserts all the elements within specified range into out; does not clear out
//the range is inclusive of high for numbers, but exclusive for string ids!
//either but not both numerical bounds may be NaN to signify an infinity
//if between_values is false, find complementary set
void FindAllIndicesWithinRange(EvaluableNodeImmediateValueType value_type,
EvaluableNodeImmediateValue &low, EvaluableNodeImmediateValue &high, BitArrayIntegerSet &out, bool between_values = true)
{
Expand All @@ -864,7 +847,7 @@ class SBFDSColumnData
//modify range to include elements from or up to -/+inf
if(FastIsNaN(low_number)) //find all NaN values and all values up to max
low_number = -std::numeric_limits<double>::infinity(); //else include elements from -inf to high as well as NaN elements
else
else if(FastIsNaN(high_number))
high_number = std::numeric_limits<double>::infinity(); //include elements from low to +inf as well as NaN elements
}

Expand All @@ -884,6 +867,7 @@ class SBFDSColumnData
//if within range, and range has no length, just return indices in that one bucket
if(between_values)
{
assert(exact_index_found);
size_t index = value_index;
out.InsertInBatch(sortedNumberValueEntries[index]->indicesWithValue);
}
Expand All @@ -909,10 +893,6 @@ class SBFDSColumnData
//insert everything between the two indices
for(size_t i = start_index; i < end_index; i++)
out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue);

//include end_index if value matches
if(end_index < sortedNumberValueEntries.size() && sortedNumberValueEntries[end_index]->value.number == high_number)
out.InsertInBatch(sortedNumberValueEntries[end_index]->indicesWithValue);
}
else //not between_values
{
Expand All @@ -928,6 +908,7 @@ class SBFDSColumnData
}
else if(value_type == ENIVT_STRING_ID)
{
//there are no ids for this column, so return no results
if(stringIdValueEntries.size() == 0)
return;

Expand Down Expand Up @@ -998,7 +979,7 @@ class SBFDSColumnData
if(sortedNumberValueEntries.size() == 0)
return;

//search left to right for max (bucket 0 is largest) or right to left for min
//search right to left for max (bucket 0 is smallest) or left to right for min
int64_t value_index = find_max ? sortedNumberValueEntries.size() - 1 : 0;

while(value_index < static_cast<int64_t>(sortedNumberValueEntries.size()) && value_index >= 0)
Expand All @@ -1016,7 +997,7 @@ class SBFDSColumnData
return;
}

value_index += find_max ? -1 : 1; //search right to right for max or left to right for min
value_index += find_max ? -1 : 1; //search right to left for max or left to right for min
}
}
else if(value_type == ENIVT_STRING_ID)
Expand Down
44 changes: 42 additions & 2 deletions src/Amalgam/SeparableBoxFilterDataStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class SeparableBoxFilterDataStore
inline double GetMaxDistanceTermForContinuousFeature(RepeatedGeneralizedDistanceEvaluator &r_dist_eval,
size_t query_feature_index, size_t absolute_feature_index, bool high_accuracy)
{
double max_diff = columnData[absolute_feature_index]->GetMaxDifferenceTerm(
double max_diff = GetMaxDifferenceTerm(*columnData[absolute_feature_index],
r_dist_eval.distEvaluator->featureAttribs[query_feature_index]);
return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonNullRegular(
max_diff, query_feature_index, high_accuracy);
Expand Down Expand Up @@ -933,6 +933,46 @@ class SeparableBoxFilterDataStore
return std::make_pair(true, distance);
}

//returns the maximum difference between value and any other value for this column
static inline double GetMaxDifferenceTerm(const SBFDSColumnData &column,
GeneralizedDistanceEvaluator::FeatureAttributes &feature_attribs)
{
switch(feature_attribs.featureType)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why should this be static rather than a method of column data?

{
case GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC:
case GeneralizedDistanceEvaluator::FDT_NOMINAL_STRING:
case GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE: {
auto denom = (column.numberIndices.size() + column.stringIdIndices.size() + column.codeIndices.size());
return denom > 0 ? 1.0 - 1.0 / denom : 0;
}

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC:
if(column.sortedNumberValueEntries.size() <= 1)
return 0.0;

return column.sortedNumberValueEntries.back()->value.number - column.sortedNumberValueEntries[0]->value.number;

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC:
//maximum is the other side of the cycle
return feature_attribs.typeAttributes.maxCyclicDifference / 2;

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_STRING:
//the max difference is the worst case edit distance, of removing all the characters
// and then adding back in another of equal size but different
return static_cast<double>(column.longestStringLength * 2);

case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE:
//the max difference is the worst case edit distance, of removing all the characters
// and then adding back in another of equal size but different
return static_cast<double>(column.largestCodeSize * 2);

default:
//this switch should be exhaustive
assert(false);
return std::numeric_limits<double>::infinity();
}
}

public:

//populates specified target value given the selected target values for each value in corresponding position* parameters
Expand Down Expand Up @@ -976,7 +1016,7 @@ class SeparableBoxFilterDataStore
if(FastIsNaN(feature_attribs.knownToUnknownDistanceTerm.deviation)
|| FastIsNaN(feature_attribs.unknownToUnknownDistanceTerm.deviation))
{
unknown_distance_deviation = column_data->GetMaxDifferenceTerm(feature_attribs);
unknown_distance_deviation = GetMaxDifferenceTerm(*column_data, feature_attribs);

if(FastIsNaN(feature_attribs.knownToUnknownDistanceTerm.deviation))
feature_attribs.knownToUnknownDistanceTerm.deviation = unknown_distance_deviation;
Expand Down