-
Notifications
You must be signed in to change notification settings - Fork 2
Sbfds column data changes #251
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 13 commits
801d5a9
548f31d
c8e4c2a
0e22063
e20c9e8
9194259
81f4e0d
0ff1f46
a119ef3
eec8536
6a8d750
a110b52
4d5b6fc
7dc5625
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ | |
|
|
||
| //SBFDSColumnData class maintains a sorted linear and random access data collection | ||
| //values with the same key are placed into the same bucket. buckets are stored in sorted order by key | ||
| //this is "indexing" as in database terminology, but otherwise "index" herein means a row identifier | ||
| class SBFDSColumnData | ||
| { | ||
| public: | ||
|
|
@@ -101,14 +102,14 @@ class SBFDSColumnData | |
| size_entry->second = std::make_unique<SortedIntegerSet>(); | ||
|
|
||
| //add the entity | ||
| size_entry->second->insert(index); | ||
| size_entry->second->InsertNewLargestInteger(index); | ||
|
|
||
| UpdateLargestCode(code_size, index); | ||
| } | ||
| } | ||
|
|
||
| //inserts indices assuming that they have been sorted by value, | ||
| // and that index_values are also sorted from smallest to largest | ||
| // and that index_values are also sorted from smallest to largest within each value | ||
| void AppendSortedNumberIndicesWithSortedIndices(std::vector<DistanceReferencePair<size_t>> &index_values) | ||
| { | ||
| if(index_values.size() == 0) | ||
|
|
@@ -117,8 +118,11 @@ class SBFDSColumnData | |
| //count unique values so only need to perform one allocation for the main list | ||
| size_t num_uniques = 1; | ||
| double prev_value = index_values[0].distance; | ||
| size_t maxIndex = 0; | ||
| for(size_t i = 1; i < index_values.size(); i++) | ||
| { | ||
| if(auto reference = index_values[i].reference; maxIndex < reference) | ||
| maxIndex = reference; | ||
| if(prev_value != index_values[i].distance) | ||
| { | ||
| num_uniques++; | ||
|
|
@@ -127,7 +131,7 @@ class SBFDSColumnData | |
| } | ||
|
|
||
| sortedNumberValueEntries.reserve(num_uniques); | ||
| numberIndices.ReserveNumIntegers(index_values.back().reference + 1); | ||
| numberIndices.ReserveNumIntegers(maxIndex + 1); | ||
|
|
||
| for(auto &index_value : index_values) | ||
| { | ||
|
|
@@ -308,6 +312,7 @@ class SBFDSColumnData | |
| std::make_unique<ValueEntry>(new_number_value)); | ||
|
|
||
| InsertFirstIndexIntoNumberValueEntry(index, new_value_entry_index); | ||
| new_value_index = sortedNumberValueEntries[new_value_entry_index]->valueInternIndex; | ||
| } | ||
|
|
||
| if(internedNumberValues.valueInterningEnabled) | ||
|
|
@@ -371,11 +376,16 @@ class SBFDSColumnData | |
| } | ||
| } | ||
| } | ||
| else if(inserted) //shouldn't make it here, but ensure integrity just in case | ||
| else //shouldn't make it here, but ensure integrity just in case | ||
| { | ||
| assert(false); | ||
| new_id_entry->second = std::make_unique<ValueEntry>(new_sid_value); | ||
| InsertFirstIndexIntoStringIdValueEntry(index, new_id_entry); | ||
| if(inserted) { | ||
| new_id_entry->second = std::make_unique<ValueEntry>(new_sid_value); | ||
| InsertFirstIndexIntoStringIdValueEntry(index, new_id_entry); | ||
| } | ||
| else | ||
| new_id_entry->second->indicesWithValue.insert(index); | ||
|
|
||
| new_value_index = new_id_entry->second->valueInternIndex; | ||
| } | ||
|
|
||
|
|
@@ -430,15 +440,16 @@ class SBFDSColumnData | |
| valueCodeSizeToIndices.erase(old_size_entry); | ||
| } | ||
| } | ||
| else if(inserted) //shouldn't make it here, but ensure integrity just in case | ||
| else //shouldn't make it here, but ensure integrity just in case | ||
| { | ||
| assert(false); | ||
| new_size_entry->second = std::make_unique<SortedIntegerSet>(); | ||
| if(inserted) { | ||
| new_size_entry->second = std::make_unique<SortedIntegerSet>(); | ||
| } | ||
| new_size_entry->second->insert(index); | ||
| } | ||
| } | ||
|
|
||
| //update longest string as appropriate | ||
| //see if need to update largest code | ||
| if(index == indexWithLargestCode) | ||
| RecomputeLargestCode(); | ||
|
|
@@ -513,14 +524,15 @@ class SBFDSColumnData | |
| assert(false); | ||
|
|
||
| auto &entities = id_entry->second->indicesWithValue; | ||
| entities.erase(index); | ||
|
|
||
| //if no more entries have the value, remove it | ||
| if(entities.size() == 0) | ||
| if(entities.size() <= 1) | ||
| { | ||
| internedStringIdValues.DeleteInternIndex(id_entry->second->valueInternIndex); | ||
| stringIdValueEntries.erase(id_entry); | ||
| } | ||
| else | ||
| entities.erase(index); | ||
|
|
||
| //see if need to compute new longest string | ||
| if(index == indexWithLongestString) | ||
|
|
@@ -555,10 +567,11 @@ class SBFDSColumnData | |
|
|
||
| //remove the entity | ||
| auto &entities = *(id_entry->second); | ||
| entities.erase(index); | ||
|
|
||
| if(entities.size() == 0) | ||
| if(entities.size() <= 1) | ||
| valueCodeSizeToIndices.erase(id_entry); | ||
| else | ||
| entities.erase(index); | ||
|
|
||
| //see if need to update largest code | ||
| if(index == indexWithLargestCode) | ||
|
|
@@ -576,6 +589,7 @@ class SBFDSColumnData | |
| { | ||
| ValueEntry *value_entry = sortedNumberValueEntries[value_index].get(); | ||
|
|
||
| assert(value_entry->indicesWithValue.size() == 0); | ||
| value_entry->indicesWithValue.insert(index); | ||
| internedNumberValues.InsertValueEntry(value_entry, sortedNumberValueEntries.size()); | ||
| } | ||
|
|
@@ -587,6 +601,7 @@ class SBFDSColumnData | |
| { | ||
| ValueEntry *value_entry = value_iter->second.get(); | ||
|
|
||
| assert(value_entry->indicesWithValue.size() == 0); | ||
| value_entry->indicesWithValue.insert(index); | ||
| internedStringIdValues.InsertValueEntry(value_entry, stringIdValueEntries.size()); | ||
| } | ||
|
|
@@ -601,7 +616,7 @@ class SBFDSColumnData | |
| { | ||
| invalidIndices.insert(index); | ||
|
|
||
| if(internedNumberValues.valueInterningEnabled) | ||
| if(internedNumberValues.valueInterningEnabled || internedStringIdValues.valueInterningEnabled) | ||
| return EvaluableNodeImmediateValue(ValueEntry::NULL_INDEX); | ||
| else | ||
| return value; | ||
|
|
@@ -656,10 +671,12 @@ class SBFDSColumnData | |
|
|
||
| //try to insert the value if not already there | ||
| auto [inserted_id_entry, inserted] = stringIdValueEntries.emplace(string_id, nullptr); | ||
| if(inserted) | ||
| if(inserted) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Style is for newline here
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, there's logic in InsertFirstIndexIntoStringIdValueEntry to do the right thing, which mimics the logic above (line 658) for numbers. |
||
| inserted_id_entry->second = std::make_unique<ValueEntry>(string_id); | ||
|
|
||
| InsertFirstIndexIntoStringIdValueEntry(index, inserted_id_entry); | ||
| InsertFirstIndexIntoStringIdValueEntry(index, inserted_id_entry); | ||
| } | ||
| else | ||
| inserted_id_entry->second->indicesWithValue.insert(index); | ||
|
|
||
| UpdateLongestString(string_id, index); | ||
|
|
||
|
|
@@ -701,42 +718,6 @@ class SBFDSColumnData | |
| return numberIndices.size() + stringIdIndices.size() + codeIndices.size(); | ||
| } | ||
|
|
||
| //returns the maximum difference between value and any other value for this column | ||
| //if empty, will return infinity | ||
| inline double GetMaxDifferenceTerm(GeneralizedDistanceEvaluator::FeatureAttributes &feature_attribs) | ||
| { | ||
| switch(feature_attribs.featureType) | ||
| { | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC: | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_STRING: | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE: | ||
| return 1.0 - 1.0 / (numberIndices.size() + stringIdIndices.size() + codeIndices.size()); | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC: | ||
| if(sortedNumberValueEntries.size() <= 1) | ||
| return 0.0; | ||
|
|
||
| return sortedNumberValueEntries.back()->value.number - sortedNumberValueEntries[0]->value.number; | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC: | ||
| //maximum is the other side of the cycle | ||
| return feature_attribs.typeAttributes.maxCyclicDifference / 2; | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_STRING: | ||
| //the max difference is the worst case edit distance, of removing all the characters | ||
| // and then adding back in another of equal size but different | ||
| return static_cast<double>(longestStringLength * 2); | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE: | ||
| //the max difference is the worst case edit distance, of removing all the characters | ||
| // and then adding back in another of equal size but different | ||
| return static_cast<double>(largestCodeSize * 2); | ||
|
|
||
| default: | ||
| return std::numeric_limits<double>::infinity(); | ||
| } | ||
| } | ||
|
|
||
| //returns the exact index of value | ||
| //Same as std::binary_search but returns both index and if found | ||
| // .first: found index - if not found, returns closest index from lower_bound if | ||
|
|
@@ -840,8 +821,10 @@ class SBFDSColumnData | |
| } | ||
| } | ||
|
|
||
| //given a feature_id and a range [low, high], inserts all the elements with values of feature feature_id within specified range into out; does not clear out | ||
| //if the feature value is null, it will NOT be present in the search results, ie "x" != 3 will NOT include elements with x is null, even though null != 3 | ||
| //given a range [low, high], inserts all the elements within specified range into out; does not clear out | ||
| //the range is inclusive of high for numbers, but exclusive for string ids! | ||
| //either but not both numerical bounds may be NaN to signify an infinity | ||
| //if between_values is false, find complementary set | ||
| void FindAllIndicesWithinRange(EvaluableNodeImmediateValueType value_type, | ||
| EvaluableNodeImmediateValue &low, EvaluableNodeImmediateValue &high, BitArrayIntegerSet &out, bool between_values = true) | ||
| { | ||
|
|
@@ -864,7 +847,7 @@ class SBFDSColumnData | |
| //modify range to include elements from or up to -/+inf | ||
| if(FastIsNaN(low_number)) //find all NaN values and all values up to max | ||
| low_number = -std::numeric_limits<double>::infinity(); //else include elements from -inf to high as well as NaN elements | ||
| else | ||
| else if(FastIsNaN(high_number)) | ||
| high_number = std::numeric_limits<double>::infinity(); //include elements from low to +inf as well as NaN elements | ||
| } | ||
|
|
||
|
|
@@ -884,6 +867,7 @@ class SBFDSColumnData | |
| //if within range, and range has no length, just return indices in that one bucket | ||
| if(between_values) | ||
| { | ||
| assert(exact_index_found); | ||
| size_t index = value_index; | ||
| out.InsertInBatch(sortedNumberValueEntries[index]->indicesWithValue); | ||
| } | ||
|
|
@@ -909,10 +893,6 @@ class SBFDSColumnData | |
| //insert everything between the two indices | ||
| for(size_t i = start_index; i < end_index; i++) | ||
| out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue); | ||
|
|
||
| //include end_index if value matches | ||
| if(end_index < sortedNumberValueEntries.size() && sortedNumberValueEntries[end_index]->value.number == high_number) | ||
| out.InsertInBatch(sortedNumberValueEntries[end_index]->indicesWithValue); | ||
| } | ||
| else //not between_values | ||
| { | ||
|
|
@@ -928,6 +908,7 @@ class SBFDSColumnData | |
| } | ||
| else if(value_type == ENIVT_STRING_ID) | ||
| { | ||
| //there are no ids for this column, so return no results | ||
| if(stringIdValueEntries.size() == 0) | ||
| return; | ||
|
|
||
|
|
@@ -998,7 +979,7 @@ class SBFDSColumnData | |
| if(sortedNumberValueEntries.size() == 0) | ||
| return; | ||
|
|
||
| //search left to right for max (bucket 0 is largest) or right to left for min | ||
| //search right to left for max (bucket 0 is smallest) or left to right for min | ||
| int64_t value_index = find_max ? sortedNumberValueEntries.size() - 1 : 0; | ||
|
|
||
| while(value_index < static_cast<int64_t>(sortedNumberValueEntries.size()) && value_index >= 0) | ||
|
|
@@ -1016,7 +997,7 @@ class SBFDSColumnData | |
| return; | ||
| } | ||
|
|
||
| value_index += find_max ? -1 : 1; //search right to right for max or left to right for min | ||
| value_index += find_max ? -1 : 1; //search right to left for max or left to right for min | ||
| } | ||
| } | ||
| else if(value_type == ENIVT_STRING_ID) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -77,7 +77,7 @@ class SeparableBoxFilterDataStore | |
| inline double GetMaxDistanceTermForContinuousFeature(RepeatedGeneralizedDistanceEvaluator &r_dist_eval, | ||
| size_t query_feature_index, size_t absolute_feature_index, bool high_accuracy) | ||
| { | ||
| double max_diff = columnData[absolute_feature_index]->GetMaxDifferenceTerm( | ||
| double max_diff = GetMaxDifferenceTerm(*columnData[absolute_feature_index], | ||
| r_dist_eval.distEvaluator->featureAttribs[query_feature_index]); | ||
| return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonNullRegular( | ||
| max_diff, query_feature_index, high_accuracy); | ||
|
|
@@ -933,6 +933,46 @@ class SeparableBoxFilterDataStore | |
| return std::make_pair(true, distance); | ||
| } | ||
|
|
||
| //returns the maximum difference between value and any other value for this column | ||
| static inline double GetMaxDifferenceTerm(const SBFDSColumnData &column, | ||
| GeneralizedDistanceEvaluator::FeatureAttributes &feature_attribs) | ||
| { | ||
| switch(feature_attribs.featureType) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why should this be static rather than a method of column data? |
||
| { | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC: | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_STRING: | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE: { | ||
| auto denom = (column.numberIndices.size() + column.stringIdIndices.size() + column.codeIndices.size()); | ||
| return denom > 0 ? 1.0 - 1.0 / denom : 0; | ||
| } | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC: | ||
| if(column.sortedNumberValueEntries.size() <= 1) | ||
| return 0.0; | ||
|
|
||
| return column.sortedNumberValueEntries.back()->value.number - column.sortedNumberValueEntries[0]->value.number; | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC: | ||
| //maximum is the other side of the cycle | ||
| return feature_attribs.typeAttributes.maxCyclicDifference / 2; | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_STRING: | ||
| //the max difference is the worst case edit distance, of removing all the characters | ||
| // and then adding back in another of equal size but different | ||
| return static_cast<double>(column.longestStringLength * 2); | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE: | ||
| //the max difference is the worst case edit distance, of removing all the characters | ||
| // and then adding back in another of equal size but different | ||
| return static_cast<double>(column.largestCodeSize * 2); | ||
|
|
||
| default: | ||
| //this switch should be exhaustive | ||
| assert(false); | ||
| return std::numeric_limits<double>::infinity(); | ||
| } | ||
| } | ||
|
|
||
| public: | ||
|
|
||
| //populates specified target value given the selected target values for each value in corresponding position* parameters | ||
|
|
@@ -976,7 +1016,7 @@ class SeparableBoxFilterDataStore | |
| if(FastIsNaN(feature_attribs.knownToUnknownDistanceTerm.deviation) | ||
| || FastIsNaN(feature_attribs.unknownToUnknownDistanceTerm.deviation)) | ||
| { | ||
| unknown_distance_deviation = column_data->GetMaxDifferenceTerm(feature_attribs); | ||
| unknown_distance_deviation = GetMaxDifferenceTerm(*column_data, feature_attribs); | ||
|
|
||
| if(FastIsNaN(feature_attribs.knownToUnknownDistanceTerm.deviation)) | ||
| feature_attribs.knownToUnknownDistanceTerm.deviation = unknown_distance_deviation; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good find!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This one is important enough that I'm going to include it in another PR going in shortly to make the next release.