-
Notifications
You must be signed in to change notification settings - Fork 2
Sbfds column data changes #251
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
Paul-Licameli
wants to merge
14
commits into
howsoai:main
Choose a base branch
from
Paul-Licameli:SBFDSColumnData-changes
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
801d5a9
Move GetMaxDifferenceTerm to SeparableBoxFilterDataStore...
Paul-Licameli 548f31d
Some const qualifiers
Paul-Licameli c8e4c2a
Should GetMaxDifferenceTerm return negative infinity? I assumed not
Paul-Licameli 0e22063
Don't intern string id twice in a column
Paul-Licameli e20c9e8
Reviewed uses of valueInterningEnabled
Paul-Licameli 9194259
Use InsertNewLargestInteger
Paul-Licameli 81f4e0d
Improve some comments
Paul-Licameli 0ff1f46
Minor efficiencies
Paul-Licameli a119ef3
Correct defensive code path
Paul-Licameli eec8536
Better reservation of integer array
Paul-Licameli 6a8d750
More assertions
Paul-Licameli a110b52
Fix comments and logic in FindAllIndicesWithinRange
Paul-Licameli 4d5b6fc
Merge branch 'main' into SBFDSColumnData-changes
howsohazard 7dc5625
Merge branch 'main' into SBFDSColumnData-changes
howsohazard File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -77,7 +77,7 @@ class SeparableBoxFilterDataStore | |
| inline double GetMaxDistanceTermForContinuousFeature(RepeatedGeneralizedDistanceEvaluator &r_dist_eval, | ||
| size_t query_feature_index, size_t absolute_feature_index, bool high_accuracy) | ||
| { | ||
| double max_diff = columnData[absolute_feature_index]->GetMaxDifferenceTerm( | ||
| double max_diff = GetMaxDifferenceTerm(*columnData[absolute_feature_index], | ||
| r_dist_eval.distEvaluator->featureAttribs[query_feature_index]); | ||
| return r_dist_eval.distEvaluator->ComputeDistanceTermContinuousNonNullRegular( | ||
| max_diff, query_feature_index, high_accuracy); | ||
|
|
@@ -933,6 +933,46 @@ class SeparableBoxFilterDataStore | |
| return std::make_pair(true, distance); | ||
| } | ||
|
|
||
| //returns the maximum difference between value and any other value for this column | ||
| static inline double GetMaxDifferenceTerm(const SBFDSColumnData &column, | ||
| GeneralizedDistanceEvaluator::FeatureAttributes &feature_attribs) | ||
| { | ||
| switch(feature_attribs.featureType) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why should this be static rather than a method of column data? |
||
| { | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_NUMERIC: | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_STRING: | ||
| case GeneralizedDistanceEvaluator::FDT_NOMINAL_CODE: { | ||
| auto denom = (column.numberIndices.size() + column.stringIdIndices.size() + column.codeIndices.size()); | ||
| return denom > 0 ? 1.0 - 1.0 / denom : 0; | ||
| } | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC: | ||
| if(column.sortedNumberValueEntries.size() <= 1) | ||
| return 0.0; | ||
|
|
||
| return column.sortedNumberValueEntries.back()->value.number - column.sortedNumberValueEntries[0]->value.number; | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_NUMERIC_CYCLIC: | ||
| //maximum is the other side of the cycle | ||
| return feature_attribs.typeAttributes.maxCyclicDifference / 2; | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_STRING: | ||
| //the max difference is the worst case edit distance, of removing all the characters | ||
| // and then adding back in another of equal size but different | ||
| return static_cast<double>(column.longestStringLength * 2); | ||
|
|
||
| case GeneralizedDistanceEvaluator::FDT_CONTINUOUS_CODE: | ||
| //the max difference is the worst case edit distance, of removing all the characters | ||
| // and then adding back in another of equal size but different | ||
| return static_cast<double>(column.largestCodeSize * 2); | ||
|
|
||
| default: | ||
| //this switch should be exhaustive | ||
| assert(false); | ||
| return std::numeric_limits<double>::infinity(); | ||
| } | ||
| } | ||
|
|
||
| public: | ||
|
|
||
| //populates specified target value given the selected target values for each value in corresponding position* parameters | ||
|
|
@@ -976,7 +1016,7 @@ class SeparableBoxFilterDataStore | |
| if(FastIsNaN(feature_attribs.knownToUnknownDistanceTerm.deviation) | ||
| || FastIsNaN(feature_attribs.unknownToUnknownDistanceTerm.deviation)) | ||
| { | ||
| unknown_distance_deviation = column_data->GetMaxDifferenceTerm(feature_attribs); | ||
| unknown_distance_deviation = GetMaxDifferenceTerm(*column_data, feature_attribs); | ||
|
|
||
| if(FastIsNaN(feature_attribs.knownToUnknownDistanceTerm.deviation)) | ||
| feature_attribs.knownToUnknownDistanceTerm.deviation = unknown_distance_deviation; | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Style is for newline here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, there's logic in InsertFirstIndexIntoStringIdValueEntry to do the right thing, which mimics the logic above (line 658) for numbers.