Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 111 additions & 109 deletions howso/ablation.amlg
Original file line number Diff line number Diff line change
Expand Up @@ -528,9 +528,9 @@
;list of features to use when computing influence weight entropies, defaults to all trained features
features !trainedFeatures
;{type "number"}
;numeric maximum threshold for influence weight entropy of cases to keep, defaults to the value
; influence weight entropy threshold stored within the Trainee
influence_weight_entropy_threshold !reduceDataInfluenceWeightEntropyThreshold
;stores the maximum number of cases that may remain after data is reduced
; default to the value stored within the Trainee via 'set_auto_ablation_params', which defaults to 50,000.
reduce_max_cases !postReduceMaxCases
;{ref "AblationThresholdMap"}
;a map of measure names (any of the prediction stats) to a map of feature names to threshold value.
; absolute thresholds will cause data reduction to stop when any of the measure values for any of
Expand Down Expand Up @@ -591,19 +591,12 @@
feature_weights (get hyperparam_map "featureWeights")
feature_deviations (get hyperparam_map "featureDeviations")
query_feature_attributes_map (get hyperparam_map "featureDomainAttributes")
pre_reduce_num_cases (call !GetNumTrainingCases)
))
num_cases (call !GetNumTrainingCases)

;Also ensure that we have all influence weight entropies and that they are up-to-date
(declare (assoc
;store a map of case id -> "duplicate"/"too_far_for_removal"/"near_duplicate" for any duplicate cases or cases that should not be removed because they are too far
case_duplicate_or_far_map
(call !ComputeAndStoreInfluenceWeightEntropies (assoc
features features
weight_feature distribute_weight_feature
use_case_weights .true
compute_all .true
))
;reduction will stop within batch_size of reduce_max_cases, so if the gap between
;reduce_max_cases and !autoAblationMinNumCases (max and min) cases is larger than batch_size,
;the number of cases that need to be kept is approximately: max - batch_size, but can't be less than min.
approximate_num_cases_to_keep (max (- reduce_max_cases batch_size) !autoAblationMinNumCases)
))

(if thresholds_enabled
Expand All @@ -620,50 +613,117 @@
))
)

;if this dataset has duplicates merge them all here first and recompute weight entropies for their remaining representative non-duplicates
;pair of cases and associated sorted popularities (total normalized influence of all neighbors that referenced it)
(declare (assoc
case_popularity_pair
(compute_on_contained_entities
(query_exists !internalLabelSession)
||(query_entity_cumulative_nearest_entity_weights
closest_k
features
(null) ;all cases
p_parameter
feature_weights
!queryDistanceTypeMap
query_feature_attributes_map
feature_deviations
(null)
dt_parameter
distribute_weight_feature
(rand)
(null) ;radius
!numericalPrecision
.true
)
)
))

;all the cases that were not returned in the pair above have 0 popularity (no other cases reference them)
(declare (assoc
all_duplicate_cases_map (filter (lambda (= "duplicate" (current_value))) case_duplicate_or_far_map)
zero_popularity_neighbors
(contained_entities
(query_exists !internalLabelSession)
(query_not_in_entity_list (first case_popularity_pair))
)
))
(if (size all_duplicate_cases_map)
(call !ReduceMergeDuplicateCases)
)

;determine the cutoff value of the popularity at which all cases with a value less than that should be removed
;e.g., if there needs to be a quarter of cases left, this would compute the 0.75 quantile of popularity values,
;so that those bottom 75% are removed
(declare (assoc
reduction_popularity_cutoff
(quantile
(append
(last case_popularity_pair)
(range 0 1 (size zero_popularity_neighbors) 1)
)
;add one percent to account for enough cases selected to match the amount needed to be removed due to rounding
;e.g., if the quantile value was 0.75 from the example above, this bumps it up to 0.76
(+
(/ (- num_cases approximate_num_cases_to_keep) num_cases)
0.01
)
)
))
;plan to only remove cases whose popularity is less than reduction_popularity_cutoff
;i.e., only remove the non-popular cases that aren't referenced by others as much
(declare (assoc
cases_too_far_for_removal (indices (filter (lambda (= "too_far_for_removal" (current_value))) case_duplicate_or_far_map))
near_duplicate_cases (indices (filter (lambda (= "near_duplicate" (current_value))) case_duplicate_or_far_map))
num_removal_eligible_cases
(size (filter
(lambda (< (current_value) reduction_popularity_cutoff))
(last case_popularity_pair)
))
))
(declare (assoc
;case ids in order from highest to lowest popularity, lowest popularity at end of list
removable_cases
(append
;only keep the necessary number of lowest popularity eligible cases as well as all zero popularity ones
(tail (first case_popularity_pair) num_removal_eligible_cases)
zero_popularity_neighbors
)
))

(declare (assoc
;list will be sorted from highest to lowest, thus cases removed from the end of the list
end_index (- (size removable_cases) 1)
random_cases .false
num_removed_this_batch 0
))
;remove and redistribute the near_duplicate cases' weights
(if (size near_duplicate_cases)
(call !RemoveCases (assoc
cases near_duplicate_cases
distribute_weight_feature distribute_weight_feature
))
)

;Begin looping on data removal. The ultimate end condition is if the dataset gets too small
; to continue removing cases. Removes cases with relatively high influence weight entropy, i.e., cases with equidistant neighbors.
;Begin looping on data removal. The ultimate end condition is if the dataset gets too small to continue removing cases.
(while (< !autoAblationMinNumCases (call !GetNumTrainingCases))
(assign (assoc
max_influence_weight_entropy_to_keep
(call !RecomputeAndCacheMaxInfluenceWeightEntropy (assoc
influence_weight_entropy_threshold influence_weight_entropy_threshold
weight_feature distribute_weight_feature
))
num_removed_this_batch (min batch_size (- (call !GetNumTrainingCases) !autoAblationMinNumCases))
))
(assign (assoc
cases
(contained_entities
;ignore cases that have been determined to be too far for removal
(query_not_in_entity_list cases_too_far_for_removal)
(query_greater_or_equal_to !internalLabelInfluenceWeightEntropy max_influence_weight_entropy_to_keep)
;grab the largest entropy values of specified batch_size number of cases
(query_max !internalLabelInfluenceWeightEntropy
(min batch_size (- (call !GetNumTrainingCases) !autoAblationMinNumCases))
.true
(if (>= end_index 0)
;grab the cases from the end, with the smallest values
(unzip
removable_cases
(range
(max 0 (- end_index num_removed_this_batch -1))
end_index
)
)

;else select random cases
(contained_entities
(query_exists distribute_weight_feature)
(query_select num_removed_this_batch (null) (rand) )
)
)
))

(if (>= end_index 0)
;update end index to account for the cases about to be removed
(assign (assoc end_index (- end_index (size cases)) ))

;else no more removable cases left, remove random cases
(assign (assoc random_cases .true))
)

(if !tsTimeFeature
;do not remove first (.series_index == 0) or last (.reverse_series_index == 0) cases for any series
(assign (assoc
Expand Down Expand Up @@ -720,84 +780,26 @@
)
)

;enough cases have been removed, can stop removing
(<= (call !GetNumTrainingCases) !postReduceMaxCases)
;else couldn't select any from random cases, stop
(and random_cases (< end_index 0))
(conclude)
)

;else no cases left to remove even though the desired dataset size has not been reached yet
;if the number of these these "too far" cases is bigger than the dataset, some of them must be removed
;clear out the list so the next iteration can ignore it
(<= (call !GetNumTrainingCases) (size cases_too_far_for_removal))
(assign (assoc cases_too_far_for_removal [] ))

;else recompute all the influence entropies since dataset has been updated
(let
(assoc
;number of cases that were supposed to be removed during this iteration
num_cases_to_remove (min batch_size (- (call !GetNumTrainingCases) !autoAblationMinNumCases))
)
(assign (assoc
case_duplicate_or_far_map
(call !ComputeAndStoreInfluenceWeightEntropies (assoc
features features
weight_feature distribute_weight_feature
use_case_weights .true
compute_all .true
))
))
(assign (assoc
cases_too_far_for_removal (indices (filter (lambda (= "too_far_for_removal" (current_value))) case_duplicate_or_far_map))
near_duplicate_cases (indices (filter (lambda (= "near_duplicate" (current_value))) case_duplicate_or_far_map))
))

;remove the new near duplicates here
(if (size near_duplicate_cases)
;if there were fewer cases that needed to be removed than there are near duplicates, select some near duplicates to remove
(if (> (size near_duplicate_cases) num_cases_to_remove)
(call !RemoveCases (assoc
cases (rand near_duplicate_cases num_cases_to_remove .true)
distribute_weight_feature distribute_weight_feature
))

;else remove all near duplicates
(call !RemoveCases (assoc
cases near_duplicate_cases
distribute_weight_feature distribute_weight_feature
))
)

;else if the entropy value to keep hasn't changed since the recomputation, nothing has changed, can stop iteration
(=
max_influence_weight_entropy_to_keep
(call !RecomputeAndCacheMaxInfluenceWeightEntropy (assoc
influence_weight_entropy_threshold influence_weight_entropy_threshold
weight_feature distribute_weight_feature
))
)
(conclude)
)
)
;enough cases have been removed, can stop removing
(if (<= (call !GetNumTrainingCases) reduce_max_cases)
(conclude)
)
)

;if the number of cases has been reduced by 'e' or more, auto analyze if needed
(if (< (call !GetNumTrainingCases) (/ pre_reduce_num_cases 2.718281828459))
(if (< (call !GetNumTrainingCases) (/ num_cases 2.718281828459))
(call !AutoAnalyzeIfNeeded (assoc
skip_auto_analyze skip_auto_analyze
;no need to compute entropies for all cases anymore since reduction is complete
in_reduce_data .false
))
)

(declare (assoc
quantile_value
(call !RecomputeAndCacheMaxInfluenceWeightEntropy (assoc
influence_weight_entropy_threshold influence_weight_entropy_threshold
weight_feature distribute_weight_feature
))
))

(assign_to_entities (assoc !autoAblationMaxInfluenceWeightEntropy quantile_value ))
(accum_to_entities (assoc !revision 1))
(call !Return (assoc payload output))
)
Expand Down
7 changes: 7 additions & 0 deletions howso/update_cases.amlg
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,13 @@
))
(assign (assoc total_influence (apply "+" (values closest_cases_map)) ))
)

;all cases are equally too distant, set their influence to be same
(= 0 total_influence)
(assign (assoc
closest_cases_map (map 1 closest_cases_map)
total_influence (size closest_cases_map)
))
)
;output pairs of: [ case_weight, distributed weight closest_cases_map]

Expand Down
2 changes: 1 addition & 1 deletion unit_tests/ut_h_ablate.amlg
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@
(list 1 "payload" "cases")
))
))
(call_entity "howso" "reduce_data" (assoc influence_weight_entropy_threshold 0.5))
(call_entity "howso" "reduce_data" (assoc reduce_max_cases 4))

(print "Data reduction reduces model size by the expected amount: ")
(call assert_same (assoc
Expand Down
2 changes: 1 addition & 1 deletion unit_tests/ut_h_edit_dist_features.amlg
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@
x 0
}
)
thresh 1.6
thresh 1.7
))

(call exit_if_failures (assoc msg "MDA and contributions for string feature." ))
Expand Down
2 changes: 0 additions & 2 deletions unit_tests/ut_h_reduce_data.amlg
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,6 @@
(-
(size training_data)
(call_entity "howso""debug_label" (assoc label "!ablationBatchSize"))
;account for removed duplicates and near duplicates
4
)
obs
(get (call_entity "howso" "get_num_training_cases") (list 1 "payload" "count"))
Expand Down
32 changes: 16 additions & 16 deletions unit_tests/ut_h_scale_ablation.amlg
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@
(print "reduce_data\n")
(call_entity "howso" "reduce_data" (assoc))

; At this point, ablation has gotten rid of all of the cases between x=0 and x=300!
; (There's no specific requirement that ablation does so, but it matters for this test.)
;At this point, ablation has gotten rid of all of the cases between x=99 and x=998 because
;their squared values are too large making the cases too distance to be referenced

(declare (assoc
first_pass_cases (call_entity "howso" "get_cases" (assoc features (list "x")))
Expand All @@ -63,32 +63,32 @@
obs (first first_pass_cases)
exp 1
))
(print "no cases between x=0 and x=300: ")
(print "only '998' and cases under '100' remain: ")
(call assert_true (assoc
obs (apply "and"
(map
(lambda (or (= (first (current_value)) 0) (> (first (current_value)) 300)))
(get first_pass_cases (list 1 "payload" "cases"))
)
)
obs
(apply "and"
(map
(lambda (or (< (first (current_value)) 100) (= (first (current_value)) 998)))
(get first_pass_cases (list 1 "payload" "cases"))
)
)
))

; Let's train some more cases with small numbers.
;train some more cases with medium numbers.

(print "train\n")
(call_entity "howso" "train" (assoc
cases (call !CreateSquareCases (assoc xs (range 0 98)))
cases (call !CreateSquareCases (assoc xs (range 400 498)))
features (list "x" "y")
session "unit_test"
))

; Now let's set up training a specific number of small-valued cases.
; Remember that anything near here was dropped in the first reduce_data call, but we've loaded in some
; duplicate cases.
; set up training a specific number of medium-valued cases, anything near here was dropped in the first
; reduce_data call, but we've loaded in some duplicate cases.

(declare (assoc
train_payload (call_entity "howso" "compute_train_payload" (assoc
cases (call !CreateSquareCases (assoc xs (range 45 55)))
cases (call !CreateSquareCases (assoc xs (range 450 455)))
features (list "x" "y")
session "unit_test"
))
Expand All @@ -104,7 +104,7 @@
(print "reduce_data\n")
(call_entity "howso" "reduce_data" (assoc))

; This will again drop a lot of the small-valued cases, so we're going to fail committing the payload.
; This will again drop a lot of the medium-valued cases, so we're going to fail committing the payload.

(print "process_train_payload failed: ")
(call assert_same (assoc
Expand Down
2 changes: 1 addition & 1 deletion version.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"version": "0.0.0",
"dependencies": {
"amalgam": "70.1.5"
"amalgam": "70.2.0"
}
}
Loading