diff --git a/howso.amlg b/howso.amlg index 4567a5c23..ea61823c8 100644 --- a/howso.amlg +++ b/howso.amlg @@ -149,6 +149,10 @@ #!postProcessMap (null) #!hasDependentFeatures (null) #!dependentFeatureMap (null) + #!sharedDeviationsMap (null) + #!sharedDeviationGroupByPrimaryMap (null) + #!sharedDeviationsNonPrimaryFeatures (null) + #!sharedDeviationsPrimaryFeatures (null) #!continuousToNominalDependenciesMap (null) #!dependentsBoundaryMap (null) #!dependentValuesCombinationsMap (null) @@ -229,6 +233,7 @@ "residuals" "return_typing" "series_store" + "shared_deviations" "substitution" "synthesis" "synthesis_bounds" @@ -556,6 +561,20 @@ ;assoc of feature -> { 'dependent_features' : [ list of dependent features ] } !dependentFeatureMap (assoc) + ;assoc of feature -> primary feature key from group of shared deviations features + ; This primary feature key is the first sorted feature in the group of features + !sharedDeviationsMap (assoc) + + ;list of all primary shared features + !sharedDeviationsPrimaryFeatures (list) + + ;list of all non-primary shared features in a group + !sharedDeviationsNonPrimaryFeatures (list) + + ;assoc of primary group feature -> list of shared features in its group + ; e.g., { "a": ["a", "b"], "c": ["c", "d"] } + !sharedDeviationGroupByPrimaryMap (assoc) + ;assoc of continuous feature -> [ list of sorted nominal dependents ] !continuousToNominalDependenciesMap (assoc) diff --git a/howso/analysis.amlg b/howso/analysis.amlg index bf6a3a927..56fb43b15 100644 --- a/howso/analysis.amlg +++ b/howso/analysis.amlg @@ -293,6 +293,7 @@ weight_feature weight_feature use_case_weights use_case_weights compute_null_uncertainties (false) + use_shared_deviations (true) )) ) diff --git a/howso/analysis_weights.amlg b/howso/analysis_weights.amlg index ebfa38e07..a3c55748b 100644 --- a/howso/analysis_weights.amlg +++ b/howso/analysis_weights.amlg @@ -29,7 +29,7 @@ (while (< iteration num_iterations) (assign (assoc residuals_map - (call !ExpandResidualValuesToUncertainty (assoc + (call !ExpandResidualValuesToUncertainty (assoc feature_residuals_map (call !CalculateFeatureResiduals (assoc features features @@ -51,6 +51,9 @@ compute_all_statistics use_deviations ;don't sparsify the confusion matrix so that SDM can be computed using full counts confusion_matrix_min_count 0 + use_shared_deviations (true) + ;don't create copies of confusion matrices for non-primary shared deviation features + expand_confusion_matrices (false) )) )) )) @@ -215,7 +218,7 @@ ) ;updates analyzed_hp_map - ;run multiple iterations of grid search and resdiuals to either use with the inverse_residuals_as_weights flow + ;run multiple iterations of grid search and residuals to either use with the inverse_residuals_as_weights flow #!ConvergeIRW (declare (assoc @@ -350,6 +353,9 @@ compute_all_statistics use_deviations ;don't sparsify the confusion matrix so that SDM can be computed using full counts confusion_matrix_min_count 0 + use_shared_deviations (true) + ;don't create copies of confusion matrices for non-primary shared deviation features + expand_confusion_matrices (false) )) )) ) @@ -436,7 +442,7 @@ (assoc feature (current_index 1)) ;store class counts as weights (reciprocal of count) for all cases for each id feature (map - (lambda (/ 1 (current_value))) + (lambda (/ 1 (current_value)) ) ;grab the un-weighted count of each class (compute_on_contained_entities (list diff --git a/howso/attribute_maps.amlg b/howso/attribute_maps.amlg index d0e7e2a1f..28338edf0 100644 --- a/howso/attribute_maps.amlg +++ b/howso/attribute_maps.amlg @@ -205,6 +205,207 @@ full_dependents_map ) + ;Creates shared deviations model attributes: + ; shared_deviations_key_map: mapping of shared deviations where the keys are each feature within a group of shared deviations, and the values are their + ; corresponding primary feature. For each group of shared deviations, they are stored under the first sorted feature in their group, a.k.a. the primary feature. + ; e.g., a group of features with shared devations is [A, B, C], creates an mapping of {A : A, B : A, C : A} + ; shared_deviations_group_by_primary: mapping of primary key -> list of shared deviation features in the group, e.g. {A : [A, B, C]} + ; shared_deviations_primary_features: list of all primary features, e.g., [A] + ; shared_deviations_non_primary_features: list of all non-primary features e.g., [B, C] + #!ProcessSharedDeviationsMap + (seq + (assign (assoc + shared_deviations_map + (append + (map + (lambda + (or + (get shared_deviations_map (current_index)) + [(current_index 1)] + ) + ) + (zip (apply "append" (values shared_deviations_map))) + ) + shared_deviations_map + ) + )) + + ;create lists of referenced features for each shared feature + (declare (assoc + shared_sets + (map + (lambda (let + (assoc + shared_features (current_value 1) + feature (current_index 1) + ) + (append + feature + ;keep lists of shared features that reference either this feature or any of its shared_features + (indices (filter + (lambda + (or + (contains_value shared_features (current_index)) + (contains_value (current_value) feature) + (!= + (size shared_features) + (size (remove (zip shared_features) (current_value)) ) + ) + ) + ) + (remove shared_deviations_map feature) + )) + ) + )) + shared_deviations_map + ) + )) + + + (declare (assoc + combined_shared_sets + (map + (lambda (let + (assoc + shared_set (zip (current_value 1)) + num_features_in_set (size (current_value 1)) + feature (current_index 1) + ) + + ;append all the overlapping shared sets and make them unique by zipping up the appended features + (zip + (apply "append" + ;if there's any overlap between shared_set and any other feature's shared_set, combine them + (values (filter + (lambda + (if (!= num_features_in_set (size (remove shared_set (current_value)))) + (append shared_set (zip (current_value))) + ) + ) + (remove shared_sets feature) + )) + ) + ) + + )) + shared_sets + ) + + )) + + (declare (assoc + unique_sets (values combined_shared_sets (true)) + )) + + (declare (assoc + shared_deviations_key_map + (map + (lambda (let + (assoc feature (current_index 1)) + ;find the the shared set that this feature belongs to and output the first feature from that set + (first (indices + (first (filter + (lambda (contains_index (current_value) feature)) + unique_sets + )) + )) + )) + combined_shared_sets + ) + )) + + (declare (assoc + primary_features (indices (zip (values shared_deviations_key_map))) + )) + + (assoc + "shared_deviations_key_map" shared_deviations_key_map + "shared_deviations_primary_features" primary_features + "shared_deviations_group_by_primary" + (map + (lambda (indices (get combined_shared_sets (current_index))) ) + (zip primary_features) + ) + "shared_deviations_non_primary_features" (indices (remove shared_deviations_key_map primary_features)) + ) + ) + + ;Validates the necessary parameters for shared deviations, making sure that no shared deviations features are unique nominals. + #!ValidateSharedDeviations + (let + (assoc + error_messages (list) + invalid_unique_shared_deviations_features (null) + invalid_nominals_non_nominals_shared_deviations (null) + ) + ;check to see if any features in a shared deviations group is a unique nominal + (if (size unique_nominals_set) + (assign (assoc + invalid_unique_shared_deviations_features + (filter + (lambda (contains_index unique_nominals_set (current_value))) + (indices (get shared_deviations_map "shared_deviations_key_map")) + ) + )) + ) + + (if (size invalid_unique_shared_deviations_features) + (accum (assoc + error_messages + (list (concat + "Features with shared deviations may not be unique nominals, as they have null deviations. " + "The following features are in a shared deviations group and is a unique nominal: " + (apply "concat" (trunc (weave invalid_unique_shared_deviations_features ", "))) + "." + )) + )) + ) + + ;check to see if nominal features has shared deviations with non-nominal features + (if (size nominals) + (assign (assoc + invalid_nominals_non_nominals_shared_deviations + ;returns at least an empty assoc for each group of shared deviations, append only keeps the assocs with values + (apply "append" + (values (map + ;grab the type of the primary feature, and then filter out all the ones that match. Should be left + ; with a list of size 0 as all of the features should share the same type + (lambda + (let + (assoc + ;the nominal type of the primary feature in the group of shared deviations + is_primary_nominal (contains_value nominals (current_index 1)) + ) + (filter + ;if the current value's nominal type does not match the first value's return it + (lambda + (!= is_primary_nominal (contains_value nominals (current_value)) ) + ) + ;list of shared deviations in the group ["a", "b"] + (current_value) + ) + ) + ) + ;assoc of shared deviations groups, i.e. {"a": ["a", "b"], "c": ["c", "d"] } + (get shared_deviations_map "shared_deviations_group_by_primary") + )) + ) + )) + ) + + (if (size invalid_nominals_non_nominals_shared_deviations) + (accum (assoc + error_messages (list "Nominal features may not share deviations with non-nominal features.") + )) + ) + + ;warns if errors are present + (if (size error_messages) + (conclude (call !Return (assoc errors error_messages))) + ) + ) + + #!ComposeContinuousToNominalDependenciesMap (let (assoc diff --git a/howso/attributes.amlg b/howso/attributes.amlg index 88f3ed47f..caf11ae99 100644 --- a/howso/attributes.amlg +++ b/howso/attributes.amlg @@ -93,6 +93,10 @@ ; feature is non-sensitive, setting this parameter to true will bypass the 'subtype' ; requirement. Only applicable to nominal features. Default is false ; + ; 'shared_deviations': list or boolean, The shared deviations feature group. If a list, this feature will share deviations with + ; the features in this list. By default, derived lag values share deviations with the parent feature. If + ; false and a parent feature, derived lags of this feature will not share deviations. + ; ; 'subtype': string, the type used in novel nominal substitution. ; ; 'original_type': string, original data type details. Used by clients to determine how to serialize and deserialize feature data. @@ -212,11 +216,17 @@ continuous_to_nominal_dependents_map (null) nominal_to_nominal_dependents_map (null) novel_substition_feature_set (null) + shared_deviations_map (null) )) (call !UpdateAttributesForDateTimeDataTypes) (assign (assoc + shared_deviations_map + (filter (map + (lambda (get (current_value) "shared_deviations") ) + feature_attributes + )) ordinals_map (filter (lambda (= "ordinal" (get (current_value) "type"))) @@ -420,6 +430,13 @@ )) ) + (if (size shared_deviations_map) + (seq + (assign (assoc shared_deviations_map (call !ProcessSharedDeviationsMap))) + (call !ValidateSharedDeviations) + ) + ) + (if (size post_process_map) (assign (assoc has_post_processing (true))) ) @@ -553,6 +570,11 @@ !queryDistanceTypeMap query_distance_type_map !tsTimeFeature time_series_feature + !sharedDeviationsMap (get shared_deviations_map "shared_deviations_key_map") + !sharedDeviationGroupByPrimaryMap (get shared_deviations_map "shared_deviations_group_by_primary") + !sharedDeviationsPrimaryFeatures (get shared_deviations_map "shared_deviations_primary_features") + !sharedDeviationsNonPrimaryFeatures (get shared_deviations_map "shared_deviations_non_primary_features") + )) (call !SetOrdinalFeatures (assoc ordinal_features ordinals)) diff --git a/howso/feature_residuals.amlg b/howso/feature_residuals.amlg index 7571b3824..20f545140 100644 --- a/howso/feature_residuals.amlg +++ b/howso/feature_residuals.amlg @@ -21,6 +21,7 @@ feature_with_nulls (null) compute_null_uncertainties (true) compute_all_statistics (true) + use_shared_deviations (false) ) (if (= (null) context_features) @@ -94,6 +95,7 @@ compute_all_statistics compute_all_statistics confusion_matrix_min_count confusion_matrix_min_count compute_null_uncertainties compute_null_uncertainties + use_shared_deviations use_shared_deviations )) )) @@ -210,6 +212,10 @@ ; confusion_matrix_min_count: number, optional, default is 10. Applicable only to confusion matrices, the number of predictions a class should have ; (value of a cell in the matrix) for it to remain in the confusion matrix. If the count is less than this value, it will be accumulated ; into a single value of all insignificant predictions for the class and removed from the confusion matrix. + ; use_shared_deviations: flag, optional, default false. If set to true, will return the residuals grouped by shared deviations group, otherwise will return the residuals + ; calculated on a per feature basis. + ; expand_confusion_matrices: flag, optional, default true. When use_shared_deviations is true, if there are confusion matrices for shared features, they will be copied over + ; for all non-primary shared features. When false, confusion matrices will not be copied and remain only for the primary features. #!CalculateFeatureResiduals (declare (assoc @@ -235,6 +241,8 @@ compute_all_statistics (false) compute_null_uncertainties (true) confusion_matrix_min_count 15 + use_shared_deviations (false) + expand_confusion_matrices (true) ) (declare (assoc @@ -289,6 +297,23 @@ (call !RunFullResiduals) ) + (if use_shared_deviations + (if (size !sharedDeviationsMap) + (seq + (assign (assoc feature_residuals_lists (call !PrepSharedDeviations) )) + (assign (assoc + features (filter (lambda (not (contains_value !sharedDeviationsNonPrimaryFeatures (current_value)))) features) + )) + ) + + ;else there are no set shared residuals, ignore shared deviations + (assign (assoc use_shared_deviations (false))) + ) + ) + + ;if shared deviations, the prediction_stats returned will a fully expanded assoc of prediction stats, + ; however the features list and feature_residuals_lists will be shortened to only features that do not belong + ; to a shared deviations group, or are the primary key for a group of shared deviations. (declare (assoc prediction_stats (if compute_all_statistics @@ -319,7 +344,14 @@ ;only query features that have nulls (lambda (get (current_value) "has_nulls")) ;ignore all inactive features (where all values are nulls) - (keep (remove !featureNullRatiosMap (indices !inactiveFeaturesMap)) features) + (keep + (remove !featureNullRatiosMap (indices !inactiveFeaturesMap)) + ;if shared deviations, the features list needs to be expanded to include features that were dropped for shared deviations calculations + (if use_shared_deviations + (call !ExpandForSharedDeviations (assoc compressed_values features)) + features + ) + ) ) )) ;store an assoc of lag/rate/delta feature -> lag/order amount for time series flows @@ -392,6 +424,30 @@ )) ) + + ;for shared deviations, only one set of deviations are calculated for each group. At the end of the calculations, + ; the deviations mapping is expanded to add back the other features whose deviations were not calculated and point them at the feature + ; in which their shared deviations are stored under. + (if use_shared_deviations + (map + (lambda (let + (assoc + current_map_name (current_index 1) + current_map_value (current_value 1) + ) + (assign (associate + current_map_name + (call !ExpandForSharedDeviations (assoc compressed_values current_map_value)) + )) + )) + (assoc + "ordinal_residuals_map" ordinal_residuals_map + "residuals_map" residuals_map + "null_uncertainties_map" null_uncertainties_map + ) + ) + ) + (assoc "residual_map" residuals_map "ordinal_residual_map" ordinal_residuals_map @@ -1055,6 +1111,41 @@ ) )) + + ;copy over all the stats for non-primary shared features + (if use_shared_deviations + (map + (lambda (let + (assoc + current_map_name (current_index 1) + current_map_value (current_value 1) + ) + ;only expand shared devitaions that are not confusion_matrix_map + ;if expand_confusion_matrices flag is true, will also expand confusion_matrix_map + (if (or + (!= "confusion_matrix_map" current_map_name) + expand_confusion_matrices + ) + (assign (associate + current_map_name + (call !ExpandForSharedDeviations (assoc compressed_values current_map_value)) + )) + ) + )) + (assoc + "accuracy_map" accuracy_map + "precision_map" precision_map + "recall_map" recall_map + "mcc_map" mcc_map + "r2_map" r2_map + "rmse_map" rmse_map + "spearman_coeff_map" spearman_coeff_map + "confusion_matrix_map" confusion_matrix_map + ) + ) + ) + + ;null out stats for unique features that don't apply (if (size unique_features_for_populating_output) (let (assoc diff --git a/howso/generate_features.amlg b/howso/generate_features.amlg index c0e0823aa..a1c5ebaca 100644 --- a/howso/generate_features.amlg +++ b/howso/generate_features.amlg @@ -698,6 +698,18 @@ (map (lambda (append (assoc + "shared_deviations" + (if + ;if manually specified shared deviations, use it + (size (get !featureAttributes [(get lag_sub_feature_names (current_index 0)) "shared_deviations"])) + (get !featureAttributes [(get lag_sub_feature_names (current_index 0)) "shared_deviations"]) + ;if parent feature's shared deviations value is set to false, skip + (!= (false) (get attributes "shared_deviations")) + ;only need the lags in one feature's attributes. Putting it in the first lag feature to reduce prep processing. + (if (= 1 (current_value 1)) + (append lag_sub_feature_names feature) + ) + ) "type" (if (contains_index attributes "type") (get attributes "type") "continuous") "derived_feature_code" (call !GenerateFeatureAttributeDerivation (assoc diff --git a/howso/hyperparameters.amlg b/howso/hyperparameters.amlg index 4b2ebae23..49cec7ccd 100644 --- a/howso/hyperparameters.amlg +++ b/howso/hyperparameters.amlg @@ -32,44 +32,63 @@ ) (if (size confusion_matrix_map) - (assign (assoc - feature_deviations - (map - ;convert deviations for nominals that had a confusion matrix computed from a single value into an assoc - ;containing the nominal deviation as 'expected_deviation' and a sparse deviation matrix as 'sdm' - (lambda - (if (contains_index confusion_matrix_map (current_index)) - (let - (assoc feature (current_index 1) ) - (declare (assoc - sdm - ##SparseDeviationMatrix - (call !ConfusionMatrixToSDM (assoc - confusion_matrix (get confusion_matrix_map [feature "matrix"]) - feature feature - expected_deviation (current_value 2) - )) - )) + (seq + (assign (assoc + feature_deviations + (map + ;convert deviations for nominals that had a confusion matrix computed from a single value into an assoc + ;containing the nominal deviation as 'expected_deviation' and a sparse deviation matrix as 'sdm' + (lambda + (if (contains_index confusion_matrix_map (current_index)) + (let + (assoc feature (current_index 1) ) + (declare (assoc + sdm + ##SparseDeviationMatrix + (call !ConfusionMatrixToSDM (assoc + confusion_matrix (get confusion_matrix_map [feature "matrix"]) + feature feature + expected_deviation (current_value 2) + )) + )) - ;output an assoc of sdm and expected deviation - (if sdm - (assoc - "sdm" sdm - "expected_deviation" (current_value 1) - ) + ;output an assoc of sdm and expected deviation + (if sdm + (assoc + "sdm" sdm + "expected_deviation" (current_value 1) + ) - ;else there is no sdm, return just the value - (current_value) + ;else there is no sdm, return just the value + (current_value) + ) ) - ) - ;else leave deviation value as-is - (current_value) + ;else leave deviation value as-is + (current_value) + ) + ) + ;if there are shared deviations, don't store or compute SDM for deviations for the non-primary shared features + (if (size !sharedDeviationsMap) + (remove feature_deviations !sharedDeviationsNonPrimaryFeatures) + feature_deviations ) ) + )) + + ;append the shared deviations as a duplicate of their primary feature shared deviation + (if (size !sharedDeviationsMap) + (accum (assoc feature_deviations - ) - )) + (map + (lambda + (get feature_deviations (get !sharedDeviationsMap (current_index))) + ) + (zip !sharedDeviationsNonPrimaryFeatures) + ) + )) + ) + ) ) (accum (assoc @@ -1091,6 +1110,11 @@ leftover_confusion_matrix (assoc) leftover_unknown_class_deviation (null) valid_weight_feature (and use_case_weights (or !hasPopulatedCaseWeight (!= weight_feature ".case_weight")) ) + feature_grouping + (if (get !sharedDeviationsMap feature) + (call !GetSharedDeviationGrouping (assoc feature_group_to_retrieve feature)) + [feature] + ) )) (declare (assoc @@ -1106,24 +1130,42 @@ (conclude (null)) ) - (declare (assoc num_classes (size (get !expectedValuesMap [weight_feature feature "class_counts"])) )) - (if (= 0 num_classes) - (assign (assoc - num_classes - (size - (compute_on_contained_entities (list - (query_value_masses - feature - (if valid_weight_feature weight_feature) - (or - (not (contains_index !nominalsMap feature)) - (contains_index !numericNominalFeaturesMap feature) + ;pull the number of classes for this feature from !expectedValuesMap if it was cached, otherwise compute it here + ;if feature is in a shared deviation feature_grouping, counts the total number of unique classes from all the features in the group + (declare (assoc + num_classes + (size (values + (apply "append" + (map + (lambda + (let + (assoc + feature_classes (indices (get !expectedValuesMap [weight_feature (current_value 2) "class_counts"])) + current_feature (current_value 1) + ) + (if (= 0 (size feature_classes)) + (indices + (compute_on_contained_entities (list + (query_value_masses + current_feature + (if valid_weight_feature weight_feature) + (or + (not (contains_index !nominalsMap current_feature)) + (contains_index !numericNominalFeaturesMap current_feature) + ) + ) + )) + ) + feature_classes + ) ) ) - )) + feature_grouping + ) ) - )) - ) + (true) + )) + )) ;average all probabilities for a class that are same as or more than the predicted class probability (declare (assoc diff --git a/howso/residuals.amlg b/howso/residuals.amlg index 8890966f8..6029aff9a 100644 --- a/howso/residuals.amlg +++ b/howso/residuals.amlg @@ -50,26 +50,57 @@ ;helper method for CalculateFeatureResiduals to compute and cache min gaps and min residuals #!CacheFeatureMinGapAndResidual - (let - (assoc - cached_feature_half_min_gap_map + (seq + (declare (assoc + features_to_compute + (if (size !sharedDeviationsMap) + ;remove non-primary shared features from the features list + (indices (remove (zip features) !sharedDeviationsNonPrimaryFeatures) ) + + ;else use all features + features + ) + )) + + (declare (assoc + feature_half_min_gap_map (map (lambda (let (assoc + ;if using shared deviations, the group of shared deviations features including this feature + feature_grouping + ;returns any empty list if not in a group + (call !GetSharedDeviationGrouping (assoc feature_group_to_retrieve (current_index 2))) + current_feature (current_index 1) + ) + + ;if not in a group, create a group of just the feature + (if (= (size feature_grouping) 0) + (assign (assoc feature_grouping [current_feature])) + ) + + (declare (assoc smallest_gap - ;gap is always 1 for nominals or strings - (if (or - (contains_index !nominalsMap (current_index 1)) - (contains_value (list "string" "string_mixable") (get !editDistanceFeatureTypesMap (current_index 1))) + (if + ;gap is always 1 for nominals or strings + (or + (contains_index !nominalsMap current_feature) + (contains_value (list "string" "string_mixable") (get !editDistanceFeatureTypesMap current_feature )) ) 1 - ;else compute the gap - (compute_on_contained_entities (list - (query_min_difference (current_index 2) (get !cyclicFeaturesMap (current_index 2)) ) - )) + (apply "min" + (map + (lambda + (compute_on_contained_entities (list + (query_min_difference (current_value 1) (get !cyclicFeaturesMap (current_value 1)) ) + )) + ) + feature_grouping + ) + ) ) - ) + )) ;infinity means there was no gap, set value to zero (if (= .infinity smallest_gap) @@ -78,7 +109,7 @@ ;(null) means that all values are nulls and a gap couldn't be computed ;set it to be 0.1 for edit distance feature, and zero for all other continuous (= (null) smallest_gap) - (if (contains_index !editDistanceFeatureTypesMap (current_index)) + (if (contains_index !editDistanceFeatureTypesMap current_feature) 0.1 0 ) @@ -87,12 +118,12 @@ (/ smallest_gap 2) ) )) - (zip features) + (zip features_to_compute) ) - ) + )) (declare (assoc - cached_feature_min_residual_map + feature_min_residual_map (map (lambda ;empty datasets set minimal residual to be half a gap @@ -114,17 +145,25 @@ (/ (* 2 (current_value)) (log (+ 1 num_training_cases)) ) ) ) - cached_feature_half_min_gap_map + feature_half_min_gap_map ) )) + (if (size !sharedDeviationsMap) + (assign (assoc + feature_min_residual_map + (call !ExpandForSharedDeviations (assoc compressed_values feature_min_residual_map)) + feature_half_min_gap_map + (call !ExpandForSharedDeviations (assoc compressed_values feature_half_min_gap_map)) + )) + ) + (assign_to_entities (assoc - !cachedFeatureHalfMinGapMap cached_feature_half_min_gap_map - !cachedFeatureMinResidualMap cached_feature_min_residual_map + !cachedFeatureMinResidualMap (replace feature_min_residual_map) + !cachedFeatureHalfMinGapMap (replace feature_half_min_gap_map ) )) ) - ;Limit computed residuals from feature_residuals_map to be the max of cached minimum residual, computed residual, and user-specified error ;outputs the passed in feature_residuals_map with limited values. ; @@ -135,62 +174,114 @@ (set feature_residuals_map "residual_map" - (map - (lambda - ;set upper bound to nominals to be max nominal deviation, accounting for imbalanced classes - (if (contains_index !nominalsMap (current_index)) - (let - (assoc - ;map of class -> count - class_counts_map - (compute_on_contained_entities (list - (query_value_masses (current_index 2) (null) (contains_index !numericNominalFeaturesMap (current_index 2)) ) - )) - ) - - ;do not max cap nominal deviations if less than 2 classes have been trained - (if (<= (size class_counts_map) 1) - (max - (get !cachedFeatureMinResidualMap (current_index)) - (current_value) - (get !userSpecifiedFeatureErrorsMap (current_index)) - ) - - ;else cap the max nominal deviation + (let (assoc + temp_residual_map + (map + (lambda + ;set upper bound to nominals to be max nominal deviation, accounting for imbalanced classes + (if (contains_index !nominalsMap (current_index)) (let - (assoc total_count (apply "+" (values class_counts_map)) ) + (assoc + ;map of class -> count + feature_grouping + (if (size !sharedDeviationsMap) + (call !GetSharedDeviationGrouping (assoc feature_group_to_retrieve (current_index 2))) + ) + current_feature (current_index 1) + ) + (declare (assoc + class_counts_map + (if (size feature_grouping) + ;get the total class counts for all the features in the shared deviations group + (reduce + (lambda + ;add each class's count together + (map + (lambda (+ + (or (first (current_value))) + (or (last (current_value))) + )) + (previous_result) + (current_value) + ) + ) + ;list of each feature's class count assocs + (map + (lambda + (compute_on_contained_entities (list + (query_value_masses (current_value 1) (null) (contains_index !numericNominalFeaturesMap (current_value 1)) ) + )) + ) + feature_grouping + ) + ) - (min - ;nominal max deviation is the sum of: each class's probability multiplied by probability of getting it wrong - (apply "+" - (map - (lambda (let - (assoc class_prob (* (/ (current_value 1) total_count)) ) - (* class_prob (- 1 class_prob)) + (compute_on_contained_entities (list + (query_value_masses current_feature (null) (contains_index !numericNominalFeaturesMap current_feature) ) )) - (values class_counts_map) ) - ) + )) + ;do not max cap nominal deviations if less than 2 classes have been trained + (if (<= (size class_counts_map) 1) (max (get !cachedFeatureMinResidualMap (current_index)) (current_value) (get !userSpecifiedFeatureErrorsMap (current_index)) ) + + ;else cap the max nominal deviation + (let + (assoc total_count (apply "+" (values class_counts_map)) ) + + (min + ;nominal max deviation is the sum of: each class's probability multiplied by probability of getting it wrong + (apply "+" + (map + (lambda (let + (assoc class_prob (* (/ (current_value 1) total_count)) ) + (* class_prob (- 1 class_prob)) + )) + (values class_counts_map) + ) + ) + + (max + (get !cachedFeatureMinResidualMap (current_index)) + (current_value) + (get !userSpecifiedFeatureErrorsMap (current_index)) + ) + ) + ) ) ) + + ;else continuous value don't have upper bounds, set the lower bound + (max + (get !cachedFeatureMinResidualMap (current_index)) + (current_value) + (get !userSpecifiedFeatureErrorsMap (current_index)) + ) ) ) - ;else continuous value don't have upper bounds, set the lower bound - (max - (get !cachedFeatureMinResidualMap (current_index)) - (current_value) - (get !userSpecifiedFeatureErrorsMap (current_index)) + ;if using shared deviations, map over a reduced residuals map with only features that are not + ; in a shared deviations group or are the primary keys of a shared deviations group + (if (size !sharedDeviationsMap) + (remove + (get feature_residuals_map "residual_map") + !sharedDeviationsNonPrimaryFeatures + ) + + (get feature_residuals_map "residual_map") ) ) ) - (get feature_residuals_map "residual_map") + + (if (size !sharedDeviationsMap) + (call !ExpandForSharedDeviations (assoc compressed_values temp_residual_map)) + temp_residual_map + ) ) ) @@ -261,6 +352,7 @@ ;all the context features are specified robust_residuals (false) + use_shared_deviations (true) )) ) @@ -749,8 +841,44 @@ )) features ) + )) + + (declare (assoc + nominal_class_statistical_significant_count 30 + max_samples_threshold 15000 + )) + + (declare (assoc ;need to have at least 2.5% (that's a value of 50 using default sample of 2000) non-null values for a feature min_value_count (ceil (* 0.025 (size case_ids))) + min_count_nominals_map + (map + (lambda (let + (assoc feature (current_index 1) ) + (declare (assoc + class_counts_above_stat_significance + (filter + (lambda (>= (current_value) nominal_class_statistical_significant_count)) + (values (get !expectedValuesMap [weight_feature feature "class_counts"])) + ) + )) + + (declare (assoc + smallest_count_above_statistical_significance (first (sort class_counts_above_stat_significance)) + )) + + ;min(15k or the max threshold, (3 / (smallest_count_above_statistical_significance / num_cases))) + (if smallest_count_above_statistical_significance + (min max_samples_threshold (/ 3 (/ smallest_count_above_statistical_significance num_training_cases))) + 0 + ) + )) + !nominalsMap + ) + )) + + (declare (assoc + feature_index_map (zip features (indices features)) )) ;determine if any of the lists in feature_residuals_lists are too short (< 50 values), if so keep feature as needing to be resampled @@ -768,11 +896,32 @@ ) ) + (declare (assoc + ;assoc of nominal feature -> number of needed extra samples + num_needed_nominal_values_per_feature_map + (filter + (lambda (> (current_value) 0)) + (map + (lambda + (- + (get min_count_nominals_map (current_index)) + ;if this number will be padded due to nulls, discont that min amount, otherwise discount existing residual samples + (if (contains_index num_valid_values_per_feature_map (current_index)) + min_value_count + (size (get feature_residuals_lists (get feature_index_map (current_index))) ) + ) + ) + ) + !nominalsMap + ) + ) + )) + ;some features did not have 50 values, accrue case_ids, then call !the main method agan - (if (size num_valid_values_per_feature_map) + (if (or (size num_valid_values_per_feature_map) (size num_needed_nominal_values_per_feature_map)) (seq ;create the resampled case_ids list that contains cases with enough necessary non-null feature values - (assign (assoc + (assign (assoc case_ids ;append all the lists of cases ids for each feature into one list (apply "append" @@ -791,6 +940,27 @@ ) )) + (if (size num_needed_nominal_values_per_feature_map) + (let + (assoc + max_nominal_samples_needed + (* 2 (apply "max" (values num_needed_nominal_values_per_feature_map)) ) + ) + (if (= (null) case_ids) (assign (assoc case_ids [] )) ) + + (accum (assoc + case_ids + (contained_entities (list + (query_exists !internalLabelSession) + (if valid_weight_feature + (query_weighted_sample weight_feature max_nominal_samples_needed (rand)) + (query_sample max_nominal_samples_needed (rand)) + ) + )) + )) + ) + ) + ;re-compute residuals on these case_ids, store into case_residuals_lists (call !AccumulateFeatureResiduals) @@ -1290,6 +1460,17 @@ ) )) + (if use_shared_deviations + (seq + (assign (assoc + feature_residuals_lists (call !PrepSharedDeviations (assoc features features_with_nulls)) + )) + (assign (assoc + features_with_nulls (filter (lambda (not (contains_value !sharedDeviationsNonPrimaryFeatures (current_value)))) features_with_nulls) + )) + ) + ) + ;create a map of feature -> null prediction probability by averaging out all the correctly predicted nulls for each feature (assign (assoc null_accuracies_map @@ -1316,6 +1497,10 @@ (lambda (let (assoc feature (current_index 1) + feature_grouping + (if (size !sharedDeviationsMap) + (call !GetSharedDeviationGrouping (assoc feature_group_to_retrieve (current_index 2))) + ) ;null deviation is: 1 - null prediction probability null_prediction_deviation (- 1 (current_value 1)) @@ -1337,12 +1522,44 @@ ;compute from the existing feature max-min value or bounds if provided (max (- - (get !featureMarginalStatsMap (list weight_feature feature "max")) - (get !featureMarginalStatsMap (list weight_feature feature "min")) + (if (size feature_grouping) + (apply "max" + (filter (map + (lambda (get !featureMarginalStatsMap (list weight_feature (current_value 1) "max"))) + feature_grouping + )) + ) + (get !featureMarginalStatsMap (list weight_feature feature "max")) + ) + (if (size feature_grouping) + (apply "min" + (filter (map + (lambda (get !featureMarginalStatsMap (list weight_feature (current_value 1) "min"))) + feature_grouping + )) + ) + (get !featureMarginalStatsMap (list weight_feature feature "min")) + ) ) (- - (get !featureBoundsMap (list feature "max")) - (get !featureBoundsMap (list feature "min")) + (if (size feature_grouping) + (apply "max" + (filter (map + (lambda (get !featureBoundsMap (list (current_value 1) "max"))) + feature_grouping + )) + ) + (get !featureBoundsMap (list feature "max")) + ) + (if (size feature_grouping) + (apply "min" + (filter (map + (lambda (get !featureBoundsMap (list weight_feature (current_value 1) "min"))) + feature_grouping + )) + ) + (get !featureBoundsMap (list feature "min")) + ) ) ) ) @@ -1357,4 +1574,4 @@ ) )) ) -) +) \ No newline at end of file diff --git a/howso/shared_deviations.amlg b/howso/shared_deviations.amlg new file mode 100644 index 000000000..8074dfa7d --- /dev/null +++ b/howso/shared_deviations.amlg @@ -0,0 +1,92 @@ +;Contains methods for shared deviations. +(null + + ;Helper function that takes in a feature that belongs to a group of shared deviations features and returns all of the features in that group. + ; + ;parameters: + ; feature_group_to_retrieve: A feature that belongs to a group of shared deviations features. + #!GetSharedDeviationGrouping + (declare + (assoc feature_group_to_retrieve (null)) + (indices + (let + (assoc current_shared_deviations_key (get !sharedDeviationsMap feature_group_to_retrieve)) + (filter + (lambda (= (current_value) current_shared_deviations_key) ) + !sharedDeviationsMap + ) + ) + ) + ) + + ;Helper function that takes a mapping or list that has been reduced, and expands it by readding the features from the groups that have been dropped. + ; + ;parameters: + ; compressed_values: an assoc or list of values that need to expanded. If a list, adds back in the features from shared deviations groups that were removed + ; for calculations If an assoc, adds back in removed features as keys and copies the value of the primary key for each shared deviation group as its value. + ; e.g. {"a": 2.5, "b": 0} -> {"a": 2.5 "b":9 "c": 2.5 "d": 2.5} if features a, c, d were in a shared deviations group. + #!ExpandForSharedDeviations + (declare + (assoc compressed_values (null)) + (if (size !sharedDeviationsMap) + (if (~ (list) compressed_values) + (assign (assoc + compressed_values (values (append compressed_values (indices !sharedDeviationsMap)) (true)) + )) + + ;else compressed_values is an assoc of feature -> value + (let + (assoc + ;features that were dropped and had their values combined into their primary shared deviations feature + removed_features (indices (remove !sharedDeviationsMap (indices compressed_values))) + ) + (accum (assoc + compressed_values + ;grab the values for the removed features by looking up their primary key values + (map + (lambda + (get compressed_values (get !sharedDeviationsMap (current_index))) + ) + (zip removed_features) + ) + )) + ) + ) + ) + compressed_values + ) + + ;Helper method to reduce feature_residuals_lists into just the residuals lists corresponding to non-shared devation features and + ;primary shared deviation features by combining all the non-primary residuals into their corresponding primary feature residuals + ; + ;parameters: + ; features: A features list + ; feature_residuals_lists: A features residuals whose lists are in the order of the features_list + #!PrepSharedDeviations + (let + (assoc feature_index_map (zip features (indices features)) ) + + (filter (map + (lambda + (if (contains_value !sharedDeviationsPrimaryFeatures (current_value)) + ;combine all the residuals for a feature group + (apply "append" + ;grap the residuals for the features by their indices + (unzip + feature_residuals_lists + (unzip feature_index_map (get !sharedDeviationGroupByPrimaryMap (current_value))) + ) + ) + + ;all non-primary feature residual lists are replaced with null so they can be filtered out + (contains_value !sharedDeviationsNonPrimaryFeatures (current_value)) + (null) + + ;leave residuals for non-shared deviation features as-is + (get feature_residuals_lists (get feature_index_map (current_value))) + ) + ) + features + )) + ) +) \ No newline at end of file diff --git a/unit_tests/ut_h_dependent_features.amlg b/unit_tests/ut_h_dependent_features.amlg index 7938cd38a..5e992836c 100644 --- a/unit_tests/ut_h_dependent_features.amlg +++ b/unit_tests/ut_h_dependent_features.amlg @@ -79,7 +79,7 @@ context_features features targeted_model "targetless" k_values (list 8) - use_deviations(false) + use_deviations (false) )) (print "Verify dependendents boundary map is properly set: \n") diff --git a/unit_tests/ut_h_shared_deviations.amlg b/unit_tests/ut_h_shared_deviations.amlg new file mode 100644 index 000000000..4445be0f3 --- /dev/null +++ b/unit_tests/ut_h_shared_deviations.amlg @@ -0,0 +1,125 @@ +(seq + #unit_test (direct_assign_to_entities (assoc unit_test (load "unit_test.amlg"))) + (call (load "unit_test_howso.amlg") (assoc name "ut_h_shared_deviations.amlg" retries 1)) + + (declare + (assoc + result (null) + features (list "height" "width" "length" "tart" "sweet" "size" "weight" "fruit" "color1" "color2") + action_features (list "fruit") + context_features (list "height" "weight" "length" "tart" "sweet" "size" "weight" "color1" "color2") + fruit_data + (list + ; "h" "w" "l" "tart" "sweet" "size" "weight" "fruit" " "color1" "color2" + (list 1 0 15 11 1 (null) 1 "strawberry" "red-1" "red-1") + (list 2 2 20 21 .45 "small" .8 "strawberry" "red-2" "red-2") + (list 3 4 35 31 .42 "small" 1.2 "strawberry" "red-3" "red-3") + (list 4 6 40 41 .49 "small" 1.1 "strawberry" "red-4" "red-4") + + (list 5 8 55 51 .4 "small" 2 (null) "green-1" "green-1") + (list 6 10 60 61 .55 "medium" 3 "apple" "green-1" "green-2") + (list 7 12 75 71 .52 "medium" 3.5 "apple" "green-1" "green-3") + (list 8 14 80 81 .54 "medium" 4.5 (null) "green-1" "green-4") + + (list 9 16 95 91 .60 "small" (null) "banana" "yellow-1" "yellow-1") + (list 10 18 100 101 .65 "medium" (null) "banana" "yellow-2" "yellow-2") + (list 11 20 115 111 .69 (null) 5.5 "banana" "yellow-3" "yellow-3") + (list 12 22 120 121 .62 "medium" 7 "banana" "yellow-4" "yellow-4") + ) + ) + ) + + (assign (assoc + result + (call_entity "howso" "set_feature_attributes" (assoc + feature_attributes + (assoc + "color1" (assoc "type" "nominal") + "color2" (assoc "type" "nominal" "unique" (true)) + "fruit" (assoc "type" "nominal" "shared_deviations" (list "color2")) + ) + )) + )) + + (call keep_result_errors) + + (print "Invalid shared deviations feature groupings with unique nominals: ") + (call assert_same (assoc + obs result + exp "Features with shared deviations may not be unique nominals, as they have null deviations. The following features are in a shared deviations group and is a unique nominal: color2." + )) + + (assign (assoc + result + (call_entity "howso" "set_feature_attributes" (assoc + feature_attributes + (assoc + "color1" (assoc "type" "nominal") + "color2" (assoc "type" "nominal") + "sweet" (assoc "type" "nominal" "shared_deviations" (list "color2")) + "tart" (assoc "type" "nominal" "shared_deviations" (list "color1")) + "height" (assoc "type" "nominal" "shared_deviations" (list "weight")) + ) + )) + )) + + (call keep_result_errors) + + (print "Invalid shared deviations feature groupings with nominal and non-nominal featuers: ") + (call assert_same (assoc + obs result + exp "Nominal features may not share deviations with non-nominal features." + )) + + (call_entity "howso" "set_feature_attributes" (assoc + feature_attributes + (assoc + "fruit" (assoc "type" "nominal") + "color1" (assoc "type" "nominal") + "color2" (assoc "type" "nominal" "unique" (true)) + "weight" (assoc "type" "continuous" "shared_deviations" (list "height" "tart")) + "length" (assoc "type" "continuous" "shared_deviations" (list "sweet")) + "size" (assoc "type" "nominal" "shared_deviations" (list "fruit" "size")) + ) + )) + + (call_entity "howso" "train" (assoc + features features + cases fruit_data + session "unit_test" + )) + + (call_entity "howso" "analyze" (assoc + context_features features + action_features (list (last features)) + use_deviations (true) + )) + + (declare (assoc + shared_feature_deviations + (get (call_entity "howso" "get_params") (list 1 "payload" "hyperparameter_map" "color2" "color1.fruit.height.length.size.sweet.tart.weight.width." "full" ".none" "featureDeviations")) + )) + + (print "Features with shared deviations have same deviations: weight and height ") + (call assert_same (assoc + obs (get shared_feature_deviations "weight") + exp (get shared_feature_deviations "height") + )) + (print "Features with shared deviations have same deviations: weight and tart ") + (call assert_same (assoc + obs (get shared_feature_deviations "weight") + exp (get shared_feature_deviations "tart") + )) + (print "Features with shared deviations have same deviations: length and sweet ") + (call assert_same (assoc + obs (get shared_feature_deviations "length") + exp (get shared_feature_deviations "sweet") + )) + (print "Features with shared deviations have same deviations: size and fruit ") + (call assert_same (assoc + obs (get shared_feature_deviations "size") + exp (get shared_feature_deviations "fruit") + )) + + (call exit_if_failures (assoc msg unit_test_name )) +) diff --git a/unit_tests/ut_h_shared_deviations_series.amlg b/unit_tests/ut_h_shared_deviations_series.amlg new file mode 100644 index 000000000..ba1b6b48b --- /dev/null +++ b/unit_tests/ut_h_shared_deviations_series.amlg @@ -0,0 +1,175 @@ + +(seq + #unit_test (direct_assign_to_entities (assoc unit_test (load "unit_test.amlg"))) + (call (load "unit_test_howso.amlg") (assoc name "ut_h_shared_deviations_series.amlg" debug (false) retries 1)) + + (declare (assoc + data + (list + (list "stk_A" 8 100) + (list "stk_A" 9 101) + (list "stk_A" 10 103) + (list "stk_A" 11 105) + (list "stk_A" 12 104) + (list "stk_A" 13 106) + (list "stk_A" 14 104) + (list "stk_A" 15 103) + (list "stk_A" 16 105) + (list "stk_A" 17 106) + (list "stk_A" 18 107) + (list "stkB" 8 110) + (list "stkB" 9 109) + (list "stkB" 10 110) + (list "stkB" 11 109) + (list "stkB" 12 108) + (list "stkB" 13 109) + (list "stkB" 14 108) + (list "stkB" 15 106) + (list "stoC" 5 94) + (list "stoC" 6 96) + (list "stoC" 7 95) + (list "stoC" 8 93) + (list "stoC" 9 95) + (list "stoC" 10 98) + (list "stoC" 11 99) + (list "stoC" 12 98) + (list "stoC" 13 95) + (list "stoC" 14 96) + (list "stoC" 15 94) + (list "stoC" 16 93) + (list "stoC" 17 91) + (list "stoC" 18 92) + (list "stoC" 19 90) + (list "stoC" 20 91) + (list "stoC" 21 89) + (list "stoC" 22 91) + ) + + features (list "stock" "time" "value") + result (null) + result_features (null) + )) + + (call_entity "howso" "set_feature_attributes" (assoc + feature_attributes + (assoc + "stock" + (assoc + "type" "nominal" + "id_feature" (true) + "time_series" (assoc "type" "delta") + ) + "time" + (assoc + "type" "continuous" + "decimal_places" 0 + "time_series" (assoc "time_feature" (true) "type" "delta" "universal" (false)) + "bounds" + (assoc + "allow_null" (false) + "min" 1 + "max" 30 + ) + ) + "value" + (assoc + "type" "continuous" + "decimal_places" 0 + "time_series" (assoc "type" "delta" "num_lags" 2) + "bounds" + (assoc + "allow_null" (true) + "min" 80 + "max" 125 + ) + ) + ) + )) + + (call_entity "howso" "train" (assoc + cases data + features features + )) + + (call_entity "howso" "analyze" (assoc use_deviations (true))) + + (declare (assoc + shared_feature_deviations + (get (call_entity "howso" "get_params") (list 1 "payload" "hyperparameter_map" ".targetless" ".series_progress..series_progress_delta..time_delta_1..time_lag_1..value_delta_1..value_lag_1..value_lag_2.stock.time.value." "robust" ".none" "featureDeviations")) + )) + + (declare (assoc shared_feature_deviations (get shared_deviations_hp_map deviations_param_path))) + + (print "Series lag features with shared deviations are same: time and .time_lag_1.") + (call assert_same (assoc + obs (get shared_feature_deviations ".time_lag_1") + exp (get shared_feature_deviations "time") + )) + + (print "Series lag features with shared deviations are same: value and .value_lag_1.") + (call assert_same (assoc + obs (get shared_feature_deviations "value") + exp (get shared_feature_deviations ".value_lag_1") + )) + + (print "Series lag features with shared deviations are same: .value_lag_1 and .value_lag_2.") + (call assert_same (assoc + obs (get shared_feature_deviations ".value_lag_1") + exp (get shared_feature_deviations ".value_lag_2") + )) + + (call_entity "howso" "set_feature_attributes" (assoc + feature_attributes + (assoc + "stock" + (assoc + "type" "nominal" + "id_feature" (true) + "time_series" (assoc "type" "delta") + ) + "time" + (assoc + "type" "continuous" + "decimal_places" 0 + "time_series" (assoc "time_feature" (true) "type" "delta" "universal" (false)) + "bounds" + (assoc + "allow_null" (false) + "min" 1 + "max" 30 + ) + ) + "value" + (assoc + "type" "continuous" + "shared_deviations" (false) + "decimal_places" 0 + "time_series" (assoc "type" "delta" "num_lags" 2) + "bounds" + (assoc + "allow_null" (true) + "min" 80 + "max" 125 + ) + ) + ) + )) + + (call_entity "howso" "analyze" (assoc use_deviations (true))) + + (assign (assoc shared_feature_deviations + (get (call_entity "howso" "get_params") (list 1 "payload" "hyperparameter_map" ".targetless" ".series_progress..series_progress_delta..time_delta_1..time_lag_1..value_delta_1..value_lag_1..value_lag_2.stock.time.value." "robust" ".none" "featureDeviations")) + )) + + (print "Series lag features with no shared deviations are different: .value_lag_1 and .value_lag_2.") + (call assert_false (assoc + obs (= (get shared_feature_deviations ".value_lag_2") (get shared_feature_deviations ".value_lag_1")) + )) + + (print "Series lag features with no shared deviations are different: value and .value_lag_1.") + (call assert_false (assoc + obs (= (get shared_feature_deviations "value") (get shared_feature_deviations ".value_lag_1")) + )) + + (call exit_if_failures (assoc msg unit_test_name)) +) \ No newline at end of file diff --git a/unit_tests/ut_howso.amlg b/unit_tests/ut_howso.amlg index 9b2f66ebf..d93cfe418 100644 --- a/unit_tests/ut_howso.amlg +++ b/unit_tests/ut_howso.amlg @@ -45,6 +45,8 @@ "ut_h_unique_ids.amlg" "ut_h_derive_start_end.amlg" "ut_h_derive_custom.amlg" + "ut_h_shared_deviations.amlg" + "ut_h_shared_deviations_series.amlg" "ut_h_time_series_datetime.amlg" "ut_h_time_series.amlg" "ut_h_time_series_stock.amlg"