diff --git a/howso.amlg b/howso.amlg index 999bcfe99..4567a5c23 100644 --- a/howso.amlg +++ b/howso.amlg @@ -235,6 +235,7 @@ "synthesis_utilities" "synthesis_validation" "train" + "train_ts_ablation" "typing" "update_cases" "upgrade" diff --git a/howso/ablation.amlg b/howso/ablation.amlg index 7b6d5893a..7f1466537 100644 --- a/howso/ablation.amlg +++ b/howso/ablation.amlg @@ -34,6 +34,8 @@ use_case_weights use_case_weights weight_feature weight_feature details (assoc influential_cases (true) influential_cases_raw_weights (true)) + skip_encoding (true) + skip_decoding (true) ) )) @@ -51,7 +53,7 @@ ) (declare (assoc - influential_cases (get (call !SingleReact react_kwargs) "influential_cases") + influential_cases (get (call !ReactDiscriminative react_kwargs) "influential_cases") )) (if @@ -624,18 +626,18 @@ react_kwargs (assoc use_case_weights use_case_weights - weight_feature weight_feature - details (assoc influential_cases (true) influential_cases_raw_weights (true)) + weight_feature weight_feature + details (assoc influential_cases (true) influential_cases_raw_weights (true)) context_features features - context_values feature_values case_indices (unzip (last influentials_entropy_pair) [".session" ".session_training_index"] ) preserve_feature_values features leave_case_out (true) + skip_encoding (true) ) ) (declare (assoc - neighbor_influential_cases (get (call !SingleReact react_kwargs) "influential_cases") + neighbor_influential_cases (get (call !ReactDiscriminative react_kwargs) "influential_cases") )) (declare (assoc @@ -663,7 +665,7 @@ (and (or (not (and !hasInfluenceWeightEntropies !autoAblationEnabled) ) - (< num_cases !autoAblationMinNumCases ) + (< (+ num_cases (size cases)) !autoAblationMinNumCases ) ) (= (null) !autoAblationExactPredictionFeatures diff --git a/howso/custom_codes.amlg b/howso/custom_codes.amlg index 671f50371..84e2d2da2 100644 --- a/howso/custom_codes.amlg +++ b/howso/custom_codes.amlg @@ -83,7 +83,10 @@ data series_data ;specify indices for series_ordered_by_features and the index of !internalLabelSessionTrainingIndex column_order_indices - (append (range 0 (- (size series_ordered_by_features) 1)) (size necessary_features)) + (append + (unzip (zip necessary_features (indices necessary_features)) series_ordered_by_features) + (size necessary_features) + ) )) )) ) diff --git a/howso/derive_features.amlg b/howso/derive_features.amlg index 67aff8163..632d52cb1 100644 --- a/howso/derive_features.amlg +++ b/howso/derive_features.amlg @@ -288,6 +288,10 @@ lag_features (list) ) + (if (= 0 (size lag_features)) + (conclude) + ) + (declare (assoc series_id_features (get !featureAttributes (list (first lag_features) "auto_derive_on_train" "series_id_features")) series_ordered_by_features (get !featureAttributes (list (first lag_features) "auto_derive_on_train" "ordered_by_features")) @@ -337,7 +341,7 @@ series_data (call !MultiSortList (assoc data series_data - column_order_indices (range 0 (size series_ordered_by_features)) + column_order_indices (unzip (zip necessary_features (indices necessary_features)) series_ordered_by_features) )) )) ) diff --git a/howso/derive_utilities.amlg b/howso/derive_utilities.amlg index e5e098b04..6e5228230 100644 --- a/howso/derive_utilities.amlg +++ b/howso/derive_utilities.amlg @@ -171,12 +171,15 @@ a (current_value 1) b (current_value 2) ) + (apply "or" (map (lambda - (- - (get a (current_value)) - (get b (current_value)) + ;use - for numeric value comparison, and > for strings so it compares them alphabetically + (if (~ 0 (get a (current_value))) + (- (get a (current_value)) (get b (current_value)) ) + + (> (get a (current_value)) (get b (current_value)) ) ) ) column_order_indices diff --git a/howso/train.amlg b/howso/train.amlg index ba1a5579c..9d19b9193 100644 --- a/howso/train.amlg +++ b/howso/train.amlg @@ -404,36 +404,39 @@ ) ) - ;if derived features wasn't specified, auto-detect them - (if (and (= (null) derived_features) (> (size !derivedFeaturesMap) 0)) - (seq - (assign (assoc derived_features (list))) - - ;check features vs !sourceToDerivedFeatureMap and populate derived_features accordingly - (map - (lambda (let - (assoc feature_name (current_value 1)) - ;if this trained feature has derived features, add all of them to the derived_features list - (if (contains_index !sourceToDerivedFeatureMap feature_name) - (accum (assoc derived_features (get !sourceToDerivedFeatureMap feature_name))) - ) - )) - features - ) - - ;clear out possible duplicates out of derived_features - (assign (assoc derived_features (values derived_features (true)))) - ) - ) - - (if (and skip_ablation (> (size new_case_ids ) 0) ) ;update !dataMassChangeSinceLastAnalyze to the already computed new_possible_data_mass (assign_to_entities (assoc !dataMassChangeSinceLastAnalyze new_possible_data_mass )) ) + ;if derived features wasn't specified, auto-detect them + (if (and (= (null) derived_features) (> (size !derivedFeaturesMap) 0)) + (assign (assoc + derived_features + (values + (apply "append" + (map + (lambda + ;if this trained feature has derived features, add all of them to the derived_features list + (if (contains_index !sourceToDerivedFeatureMap (current_value)) + (get !sourceToDerivedFeatureMap (current_value)) + [] + ) + ) + features + ) + ) + (true) + ) + )) + ) + ;auto populate derived features if necessary - (if (> (size derived_features) 0) + (if (and + (> (size derived_features) 0) + ;either non-time series or ablation was skipped and thus features were not derived yet + (or skip_ablation (= (null) !tsTimeFeature) ) + ) (call !DeriveTrainFeatures (assoc features features ;keep and derive only those features that are not in the features list @@ -677,137 +680,145 @@ cases ) - ;else ablating cases during training - (let - (assoc - train_features - ;if auto ablate is enabled, populate the weight feature for this case - (if !autoAblationEnabled - (append features (list !autoAblationWeightFeature)) - features - ) - ) + ;time feature exists, do ablation on time series by deriving one series at a time and then ablating those cases prior to training + !tsTimeFeature + (call !TrainTimeSeriesAblation) - ;if user explicitly specified to skip autoanalyze, send back "analyze" status so user knows an analyze is needed - (if skip_auto_analyze - (assign (assoc status_output "analyze")) - ) + ;else ablating cases during training + (call !TrainCasesWithAblation) + ) - (declare (assoc - batch_data_mass_threshold 0 - batch_data_mass 0 - batch_size 0 - input_case_index 0 - output_case_ids [] - - ;threshold-related variables - thresholds_enabled - (or - (size !autoAblationAbsThresholdMap) - (size !autoAblationDeltaThresholdMap) - (size !autoAblationRelThresholdMap) - ) - prev_prediction_stats_map {} - new_prediction_stats_map {} - thresholds_satisfied (false) - )) + #!TrainCasesWithAblation + (let + (assoc + train_features + ;if auto ablate is enabled, populate the weight feature for this case + (if !autoAblationEnabled + (append features (list !autoAblationWeightFeature)) + features + ) + ) - ;split by batches of cases until next analyze - (while (< input_case_index (size cases)) + ;if user explicitly specified to skip autoanalyze, send back "analyze" status so user knows an analyze is needed + (if skip_auto_analyze + (assign (assoc status_output "analyze")) + ) - (if (and thresholds_enabled (not skip_ablation)) - (seq - (assign (assoc - prev_prediction_stats_map new_prediction_stats_map - )) - (assign (assoc - new_prediction_stats_map - (get - (call !CalculateFeatureResiduals (assoc - features features - weight_feature !autoAblationWeightFeature - use_case_weights (true) - compute_all_statistics (true) - store_values (false) - )) - "prediction_stats" - ) - )) - (assign (assoc - thresholds_satisfied - (apply "or" - (values - (call !CheckThresholds (assoc - abs_threshold_map !autoAblationAbsThresholdMap - delta_threshold_map !autoAblationDeltaThresholdMap - rel_threshold_map !autoAblationRelThresholdMap - prev_prediction_stats_map prev_prediction_stats_map - new_prediction_stats_map new_prediction_stats_map - )) - ) - ) - )) - ) + (declare (assoc + batch_data_mass_threshold 0 + batch_data_mass 0 + batch_size 0 + input_case_index 0 + output_case_ids [] + + ;threshold-related variables + thresholds_enabled + (or + (size !autoAblationAbsThresholdMap) + (size !autoAblationDeltaThresholdMap) + (size !autoAblationRelThresholdMap) ) + prev_prediction_stats_map {} + new_prediction_stats_map {} + thresholds_satisfied (false) + )) - (assign (assoc - ;always train a few extra cases since some are expected to be ablated - ;to prevent this threshold value from dropping down to very small values - ;but also limit to how much is trained at a time - batch_data_mass_threshold - (max - 10 - (min - (+ 10 (- !autoAnalyzeThreshold !dataMassChangeSinceLastAnalyze)) - !ablationBatchSize - ) - ) - batch_data_mass 0 - batch_size 0 - )) + ;split by batches of cases until next analyze + (while (< input_case_index (size cases)) - (while (and (< batch_data_mass batch_data_mass_threshold) (< (+ input_case_index (current_index)) (size cases)) ) + (if (and thresholds_enabled (not skip_ablation)) + (seq + (assign (assoc + prev_prediction_stats_map new_prediction_stats_map + )) (assign (assoc - batch_data_mass - (+ - (or (previous_result 1) 0) - (if accumulate_weight_feature - (or (get cases [(+ input_case_index (current_index 2)) weight_feature_index ]) 1) - 1 + new_prediction_stats_map + (get + (call !CalculateFeatureResiduals (assoc + features features + weight_feature !autoAblationWeightFeature + use_case_weights (true) + compute_all_statistics (true) + store_values (false) + )) + "prediction_stats" + ) + )) + (assign (assoc + thresholds_satisfied + (apply "or" + (values + (call !CheckThresholds (assoc + abs_threshold_map !autoAblationAbsThresholdMap + delta_threshold_map !autoAblationDeltaThresholdMap + rel_threshold_map !autoAblationRelThresholdMap + prev_prediction_stats_map prev_prediction_stats_map + new_prediction_stats_map new_prediction_stats_map + )) ) ) - batch_size (+ 1 (current_index 1)) )) - batch_data_mass ) + ) + + (assign (assoc + ;always train a few extra cases since some are expected to be ablated + ;to prevent this threshold value from dropping down to very small values + ;but also limit to how much is trained at a time + batch_data_mass_threshold + (max + 10 + (min + (+ 10 (- !autoAnalyzeThreshold !dataMassChangeSinceLastAnalyze)) + !ablationBatchSize + ) + ) + batch_data_mass 0 + batch_size 0 + )) + + (while (and (< batch_data_mass batch_data_mass_threshold) (< (+ input_case_index (current_index)) (size cases)) ) (assign (assoc - output_case_ids - (call !TrainCasesWithAblation (assoc - cases (unzip cases (range input_case_index (+ input_case_index batch_size -1)) ) - ;ensure that starting training index value is updated for each batch - session_training_index (+ trained_instance_count input_case_index) - )) + batch_data_mass + (+ + (or (previous_result 1) 0) + (if accumulate_weight_feature + (or (get cases [(+ input_case_index (current_index 2)) weight_feature_index ]) 1) + 1 + ) + ) + batch_size (+ 1 (current_index 1)) )) + batch_data_mass + ) + (assign (assoc + output_case_ids + (call !AblateCases (assoc + cases (unzip cases (range input_case_index (+ input_case_index batch_size -1)) ) + ;ensure that starting training index value is updated for each batch + session_training_index (+ trained_instance_count input_case_index) + )) + )) - (if (and run_autoanalyze_check (not skip_auto_analyze)) - (call !AutoAnalyzeIfNeeded) - ) - - (accum (assoc input_case_index batch_size )) + (if (and run_autoanalyze_check (not skip_auto_analyze)) + (call !AutoAnalyzeIfNeeded) + ) - (if (> (current_index) 0) - (assign (assoc output_case_ids (append (previous_result 1) output_case_ids) )) - ) + (accum (assoc input_case_index batch_size )) - output_case_ids + (if (> (current_index) 0) + (assign (assoc output_case_ids (append (previous_result 1) output_case_ids) )) ) output_case_ids ) + + output_case_ids ) + ;Helper method to train cases with an ablation check - #!TrainCasesWithAblation + #!AblateCases (let (assoc indices_to_train @@ -831,6 +842,22 @@ )) )) ) + + ;time series ablation explicitly keeps the first and last case of a series + (if ts_ablated_indices_map + (if (or + (= (- (size cases) 1) (current_index)) + ;first case and it's actually the first case of the series + (and + (= 0 (current_index)) + ;the .series_index feature is second to last + (= 0 (get feature_values (- (size features) 2)) ) + ) + ) + (conclude (true)) + ) + ) + ;do not train on this case if it is null or all case values are null or it's within provided thresholds ;if one of the ablation methods returns false, then the case should be ablated. (and @@ -853,7 +880,12 @@ (accum (assoc ablated_indices_list (map - (lambda (+ session_training_index (current_value))) + (lambda + (if (size ts_ablated_indices_map) + (+ session_training_index (get ts_ablated_indices_map (current_value))) + (+ session_training_index (current_value)) + ) + ) (remove (indices cases) indices_to_train) ) ablation_trained_instance_count (size indices_to_train) @@ -892,10 +924,24 @@ ) )) + ;if there are ablated cases to accumulate influence weights to, do it here + ;also increase !dataMassChangeSinceLastAnalyze by the ablated cases and recompute influence weights entropy + (if (and accumulate_weight_feature (< (size indices_to_train) (size cases)) ) + (call !AccumulateCaseInfluenceWeights (assoc + features features + accumulate_weight_feature accumulate_weight_feature + cases + (unzip + cases + (remove (indices cases) indices_to_train) + ) + )) + ) + ;if auto analyzing, need to accumulate data masses for all cases (if run_autoanalyze_check (let - (assoc + (assoc mass_to_accumulate (if (and accumulate_weight_feature (size indices_to_train)) (apply "+" @@ -912,28 +958,12 @@ (accum_to_entities (assoc !dataMassChangeSinceLastAnalyze mass_to_accumulate)) - ;if there are ablated cases to accumulate influence weights to, do it here - ;also increase !dataMassChangeSinceLastAnalyze by the ablated cases and recompute influence weights entropy - (if (and accumulate_weight_feature (< (size indices_to_train) (size cases)) ) - (seq - (call !AccumulateCaseInfluenceWeights (assoc - features features - accumulate_weight_feature accumulate_weight_feature - cases - (unzip - cases - (remove (indices cases) indices_to_train) - ) - )) - - ;recompute influence weights entropy - (call !ComputeAndStoreInfluenceWeightEntropies (assoc - features features - weight_feature accumulate_weight_feature - use_case_weights (true) - )) - ) - ) + ;recompute influence weights entropy + (call !ComputeAndStoreInfluenceWeightEntropies (assoc + features features + weight_feature accumulate_weight_feature + use_case_weights (true) + )) ) ) diff --git a/howso/train_ts_ablation.amlg b/howso/train_ts_ablation.amlg new file mode 100644 index 000000000..7d348891d --- /dev/null +++ b/howso/train_ts_ablation.amlg @@ -0,0 +1,602 @@ +(null + ;Since all cases are expected for a series to be trained together as a group, this checks if there are any + ;cases from one series that are between cases of another series. If it finds a case that is not grouped + ;together with others of its series, this method will return true. + #!DoesDataNeedGroupingById + (declare + (assoc + data [] + id_indices [0] + ) + + (declare (assoc + one_id (= 1 (size id_indices)) + id_index (first id_indices) + needs_grouping (false) + )) + + ;This flow creates a set of all unique ids, then iterates over all cases, looking at each case's id + ;if a different id is encontered it is removed from that unique set + ;if an id that has already been removed from the set is encontered again, + ;that means the case is out of order and the data needs to be grouped + (if one_id + (let + (assoc + unique_ids_set + (zip (map + (lambda (get (current_value) id_index) ) + data + )) + previous_id (null) + ) + + (while (< (current_index) (size data)) + ;first row sets previous_id and unique_ids_set into (previous_result) + (if (= 0 (current_index)) + (seq + (assign (assoc + previous_id (get data [(current_index 2) id_index]) + )) + (remove unique_ids_set previous_id) + ) + + ;if the current id doesn't match previous_id, check to see if it has + ;already been encountered. if so, the data is out of order + (!= previous_id (get data [(current_index 1) id_index])) + (seq + (assign (assoc + previous_id (get data [(current_index 2) id_index]) + )) + + ;new ids should still be in unique_ids_set, but if they are not they've been encountered before + (if (not (contains_index (previous_result 0 (true)) previous_id)) + (conclude (conclude + (assign (assoc needs_grouping (true) )) + )) + ) + + ;id has not been encountered before, remove it from unique_ids_set + (remove (previous_result) previous_id) + ) + + ;else no change to unique_ids_set + (previous_result) + ) + ) + ) + + ;else multiple ids, less efficient + (let + (assoc + unique_ids_list + (values + (map (lambda (unzip (current_value) id_indices)) data) + (true) + ) + previous_ids [] + ) + + (while (< (current_index) (size data)) + ;first row sets previous_id and unique_ids_list into (previous_result) + (if (= 0 (current_index)) + (seq + (assign (assoc + previous_ids (unzip (get data (current_index 1)) id_indices) + )) + (filter + (lambda (!= (current_value) previous_ids)) + unique_ids_list + ) + ) + + ;if the current ids don't match previous_ids, check to see if they have + ;already been encountered. if so, the data is out of order + (!= previous_ids (unzip (get data (current_index)) id_indices) ) + (seq + (assign (assoc + previous_ids (unzip (get data (current_index 1)) id_indices) + )) + + ;new ids should still be in unique_ids_list, but if they are not they've been encountered before + (if (not (contains_value (previous_result 0 (true)) previous_ids)) + (conclude (conclude + (assign (assoc needs_grouping (true) )) + )) + ) + + ;id has not been encountered before, remove it from unique_ids_list + (filter + (lambda (!= (current_value) previous_ids)) + (previous_result) + ) + ) + + ;else no change to unique_ids_list + (previous_result) + ) + ) + ) + ) + + ;output true if data needs grouping + needs_grouping + ) + + + ;helper method + ;outputs a sorted list of cases where any case for a specific series that was among cases of a different series, + ;will be moved to be together among other cases of its own series, while mantaining the original order of different series + ;e.g., if these are series IDS of cases: A A A B A A B B C B C C C the result would be: A A A A A B B B B C C C C + #!GroupDataByIds + (seq + (declare (assoc + unique_ids_list + (values + (map (lambda (unzip (current_value) id_indices)) cases) + (true) + ) + )) + + (apply "append" + (map + (lambda (let + (assoc ids (current_value 1)) + (filter + (lambda (= ids (unzip (current_value) id_indices))) + cases + ) + )) + unique_ids_list + ) + ) + ) + + #!TrainTimeSeriesAblation + (seq + (declare (assoc + feature_index_map (zip features (indices features)) + )) + + (declare (assoc + lag_features + (filter + (lambda (and + (= "lag" (get !featureAttributes [ (current_value 1) "ts_type"]) ) + (= "custom" (get !featureAttributes [ (current_value 1) "auto_derive_on_train" "derive_type"]) ) + )) + derived_features + ) + progress_features [".series_progress" ".series_index" ".series_progress_delta"] + ts_series_length_limit (retrieve_from_entity "!tsSeriesLimitLength") + time_feature_index (get feature_index_map !tsTimeFeature) + id_features (get !tsModelFeaturesMap "series_id_features") + id_indices (unzip feature_index_map (get !tsModelFeaturesMap "series_id_features")) + original_features (replace features) + )) + (declare (assoc + series_id_features (get !featureAttributes [(first lag_features) "auto_derive_on_train" "series_id_features"]) + series_ordered_by_features (get !featureAttributes [(first lag_features) "auto_derive_on_train" "ordered_by_features"]) + ;check if the incoming cases are group sorted by series_id + needs_grouping + (call !DoesDataNeedGroupingById (assoc + data cases + id_indices id_indices + )) + )) + + ;set last column to be the original index of the cases + (assign (assoc + cases (map (lambda (append (current_value) (current_index))) cases) + features (append features ".original_index") + feature_index_map (append feature_index_map { ".original_index" (size feature_index_map)} ) + )) + + ;if the cases aren't ordered together by series ids, shuffle them around so they are grouped by ids + (if needs_grouping + (assign (assoc cases (call !GroupDataByIds) )) + ) + + ;the non-lag 'custom' derivation type features below + (assign (assoc + derived_features + (filter + (lambda + (and + (not (contains_value lag_features (current_value))) + (= "custom" (get !featureAttributes [(current_value 1) "auto_derive_on_train" "derive_type"]) ) + ) + ) + derived_features + ) + )) + + (if encode_features_on_train + (assign (assoc + cases + (map + (lambda + (call !ConvertFromInput (assoc + feature_values (current_value 1) + features features + )) + ) + cases + ) + )) + ) + + (declare (assoc + previous_ids [] + start_index 0 + end_index 0 + num_rows (size cases) + output_case_ids [] + ;map of sorted case index -> original case index used to lookup which original rows were ablated after data is sorted + ts_ablated_indices_map (null) + ;number of cases to be trained from each series, accumulated to keep track of session training index + num_previously_trained_cases 0 + )) + + (while (< (current_index) num_rows) + (if (= 0 (current_index)) + (assign (assoc previous_ids (unzip (get cases (current_index 1)) id_indices) )) + ) + + ;encountering next id or at end of all cases, train on the previous list of cases + (if + (or + (!= previous_ids (unzip (get cases (current_index)) id_indices) ) + (= (current_index) (- num_rows 1)) + ) + (let + (assoc + end_index + ;if last index, should be as-is + (if (= (current_index 1) (- num_rows 1)) + (current_index 1) + (- (current_index 1) 1) + ) + features original_features + ) + + ;train one entire series at a time + (accum (assoc + output_case_ids + (call !TrainSingleSeriesWithAblation (assoc + data (unzip cases (range start_index end_index)) + )) + )) + + (assign (assoc + previous_ids (unzip (get cases (current_index 1)) id_indices) + start_index (current_index 1) + num_previously_trained_cases (+ num_previously_trained_cases 1 (- end_index start_index)) + )) + ) + ) + ) + + ;return trained case_ids + output_case_ids + ) + + ;derive then train with ablation + #!TrainSingleSeriesWithAblation + (seq + + ;TODO: do this only if sorting in `needs_grouping` has not been done already + ;sort the data according to the specified features if ordering has been provided + (if (size series_ordered_by_features) + (assign (assoc + data + (call !MultiSortList (assoc + data data + column_order_indices (unzip feature_index_map series_ordered_by_features) + )) + )) + ) + ;map of index -> original index + (assign (assoc + ts_ablated_indices_map + (zip + (indices data) + (map (lambda (last (current_value))) data) + ) + )) + + ;drop that last 'index' column + (assign (assoc + data (map (lambda (trunc (current_value))) data) + )) + + (declare (assoc + ;series_index of each row, will be set to be non-zero if some series cases were already trained previously + continue_series_index 0 + trained_cases_reversed_order (false) + id_values (unzip (first data) id_indices) + previous_range (null) + untrained_data_size (size data) + )) + + ;check if this series has already been trained, if so, pull those cases and prepend to these + (declare (assoc + trained_series_case_ids + (contained_entities + (apply "append" + (map + (lambda + [(query_equals + (current_value 1) + (if (contains_index !numericNominalFeaturesMap (current_value 1)) + (+ (get id_values (current_index 1))) + (get id_values (current_index 1)) + ) + )] + ) + id_features + ) + ) + ) + trained_series_cases [] + )) + + ;if previously trained series cases exist for this series, prepend them to data + (if (size trained_series_case_ids) + (let + (assoc + features_indices (indices features) + prev_row_index 0 + ) + + ;overwrite trained_series_cases to contain each cases's feature values and all the progress_features values + (assign (assoc + trained_series_cases + (map + (lambda + (append + (retrieve_from_entity (current_value) (append features ".series_index") ) + (current_value) + ) + ) + trained_series_case_ids + ) + series_progress_index_feature_index (size features) + )) + + (assign (assoc + trained_series_cases + (call !MultiSortList (assoc + data trained_series_cases + column_order_indices [ time_feature_index ] + )) + )) + + (assign (assoc + ;flag set to true if previously trained cases were trained in reverse order, + ;i.e., they come later in the series than the cases being trained now + trained_cases_reversed_order + (> + (get (first trained_series_cases) time_feature_index) + (get (first data) time_feature_index) + ) + trained_series_case_ids (map (lambda (last (current_value))) trained_series_cases) + )) + + ;set continue_series_index to the would-be next index value + (assign (assoc + continue_series_index (+ 1 (get (last trained_series_cases) series_progress_index_feature_index) ) + )) + + ;previously trained series was ablated because the number of cases is less than the continue series index + (if (< (size trained_series_cases) continue_series_index) + (assign (assoc + trained_series_cases + ;fill previously ablated cases with nulls + (range + (lambda + (if (= (current_index) (get trained_series_cases [prev_row_index series_progress_index_feature_index])) + (seq + (accum (assoc prev_row_index 1)) + (get trained_series_cases [(- prev_row_index 1) series_progress_index_feature_index]) + ) + + ;else output (null) + ) + + ) + 0 series_continuation_index 1 + ) + )) + ) + + ;combine previously trained data with this new data + (assign (assoc + data + (if trained_cases_reversed_order + (append + data + (map + (lambda (unzip (current_value) features_indices)) + trained_series_cases + ) + ) + + (append + (map + (lambda (unzip (current_value) features_indices)) + trained_series_cases + ) + data + ) + ) + )) + ) + ) + + ;now that the length of each new series is known, ensure that ts_series_length_limit is e*(longest series) + (if (> (* 2.718281828459 (size data)) ts_series_length_limit) + (assign (assoc ts_series_length_limit (* 2.718281828459 (size data)) )) + ) + ;if ts_series_length_limit has been been updated to a larger value in the loop above, update the model with this new value + (if (> ts_series_length_limit !tsSeriesLimitLength) + (assign_to_entities (assoc !tsSeriesLimitLength ts_series_length_limit )) + ) + + ;derive lag features and append to data + (call !DeriveLagFeaturesForData) + + ;derive custom code features + (call !DeriveCustomFeaturesForData) + + ;derive progress features + (declare (assoc + derived_progress_values_lists (call !DeriveProgressFeaturesForData) + )) + + ;there were existing cases, update their progress values + (if (size trained_series_case_ids) + (map + (lambda + (assign_to_entities + (current_value) + (zip + progress_features + (get derived_progress_values_lists (+ (current_index) (if trained_cases_reversed_order untrained_data_size 0)) ) + ) + ) + ) + trained_series_case_ids + ) + ) + + + ;append all the progress values to data + (assign (assoc + features (append features progress_features ) + data + (map + (lambda (let + (assoc + row_index + (if trained_cases_reversed_order + (current_index 1) + (+ continue_series_index (current_index 1)) + ) + ) + (append + (current_value) + ;for each of the three progress features, grab the tuple of progress values + (get derived_progress_values_lists row_index) + ) + )) + + ;since data is combined with all the previously trained cases, + ;only use the non-trained data indices + (if trained_cases_reversed_order + (trunc data (- continue_series_index)) + (tail data (- continue_series_index)) + ) + ) + )) + + ;train and ablate cases and output created case ids + (call !TrainCasesWithAblation (assoc + cases data + trained_instance_count (+ trained_instance_count num_previously_trained_cases) + ;features have already been encoded + encode_features_on_train (false) + )) + ) + + #!DeriveLagFeaturesForData + (seq + (declare (assoc + derived_lag_values_lists + (call !AddDerivedIndependentCodeFeatures (assoc + derived_features lag_features + features features + series_data data + )) + )) + + (assign (assoc + features (append features lag_features) + data + (map + (lambda + (append (first (current_value)) (last (current_value))) + ) + data + derived_lag_values_lists + ) + )) + ) + + ;every derived feature must be immediately fed back into data because it may be needed by the next derived feature + #!DeriveCustomFeaturesForData + (map + (lambda (let + (assoc feature (current_value 1)) + (declare (assoc + derived_custom_values + (call !AddDerivedCodeFeature (assoc + feature feature + features features + series_data data + )) + )) + + (assign (assoc + features (append features feature) + data + (map + (lambda + (append (first (current_value)) (last (current_value))) + ) + data + derived_custom_values + ) + )) + )) + derived_features + ) + + + ;all the time values + #!DeriveProgressFeaturesForData + (declare + (assoc + sorted_time_values (map (lambda (get (current_value) time_feature_index)) data) + ) + (declare (assoc + range (- (last sorted_time_values) (first sorted_time_values)) + previous_value (first sorted_time_values) + first_value (first sorted_time_values) + fixed_delta (/ 1 (- (size sorted_time_values) 1)) + )) + + ;output a list of tuples [ progress%, index, delta_to_previous ] for each row in the data + (map + (lambda (let + (assoc + progress (/ (- (current_value 1) first_value) range) + ;delta is the % change, don't allow 0, use the fixed delta instead + delta (or (/ (- (current_value 1) previous_value) range) fixed_delta) + ) + + ;if the series is of length 1, set progress and delta to be 1 and prevent a divide by 0 + (if (= 0 range) + (assign (assoc + progress 1 + delta 1 + )) + ) + (assign (assoc previous_value (current_value 1))) + ;output the tuple + [progress (current_index 1) delta] + )) + + sorted_time_values + ) + ) +) \ No newline at end of file diff --git a/unit_tests/ut_h_ablate.amlg b/unit_tests/ut_h_ablate.amlg index 43da682ff..ca2180d92 100644 --- a/unit_tests/ut_h_ablate.amlg +++ b/unit_tests/ut_h_ablate.amlg @@ -106,20 +106,7 @@ )) )) - (declare (assoc - ablate_train_payload - (call_entity "howso" "train" (assoc - features features - cases - (list - (list 0.5 0.5) - (list 11 11) - ) - session "unit_test" - )) - )) - - (print "third train call ablates: ") + (print "second train call ablates: ") (call assert_same (assoc obs (get ablate_train_payload (list 1 "payload" "ablated_indices")) exp (list 8) @@ -189,6 +176,8 @@ exp (list) )) + (call exit_if_failures (assoc msg "High entropy case was ablated") ) + (declare (assoc eleven_cases (call_entity "howso" "get_cases" (assoc @@ -198,20 +187,12 @@ )) )) - (call exit_if_failures (assoc msg "High entropy case was ablated") ) - (print "low entropy case is in model: ") (call assert_same (assoc obs (get eleven_cases (list 1 "payload" "cases")) exp (list (list 11 11)) )) - (print "High entropy case is not in model: ") - (call assert_same (assoc - obs (get point_five_cases (list 1 "payload" "cases")) - exp (list) - )) - (call exit_if_failures (assoc msg "Low entropy case was not ablated") ) (call_entity "howso" "set_auto_ablation_params" (assoc @@ -228,13 +209,16 @@ targeted_model "targetless" )) - (call_entity "howso" "train" (assoc - features features - cases (list - (list 0.5 0.5) - (list 11 11) - ) - session "unit_test" + (assign (assoc + result + (call_entity "howso" "train" (assoc + features features + cases (list + (list 0 0) + (list 11 11) + ) + session "unit_test" + )) )) (assign (assoc @@ -247,12 +231,8 @@ (print "Data reduction reduces model size by the expected amount: ") (call assert_same (assoc - obs - (size (get (call_entity "howso" "get_cases") - (list 1 "payload" "cases") - )) - exp - (floor (/ pre_reduction_size 2)) + obs (get (call_entity "howso" "get_num_training_cases") [1 "payload" "count"]) + exp (floor (/ pre_reduction_size 2)) )) (call exit_if_failures (assoc msg "Data reduction does not reduce model size by the expected amount")) diff --git a/unit_tests/ut_h_edit_dist_features.amlg b/unit_tests/ut_h_edit_dist_features.amlg index c71d6cbb8..ccf326c7c 100644 --- a/unit_tests/ut_h_edit_dist_features.amlg +++ b/unit_tests/ut_h_edit_dist_features.amlg @@ -435,10 +435,10 @@ exp (assoc feature_contributions_full { - amalgam 1 - yaml 1 - json 1 - x 1 + amalgam 3.8 + yaml 3.5 + json 3.4 + x 3.9 } directional_feature_contributions_full { amalgam 0 @@ -446,6 +446,7 @@ json 0 x 0 } + ) percent 0.5 ))