diff --git a/CMakeLists.txt b/CMakeLists.txt index 3072824c5..328a3e299 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,7 +114,6 @@ set(COMMON_SOURCE src/Amalgam/AssetManager.h src/Amalgam/BinaryPacking.cpp src/Amalgam/BinaryPacking.h - src/Amalgam/Conviction.h src/Amalgam/Cryptography.cpp src/Amalgam/Cryptography.h src/Amalgam/DateTimeFormat.cpp @@ -128,6 +127,8 @@ set(COMMON_SOURCE src/Amalgam/entity/EntityManipulation.h src/Amalgam/entity/EntityQueries.cpp src/Amalgam/entity/EntityQueries.h + src/Amalgam/entity/EntityQueriesDensityFunctions.cpp + src/Amalgam/entity/EntityQueriesDensityFunctions.h src/Amalgam/entity/EntityQueriesStatistics.h src/Amalgam/entity/EntityQueryBuilder.h src/Amalgam/entity/EntityQueryCaches.cpp diff --git a/docs/language.js b/docs/language.js index faf077a59..6d86e26aa 100644 --- a/docs/language.js +++ b/docs/language.js @@ -1701,6 +1701,15 @@ var data = [ "description" : "When used as a query argument, computes the nearest neighbors to every entity given by entity_ids_to_compute, normalizes their influence weights, and accumulates the entity's total influence weights relative to every other case. It returns a list of all cases whose cumulative neighbor values are greater than zero. feature_labels specifies the names of the features to consider the during computation. The parameter p_value is the generalized norm parameter, where the value of 1 is probability space and Manhattan distance, the default, 2 being Euclidean distance, etc. The weights parameter specifies how to weight the different dimensions. If weights is a list, each value maps to its respective element in the vectors. If weights is null, then it will assume that the weights are 1. If weights is an assoc, then the parameter value_names will select the weights from the assoc. If weights is an assoc of assocs, then the parameter weights_selection_feature will select which set of weights to use and redistribute and normalize any probability masses for unused features to features that are used. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_bool\", \"nominal_number\", \"nominal_string\", \"nominal_code\", \"continuous_number\", \"continuous_number_cyclic\", \"continuous_string\", \"continuous_code_no_recursive_matching\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviation is equivalent to setting each deviation to 0, unless distance_transform is \"surprisal\" or \"surprisal_to_prob\", in which case it will attempt to infer a deviation. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For nominal types, the value for each feature can be a numeric deviation, an assoc, or a list. If the value is an assoc it specifies deviation information, where each key of the assoc is the nominal value, and each value of the assoc can be a numeric deviation value, a list, or an assoc, with the list specifying either an assoc followed optionally by the default deviation. This inner assoc, regardless of whether it is in a list, maps the value to each actual value's deviation. The parameter entities_returned specifies either the number of entities to return, or is a list. If entities_returned is a list, the first element of the list specifies the minimum incremental probability or percent of mass that the next largest entity would comprise (e.g., 0.05 would return at most 20 entities if they were all equal in percent of mass), and the other elements are optional. The second element is the minimum number of entities to return, the third element is the maximum number of entities to return, and the fourth indicates the number of additional entities to include after any of the aforementioned thresholds (defaulting to zero). If there is disagreement among the constraints for entities_returned, the constraint yielding the fewest entities will govern the number of entities returned. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precision, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal\" then distances will be calculated as surprisals, and weights will not be applied to the values. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances, only using entity weights for nonpositive values of distance_transform. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", "example" : "(compute_on_contained_entities (list (query_entity_cumulative_nearest_entity_weights 2 (list \"x\" \"y\") (null) 2 (null) (null) (null) (null) (null) -1 (null) \"random seed 1234\") ))" }, + + { + "parameter" : "query_entity_clusters list|number entities_returned list feature_labels number min_cluster_weight [number p_value] [list|assoc|assoc of assoc weights] [list|assoc distance_types] [list|assoc attributes] [list|assoc deviations] [string weights_selection_feature] [string|number distance_transform] [string entity_weight_label_name] [number random_seed] [string radius_label] [string numerical_precision] [* output_sorted_list]", + "output" : "query", + "new value" : "partial", + "concurrency" : true, + "description" : "When used as a query argument, computes cluster ids for each of the entities, requiring that min_cluster_weight be the smallest allowable cluster. min_cluster_weight can be 0 and is the smallest value of accumulated entity weight that will be considered as a cluster. The ids are nonegative integers, with the id of zero denoting entities that are independent and isolated from all clusters. feature_labels specifies the names of the features to consider the during computation. The parameter p_value is the generalized norm parameter, where the value of 1 is probability space and Manhattan distance, the default, 2 being Euclidean distance, etc. The weights parameter specifies how to weight the different dimensions. If weights is a list, each value maps to its respective element in the vectors. If weights is null, then it will assume that the weights are 1. If weights is an assoc, then the parameter value_names will select the weights from the assoc. If weights is an assoc of assocs, then the parameter weights_selection_feature will select which set of weights to use and redistribute and normalize any probability masses for unused features to features that are used. The parameter distance_types is either a list strings or an assoc of strings indicating the type of distance for each feature. Allowed values are \"nominal_bool\", \"nominal_number\", \"nominal_string\", \"nominal_code\", \"continuous_number\", \"continuous_number_cyclic\", \"continuous_string\", \"continuous_code_no_recursive_matching\", and \"continuous_code\". Nominals evaluate whether the two values are the same and continuous evaluates the difference between the two values. The numeric, string, or code modifier specifies how the difference is measured, and cyclic means it is a difference that wraps around. \nFor attributes, the particular distance_types specifies what particular attributes are expected. For a nominal distance_type, a number indicates the nominal count, whereas null will infer from the values given. Cyclic requires a single value, which is the upper bound of the difference for the cycle range (e.g., if the value is 360, then the supremum difference between two values will be 360, leading 1 and 359 to have a difference of 2).\n Deviations are used during distance calculation to specify uncertainty per-element, the minimum difference between two values prior to exponentiation. Specifying null as a deviation is equivalent to setting each deviation to 0, unless distance_transform is \"surprisal\" or \"surprisal_to_prob\", in which case it will attempt to infer a deviation. Each deviation for each feature can be a single value or a list. If it is a single value, that value is used as the deviation and differences and deviations for null values will automatically computed from the data based on the maximum difference. If a deviation is provided as a list, then the first value is the deviation, the second value is the difference to use when one of the values being compared is null, and the third value is the difference to use when both of the values are null. If the third value is omitted, it will use the second value for both. If both of the null values are omitted, then it will compute the maximum difference and use that for both. For nominal types, the value for each feature can be a numeric deviation, an assoc, or a list. If the value is an assoc it specifies deviation information, where each key of the assoc is the nominal value, and each value of the assoc can be a numeric deviation value, a list, or an assoc, with the list specifying either an assoc followed optionally by the default deviation. This inner assoc, regardless of whether it is in a list, maps the value to each actual value's deviation. The parameter entities_returned specifies either the number of entities to return, or is a list. If entities_returned is a list, the first element of the list specifies the minimum incremental probability or percent of mass that the next largest entity would comprise (e.g., 0.05 would return at most 20 entities if they were all equal in percent of mass), and the other elements are optional. The second element is the minimum number of entities to return, the third element is the maximum number of entities to return, and the fourth indicates the number of additional entities to include after any of the aforementioned thresholds (defaulting to zero). If there is disagreement among the constraints for entities_returned, the constraint yielding the fewest entities will govern the number of entities returned. The optional radius_label parameter represents the label name of the radius of the entity (if the radius is within the distance, the entity is selected). The optional numerical_precision represents one of three values: \"precise\", which computes every distance with high numerical precision, \"fast\", which computes every distance with lower but faster numerical precision, and \"recompute_precise\", which computes distances quickly with lower precision but then recomputes any distance values that will be returned with higher precision. If called last with compute_on_contained_entities, then it returns an assoc of the entity ids with their convictions. A transform will be applied to these distances based on distance_transform. If distance_transform is \"surprisal\" then distances will be calculated as surprisals, and weights will not be applied to the values. If distance_transform is \"surprisal_to_prob\" then distances will be calculated as surprisals and will be transformed back into probabilities for aggregating, and then transformed back to surprisals. If distance_transform is a number or omitted, which will default to 1.0, then it will be used as a parameter for a generalized mean (e.g., -1 yields the harmonic mean) to average the distances, only using entity weights for nonpositive values of distance_transform. If entity_weight_label_name is specified, it will multiply the resulting value for each entity (after distance_weight_exponent, etc. have been applied) by the value in the label of entity_weight_label_name. If output_sorted_list is not specified or is false, then it will return an assoc of entity string id as the key with the distance as the value; if output_sorted_list is true, then it will return a list of lists, where the first list is the entity ids and the second list contains the corresponding distances, where both lists are in sorted order starting with the closest or most important (based on whether distance_weight_exponent is positive or negative respectively). If output_sorted_list is a string, then it will additionally return a list where the values correspond to the values of the labels for each respective entity. If output_sorted_list is a list of strings, then it will additionally return a list of values for each of the label values for each respective entity.", + "example" : "(compute_on_contained_entities (list (query_entity_clusters 2 (list \"x\" \"y\") (null) 2 (null) (null) (null) (null) (null) -1 (null) \"random seed 1234\") ))" + }, { "parameter" : "contains_label [id_path entity] string label_name", diff --git a/src/Amalgam/Amalgam.vcxproj b/src/Amalgam/Amalgam.vcxproj index a50d1e83c..d7f597d88 100644 --- a/src/Amalgam/Amalgam.vcxproj +++ b/src/Amalgam/Amalgam.vcxproj @@ -562,6 +562,7 @@ + @@ -611,7 +612,6 @@ - @@ -619,6 +619,7 @@ + diff --git a/src/Amalgam/Amalgam.vcxproj.filters b/src/Amalgam/Amalgam.vcxproj.filters index 9ef37f5ff..9a0a2c58a 100644 --- a/src/Amalgam/Amalgam.vcxproj.filters +++ b/src/Amalgam/Amalgam.vcxproj.filters @@ -203,9 +203,6 @@ Header Files - - Header Files - Header Files @@ -338,6 +335,9 @@ Header Files + + Header Files + diff --git a/src/Amalgam/KnnCache.h b/src/Amalgam/KnnCache.h index 7af324ffb..0947e4323 100644 --- a/src/Amalgam/KnnCache.h +++ b/src/Amalgam/KnnCache.h @@ -151,6 +151,13 @@ class KnnCache *positionLabelIds, index, top_k, radiusLabelId, from_indices, true, expand_to_first_nonzero_distance, out); } + //returns the container for the knn cache for index + //note that this should only be called after PreCacheKnn has been called + std::vector> &GetKnnCache(size_t index) + { + return cachedNeighbors[index]; + } + //returns a pointer to the relevant indices of the cache constexpr BitArrayIntegerSet *GetRelevantEntities() { @@ -169,6 +176,12 @@ class KnnCache return relevantIndices->GetEndInteger(); } + //returns the generalized distance evaluator + inline GeneralizedDistanceEvaluator *GetDistanceEvaluator() + { + return distEvaluator; + } + protected: //cache of nearest neighbor results. The index of cache is the entity, and the corresponding vector are its nearest neighbors. std::vector>> cachedNeighbors; diff --git a/src/Amalgam/Opcodes.cpp b/src/Amalgam/Opcodes.cpp index 8135c3eff..e9571fb59 100644 --- a/src/Amalgam/Opcodes.cpp +++ b/src/Amalgam/Opcodes.cpp @@ -289,6 +289,7 @@ void StringInternPool::InitializeStaticStrings() EmplaceNodeTypeString(ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS, "query_entity_distance_contributions"); EmplaceNodeTypeString(ENT_QUERY_ENTITY_KL_DIVERGENCES, "query_entity_kl_divergences"); EmplaceNodeTypeString(ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS, "query_entity_cumulative_nearest_entity_weights"); + EmplaceNodeTypeString(ENT_QUERY_ENTITY_CLUSTERS, "query_entity_clusters"); //entity access EmplaceNodeTypeString(ENT_CONTAINS_LABEL, "contains_label"); diff --git a/src/Amalgam/Opcodes.h b/src/Amalgam/Opcodes.h index 8c0dd81e1..e31407616 100644 --- a/src/Amalgam/Opcodes.h +++ b/src/Amalgam/Opcodes.h @@ -272,6 +272,7 @@ enum EvaluableNodeType : uint8_t ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS, ENT_QUERY_ENTITY_KL_DIVERGENCES, ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS, + ENT_QUERY_ENTITY_CLUSTERS, //entity access ENT_CONTAINS_LABEL, @@ -357,6 +358,7 @@ constexpr OrderedChildNodeType GetOpcodeOrderedChildNodeType(EvaluableNodeType t case ENT_QUERY_DISTANCE_CONTRIBUTIONS: case ENT_QUERY_ENTITY_CONVICTIONS: case ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE: case ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS: case ENT_QUERY_ENTITY_KL_DIVERGENCES: case ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS: + case ENT_QUERY_ENTITY_CLUSTERS: case ENT_CONTAINS_LABEL: case ENT_ASSIGN_TO_ENTITIES: case ENT_DIRECT_ASSIGN_TO_ENTITIES: case ENT_ACCUM_TO_ENTITIES: case ENT_RETRIEVE_FROM_ENTITY: case ENT_DIRECT_RETRIEVE_FROM_ENTITY: @@ -539,7 +541,8 @@ constexpr OpcodeNewValueReturnType GetOpcodeNewValueReturnType(EvaluableNodeType case ENT_QUERY_ENTITY_CONVICTIONS: case ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE: case ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS: case ENT_QUERY_ENTITY_KL_DIVERGENCES: case ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS: - return ONVRT_PARTIALLY_NEW_VALUE; + case ENT_QUERY_ENTITY_CLUSTERS: + return ONVRT_PARTIALLY_NEW_VALUE; case ENT_RAND: case ENT_FIRST: case ENT_TAIL: case ENT_LAST: case ENT_TRUNC: @@ -643,6 +646,7 @@ constexpr bool IsEvaluableNodeTypeQuery(EvaluableNodeType t) || t == ENT_QUERY_DISTANCE_CONTRIBUTIONS || t == ENT_QUERY_ENTITY_CONVICTIONS || t == ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE || t == ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS || t == ENT_QUERY_ENTITY_KL_DIVERGENCES || t == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + || t == ENT_QUERY_ENTITY_CLUSTERS ); } diff --git a/src/Amalgam/amlg_code/full_test.amlg b/src/Amalgam/amlg_code/full_test.amlg index fe3ebbe9f..89dfef57d 100644 --- a/src/Amalgam/amlg_code/full_test.amlg +++ b/src/Amalgam/amlg_code/full_test.amlg @@ -3585,6 +3585,66 @@ (print "cyclic test expected: 155, 200, 190 ... deg values of 0 8 and 12:\n") (map (lambda (print (current_index) ": " (current_value) " " (retrieve_entity_root (list "CyclicTestEntity" (current_index 1))))) buds) + (print "--query_entity_clusters--\n") + (create_entities "ClusterTestEntity" (null)) + (create_entities + ["ClusterTestEntity" "invalid"] (zip_labels ["w"] [1]) + + ["ClusterTestEntity" "g1c1"] (zip_labels ["A" "B" "w"] [1 1 1]) + ["ClusterTestEntity" "g1c2"] (zip_labels ["A" "B" "w"] [1 2 1]) + ["ClusterTestEntity" "g1c3"] (zip_labels ["A" "B" "w"] [2 1 1]) + ["ClusterTestEntity" "g1c4"] (zip_labels ["A" "B" "w"] [2 2 1]) + + ["ClusterTestEntity" "g2c1"] (zip_labels ["A" "B" "w"] [11 11 1]) + ["ClusterTestEntity" "g2c2"] (zip_labels ["A" "B" "w"] [11 12 1]) + ["ClusterTestEntity" "g2c3"] (zip_labels ["A" "B" "w"] [12 11 1]) + ["ClusterTestEntity" "g2c4"] (zip_labels ["A" "B" "w"] [12 12 1]) + + ["ClusterTestEntity" "g3c1"] (zip_labels ["A" "B" "w"] [100 100 1]) + ["ClusterTestEntity" "g3c2"] (zip_labels ["A" "B" "w"] [100 200 1]) + ["ClusterTestEntity" "g3c3"] (zip_labels ["A" "B" "w"] [100 300 1]) + ["ClusterTestEntity" "g3c4"] (zip_labels ["A" "B" "w"] [200 100 1]) + ["ClusterTestEntity" "g3c5"] (zip_labels ["A" "B" "w"] [200 200 1]) + ["ClusterTestEntity" "g3c6"] (zip_labels ["A" "B" "w"] [200 300 1]) + ["ClusterTestEntity" "g3c7"] (zip_labels ["A" "B" "w"] [300 100 1]) + ["ClusterTestEntity" "g3c8"] (zip_labels ["A" "B" "w"] [300 200 1]) + ["ClusterTestEntity" "g3c8"] (zip_labels ["A" "B" "w"] [300 300 1]) + + ["ClusterTestEntity" "g4c1"] (zip_labels ["A" "B" "w"] [1010 1010 1]) + ["ClusterTestEntity" "g4c2"] (zip_labels ["A" "B" "w"] [1010 1020 1]) + ["ClusterTestEntity" "g4c3"] (zip_labels ["A" "B" "w"] [1010 1030 1]) + ["ClusterTestEntity" "g4c4"] (zip_labels ["A" "B" "w"] [1020 1010 1]) + ["ClusterTestEntity" "g4c5"] (zip_labels ["A" "B" "w"] [1020 1020 1]) + ["ClusterTestEntity" "g4c6"] (zip_labels ["A" "B" "w"] [1020 1030 1]) + ["ClusterTestEntity" "g4c7"] (zip_labels ["A" "B" "w"] [1030 1010 1]) + ["ClusterTestEntity" "g4c8"] (zip_labels ["A" "B" "w"] [1030 1020 1]) + ["ClusterTestEntity" "g4c8"] (zip_labels ["A" "B" "w"] [1030 1030 1]) + + ["ClusterTestEntity" "g5c1"] (zip_labels ["A" "B" "w"] [10000 10000 1]) + ) + + (print + (compute_on_contained_entities "ClusterTestEntity" + (query_entity_clusters + (list 0.05 1 20) ;(get hyperparam_map "k") + ["A" "B"] ;features + 2 ;min cluster size + 1 + (null) ;feature_weights + (zip ["A" "B"] "continuous_numeric") + (null) + {"A" 0.5 "B" 0.5 } ;feature_deviations + (null) + "surprisal" + distribute_weight_feature + ;use a fixed random seed to guarantee deterministic behavior for reacts (named "fixed rand seed") + "fixed rand seed" + (null) ;radius + (null) ;!numericalPrecision + ) + ) + ) + (print "--contains_label--\n") (print (contains_label "label3") "\n") (print (contains_label "hhccc") "\n") diff --git a/src/Amalgam/amlg_code/test.amlg b/src/Amalgam/amlg_code/test.amlg index ec26745f8..3810f37cf 100644 --- a/src/Amalgam/amlg_code/test.amlg +++ b/src/Amalgam/amlg_code/test.amlg @@ -1,13 +1,161 @@ (seq +(seq ;(if .false +(seq + (print "--query_entity_clusters--\n") + (create_entities "ClusterTestEntity" (null)) + (create_entities + ["ClusterTestEntity" "invalid"] (zip_labels ["w"] [1]) -(print (= (list 4 4 5) (list 4 4 5) ) "\n") -(print (= (list 4 4 5) (list 4 4 6) ) "\n") -(print (= (list 4 4 5) (list 4 5 4) ) "\n") + ["ClusterTestEntity" "g1c1"] (zip_labels ["A" "B" "w"] [1 1 1]) + ["ClusterTestEntity" "g1c2"] (zip_labels ["A" "B" "w"] [1 2 1]) + ["ClusterTestEntity" "g1c3"] (zip_labels ["A" "B" "w"] [2 1 1]) + ["ClusterTestEntity" "g1c4"] (zip_labels ["A" "B" "w"] [2 2 1]) -(print (= (unordered_list 4 4 5) (unordered_list 4 4 5) ) "\n") -(print (= (unordered_list 4 4 5) (unordered_list 4 4 6) ) "\n") -(print (= (unordered_list 4 4 5) (unordered_list 4 5 4) ) "\n") + ["ClusterTestEntity" "g2c1"] (zip_labels ["A" "B" "w"] [11 11 1]) + ["ClusterTestEntity" "g2c2"] (zip_labels ["A" "B" "w"] [11 12 1]) + ["ClusterTestEntity" "g2c3"] (zip_labels ["A" "B" "w"] [12 11 1]) + ["ClusterTestEntity" "g2c4"] (zip_labels ["A" "B" "w"] [12 12 1]) -(print (= (set_type (range 0 100) "unordered_list") (set_type (reverse (range 0 100)) "unordered_list")) "\n") + ["ClusterTestEntity" "g3c1"] (zip_labels ["A" "B" "w"] [100 100 1]) + ["ClusterTestEntity" "g3c2"] (zip_labels ["A" "B" "w"] [100 200 1]) + ["ClusterTestEntity" "g3c3"] (zip_labels ["A" "B" "w"] [100 300 1]) + ["ClusterTestEntity" "g3c4"] (zip_labels ["A" "B" "w"] [200 100 1]) + ["ClusterTestEntity" "g3c5"] (zip_labels ["A" "B" "w"] [200 200 1]) + ["ClusterTestEntity" "g3c6"] (zip_labels ["A" "B" "w"] [200 300 1]) + ["ClusterTestEntity" "g3c7"] (zip_labels ["A" "B" "w"] [300 100 1]) + ["ClusterTestEntity" "g3c8"] (zip_labels ["A" "B" "w"] [300 200 1]) + ["ClusterTestEntity" "g3c8"] (zip_labels ["A" "B" "w"] [300 300 1]) - ) \ No newline at end of file + ["ClusterTestEntity" "g4c1"] (zip_labels ["A" "B" "w"] [1010 1010 1]) + ["ClusterTestEntity" "g4c2"] (zip_labels ["A" "B" "w"] [1010 1020 1]) + ["ClusterTestEntity" "g4c3"] (zip_labels ["A" "B" "w"] [1010 1030 1]) + ["ClusterTestEntity" "g4c4"] (zip_labels ["A" "B" "w"] [1020 1010 1]) + ["ClusterTestEntity" "g4c5"] (zip_labels ["A" "B" "w"] [1020 1020 1]) + ["ClusterTestEntity" "g4c6"] (zip_labels ["A" "B" "w"] [1020 1030 1]) + ["ClusterTestEntity" "g4c7"] (zip_labels ["A" "B" "w"] [1030 1010 1]) + ["ClusterTestEntity" "g4c8"] (zip_labels ["A" "B" "w"] [1030 1020 1]) + ["ClusterTestEntity" "g4c8"] (zip_labels ["A" "B" "w"] [1030 1030 1]) + + ["ClusterTestEntity" "g5c1"] (zip_labels ["A" "B" "w"] [10000 10000 1]) + ) + + (print + (compute_on_contained_entities "ClusterTestEntity" + (query_entity_clusters + [0.05 3 20] + ["A" "B"] ;features + 3 ;min cluster size + 1 + (null) ;feature_weights + (zip ["A" "B"] "continuous_numeric") + (null) + {"A" 0.5 "B" 0.5 } ;feature_deviations + (null) + "surprisal" + distribute_weight_feature + ;use a fixed random seed to guarantee deterministic behavior for reacts (named "fixed rand seed") + "fixed rand seed" + (null) ;radius + (null) ;!numericalPrecision + ) + ) + ) +) +(seq + + (map + (lambda + (create_entities (concat "case" (current_index)) (zip_labels ["A" "B" "C"] (current_value)) ) + ) + [ + ;cluster 1 + [1 1 1] + [1 2 1] + [2 1 1] + [2 2 2] + [2 2 2] + [3.5 1 2] + [3.1 0 2] + [1.2 3 1.3] + [2 1.9 0.8] + [2.3 1.6 0.5] + [1.4 1.9 1.4] + [2 3 3] + + + ;cluster 2 + [16 19 19] + ;this case has a very low 3rd value + [15 17 10] + [17 17 19] + [17 15 19] + [19 16 15] + [18 16 17] + [16 17 17] + [19 15 17] + [15 18 16] + [15 17 17] + [16 19 18] + [16 15 19] + + + ;cluster 3 though some may be unclustered + [7 9 10] + [13 12 7] + [11 11 12] + [13 8 12] + [13 10 6] + [9 8 12] + [8 9 12] + [10 11 11] + [9 12 7] + [9 7 10] + [8 8 9] + [9 14 10] + + questionables + [14 10 18] + [17 7 10] + [18 2 8] + ] + ) + + (declare (assoc + cluster_map + (compute_on_contained_entities + (query_entity_clusters + [0.05 3 20 3] + ["A" "B" "C"] + 3 ;min cluster size + 1 ;p + (null) ;{ "A" 0.333333 "B" 0.333333 "C" 0.333333} ;feature_weights + { "A" "continuous_numeric" "B" "continuous_numeric" "C" "continuous_numeric" } ;!queryDistanceTypeMap + (null) ;(get !hyperparameterMetadataMap "featureDomainAttributes") + {"A" 0.05 "B" 0.05 "C" 0.05} ;feature_deviations + (null) ;"C" ; feature weight feature + "surprisal" ;(get hyperparam_map "dt") + (null) ; weight feature + "tie_break_random_seed" + (null) ;radius + (null) ; !numericalPrecision + ) + ) + )) + (print + ;for every cluster output the number of cases in that cluster + (map + (lambda + (size + ;keep cases matching this cluster id + (filter (lambda (= (current_index 1) (current_value))) cluster_map) + ) + ) + ;makes an assoc of cluster id + (zip (values cluster_map)) + ) + ) + + (print cluster_map) +) +) +) \ No newline at end of file diff --git a/src/Amalgam/entity/EntityQueries.cpp b/src/Amalgam/entity/EntityQueries.cpp index f49c88e7a..26c25e531 100644 --- a/src/Amalgam/entity/EntityQueries.cpp +++ b/src/Amalgam/entity/EntityQueries.cpp @@ -221,6 +221,7 @@ bool EntityQueryCondition::DoesEntityMatchCondition(Entity *e) case ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE: case ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS: case ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS: + case ENT_QUERY_ENTITY_CLUSTERS: return false; default: diff --git a/src/Amalgam/entity/EntityQueries.h b/src/Amalgam/entity/EntityQueries.h index c56b64871..204834450 100644 --- a/src/Amalgam/entity/EntityQueries.h +++ b/src/Amalgam/entity/EntityQueries.h @@ -115,6 +115,9 @@ class EntityQueryCondition //quantile percentage, for ENT_QUERY_QUANTILE double qPercentage; + //min cluster for ENT_QUERY_ENTITY_CLUSTERS + double minClusterWeight; + //for ENT_QUERY_GENERALIZED_MEAN double center; bool calculateMoment; diff --git a/src/Amalgam/entity/EntityQueriesDensityFunctions.cpp b/src/Amalgam/entity/EntityQueriesDensityFunctions.cpp new file mode 100644 index 000000000..bf4fc8ad1 --- /dev/null +++ b/src/Amalgam/entity/EntityQueriesDensityFunctions.cpp @@ -0,0 +1,566 @@ +//project headers: +#include "EntityQueriesDensityFunctions.h" + +#ifdef HDBSCAN +void EntityQueriesDensityProcessor::BuildMutualReachabilityMST(std::vector &core_distances, std::vector &order, + std::vector &edge_distances, std::vector &parent_entities) +{ + size_t num_entity_ids = core_distances.size(); + size_t num_entities = order.size(); + edge_distances.clear(); + edge_distances.resize(num_entity_ids, std::numeric_limits::infinity()); + parent_entities.clear(); + parent_entities.resize(num_entity_ids, std::numeric_limits::max()); + + //used to mark vertices (entities) as they are added to the tree + std::vector processed_flags; + processed_flags.resize(num_entity_ids, false); + + //initialize the first point, largest core distance, as the root + size_t root = order[0]; + processed_flags[root] = true; + //root points to itself + parent_entities[root] = root; + //no edge weight + edge_distances[root] = 0.0; + + for(size_t order_index = 1; order_index < num_entities; ++order_index) + { + size_t cur_entity_index = order[order_index]; + + size_t best_parent = std::numeric_limits::max(); + double best_dist = std::numeric_limits::max(); + + auto &neighbors = knnCache->GetKnnCache(cur_entity_index); + for(auto &nb : neighbors) + { + size_t neighbor_entity_index = nb.reference; + //ignore neighbors that have not yet been processed + if(!processed_flags[neighbor_entity_index]) + continue; + + double mutual_reachability_distance = std::max({ core_distances[cur_entity_index], + core_distances[neighbor_entity_index], nb.distance }); + + if(mutual_reachability_distance < best_dist) + { + best_dist = mutual_reachability_distance; + best_parent = neighbor_entity_index; + } + } + + //it is possible but rare that none of the neighbours have not been processed yet, + // e.g., the graph is disconnected. if so, fall back to a + // direct connection to the root using only core distances + if(best_parent == std::numeric_limits::max()) + { + //TODO 24886: need a better way to connect disconnected cliques, maybe get nearest neighbor among spanning tree? + best_dist = std::max(core_distances[cur_entity_index], core_distances[root]); + best_parent = root; + } + + //record the processed entity + parent_entities[cur_entity_index] = best_parent; + edge_distances[cur_entity_index] = best_dist; + processed_flags[cur_entity_index] = true; + } +} + +void EntityQueriesDensityProcessor::ExtractClustersFromMST(EntityReferenceSet &entities_to_compute, + std::vector &core_distances, std::vector &edge_distances, + std::vector &parent_entities, std::vector &order, double minimum_cluster_weight, + std::vector &cluster_ids, std::vector &stabilities) +{ + size_t num_entity_ids = edge_distances.size(); + + //density is 1 / mutual reachability distance + std::vector densities(num_entity_ids, 0.0); + for(auto entity_index : entities_to_compute) + { + if(edge_distances[entity_index] > 0.0) + densities[entity_index] = 1.0 / edge_distances[entity_index]; + } + //root has a 0 edge distance, so compute its density separately + size_t root_index = order.front(); + densities[root_index] = 1.0 / core_distances[root_index]; + + //bottom-up pass to construct the total entity weights of the potential clusters + std::vector subtree_cumulative_weights(num_entity_ids, 0.0); + + //accumulate the total distances up the MST + for(auto it = order.rbegin(); it != order.rend(); ++it) + { + size_t entity_index = *it; + size_t parent_index = parent_entities[entity_index]; + + double w = 1.0; + distanceTransform->getEntityWeightFunction(entity_index, w); + + subtree_cumulative_weights[entity_index] += w; + + //if root, doesn't have a different parent, so don't accumulate + if(parent_index != entity_index) + subtree_cumulative_weights[parent_index] += subtree_cumulative_weights[entity_index]; + } + + stabilities.clear(); + stabilities.resize(num_entity_ids, 0.0); + + //accumulate stabilities using differences in densities + for(auto it = order.rbegin(); it != order.rend(); ++it) + { + size_t entity_index = *it; + + //TODO 24886: use a different algorithm to make faster + //check all entities to find children of this node + for(size_t i = 0; i < num_entity_ids; i++) + { + if(parent_entities[i] == entity_index) + { + double delta_density = densities[i] - densities[entity_index]; + if(delta_density < 0.0) + delta_density = 0.0; + + stabilities[entity_index] += delta_density * subtree_cumulative_weights[i]; + } + } + } + + cluster_ids.clear(); + cluster_ids.resize(num_entity_ids, 0); + + //cluster id 0 is considered noise / not a cluster + size_t next_cluster_id = 1; + + //minimum stability to avoid treating floating point noise as a cluster + constexpr double stability_eps = 1e-12; + + //stack to search all descendents + std::vector descendent_search_stack; + + //walk the tree from leaves to root (reverse order) + for(auto it = order.rbegin(); it != order.rend(); ++it) + { + size_t entity_index = *it; + + //skip if has already been assigned + if(cluster_ids[entity_index] != 0) + continue; + + //decide whether entity_index is eligible to become a cluster + if(stabilities[entity_index] < stability_eps) + continue; + + //skip if not enough weight + if(subtree_cumulative_weights[entity_index] < minimum_cluster_weight) + continue; + + //ensure no ancestor is already a cluster + bool ancestor_clustered = false; + size_t ancestor_index = parent_entities[entity_index]; + //walk up until hit the root + while(ancestor_index != entity_index) + { + if(cluster_ids[ancestor_index] != 0) + { + ancestor_clustered = true; + break; + } + + //stop if hit root + if(ancestor_index == parent_entities[ancestor_index]) + break; + ancestor_index = parent_entities[ancestor_index]; + } + if(ancestor_clustered) + continue; + + //mark this entity as a new cluster with an id + cluster_ids[entity_index] = next_cluster_id; + + //depth‑first walk to label all descendants that are still unassigned + descendent_search_stack.clear(); + descendent_search_stack.emplace_back(entity_index); + while(!descendent_search_stack.empty()) + { + size_t cur_id = descendent_search_stack.back(); + descendent_search_stack.pop_back(); + + cluster_ids[cur_id] = next_cluster_id; + + //push child entities that are not yet labelled + for(size_t i = 0; i < num_entity_ids; i++) + { + if(parent_entities[i] == cur_id && cluster_ids[i] == 0 && i != cur_id) + descendent_search_stack.push_back(i); + } + } + + //given that the cluster was accepted, need to remove its weight from its parents + double consumed = subtree_cumulative_weights[entity_index]; + size_t up = parent_entities[entity_index]; + while(up != entity_index) + { + subtree_cumulative_weights[up] -= consumed; + if(up == parent_entities[up]) + break; + up = parent_entities[up]; + } + + next_cluster_id++; + } +} + +void EntityQueriesDensityProcessor::ComputeCaseClusters(EntityReferenceSet &entities_to_compute, + std::vector &clusters_out, double minimum_cluster_weight) +{ + //prime the cache +#ifdef MULTITHREAD_SUPPORT + knnCache->PreCacheKnn(&entities_to_compute, numNearestNeighbors, true, runConcurrently); +#else + knnCache->PreCacheKnn(&entities_to_compute, numNearestNeighbors, true); +#endif + + //find distance contributions to use as core weights + size_t num_entity_indices = knnCache->GetEndEntityIndex(); + size_t num_entities = entities_to_compute.size(); + auto &core_distances = buffers.baseDistanceContributions; + core_distances.clear(); + core_distances.resize(num_entity_indices, std::numeric_limits::infinity()); + + IterateOverConcurrentlyIfPossible(entities_to_compute, + [this, &core_distances](auto /*unused index*/, auto entity) + { + auto &neighbors = knnCache->GetKnnCache(entity); + + double entity_weight = 1.0; + distanceTransform->getEntityWeightFunction(entity, entity_weight); + core_distances[entity] = distanceTransform->ComputeDistanceContribution(neighbors, entity_weight); + + distanceTransform->TransformDistances(neighbors, false); + } +#ifdef MULTITHREAD_SUPPORT + , runConcurrently +#endif + ); + + //entity indices, sorted descending by core distance + std::vector order; + order.reserve(num_entities); + for(auto entity : entities_to_compute) + order.push_back(entity); + std::stable_sort(order.begin(), order.end(), + [&](size_t i, size_t j) { return core_distances[i] > core_distances[j]; }); + + //reuse baseDistanceProbabilities, but because clustering is not typically done repeatedly, + //don't reuse any of the other buffers + auto &edge_distances = buffers.baseDistanceProbabilities; + std::vector parent_entities; + BuildMutualReachabilityMST(core_distances, order, edge_distances, parent_entities); + + std::vector cluster_ids_tmp; + std::vector node_stabilities; + ExtractClustersFromMST(entities_to_compute, core_distances, edge_distances, parent_entities, order, + minimum_cluster_weight, cluster_ids_tmp, node_stabilities); + + //convert integer ids to double + clusters_out.clear(); + clusters_out.reserve(num_entities); + for(auto entity_id : entities_to_compute) + clusters_out.emplace_back(static_cast(cluster_ids_tmp[entity_id])); +} +#else + +template +static inline void RemoveDuplicates(std::vector &v) +{ + std::sort(v.begin(), v.end()); + auto last = std::unique(v.begin(), v.end()); + v.erase(last, v.end()); +} + +static inline void PruneAndCompactClusterIds(EntityQueriesDensityProcessor::EntityReferenceSet &entities_to_compute, + std::vector &cluster_ids, + EntityQueriesStatistics::DistanceTransform &distance_transform, + double min_weight, std::vector &clusters_out) +{ + FastHashMap cluster_id_to_compact_cluster_id; + FastHashMap total_weights_map; + for(auto entity_index : entities_to_compute) + { + size_t cluster_id = cluster_ids[entity_index]; + if(cluster_id == 0) + continue; + + double entity_weight = 1.0; + distance_transform.getEntityWeightFunction(entity_index, entity_weight); + + cluster_id_to_compact_cluster_id.emplace(cluster_id, 0); + total_weights_map[cluster_id] += entity_weight; + } + + //determine cluster ids to erase + FastHashSet cluster_ids_to_erase; + for(auto &[id, w] : total_weights_map) + { + if(w < min_weight) + { + cluster_ids_to_erase.insert(id); + cluster_id_to_compact_cluster_id.erase(id); + } + } + + //remove clusters that are too small + for(auto &id : cluster_ids) + { + if(cluster_ids_to_erase.find(id) != end(cluster_ids_to_erase)) + id = 0; + } + + //build compact mapping for surviving ids + size_t next_id = 1; + for(auto &cluster_mapping : cluster_id_to_compact_cluster_id) + cluster_mapping.second = next_id++; + + //map ids and convert to double + clusters_out.clear(); + clusters_out.reserve(entities_to_compute.size()); + for(auto entity_index : entities_to_compute) + { + double id = 0.0; + if(cluster_ids[entity_index] != 0) + { + size_t cluster_id = cluster_ids[entity_index]; + id = static_cast(cluster_id_to_compact_cluster_id[cluster_id]); + } + clusters_out.emplace_back(id); + } +} + +void EntityQueriesDensityProcessor::ComputeCaseClusters(EntityReferenceSet &entities_to_compute, + std::vector &clusters_out, double minimum_cluster_weight) +{ + //prime the cache, grabbing many extra cases in case the distance contributions + //occasionally push past what is needed +#ifdef MULTITHREAD_SUPPORT + knnCache->PreCacheKnn(&entities_to_compute, 3 * numNearestNeighbors, true, runConcurrently); +#else + knnCache->PreCacheKnn(&entities_to_compute, 3 * numNearestNeighbors, true); +#endif + + //find distance contributions to use as core weights + size_t num_entity_indices = knnCache->GetEndEntityIndex(); + size_t num_entities = entities_to_compute.size(); + auto &distance_contributions = buffers.baseDistanceContributions; + distance_contributions.clear(); + distance_contributions.resize(num_entity_indices, std::numeric_limits::infinity()); + std::vector entity_bandwidth_case_weight(num_entity_indices, 0.0); + std::vector entity_bandwidth_case_count(num_entity_indices, 0); + + IterateOverConcurrentlyIfPossible(entities_to_compute, + [this, &distance_contributions, &entity_bandwidth_case_weight, &entity_bandwidth_case_count](auto /*unused index*/, auto entity) + { + auto &neighbors = knnCache->GetKnnCache(entity); + + double entity_weight = 1.0; + distanceTransform->getEntityWeightFunction(entity, entity_weight); + distance_contributions[entity] = distanceTransform->ComputeDistanceContribution(neighbors, entity_weight); + + size_t num_kept = distanceTransform->TransformDistances(neighbors, false, false); + for(size_t i = 0; i < num_kept; i++) + { + double neighbor_weight = 1.0; + distanceTransform->getEntityWeightFunction(neighbors[i].reference, neighbor_weight); + entity_bandwidth_case_weight[entity] += neighbor_weight; + } + entity_bandwidth_case_count[entity] = num_kept; + + if(num_kept > 0) + distance_contributions[entity] = neighbors[num_kept - 1].distance; + else + distance_contributions[entity] = 0.0; + } +#ifdef MULTITHREAD_SUPPORT + , runConcurrently +#endif + ); + + //entity indices, sorted ascending by distance contribution + std::vector order; + order.reserve(num_entities); + for(auto entity : entities_to_compute) + order.push_back(entity); + std::stable_sort(order.begin(), order.end(), + [&](size_t i, size_t j) { return distance_contributions[i] < distance_contributions[j]; }); + + std::vector cluster_ids_tmp(num_entity_indices, 0); + size_t next_cluster_id = 1; + std::vector cluster_ids_to_merge; + std::vector entities_to_add_to_cluster; + + for(auto cur_entity_index : order) + { + double dist_contrib_threshold = 0; + + //make a copy of the neighbors and only consider actual neighbors, because TransformDistances will + //reduce the number of neighbors in the vector, but we don't want to reduce the number of neighbors in the cache + //this is because we want to check if the neighbors' neighbors can mutually reach each other within + //2x distance contribution (one for each neighbor), but that requires having the cache maintain + //a longer list of neighbors + auto &extended_neighbors = knnCache->GetKnnCache(cur_entity_index); + + //truncate nearest neighbors -- if don't need this, then don't need to make a copy above + //auto neighbors(extended_neighbors); + //distanceTransform->TransformDistances(neighbors, false); + auto &neighbors = extended_neighbors; + + size_t num_neighbors = entity_bandwidth_case_count[cur_entity_index]; + + //max + //for(size_t i = 0; i < num_neighbors; i++) + // dist_contrib_threshold = std::max(dist_contrib_threshold, distance_contributions[neighbors[i].reference]); + + //geomean + //dist_contrib_threshold = 1.0; + //for(auto &neighbor : neighbors) + // dist_contrib_threshold *= distance_contributions[neighbor.reference]; + //dist_contrib_threshold = std::pow(dist_contrib_threshold, 1.0 / neighbors.size()); + + //average + for(size_t i = 0; i < num_neighbors; i++) + dist_contrib_threshold += distance_contributions[neighbors[i].reference]; + dist_contrib_threshold /= num_neighbors; + + //average to dist contrib + //size_t counted = 0; + //for(auto &neighbor : neighbors) + //{ + // if(neighbor.distance > distance_contributions[cur_entity_index]) + // break; + // counted++; + // dist_contrib_threshold += distance_contributions[neighbor.reference]; + //} + //dist_contrib_threshold /= counted; + + //rmse + //for(auto &neighbor : neighbors) + // dist_contrib_threshold += distance_contributions[neighbor.reference] * distance_contributions[neighbor.reference]; + //dist_contrib_threshold /= neighbors.size(); + //dist_contrib_threshold = std::sqrt(dist_contrib_threshold); + + //neighbor bandwidth weight + double neighbor_bandwidth_weight_threshold = 0.0; + num_neighbors = entity_bandwidth_case_count[cur_entity_index]; + + //average + for(size_t i = 0; i < num_neighbors; i++) + neighbor_bandwidth_weight_threshold += entity_bandwidth_case_weight[neighbors[i].reference]; + neighbor_bandwidth_weight_threshold /= num_neighbors; + + //max + //for(size_t i = 0; i < num_neighbors; i++) + // neighbor_bandwidth_weight_threshold = std::max(neighbor_bandwidth_weight_threshold, entity_bandwidth_case_weight[neighbors[i].reference]); + + ///////////// + + //ensure closest enough mutual neighbor using both average distance contribution to assess reachability + //use 2x max neighbor dist contribution, as each point would have its own distance contribution + dist_contrib_threshold *= 2; + + //auto dist_eval = knnCache->GetDistanceEvaluator(); + //dist_contrib_threshold *= std::sqrt(dist_eval->featureAttribs.size()); + + + //accumulate all clusters that are potentially overlapping or touching this one + //that requires going through all neighbors, and looking to see if each one can reach back + bool found_mutual_neighbor = false; + cluster_ids_to_merge.clear(); + entities_to_add_to_cluster.clear(); + double cumulative_neighbor_weight = 0.0; + for(auto &neighbor : extended_neighbors) + { + double neighbor_weight = 1.0; + distanceTransform->getEntityWeightFunction(cur_entity_index, neighbor_weight); + cumulative_neighbor_weight += neighbor_weight; + + double cumulative_neighbor_neighbor_weight = 0.0; + auto &neighbors_neighbors = knnCache->GetKnnCache(neighbor.reference); + for(auto &neighbor_neighbor : neighbors_neighbors) + { + double neighbor_neighbor_weight = 1.0; + distanceTransform->getEntityWeightFunction(neighbor_neighbor.reference, neighbor_neighbor_weight); + cumulative_neighbor_neighbor_weight += neighbor_neighbor_weight; + + //mutual neighbor + if(neighbor_neighbor.reference == cur_entity_index) + { + //candidate distance is the max of either distances, and the max of either distance contribution, + //as this makes sure sparse points do not connect; this is similar to core distance in the HDBSCAN algorithm + double max_distance = std::max(neighbor.distance, neighbor_neighbor.distance); + double dist_contrib_cur = distance_contributions[cur_entity_index]; + double dist_contrib_neighbor = distance_contributions[neighbor.reference]; + double max_distance_contrib = std::max(dist_contrib_neighbor, dist_contrib_cur); + + double candidate_distance = std::max(max_distance, max_distance_contrib); + + if(candidate_distance > dist_contrib_threshold) + continue; + + found_mutual_neighbor = true; + + if(cluster_ids_tmp[neighbor.reference] != 0) + cluster_ids_to_merge.emplace_back(cluster_ids_tmp[neighbor.reference]); + else + entities_to_add_to_cluster.emplace_back(neighbor.reference); + + break; + } + + if(cumulative_neighbor_neighbor_weight >= neighbor_bandwidth_weight_threshold) + break; + } + + if(cumulative_neighbor_weight >= neighbor_bandwidth_weight_threshold) + break; + } + + //if nothing mutually reachable, leave out of clustering + if(!found_mutual_neighbor) + continue; + + RemoveDuplicates(cluster_ids_to_merge); + + if(cluster_ids_to_merge.size() > 0) + { + //use smallest cluster id and remove it from the list + size_t cur_cluster_id = cluster_ids_to_merge.front(); + cluster_ids_to_merge.erase(begin(cluster_ids_to_merge)); + cluster_ids_tmp[cur_entity_index] = cur_cluster_id; + + for(auto updating_index : order) + { + auto found = std::find(begin(cluster_ids_to_merge), end(cluster_ids_to_merge), cluster_ids_tmp[updating_index]); + if(found != end(cluster_ids_to_merge)) + cluster_ids_tmp[updating_index] = cur_cluster_id; + } + } + else //new cluster + { + size_t cur_cluster_id = next_cluster_id++; + cluster_ids_tmp[cur_entity_index] = cur_cluster_id; + } + + for(auto &neighbor_entity_index : entities_to_add_to_cluster) + cluster_ids_tmp[neighbor_entity_index] = cluster_ids_tmp[cur_entity_index]; + } + + PruneAndCompactClusterIds(entities_to_compute, cluster_ids_tmp, *distanceTransform, minimum_cluster_weight, clusters_out); +} + +#endif + +//TODO 24886: make algorithms more efficient +//TODO 24886: add documentation +//TODO 24886: add tests to full_test.amlg +//TODO 24886: remove HDBSCAN code + diff --git a/src/Amalgam/Conviction.h b/src/Amalgam/entity/EntityQueriesDensityFunctions.h similarity index 91% rename from src/Amalgam/Conviction.h rename to src/Amalgam/entity/EntityQueriesDensityFunctions.h index 574255b38..e39ec2932 100644 --- a/src/Amalgam/Conviction.h +++ b/src/Amalgam/entity/EntityQueriesDensityFunctions.h @@ -52,12 +52,13 @@ inline double PartialKullbackLeiblerDivergenceFromIndices(const std::vector &distance_transform, size_t num_nearest_neighbors, StringInternPool::StringID radius_label, bool run_concurrently) #else - ConvictionProcessor(KnnCache &cache, + EntityQueriesDensityProcessor(KnnCache &cache, EntityQueriesStatistics::DistanceTransform &distance_transform, size_t num_nearest_neighbors, StringInternPool::StringID radius_label) #endif @@ -420,7 +421,7 @@ class ConvictionProcessor // if false, then will compute the conviction as if those entities were added or included inline void ComputeCaseKLDivergences(EntityReferenceSet &entities_to_compute, std::vector &convictions_out, bool normalize_convictions, bool conviction_of_removal) { - //prime the cache + //prime the cache, including one extra case so each can be left out #ifdef MULTITHREAD_SUPPORT knnCache->PreCacheKnn(nullptr, numNearestNeighbors + 1, true, runConcurrently); #else @@ -637,6 +638,41 @@ class ConvictionProcessor return KullbackLeiblerDivergence(scaled_base_distance_contribs, combined_model_distance_contribs); } + //builds the minimum spanning tree (MST) on the mutual reachability graph. + //The algorithm is a Prim‑like sweep over the entities sorted by core distance. + //Each point only considers relevant neighbors, and the smallest mutual reachability + // distance among those neighbours becomes the edge that connects the + // point to the growing tree. + //core_distances is the vector of core distances for each entity, infinity for entities not considered + //order is the list of entities sorted by descending core distance + //upon completion, edge_distances will contain the distance of the edge that link each entity to + //its parent in the MST (root gets distance = 0) + //parent_entities will contain the parent index for each entity (root points to itself) + void BuildMutualReachabilityMST(std::vector &core_distances, std::vector &order, + std::vector &edge_distances, std::vector &parent_entities); + + //extracts clusters from the minimum spanning tree (MST) + //entities_to_compute is the set of entities to consider + //edge_distances is a vector of mutual-reachability of the edge that + // connects each entity to its parent in the MST, 0 if the entity is not included + //parent_entities contains the parent index for each entity (root points to itself) + //order is the set of entities sorted by descending core distance + //minimum_cluster_weight is the minimum total weight required for a cluster + //the method outputs cluster_ids output vector, where cluster 0 is noise/no cluster + //stabilities contains the stability score for each entity + void ExtractClustersFromMST(EntityReferenceSet &entities_to_compute, + std::vector &core_distances, std::vector &edge_distances, + std::vector &parent_entities, std::vector &order, double minimum_cluster_weight, + std::vector &cluster_ids, std::vector &stabilities); + + //TODO 24886: add documentation + //TODO 24886: add tests to full_test.amlg + + //computes clusters for each entity and sets the corresponding index of clusters_out to the cluster id + //minimum_cluster_weight is the least amount of weight that can be used to constitute a cluster + void ComputeCaseClusters(EntityReferenceSet &entities_to_compute, + std::vector &clusters_out, double minimum_cluster_weight); + protected: KnnCache *knnCache; diff --git a/src/Amalgam/entity/EntityQueriesStatistics.h b/src/Amalgam/entity/EntityQueriesStatistics.h index 0a0bd06a1..49578acd7 100644 --- a/src/Amalgam/entity/EntityQueriesStatistics.h +++ b/src/Amalgam/entity/EntityQueriesStatistics.h @@ -12,8 +12,8 @@ //these macros define the specific algorithm used for aggregation of distance contributions //the geometric mean has been found to be the best combination of performance and mathematical defensibility // #define DIST_CONTRIBS_HARMONIC_MEAN -#define DIST_CONTRIBS_GEOMETRIC_MEAN -// #define DIST_CONTRIBS_ARITHMETIC_MEAN +//#define DIST_CONTRIBS_GEOMETRIC_MEAN +#define DIST_CONTRIBS_ARITHMETIC_MEAN //#define DIST_CONTRIBS_PROBABILITY_MEAN //this last one is the default if none of the above are defined //#define DIST_CONTRIBS_ENTROPY @@ -542,9 +542,10 @@ class EntityQueriesStatistics // and updating the length of entity_distance_pair_container //selects the bandwidth from the transformed values and returns the number of entities to keep, // which may be less than the total + //returns the number of cases kept after bandwidth selection template - inline void TransformDistances(EntityDistancePairContainer &entity_distance_pair_container, - bool sort_results) + inline size_t TransformDistances(EntityDistancePairContainer &entity_distance_pair_container, + bool sort_results, bool resize_neighbors = true) { size_t num_kept = TransformDistancesWithBandwidthSelectionAndResultFunction( entity_distance_pair_container.begin(), entity_distance_pair_container.end(), @@ -553,7 +554,8 @@ class EntityQueriesStatistics ed_pair->distance = weighted_value; }); - entity_distance_pair_container.resize(num_kept); + if(resize_neighbors) + entity_distance_pair_container.resize(num_kept); if(sort_results) { @@ -575,6 +577,8 @@ class EntityQueriesStatistics ); } } + + return num_kept; } //like TransformDistances but returns the appropriate expected value diff --git a/src/Amalgam/entity/EntityQueryBuilder.h b/src/Amalgam/entity/EntityQueryBuilder.h index fac7e80d9..ac467bd47 100644 --- a/src/Amalgam/entity/EntityQueryBuilder.h +++ b/src/Amalgam/entity/EntityQueryBuilder.h @@ -17,7 +17,7 @@ namespace EntityQueryBuilder { MAX_TO_FIND_OR_MAX_DISTANCE, POSITION_LABELS, - POSITION_OR_ENTITIES, + POSITION_OR_ENTITIES_OR_MIN_CLUSTER_WEIGHT, //optional params MINKOWSKI_PARAMETER, @@ -42,6 +42,7 @@ namespace EntityQueryBuilder || t == ENT_QUERY_DISTANCE_CONTRIBUTIONS || t == ENT_QUERY_ENTITY_CONVICTIONS || t == ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE || t == ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS || t == ENT_QUERY_ENTITY_KL_DIVERGENCES || t == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + || t == ENT_QUERY_ENTITY_CLUSTERS ); } @@ -473,7 +474,7 @@ namespace EntityQueryBuilder auto &ocn = cn->GetOrderedChildNodes(); //need to at least have position, otherwise not valid query - if(ocn.size() <= POSITION_OR_ENTITIES) + if(ocn.size() <= POSITION_OR_ENTITIES_OR_MIN_CLUSTER_WEIGHT) return; //if ENT_QUERY_NEAREST_GENERALIZED_DISTANCE, see if excluding an entity in the previous query -- if so, exclude here @@ -580,7 +581,7 @@ namespace EntityQueryBuilder || condition_type == ENT_QUERY_ENTITY_KL_DIVERGENCES || condition_type == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS) { - EvaluableNode *entities = ocn[POSITION_OR_ENTITIES]; + EvaluableNode *entities = ocn[POSITION_OR_ENTITIES_OR_MIN_CLUSTER_WEIGHT]; if(EvaluableNode::IsOrderedArray(entities)) { auto &entities_ocn = entities->GetOrderedChildNodesReference(); @@ -591,7 +592,7 @@ namespace EntityQueryBuilder } else if(condition_type == ENT_QUERY_DISTANCE_CONTRIBUTIONS) { - EvaluableNode *positions = ocn[POSITION_OR_ENTITIES]; + EvaluableNode *positions = ocn[POSITION_OR_ENTITIES_OR_MIN_CLUSTER_WEIGHT]; if(!EvaluableNode::IsOrderedArray(positions)) { cur_condition->queryType = ENT_NULL; @@ -599,10 +600,15 @@ namespace EntityQueryBuilder } cur_condition->positionsToCompare = &positions->GetOrderedChildNodesReference(); } + else if(condition_type == ENT_QUERY_ENTITY_CLUSTERS) + { + EvaluableNode *cluster_size = ocn[POSITION_OR_ENTITIES_OR_MIN_CLUSTER_WEIGHT]; + cur_condition->minClusterWeight = std::max(0.0, EvaluableNode::ToNumber(cluster_size, 1.0)); + } else //ENT_QUERY_NEAREST_GENERALIZED_DISTANCE or ENT_QUERY_WITHIN_GENERALIZED_DISTANCE { //set position - EvaluableNode *position = ocn[POSITION_OR_ENTITIES]; + EvaluableNode *position = ocn[POSITION_OR_ENTITIES_OR_MIN_CLUSTER_WEIGHT]; if(EvaluableNode::IsOrderedArray(position) && (position->GetNumChildNodes() == cur_condition->positionLabels.size())) { CopyOrderedChildNodesToImmediateValuesAndTypes(position->GetOrderedChildNodesReference(), @@ -732,7 +738,8 @@ namespace EntityQueryBuilder || condition_type == ENT_QUERY_NEAREST_GENERALIZED_DISTANCE || condition_type == ENT_QUERY_DISTANCE_CONTRIBUTIONS || condition_type == ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS - || condition_type == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS) + || condition_type == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + || condition_type == ENT_QUERY_ENTITY_CLUSTERS) { if(ocn.size() > NUM_MINKOWSKI_DISTANCE_QUERY_PARAMETERS + 0) { diff --git a/src/Amalgam/entity/EntityQueryCaches.cpp b/src/Amalgam/entity/EntityQueryCaches.cpp index a1ab615ec..2d7a6f7cd 100644 --- a/src/Amalgam/entity/EntityQueryCaches.cpp +++ b/src/Amalgam/entity/EntityQueryCaches.cpp @@ -1,8 +1,8 @@ //project headers: -#include "Conviction.h" #include "Entity.h" #include "EntityManipulation.h" #include "EntityQueries.h" +#include "EntityQueriesDensityFunctions.h" #include "EntityQueryCaches.h" #include "EvaluableNodeTreeFunctions.h" #include "HashMaps.h" @@ -20,7 +20,7 @@ EntityQueryCaches::QueryCachesBuffers EntityQueryCaches::buffers; #if defined(MULTITHREAD_SUPPORT) || defined(MULTITHREAD_INTERFACE) thread_local #endif -ConvictionProcessor::ConvictionProcessorBuffers ConvictionProcessor::buffers; +EntityQueriesDensityProcessor::ConvictionProcessorBuffers EntityQueriesDensityProcessor::buffers; bool EntityQueryCaches::DoesCachedConditionMatch(EntityQueryCondition *cond, bool last_condition) { @@ -29,7 +29,8 @@ bool EntityQueryCaches::DoesCachedConditionMatch(EntityQueryCondition *cond, boo if(qt == ENT_QUERY_NEAREST_GENERALIZED_DISTANCE || qt == ENT_QUERY_WITHIN_GENERALIZED_DISTANCE || qt == ENT_QUERY_DISTANCE_CONTRIBUTIONS || qt == ENT_QUERY_ENTITY_CONVICTIONS || qt == ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE || qt == ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS - || qt == ENT_QUERY_ENTITY_KL_DIVERGENCES || qt == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS) + || qt == ENT_QUERY_ENTITY_KL_DIVERGENCES || qt == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + || qt == ENT_QUERY_ENTITY_CLUSTERS) { //accelerating a p of 0 with the current caches would be a large effort, as everything would have to be // transformed via logarithms and then pValue = 1 applied @@ -75,6 +76,7 @@ void EntityQueryCaches::EnsureLabelsAreCached(EntityQueryCondition *cond) case ENT_QUERY_ENTITY_KL_DIVERGENCES: case ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE: case ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS: + case ENT_QUERY_ENTITY_CLUSTERS: { for(auto label : cond->positionLabels) { @@ -254,6 +256,7 @@ void EntityQueryCaches::GetMatchingEntities(EntityQueryCondition *cond, BitArray case ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE: case ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS: case ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS: + case ENT_QUERY_ENTITY_CLUSTERS: { //get entity (case) weighting if applicable bool use_entity_weights = (cond->weightLabel != StringInternPool::NOT_A_STRING_ID); @@ -385,13 +388,16 @@ void EntityQueryCaches::GetMatchingEntities(EntityQueryCondition *cond, BitArray { BitArrayIntegerSet *ents_to_compute_ptr = nullptr; //if nullptr, compute is done on all entities in the cache + //if anything that computes results for entities if(cond->queryType == ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS || cond->queryType == ENT_QUERY_ENTITY_CONVICTIONS || cond->queryType == ENT_QUERY_ENTITY_KL_DIVERGENCES || cond->queryType == ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE - || cond->queryType == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS) + || cond->queryType == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + || cond->queryType == ENT_QUERY_ENTITY_CLUSTERS) { - if(cond->existLabels.size() != 0) //if subset is specified, set ents_to_compute_ptr to set of ents_to_compute + //if subset is specified, set ents_to_compute_ptr to set of ents_to_compute + if(cond->existLabels.size() != 0) { ents_to_compute_ptr = &buffers.tempMatchingEntityIndices; ents_to_compute_ptr->clear(); @@ -424,10 +430,10 @@ void EntityQueryCaches::GetMatchingEntities(EntityQueryCondition *cond, BitArray } #ifdef MULTITHREAD_SUPPORT - ConvictionProcessor conviction_processor(buffers.knnCache, + EntityQueriesDensityProcessor conviction_processor(buffers.knnCache, distance_transform, distance_transform.GetNumToRetrieve(), cond->singleLabel, cond->useConcurrency); #else - ConvictionProcessor conviction_processor(buffers.knnCache, + EntityQueriesDensityProcessor conviction_processor(buffers.knnCache, distance_transform, distance_transform.GetNumToRetrieve(), cond->singleLabel); #endif buffers.knnCache.ResetCache(sbfds, matching_entities, cond->distEvaluator, cond->positionLabels, cond->singleLabel); @@ -461,7 +467,7 @@ void EntityQueryCaches::GetMatchingEntities(EntityQueryCondition *cond, BitArray { conviction_processor.ComputeDistanceContributionsWithoutCache(ents_to_compute_ptr, results_buffer); } - else //ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + else if(cond->queryType == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS) { conviction_processor.ComputeNeighborWeightsForEntities(ents_to_compute_ptr, compute_results); @@ -475,6 +481,10 @@ void EntityQueryCaches::GetMatchingEntities(EntityQueryCondition *cond, BitArray //early exit because don't need to translate distances return; } + else //ENT_QUERY_ENTITY_CLUSTERS + { + conviction_processor.ComputeCaseClusters(*ents_to_compute_ptr, results_buffer, cond->minClusterWeight); + } //clear compute_results as it may have been used for intermediate results compute_results.clear(); @@ -1030,6 +1040,7 @@ EvaluableNodeReference EntityQueryCaches::GetMatchingEntitiesFromQueryCaches(Ent case ENT_QUERY_ENTITY_CONVICTIONS: case ENT_QUERY_ENTITY_KL_DIVERGENCES: case ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS: + case ENT_QUERY_ENTITY_CLUSTERS: { entity_caches->GetMatchingEntities(&cond, matching_ents, compute_results, is_first, !is_last || !return_query_value); break; @@ -1235,7 +1246,8 @@ EvaluableNodeReference EntityQueryCaches::GetMatchingEntitiesFromQueryCaches(Ent || last_query_type == ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS || last_query_type == ENT_QUERY_ENTITY_CONVICTIONS || last_query_type == ENT_QUERY_ENTITY_KL_DIVERGENCES - || last_query_type == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS) + || last_query_type == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + || last_query_type == ENT_QUERY_ENTITY_CLUSTERS) { if(immediate_result.AnyImmediateType()) return EvaluableNodeReference(static_cast(compute_results.size())); @@ -1334,7 +1346,8 @@ EvaluableNodeReference EntityQueryCaches::GetEntitiesMatchingQuery(EntityReadRef || conditions[cond_index].queryType == ENT_QUERY_ENTITY_KL_DIVERGENCES || conditions[cond_index].queryType == ENT_QUERY_ENTITY_GROUP_KL_DIVERGENCE || conditions[cond_index].queryType == ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS - || conditions[cond_index].queryType == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS) + || conditions[cond_index].queryType == ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + || conditions[cond_index].queryType == ENT_QUERY_ENTITY_CLUSTERS) { if(!CanUseQueryCaches(conditions)) return EvaluableNodeReference::Null(); diff --git a/src/Amalgam/entity/EntityQueryCaches.h b/src/Amalgam/entity/EntityQueryCaches.h index ecef6050a..463fda645 100644 --- a/src/Amalgam/entity/EntityQueryCaches.h +++ b/src/Amalgam/entity/EntityQueryCaches.h @@ -1,8 +1,8 @@ #pragma once //project headers: -#include "Conviction.h" #include "DistanceReferencePair.h" +#include "EntityQueriesDensityFunctions.h" #include "EvaluableNode.h" #include "IntegerSet.h" #include "KnnCache.h" diff --git a/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp b/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp index 83ae8aaa7..8fb2788fe 100644 --- a/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp +++ b/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp @@ -2084,6 +2084,7 @@ CompactHashMap EvaluableNodeTreeManipulation::evaluab {ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS, 0.2}, {ENT_QUERY_ENTITY_KL_DIVERGENCES, 0.2}, {ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS,0.2}, + {ENT_QUERY_ENTITY_CLUSTERS, 0.2}, //entity access {ENT_CONTAINS_LABEL, 0.5}, diff --git a/src/Amalgam/interpreter/Interpreter.cpp b/src/Amalgam/interpreter/Interpreter.cpp index 1f8ca912b..e39cb25d8 100644 --- a/src/Amalgam/interpreter/Interpreter.cpp +++ b/src/Amalgam/interpreter/Interpreter.cpp @@ -271,6 +271,7 @@ std::array Interpreter &Interpreter::InterpretNode_ENT_QUERY_opcodes, // ENT_QUERY_ENTITY_DISTANCE_CONTRIBUTIONS &Interpreter::InterpretNode_ENT_QUERY_opcodes, // ENT_QUERY_ENTITY_KL_DIVERGENCES &Interpreter::InterpretNode_ENT_QUERY_opcodes, // ENT_QUERY_ENTITY_CUMULATIVE_NEAREST_ENTITY_WEIGHTS + &Interpreter::InterpretNode_ENT_QUERY_opcodes, // ENT_QUERY_ENTITY_CLUSTERS //entity access &Interpreter::InterpretNode_ENT_CONTAINS_LABEL, // ENT_CONTAINS_LABEL diff --git a/src/Amalgam/out.txt b/src/Amalgam/out.txt index e67a3c36a..40f2a5334 100644 --- a/src/Amalgam/out.txt +++ b/src/Amalgam/out.txt @@ -135,6 +135,7 @@ hello world: 12 and 2 query_among 0.2 query_between 0.2 query_distance_contributions 0.2 + query_entity_clusters 0.2 query_entity_convictions 0.2 query_entity_cumulative_nearest_entity_weights 0.2 query_entity_distance_contributions 0.2 @@ -641,7 +642,7 @@ abcdef 3 3.3166247903554 2.111111111111111 -2.111111111111111 +2.1111111111111116 1.5714285714285714 1.5714285714285714 --generalized_distance-- @@ -718,18 +719,18 @@ a [1 2 3 4 5 6] [] { - a 1 b 2 c 3 d 4 e 5 + f 6 } -{b 2 d 4} +{c 3 f 6} { - b 2 c 3 d 4 e 5 + f 6 } { a 1 @@ -769,18 +770,18 @@ c [1 2 3 4 5 6] [] { - a 1 b 2 c 3 d 4 e 5 + f 6 } -{b 2 d 4} +{c 3 f 6} { - b 2 c 3 d 4 e 5 + f 6 } { a 1 @@ -1144,8 +1145,8 @@ abcdef 7 ] [0 1 2 3] -[1 2 0 3] -[1 2 0 3] +[1 3 2 0] +[1 3 2 0] --values-- [1 "d" 3 2] [ @@ -1168,7 +1169,7 @@ abcdef 4 "d" ] -[1 "d" 3 2] +[1 2 "d" 3] [ 1 2 @@ -1298,8 +1299,8 @@ list assoc [ [4] - "4" 4 + "4" {4 4} ] --set-- @@ -1394,7 +1395,7 @@ current_index: 2 8 ] accum_string "abcdef" - argv ["C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg"] + argv ["C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg"] bar (declare {x 6} (+ x 2) @@ -1407,13 +1408,13 @@ current_index: 2 A {B 2} B 2 } - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rmdir "rmdir /s /q " rmfile "del /s /q " rwww 1 slash "\\" - start_time 1767108452.353487 + start_time 1767291495.935935 www 1 x 12 zz 10 @@ -1440,7 +1441,7 @@ current_index: 2 8 ] accum_string "abcdef" - argv ["C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg"] + argv ["C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg"] bar (declare {x 6} (+ x 2) @@ -1453,13 +1454,13 @@ current_index: 2 A {B 2} B 2 } - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rmdir "rmdir /s /q " rmfile "del /s /q " rwww 1 slash "\\" - start_time 1767108452.353487 + start_time 1767291495.935935 www 1 x 12 zz 10 @@ -1485,7 +1486,7 @@ current_index: 2 8 ] accum_string "abcdef" - argv ["C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg"] + argv ["C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg"] bar (declare {x 6} (+ x 2) @@ -1498,13 +1499,13 @@ current_index: 2 A {B 2} B 2 } - interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rmdir "rmdir /s /q " rmfile "del /s /q " rwww 1 slash "\\" - start_time 1767108452.353487 + start_time 1767291495.935935 www 1 x 12 zz 10 @@ -1612,11 +1613,11 @@ b "b" ] -infinity test c or d: ["d" "d" "c" "c"] +infinity test c or d: ["c" "c" "d" "d"] {a 16 b 49 c 35} -[2 7 3] +[3 6 2] --get_rand_seed-- c``!cl @@ -1721,17 +1722,17 @@ string {a 3 b 4} {c "c"} ] -21: [{"a":3,"b":4},{"c":"c","d":null}] +21: [{"a":3,"b":4},{"d":null,"c":"c"}] 22: [{"a":3,"b":4},{"c":"c","d":null}] 23: a: 1 +b: 2 +d: 4 e: - a - b - - .inf c: 3 -b: 2 -d: 4 24: a: 1 b: 2 @@ -1744,7 +1745,7 @@ e: - .inf 25: {a 1} -current date-time in epoch: 2025-12-30-10.27.32.3781140 +current date-time in epoch: 2026-01-01-13.18.15.9927590 2020-06-07 00:22:59 1391230800 1391230800 @@ -2009,37 +2010,18 @@ decrypted: hello --total_size-- 10 --mutate-- -(index_min - (indices) - (union) - 3 - (=) - 5 - 6 - 7 - 8 - 9 - (let) - 11 - 3.6207830689833838 - 13 - 14 - (associate) -) +(associate "a" 1 "b" 2) [ - (+) + 1 (+) 3 4 - (associate "alpha" 5 "beta" 6) + (associate "alpha" 5 (*) 6) (associate "nest" - (associate - "count" - [7 (+) 9] - ) + (associate "count" []) "end" - [(*) 11 12] + [(+) (+) 12] ) ] --commonality-- @@ -2483,15 +2465,7 @@ decrypted: hello ] ] --mix-- -[ - 1 - 3 - 5 - 7 - 9 - 11 - 13 -] +[3 6 7 9 12 14] [ ;comment 1 @@ -2499,7 +2473,7 @@ decrypted: hello ;comment 3 ;comment 4 1 - 3 + 4 5 7 9 @@ -2509,21 +2483,13 @@ decrypted: hello [ 1 5 - 2.5 + 2 (associate "a" 3 "b" 4) (lambda (if true 1 - (unordered_list - (get_entity_comments) - (lambda - (print - [2 9] - ) - ) - 1 - ) + (unordered_list (get_entity_comments) 1) ) ) [5] @@ -2544,36 +2510,44 @@ decrypted: hello [2 9] ) ) + 1 ) ) ) - [5 6] + [] ] -[3.5 5.5 7.5 9.5 11.5 13.5] [ .true - 2 - 4 + 3.5 + 5.5 + 7.5 + 9.5 + 11.5 + 13.5 +] +[ 3 6 + 5 + 7 10 9 - 12 11 + 14 + 13 ] -4 1 4 2.5 +2.5 abcdexyz abcdexyz -abcdexyz +abcomxyz { a [0 1] b [1 2] - c [3] - y [] - z [5] + c [] + x [3 4] } --total_entity_size-- 79 @@ -2820,58 +2794,58 @@ flatten restore with concurrency ) --mutate_entity-- [ - 1 - 2 - {} + ##a -0.12739193733510926 + 1.7249819167355707 + (call) 4 5 - 6 - 7 + 16.727676640725225 + 3.658868677125917 8 - (query_among) - (get_type) + 9 + 10 11 12 - (~) - 14 - (associate (intersect) 1 b (atanh)) + 13 + (query_exists) + (associate) ] [ 1 - (call) - 0.9015975926291679 + -1 + (seq) 4 - 5 - (/) + (-) + 6 7 - (get) - 12 - (contained_entities) - (contains_value) - 9 - (dot_product) + 8 + (set) + -1.159501719612117 + (unordered_list) + (retrieve_from_entity) + 13 14 - (associate (query_exists) (index_max) "b" 2) + .false ] -(- +[ (-) 2 3 - (+) - (+) + 4 + 5 6 - 7 + (-) 8 - 9 + (-) 10 - 11 (*) + (+) 13 - 14 - (associate) -) + (+) + (associate "a" (-) (+) (-)) +] --commonality_entities-- 73.3678794503212 @@ -2883,10 +2857,10 @@ MergeEntityChild1 (associate "x" 3 "y" 4) MergeEntityChild2 (associate "p" 3 "q" 4) -_3130331116 -(associate "E" 3 "F" 4) _1651806471 (associate "e" 3 "f" 4) +_3130331116 +(associate "E" 3 "F" 4) --union_entities-- (associate "b" 4 "a" 3 "c" 3) MergeEntityChild1 @@ -2904,26 +2878,26 @@ MergeEntityChild2 "w" 7 ) -_3130331116 +_1651806471 (associate - "E" + "e" 3 - "F" + "f" 4 - "G" + "g" 5 - "H" + "h" 6 ) -_1651806471 +_3130331116 (associate - "e" + "E" 3 - "f" + "F" 4 - "g" + "G" 5 - "h" + "H" 6 ) (unordered_list @@ -3503,8 +3477,14 @@ difference between DiffEntity2 and new_entity: [] (lambda { - E 3 - F 4 + E (get + (current_value 1) + "E" + ) + F (get + (current_value 1) + "F" + ) G 5 H 6 } @@ -3529,7 +3509,16 @@ difference between DiffEntity2 and new_entity: _ [] (lambda - {e 3 f 4} + { + e (get + (current_value 1) + "e" + ) + f (get + (current_value 1) + "f" + ) + } ) ) ) @@ -3555,34 +3544,107 @@ contained_entities new_entity: ["OnlyIn2" "_1985995361" "_2783372341" "DiffEntit difference between DiffContainer and DiffEntity2: (declare {_ (null) new_entity (null)} - (clone_entities _ new_entity) + (assign + "new_entity" + (first + (create_entities + new_entity + (call + (lambda + (declare + {_ (null)} + (replace _) + ) + ) + { + _ (retrieve_entity_root _) + } + ) + ) + ) + ) + (create_entities + (append new_entity "_1985995361") + (call + (lambda + (declare + {_ (null)} + (replace + _ + [] + (lambda + { + E (null) + F (null) + G (get + (current_value 1) + "G" + ) + H (get + (current_value 1) + "H" + ) + } + ) + ) + ) + ) + { + _ (retrieve_entity_root + (append _ "_1985995361") + ) + } + ) + ) + (create_entities + (append new_entity "_2783372341") + (call + (lambda + (declare + {_ (null)} + (replace + _ + [] + (lambda + {e (null) f (null)} + ) + ) + ) + ) + { + _ (retrieve_entity_root + (append _ "_2783372341") + ) + } + ) + ) + (clone_entities + (append _ "OnlyIn2") + (append new_entity "OnlyIn2") + ) + (clone_entities + (append _ "DiffEntityChild1") + (append new_entity "DiffEntityChild1") + ) + new_entity ) --mix_entities-- -(associate "b" 4 "c" 3) +(associate "b" 4 "a" 3) MergeEntityChild1 (associate "x" 3 "y" 4) MergeEntityChild2 -(associate - "p" - 3 - "q" - 4 - "v" - 6 - "w" - 7 -) -_3130331116 -(associate "E" 3 "F" 4 "H" 6) +(associate "p" 3 "q" 4) _1651806471 +(associate "e" 3 "f" 4 "h" 6) +_3130331116 (associate - "e" + "E" 3 - "f" + "F" 4 - "g" + "G" 5 - "h" + "H" 6 ) --get_entity_comments-- @@ -3671,7 +3733,7 @@ deep sets --set_entity_permissions-- RootTest -1767108452.43106 +1767291496.111029 { alter_performance .true environment .true @@ -4109,52 +4171,51 @@ store to .json normally ["Child1" "Child5"] ["Child3" "Child4"] ["Child6" "Child7"] -["Child2" "Child3" "Child6" "Child7"] -["Child1" "Child3" "Child4" "Child5"] +["Child2" "Child4" "Child6" "Child7"] +["Child1" "Child2" "Child3" "Child7"] ["Child4" "Child6"] --query_sample-- -["Child6"] -["Child5" "Child6"] -["Child2"] ["Child7"] +["Child5" "Child1"] +["Child3"] +["Child2"] with weights ["Child1"] ["Child2"] [ - "Child3" "Child1" "Child1" + "Child1" + "Child2" "Child2" "Child1" + "Child6" "Child2" "Child1" "Child1" "Child2" + "Child6" + "Child3" "Child2" "Child1" "Child1" - "Child2" - "Child4" "Child1" - "Child2" - "Child2" - "Child3" + "Child4" "Child2" "Child1" ] [ - "Child2" - "Child1" "Child1" "Child2" "Child1" "Child2" - "Child1" + "Child7" "Child2" - "Child1" - "Child1" - "Child1" - "Child1" + "Child2" + "Child2" + "Child2" + "Child2" + "Child4" "Child1" "Child1" "Child1" @@ -4163,20 +4224,21 @@ with weights "Child1" "Child2" "Child2" + "Child2" ] [ "Child2" "Child2" + "Child3" + "Child2" + "Child3" "Child2" "Child2" "Child2" - "Child7" "Child2" - "Child4" - "Child5" "Child2" ] -["Child2" "Child7"] +["Child2" "Child3" "Child4"] --query_in_entity_list-- ["Child6" "Child7"] --query_not_in_entity_list-- @@ -4222,14 +4284,14 @@ cascading query_not_in_entity_list: ["Child6" "Child7"] unweighted query: { Child1 4 Child2 1 - Child4 100 + Child3 100 Child6 2 Child7 10 } weighted query: { Child1 4 Child2 1 - Child4 100 + Child3 100 Child6 2 Child7 10 } @@ -4243,7 +4305,7 @@ weighted query list of lists: [ [-1 2 4 10 100] ] weighted query list of lists with multiple values: [ - ["Child2" "Child6" "Child1" "Child7" "Child4"] + ["Child2" "Child6" "Child1" "Child7" "Child3"] [1 2 4 10 100] [-1 2 4 10 100] [-1 1 3 0 100] @@ -4562,12 +4624,40 @@ cyclic test expected: 155, 200, 190 ... deg values of 0 8 and 12: 200: 0.05555555555555555 (null ##deg 8 ) -190: 0.045454545454545456 (null - ##deg 12 -) 155: 0.1 (null ##deg 0 ) +190: 0.045454545454545456 (null + ##deg 12 +) +--query_entity_clusters-- +{ + g1c1 2 + g1c2 2 + g1c3 2 + g1c4 2 + g2c1 1 + g2c2 1 + g2c3 1 + g2c4 1 + g3c1 4 + g3c2 4 + g3c3 4 + g3c4 4 + g3c5 4 + g3c6 4 + g3c7 4 + g3c8 4 + g4c1 3 + g4c2 3 + g4c3 3 + g4c4 3 + g4c5 3 + g4c6 3 + g4c7 3 + g4c8 3 + g5c1 0 +} --contains_label-- .true .false @@ -4794,7 +4884,7 @@ entity cyclic test: (assign_entity_roots new_entity _) ) ) - (set_entity_rand_seed new_entity "߉$Ot3FN܎5") + (set_entity_rand_seed new_entity "@_D/#61xO") new_entity ) 3 @@ -4824,7 +4914,7 @@ entity cyclic test: (assign_entity_roots new_entity _) ) ) - (set_entity_rand_seed new_entity "߉$Ot3FN܎5") + (set_entity_rand_seed new_entity "@_D/#61xO") new_entity ) cyclic lookup test: @@ -4870,7 +4960,7 @@ cyclic lookup test 2: ["hello" "!"] {a1 1.4142135623730951 a2 2 a3 1.4142135623730951} {a1 1.4142135623730951 a3 1.4142135623730951} -{a3 1.4142135623730951} +{a1 1.4142135623730951} {a1 5.0990195135927845 a2 2 a3 5.0990195135927845} {a1 1 a3 1 a4 0} should print case1 and case2: ["case1" "case2"] @@ -4884,13 +4974,13 @@ distance symmetry tests [ [ "B" + "C" "D" "I" - "C" "F" "A" "G" - "H" + "J" ] [ 0 @@ -4906,13 +4996,13 @@ distance symmetry tests [ [ "B" - "I" + "C" "F" "A" + "I" "D" - "C" - "G" "H" + "G" ] [ 0 @@ -5127,4 +5217,4 @@ rmdir /s /q amlg_code\persistent_tree_test_root del /s /q amlg_code\persist_module_test\psm.mdam del /s /q amlg_code\persist_module_test.mdam --total execution time-- -0.7332229614257812 +1.455152988433838