From f9742e51e5c03f6ce5cd0830bdb40c7e4e7a9f77 Mon Sep 17 00:00:00 2001 From: Diandre Sabale Date: Mon, 13 Nov 2023 11:18:00 -0800 Subject: [PATCH] removed unnecessary experiments and cleaned up existing ones --- .../Scripts/run-cycle-experiments-timed.jl | 37 ----- ...riments-without-join-table-cycle-stats.jl} | 11 +- Experiments/Scripts/run-cycle-experiments.jl | 6 +- .../run-inference-sampling-experiments.jl | 22 +-- ...summary-sampling-experiments-full-range.jl | 24 ---- .../run-summary-sampling-experiments.jl | 24 +--- .../run-summary-sampling-range-query-type.jl | 24 ---- .../run-summary-sampling-with-query-type.jl | 13 ++ Experiments/Scripts/update_experiments.jl | 136 ++---------------- Experiments/build_color_summaries.jl | 27 +++- Experiments/graph_results.jl | 5 +- Experiments/utils.jl | 8 +- Source/CardinalityWithColors.jl | 8 +- 13 files changed, 69 insertions(+), 276 deletions(-) delete mode 100644 Experiments/Scripts/run-cycle-experiments-timed.jl rename Experiments/Scripts/{run-cycle-experiments-detailed-sample.jl => run-cycle-experiments-without-join-table-cycle-stats.jl} (73%) delete mode 100644 Experiments/Scripts/run-summary-sampling-experiments-full-range.jl delete mode 100644 Experiments/Scripts/run-summary-sampling-range-query-type.jl create mode 100644 Experiments/Scripts/run-summary-sampling-with-query-type.jl diff --git a/Experiments/Scripts/run-cycle-experiments-timed.jl b/Experiments/Scripts/run-cycle-experiments-timed.jl deleted file mode 100644 index 65d15e6..0000000 --- a/Experiments/Scripts/run-cycle-experiments-timed.jl +++ /dev/null @@ -1,37 +0,0 @@ -using Plots.PlotMeasures -include("../Experiments.jl") - -datasets::Vector{DATASET} = [aids] -# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents] -# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents] -max_cycles = 6 - -experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles] -println("started building") -for experiment_params in experiment_params_list - build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")] - dataset = experiment_params.dataset - summary_params = experiment_params.summary_params - data = load_dataset(dataset) - summary_name = @timed params_to_summary_filename(experiment_params) - summary_file_location = "Experiments/SerializedSummaries/" * summary_name - println("Building Color Summary: ", summary_name) - # normal_results = @timed generate_color_summary(data, summary_params; verbose=1) - # normal_results = @timed generate_color_summary(data, summary_params; verbose=0, detailed_cycles=false) - results= @timed generate_color_summary(data, summary_params; verbose=1, detailed_cycles=false) # println("normal time: ", normal_results.time) - println("detailed time: ", results.time) - summary_size = Base.summarysize(results.value) - serialize(summary_file_location, results.value) - push!(build_times, (string(dataset), - string(summary_params.partitioner), - string(summary_params.num_colors), - string(results.time), - string(summary_size))) - results_filename = params_to_results_filename(experiment_params) - result_file_location = "Experiments/Results/Build_" * results_filename - writedlm(result_file_location, build_times, ",") -end -println("started estimating") -run_estimation_experiments(experiment_params_list) -println("started graphing") -graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="detailed-cycle-experiment") \ No newline at end of file diff --git a/Experiments/Scripts/run-cycle-experiments-detailed-sample.jl b/Experiments/Scripts/run-cycle-experiments-without-join-table-cycle-stats.jl similarity index 73% rename from Experiments/Scripts/run-cycle-experiments-detailed-sample.jl rename to Experiments/Scripts/run-cycle-experiments-without-join-table-cycle-stats.jl index 32d6c0c..b764c6f 100644 --- a/Experiments/Scripts/run-cycle-experiments-detailed-sample.jl +++ b/Experiments/Scripts/run-cycle-experiments-without-join-table-cycle-stats.jl @@ -1,9 +1,7 @@ using Plots.PlotMeasures include("../Experiments.jl") -datasets::Vector{DATASET} = [aids] -# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents] -# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents] +datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents] max_cycles = 6 experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles] @@ -16,10 +14,7 @@ for experiment_params in experiment_params_list summary_name = params_to_summary_filename(experiment_params) summary_file_location = "Experiments/SerializedSummaries/" * summary_name println("Building Color Summary: ", summary_name) - # normal_results = @timed generate_color_summary(data, summary_params; verbose=1) - # normal_results = @timed generate_color_summary(data, summary_params; verbose=0, detailed_cycles=false) - results= @timed generate_color_summary(data, summary_params; verbose=1, detailed_cycles=false) - # println("normal time: ", normal_results.time) + results= @timed generate_color_summary(data, summary_params; verbose=1, use_cycle_join_table=false) println("detailed time: ", results.time) summary_size = Base.summarysize(results.value) serialize(summary_file_location, results.value) @@ -35,4 +30,4 @@ end println("started estimating") run_estimation_experiments(experiment_params_list) println("started graphing") -graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="detailed-sample-experiment") \ No newline at end of file +graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="cycles-without-join-table-cycle-stats-experiment") \ No newline at end of file diff --git a/Experiments/Scripts/run-cycle-experiments.jl b/Experiments/Scripts/run-cycle-experiments.jl index 0d419e0..d7e59d8 100644 --- a/Experiments/Scripts/run-cycle-experiments.jl +++ b/Experiments/Scripts/run-cycle-experiments.jl @@ -1,12 +1,10 @@ using Plots.PlotMeasures include("../Experiments.jl") -datasets::Vector{DATASET} = [aids] -# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents] -# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents] +datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents] max_cycles = 6 - experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles] + println("started building") build_experiments(experiment_params_list) println("started estimating") diff --git a/Experiments/Scripts/run-inference-sampling-experiments.jl b/Experiments/Scripts/run-inference-sampling-experiments.jl index 9e535cb..f2d0f1d 100644 --- a/Experiments/Scripts/run-inference-sampling-experiments.jl +++ b/Experiments/Scripts/run-inference-sampling-experiments.jl @@ -1,30 +1,10 @@ -# TODO: -# - turn sampling up/down (build and inference) -# - cycle length effects -# - reverify ground truth wrt G-captures -# - rerun initial g-care benchmarks and verify old results -# - recreate initial G-Care Benchmark results - -# My tasks: -# - turn sampling up/down -# - cycle length effects - using Plots.PlotMeasures include("../Experiments.jl") -# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human] -# max_partial_paths = 10000 - -# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)] - datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents] max_paths = 60 - experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, inference_max_paths=current_paths) for current_dataset in datasets for current_paths in 2:10:max_paths] + println("started building") build_experiments(experiment_params_list) println("started estimating") diff --git a/Experiments/Scripts/run-summary-sampling-experiments-full-range.jl b/Experiments/Scripts/run-summary-sampling-experiments-full-range.jl deleted file mode 100644 index 9c91a70..0000000 --- a/Experiments/Scripts/run-summary-sampling-experiments-full-range.jl +++ /dev/null @@ -1,24 +0,0 @@ -using Plots.PlotMeasures -include("../Experiments.jl") - -# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human] -# max_partial_paths = 10000 - -# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)] - -datasets::Vector{DATASET} = [hprd, dblp, eu2005, patents] -# datasets::Vector{DATASET} = [youtube] - -max_paths = 2000 - -experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 1100:100:max_paths] -println("started building") -build_experiments(experiment_params_list) -println("started estimating") -run_estimation_experiments(experiment_params_list) -println("started graphing") -graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesfullrangesuperalot") \ No newline at end of file diff --git a/Experiments/Scripts/run-summary-sampling-experiments.jl b/Experiments/Scripts/run-summary-sampling-experiments.jl index f5ef6ef..5c18217 100644 --- a/Experiments/Scripts/run-summary-sampling-experiments.jl +++ b/Experiments/Scripts/run-summary-sampling-experiments.jl @@ -1,30 +1,14 @@ using Plots.PlotMeasures -include("../build_color_summaries.jl") -include("../get_true_cardinalities.jl") -include("../load_datasets.jl") -include("../load_querysets.jl") -include("../run_estimators.jl") -include("../graph_results.jl") -include("../utils.jl") +include("../Experiments.jl") -# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human] -# max_partial_paths = 10000 - -# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)] - -# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents] -datasets::Vector{DATASET} = [aids] +datasets::Vector{DATASET} = [aids, wordnet, lubm80, human] max_paths = 1000 +experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 0:200:max_paths] -experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 250:250:max_paths] println("started building") build_experiments(experiment_params_list) println("started estimating") run_estimation_experiments(experiment_params_list) - +println("started graphing") graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=error, grouping=summary_paths, filename="summarysamples") diff --git a/Experiments/Scripts/run-summary-sampling-range-query-type.jl b/Experiments/Scripts/run-summary-sampling-range-query-type.jl deleted file mode 100644 index 1d32312..0000000 --- a/Experiments/Scripts/run-summary-sampling-range-query-type.jl +++ /dev/null @@ -1,24 +0,0 @@ -using Plots.PlotMeasures -include("../Experiments.jl") - -# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human] -# max_partial_paths = 10000 - -# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250), -# ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)] - -# datasets::Vector{DATASET} = [hprd, dblp, eu2005, patents] -# datasets::Vector{DATASET} = [youtube] - -max_paths = 300 - -experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=yeast, partitioner=QuasiStable, summary_max_paths=current_paths) for current_paths in 10:30:max_paths] -println("started building") -build_experiments(experiment_params_list) -println("started estimating") -run_estimation_experiments(experiment_params_list) -println("started graphing") -graph_grouped_box_plot(experiment_params_list, x_type=query_type, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesquerytypesyeast") \ No newline at end of file diff --git a/Experiments/Scripts/run-summary-sampling-with-query-type.jl b/Experiments/Scripts/run-summary-sampling-with-query-type.jl new file mode 100644 index 0000000..b9464e5 --- /dev/null +++ b/Experiments/Scripts/run-summary-sampling-with-query-type.jl @@ -0,0 +1,13 @@ +using Plots.PlotMeasures +include("../Experiments.jl") + +current_dataset = yeast +max_paths = 300 + +experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_paths in 10:30:max_paths] +println("started building") +build_experiments(experiment_params_list) +println("started estimating") +run_estimation_experiments(experiment_params_list) +println("started graphing") +graph_grouped_box_plot(experiment_params_list, x_type=query_type, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesquerytypesyeast") \ No newline at end of file diff --git a/Experiments/Scripts/update_experiments.jl b/Experiments/Scripts/update_experiments.jl index a66c7e5..c9a4db5 100644 --- a/Experiments/Scripts/update_experiments.jl +++ b/Experiments/Scripts/update_experiments.jl @@ -2,138 +2,20 @@ using Plots.PlotMeasures using Graphs include("../Experiments.jl") -datasets::Vector{DATASET} = [human] +datasets::Vector{DATASET} = [wordnet] # datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents] # datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents] max_cycles = 6 proportions_not_updated = [0, 0.2, 0.4, 0.6, 0.8, 1] -# proportions_not_updated = [1.0, 0.8] - -for proportion_not_updated in proportions_not_updated -# only do this with the cloned stuff... we can repeat the same experiments without cloning in the normal version of the code -experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles] +experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_cycle, proportion_not_updated=current_proportion) + for current_dataset in datasets for current_cycle in 2:max_cycles for current_proportion in proportions_not_updated] println("started building") -for experiment_params in experiment_params_list - build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")] - dataset = experiment_params.dataset - summary_params = experiment_params.summary_params - data = load_dataset(dataset) - cloned_data = DataGraph(nv(data.graph)) # remember to add back in the vertex labels - cloned_data.vertex_labels = data.vertex_labels - graph_edges = collect(edges(data.graph)) - println("length: ", length(graph_edges)) - edges_to_add = (length(graph_edges) * proportion_not_updated) - println("edges to add: ", length(graph_edges) * proportion_not_updated) - remaining_edges = [] - for edge in graph_edges - if (edges_to_add > 0) - add_labeled_edge!(cloned_data, (src(edge), dst(edge)), only(data.edge_labels[(src(edge), dst(edge))])) - edges_to_add -= 1 - # update_edge_labels!(cloned_data, (src(edge), dst(edge)), data.edge_labels[(src(edge), dst(edge))]) - else - push!(remaining_edges, edge) - end - end - summary_name = (params_to_summary_filename(experiment_params) * "with" * string(proportion_not_updated) * "updates") - summary_file_location = "Experiments/SerializedSummaries/" * summary_name - println("Building Color Summary: ", summary_name) - results= @timed generate_color_summary(cloned_data, summary_params; verbose=1, detailed_cycles=false) # println("normal time: ", normal_results.time) - current_summary = results.value - for edge in remaining_edges - # println("ADDING SUMMARY EDGES") - add_summary_edge!(current_summary, src(edge), dst(edge), get(data.edge_labels, (src(edge), dst(edge)), [])) - end - summary_size = Base.summarysize(current_summary) - serialize(summary_file_location, current_summary) - push!(build_times, (string(dataset), - string(summary_params.partitioner), - string(summary_params.num_colors), - string(results.time), - string(summary_size))) - results_filename = params_to_results_filename(experiment_params) - result_file_location = "Experiments/Results/Build_" * results_filename - writedlm(result_file_location, build_times, ",") -end +build_experiments(experiment_params_list) println("started estimating") -for experiment_params in experiment_params_list - dataset = experiment_params.dataset - all_queries = load_querysets([dataset]; require_true_cardinality = true) - summary_name = (params_to_summary_filename(experiment_params) * "with" * string(proportion_not_updated) * "updates") - summary_file_location = "Experiments/SerializedSummaries/" * summary_name - !isfile(summary_file_location) && error("The summary has not been built yet! \n Attempted File Location: $(summary_file_location)") - summary::ColorSummary = deserialize(summary_file_location) - experiment_results = [] - push!(experiment_results, ("UpperBound", "Estimate", "LowerBound", "TrueCard", "EstimationTime", "QueryType")) - for i in 1:length(all_queries[dataset]) - query = all_queries[dataset][i].query - query_path = all_queries[dataset][i].query_path - exact_size = all_queries[dataset][i].exact_size - results = @timed get_cardinality_bounds(query, summary; - max_partial_paths = experiment_params.inference_max_paths, - use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true, - sampling_strategy=experiment_params.sampling_strategy, - only_shortest_path_cycle= experiment_params.only_shortest_path_cycle) - upper_bound = results.value[3] - estimate = max(1, results.value[2]) - lower_bound = results.value[1] - estimate_time = results.time - query_type = all_queries[dataset][i].query_type - push!(experiment_results, (upper_bound, estimate, lower_bound, exact_size, estimate_time, query_type)) - end - results_file_location = "Experiments/Results/Estimation_" * params_to_results_filename(experiment_params) * "with" * string(proportion_not_updated) * "updates" - writedlm(results_file_location, experiment_results, ",") -end +run_estimation_experiments(experiment_params_list) println("started graphing") -# graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="detailed-cycle-experiment") -x_type::GROUP=dataset -y_type::VALUE=estimate_error -grouping::GROUP=cycle_size -x_values = [] - y_values = [] - groups = [] - for experiment_params in experiment_params_list - # load the results - results_filename = params_to_results_filename(experiment_params) - results_path = "Experiments/Results/Estimation_" * results_filename * "with" * string(proportion_not_updated) * "updates" - # println("results path: ", results_path) - results_df = CSV.read(results_path, DataFrame; normalizenames=true) - - # get the x_value and grouping (same for all results in this experiment param) - - # keep track of the data points - for i in 1:nrow(results_df) - current_x = x_type == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, x_type) - current_group = grouping == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, grouping) - current_y = 0 - if y_type == estimate_error - current_y = results_df[i, :Estimate] / results_df[i, :TrueCard] - else # y_type == runtime - current_y = results_df[i, :EstimationTime] - end - # push the errors and their groupings into the correct vector - push!(x_values, current_x) - push!(y_values, current_y) - push!(groups, current_group) - end - end - println("starting graphs") - - # This seems to be necessary for using Plots.jl outside of the ipynb framework. - # See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15 - ENV["GKSwstype"]="100" - gbplot = groupedboxplot(x_values, y_values, group = groups, yscale =:log10, - ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 10^-2, 1, 10^2, 10^5, 10^10], - legend = :outertopleft, size = (1000, 600)) - xlabel!(gbplot, "Dataset") - ylabel!(gbplot, "Accuracy") - plotname = "cycleswith" * string(proportion_not_updated) * "withcycleupdateshuman" * ".png" - savefig(gbplot, "Experiments/Results/Figures/" * plotname) - -# first, take the data graph and collect its list of edges -# then, create a new data graph with the same number of nodes -# from the original graph, take the list of edges -# for a portion of the edges, put them into the new graph directly -# for the remaining edges, add them as "summary edges" using the summary -# then, run a query, using the "real" graph and the "updated" graph -end \ No newline at end of file +# compare how overall accuracy is affected by summary updates +# graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=proportion_not_updated, filename="overall-accuracy-and-updates") +# compare how cycle stat accuracies are affected by summary updates +graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates") \ No newline at end of file diff --git a/Experiments/build_color_summaries.jl b/Experiments/build_color_summaries.jl index 992c98a..4285222 100644 --- a/Experiments/build_color_summaries.jl +++ b/Experiments/build_color_summaries.jl @@ -4,13 +4,34 @@ function build_experiments(experiment_params_list::Vector{ExperimentParams}) dataset = experiment_params.dataset summary_params = experiment_params.summary_params data = load_dataset(dataset) + cloned_data = DataGraph(nv(data.graph)) + remaining_edges = [] + if (experiment_params.summary_params.proportion_not_updated < 1.0) + cloned_data.vertex_labels = data.vertex_labels + graph_edges = collect(edges(data.graph)) + # edges_to_add = (length(graph_edges) * experiment_params.summary_params.proportion_not_updated) + for edge in graph_edges + if (rand() < experiment_params.summary_params.proportion_not_updated) + add_labeled_edge!(cloned_data, (src(edge), dst(edge)), only(data.edge_labels[(src(edge), dst(edge))])) + # edges_to_add -= 1 + else + push!(remaining_edges, edge) + end + end + end summary_name = params_to_summary_filename(experiment_params) summary_file_location = "Experiments/SerializedSummaries/" * summary_name println("Building Color Summary: ", summary_name) timing_vec = Float64[] - results = @timed generate_color_summary(data, summary_params; verbose=1, timing_vec=timing_vec) - summary_size = Base.summarysize(results.value) - serialize(summary_file_location, results.value) + results = @timed generate_color_summary((experiment_params.summary_params.proportion_not_updated < 1.0) ? cloned_data : data, summary_params; verbose=1, timing_vec=timing_vec) + current_summary = results.value + if (experiment_params.summary_params.proportion_not_updated < 1.0) + for edge in remaining_edges + add_summary_edge!(current_summary, src(edge), dst(edge), get(data.edge_labels, (src(edge), dst(edge)), [])) + end + end + summary_size = Base.summarysize(current_summary) + serialize(summary_file_location, current_summary) push!(build_times, (string(dataset), string(summary_params.partitioner), string(summary_params.num_colors), diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl index 0a28718..e93d9c8 100644 --- a/Experiments/graph_results.jl +++ b/Experiments/graph_results.jl @@ -1,5 +1,4 @@ -@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase -#todo: query type +@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated @enum VALUE estimate_error runtime build_time memory_footprint @@ -244,6 +243,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR return experiment_param.only_shortest_path_cycle elseif value_type == number_of_colors return experiment_param.summary_params.num_colors + elseif value_type == proportion_not_updated + return experiment_param.summary_params.proportion_not_updated else # default to grouping by technique return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds) diff --git a/Experiments/utils.jl b/Experiments/utils.jl index 8eca011..d02d2f2 100644 --- a/Experiments/utils.jl +++ b/Experiments/utils.jl @@ -16,13 +16,14 @@ struct ExperimentParams function ExperimentParams(;dataset::DATASET, num_colors::Int=64, max_cycle_size=6, only_shortest_path_cycle=false, summary_max_paths=1000, partitioner::PARTITIONER = QuasiStable, weighting=true, inference_max_paths=500, use_partial_sums=true, - sampling_strategy=redistributive, label_refining_rounds = 0) + sampling_strategy=redistributive, label_refining_rounds = 0, proportion_not_updated=1.0) return new(dataset, ColorSummaryParams(num_colors=num_colors, max_cycle_size=max_cycle_size, max_partial_paths=summary_max_paths, partitioner=partitioner, weighting=weighting, - label_refining_rounds=label_refining_rounds), + label_refining_rounds=label_refining_rounds, + proportion_not_updated=proportion_not_updated), inference_max_paths, only_shortest_path_cycle, use_partial_sums, @@ -37,7 +38,8 @@ function params_to_results_filename(experiment_params::ExperimentParams) name *= string(experiment_params.inference_max_paths) * "_" name *= string(experiment_params.only_shortest_path_cycle) * "_" name *= string(experiment_params.use_partial_sums) * "_" - name *= string(experiment_params.sampling_strategy) * ".csv" + name *= string(experiment_params.sampling_strategy) * "_" + name *= string(experiment_params.summary_params.proportion_not_updated) * ".csv" return name end diff --git a/Source/CardinalityWithColors.jl b/Source/CardinalityWithColors.jl index 5608241..2a29b17 100644 --- a/Source/CardinalityWithColors.jl +++ b/Source/CardinalityWithColors.jl @@ -33,10 +33,11 @@ struct ColorSummaryParams partitioner::PARTITIONER weighting::Bool label_refining_rounds::Int + proportion_not_updated::Float16 function ColorSummaryParams(;num_colors::Int=64, max_cycle_size=4, max_partial_paths=1000, - partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0) - return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds) + partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0, proportion_not_updated = 1.0) + return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds, proportion_not_updated) end end @@ -45,7 +46,8 @@ function params_to_string(params::ColorSummaryParams) summary_name *= string(params.num_colors) * "_" summary_name *= string(params.max_cycle_size) * "_" summary_name *= string(params.max_partial_paths)* "_" - summary_name *= string(params.label_refining_rounds) + summary_name *= string(params.label_refining_rounds)* "_" + summary_name *= string(params.proportion_not_updated) return summary_name end