From f9742e51e5c03f6ce5cd0830bdb40c7e4e7a9f77 Mon Sep 17 00:00:00 2001
From: Diandre Sabale <dmbs@dragon.cs.washington.edu>
Date: Mon, 13 Nov 2023 11:18:00 -0800
Subject: [PATCH] removed unnecessary experiments and cleaned up existing ones

---
 .../Scripts/run-cycle-experiments-timed.jl    |  37 -----
 ...riments-without-join-table-cycle-stats.jl} |  11 +-
 Experiments/Scripts/run-cycle-experiments.jl  |   6 +-
 .../run-inference-sampling-experiments.jl     |  22 +--
 ...summary-sampling-experiments-full-range.jl |  24 ----
 .../run-summary-sampling-experiments.jl       |  24 +---
 .../run-summary-sampling-range-query-type.jl  |  24 ----
 .../run-summary-sampling-with-query-type.jl   |  13 ++
 Experiments/Scripts/update_experiments.jl     | 136 ++----------------
 Experiments/build_color_summaries.jl          |  27 +++-
 Experiments/graph_results.jl                  |   5 +-
 Experiments/utils.jl                          |   8 +-
 Source/CardinalityWithColors.jl               |   8 +-
 13 files changed, 69 insertions(+), 276 deletions(-)
 delete mode 100644 Experiments/Scripts/run-cycle-experiments-timed.jl
 rename Experiments/Scripts/{run-cycle-experiments-detailed-sample.jl => run-cycle-experiments-without-join-table-cycle-stats.jl} (73%)
 delete mode 100644 Experiments/Scripts/run-summary-sampling-experiments-full-range.jl
 delete mode 100644 Experiments/Scripts/run-summary-sampling-range-query-type.jl
 create mode 100644 Experiments/Scripts/run-summary-sampling-with-query-type.jl

diff --git a/Experiments/Scripts/run-cycle-experiments-timed.jl b/Experiments/Scripts/run-cycle-experiments-timed.jl
deleted file mode 100644
index 65d15e6..0000000
--- a/Experiments/Scripts/run-cycle-experiments-timed.jl
+++ /dev/null
@@ -1,37 +0,0 @@
-using Plots.PlotMeasures
-include("../Experiments.jl")
-
-datasets::Vector{DATASET} = [aids]
-# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
-# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
-max_cycles = 6
-
-experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]
-println("started building")
-for experiment_params in experiment_params_list
-    build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")]
-    dataset = experiment_params.dataset
-    summary_params = experiment_params.summary_params
-    data = load_dataset(dataset)
-    summary_name = @timed params_to_summary_filename(experiment_params)
-    summary_file_location = "Experiments/SerializedSummaries/" * summary_name
-    println("Building Color Summary: ", summary_name)
-    # normal_results = @timed generate_color_summary(data, summary_params; verbose=1)
-    # normal_results = @timed generate_color_summary(data, summary_params; verbose=0, detailed_cycles=false)
-    results= @timed generate_color_summary(data, summary_params; verbose=1, detailed_cycles=false)    # println("normal time: ", normal_results.time)
-    println("detailed time: ", results.time)
-    summary_size = Base.summarysize(results.value)
-    serialize(summary_file_location, results.value)
-    push!(build_times, (string(dataset),
-                         string(summary_params.partitioner),
-                         string(summary_params.num_colors),
-                         string(results.time),
-                         string(summary_size)))
-    results_filename = params_to_results_filename(experiment_params)
-    result_file_location = "Experiments/Results/Build_" * results_filename
-    writedlm(result_file_location, build_times, ",")
-end
-println("started estimating")
-run_estimation_experiments(experiment_params_list)
-println("started graphing")
-graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="detailed-cycle-experiment")
\ No newline at end of file
diff --git a/Experiments/Scripts/run-cycle-experiments-detailed-sample.jl b/Experiments/Scripts/run-cycle-experiments-without-join-table-cycle-stats.jl
similarity index 73%
rename from Experiments/Scripts/run-cycle-experiments-detailed-sample.jl
rename to Experiments/Scripts/run-cycle-experiments-without-join-table-cycle-stats.jl
index 32d6c0c..b764c6f 100644
--- a/Experiments/Scripts/run-cycle-experiments-detailed-sample.jl
+++ b/Experiments/Scripts/run-cycle-experiments-without-join-table-cycle-stats.jl
@@ -1,9 +1,7 @@
 using Plots.PlotMeasures
 include("../Experiments.jl")
 
-datasets::Vector{DATASET} = [aids]
-# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
-# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
+datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
 max_cycles = 6
 
 experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]
@@ -16,10 +14,7 @@ for experiment_params in experiment_params_list
     summary_name = params_to_summary_filename(experiment_params)
     summary_file_location = "Experiments/SerializedSummaries/" * summary_name
     println("Building Color Summary: ", summary_name)
-    # normal_results = @timed generate_color_summary(data, summary_params; verbose=1)
-    # normal_results = @timed generate_color_summary(data, summary_params; verbose=0, detailed_cycles=false)
-    results= @timed generate_color_summary(data, summary_params; verbose=1, detailed_cycles=false)
-    # println("normal time: ", normal_results.time)
+    results= @timed generate_color_summary(data, summary_params; verbose=1, use_cycle_join_table=false)
     println("detailed time: ", results.time)
     summary_size = Base.summarysize(results.value)
     serialize(summary_file_location, results.value)
@@ -35,4 +30,4 @@ end
 println("started estimating")
 run_estimation_experiments(experiment_params_list)
 println("started graphing")
-graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="detailed-sample-experiment")
\ No newline at end of file
+graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="cycles-without-join-table-cycle-stats-experiment")
\ No newline at end of file
diff --git a/Experiments/Scripts/run-cycle-experiments.jl b/Experiments/Scripts/run-cycle-experiments.jl
index 0d419e0..d7e59d8 100644
--- a/Experiments/Scripts/run-cycle-experiments.jl
+++ b/Experiments/Scripts/run-cycle-experiments.jl
@@ -1,12 +1,10 @@
 using Plots.PlotMeasures
 include("../Experiments.jl")
 
-datasets::Vector{DATASET} = [aids]
-# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
-# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
+datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
 max_cycles = 6
-
 experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]
+
 println("started building")
 build_experiments(experiment_params_list)
 println("started estimating")
diff --git a/Experiments/Scripts/run-inference-sampling-experiments.jl b/Experiments/Scripts/run-inference-sampling-experiments.jl
index 9e535cb..f2d0f1d 100644
--- a/Experiments/Scripts/run-inference-sampling-experiments.jl
+++ b/Experiments/Scripts/run-inference-sampling-experiments.jl
@@ -1,30 +1,10 @@
-# TODO:
-# - turn sampling up/down (build and inference)
-# - cycle length effects
-# - reverify ground truth wrt G-captures
-# - rerun initial g-care benchmarks and verify old results
-# - recreate initial G-Care Benchmark results
-
-# My tasks:
-# - turn sampling up/down
-# - cycle length effects
-
 using Plots.PlotMeasures
 include("../Experiments.jl")
 
-# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human]
-# max_partial_paths = 10000
-
-# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)]
-
 datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
 max_paths = 60
-
 experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, inference_max_paths=current_paths) for current_dataset in datasets for current_paths in 2:10:max_paths]
+
 println("started building")
 build_experiments(experiment_params_list)
 println("started estimating")
diff --git a/Experiments/Scripts/run-summary-sampling-experiments-full-range.jl b/Experiments/Scripts/run-summary-sampling-experiments-full-range.jl
deleted file mode 100644
index 9c91a70..0000000
--- a/Experiments/Scripts/run-summary-sampling-experiments-full-range.jl
+++ /dev/null
@@ -1,24 +0,0 @@
-using Plots.PlotMeasures
-include("../Experiments.jl")
-
-# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human]
-# max_partial_paths = 10000
-
-# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)]
-
-datasets::Vector{DATASET} = [hprd, dblp, eu2005, patents]
-# datasets::Vector{DATASET} = [youtube]
-
-max_paths = 2000
-
-experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 1100:100:max_paths]
-println("started building")
-build_experiments(experiment_params_list)
-println("started estimating")
-run_estimation_experiments(experiment_params_list)
-println("started graphing")
-graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesfullrangesuperalot")
\ No newline at end of file
diff --git a/Experiments/Scripts/run-summary-sampling-experiments.jl b/Experiments/Scripts/run-summary-sampling-experiments.jl
index f5ef6ef..5c18217 100644
--- a/Experiments/Scripts/run-summary-sampling-experiments.jl
+++ b/Experiments/Scripts/run-summary-sampling-experiments.jl
@@ -1,30 +1,14 @@
 using Plots.PlotMeasures
-include("../build_color_summaries.jl")
-include("../get_true_cardinalities.jl")
-include("../load_datasets.jl")
-include("../load_querysets.jl")
-include("../run_estimators.jl")
-include("../graph_results.jl")
-include("../utils.jl")
+include("../Experiments.jl")
 
-# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human]
-# max_partial_paths = 10000
-
-# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)]
-
-# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
-datasets::Vector{DATASET} = [aids]
 
+datasets::Vector{DATASET} = [aids, wordnet, lubm80, human]
 max_paths = 1000
+experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 0:200:max_paths]
 
-experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 250:250:max_paths]
 println("started building")
 build_experiments(experiment_params_list)
 println("started estimating")
 run_estimation_experiments(experiment_params_list)
-
+println("started graphing")
 graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=error, grouping=summary_paths, filename="summarysamples")
diff --git a/Experiments/Scripts/run-summary-sampling-range-query-type.jl b/Experiments/Scripts/run-summary-sampling-range-query-type.jl
deleted file mode 100644
index 1d32312..0000000
--- a/Experiments/Scripts/run-summary-sampling-range-query-type.jl
+++ /dev/null
@@ -1,24 +0,0 @@
-using Plots.PlotMeasures
-include("../Experiments.jl")
-
-# datasets::Vector{DATASET} = [aids, wordnet, lubm80, human]
-# max_partial_paths = 10000
-
-# experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=2),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=10),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=50),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=250),
-#                                                     ExperimentParams(dataset=aids, partitioner=QuasiStable, inference_max_paths=1250)]
-
-# datasets::Vector{DATASET} = [hprd, dblp, eu2005, patents]
-# datasets::Vector{DATASET} = [youtube]
-
-max_paths = 300
-
-experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=yeast, partitioner=QuasiStable, summary_max_paths=current_paths) for current_paths in 10:30:max_paths]
-println("started building")
-build_experiments(experiment_params_list)
-println("started estimating")
-run_estimation_experiments(experiment_params_list)
-println("started graphing")
-graph_grouped_box_plot(experiment_params_list, x_type=query_type, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesquerytypesyeast")
\ No newline at end of file
diff --git a/Experiments/Scripts/run-summary-sampling-with-query-type.jl b/Experiments/Scripts/run-summary-sampling-with-query-type.jl
new file mode 100644
index 0000000..b9464e5
--- /dev/null
+++ b/Experiments/Scripts/run-summary-sampling-with-query-type.jl
@@ -0,0 +1,13 @@
+using Plots.PlotMeasures
+include("../Experiments.jl")
+
+current_dataset = yeast
+max_paths = 300
+
+experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_paths in 10:30:max_paths]
+println("started building")
+build_experiments(experiment_params_list)
+println("started estimating")
+run_estimation_experiments(experiment_params_list)
+println("started graphing")
+graph_grouped_box_plot(experiment_params_list, x_type=query_type, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesquerytypesyeast")
\ No newline at end of file
diff --git a/Experiments/Scripts/update_experiments.jl b/Experiments/Scripts/update_experiments.jl
index a66c7e5..c9a4db5 100644
--- a/Experiments/Scripts/update_experiments.jl
+++ b/Experiments/Scripts/update_experiments.jl
@@ -2,138 +2,20 @@ using Plots.PlotMeasures
 using Graphs
 include("../Experiments.jl")
 
-datasets::Vector{DATASET} = [human]
+datasets::Vector{DATASET} = [wordnet]
 # datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
 # datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
 max_cycles = 6
 proportions_not_updated = [0, 0.2, 0.4, 0.6, 0.8, 1]
-# proportions_not_updated = [1.0, 0.8]
 
-
-for proportion_not_updated in proportions_not_updated
-# only do this with the cloned stuff... we can repeat the same experiments without cloning in the normal version of the code
-experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]
+experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_cycle, proportion_not_updated=current_proportion) 
+                                                    for current_dataset in datasets for current_cycle in 2:max_cycles for current_proportion in proportions_not_updated]
 println("started building")
-for experiment_params in experiment_params_list
-    build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")]
-    dataset = experiment_params.dataset
-    summary_params = experiment_params.summary_params
-    data = load_dataset(dataset)
-    cloned_data = DataGraph(nv(data.graph)) # remember to add back in the vertex labels
-    cloned_data.vertex_labels = data.vertex_labels
-    graph_edges = collect(edges(data.graph))
-    println("length: ", length(graph_edges))
-    edges_to_add = (length(graph_edges) * proportion_not_updated)
-    println("edges to add: ", length(graph_edges) * proportion_not_updated)
-    remaining_edges = []
-    for edge in graph_edges
-        if (edges_to_add > 0)
-            add_labeled_edge!(cloned_data, (src(edge), dst(edge)), only(data.edge_labels[(src(edge), dst(edge))]))
-            edges_to_add -= 1
-            # update_edge_labels!(cloned_data, (src(edge), dst(edge)), data.edge_labels[(src(edge), dst(edge))])
-        else
-            push!(remaining_edges, edge)
-        end
-    end
-    summary_name = (params_to_summary_filename(experiment_params) * "with" * string(proportion_not_updated) * "updates")
-    summary_file_location = "Experiments/SerializedSummaries/" * summary_name
-    println("Building Color Summary: ", summary_name)
-    results= @timed generate_color_summary(cloned_data, summary_params; verbose=1, detailed_cycles=false)    # println("normal time: ", normal_results.time)
-    current_summary = results.value
-    for edge in remaining_edges
-        # println("ADDING SUMMARY EDGES")
-        add_summary_edge!(current_summary, src(edge), dst(edge), get(data.edge_labels, (src(edge), dst(edge)), []))
-    end
-    summary_size = Base.summarysize(current_summary)
-    serialize(summary_file_location, current_summary)
-    push!(build_times, (string(dataset),
-                         string(summary_params.partitioner),
-                         string(summary_params.num_colors),
-                         string(results.time),
-                         string(summary_size)))
-    results_filename = params_to_results_filename(experiment_params)
-    result_file_location = "Experiments/Results/Build_" * results_filename
-    writedlm(result_file_location, build_times, ",")
-end
+build_experiments(experiment_params_list)
 println("started estimating")
-for experiment_params in experiment_params_list
-    dataset = experiment_params.dataset
-    all_queries = load_querysets([dataset]; require_true_cardinality = true)
-    summary_name = (params_to_summary_filename(experiment_params) * "with" * string(proportion_not_updated) * "updates")
-    summary_file_location = "Experiments/SerializedSummaries/" * summary_name
-    !isfile(summary_file_location) && error("The summary has not been built yet! \n Attempted File Location: $(summary_file_location)")
-    summary::ColorSummary = deserialize(summary_file_location)
-    experiment_results = []
-    push!(experiment_results, ("UpperBound", "Estimate", "LowerBound", "TrueCard", "EstimationTime", "QueryType"))
-    for i in 1:length(all_queries[dataset])
-        query = all_queries[dataset][i].query
-        query_path = all_queries[dataset][i].query_path
-        exact_size = all_queries[dataset][i].exact_size
-        results = @timed get_cardinality_bounds(query, summary;
-                            max_partial_paths = experiment_params.inference_max_paths,
-                            use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true,
-                            sampling_strategy=experiment_params.sampling_strategy,
-                            only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)
-        upper_bound = results.value[3]
-        estimate = max(1, results.value[2])
-        lower_bound = results.value[1]
-        estimate_time = results.time
-        query_type = all_queries[dataset][i].query_type
-        push!(experiment_results, (upper_bound, estimate, lower_bound, exact_size, estimate_time, query_type))
-    end
-    results_file_location = "Experiments/Results/Estimation_"  * params_to_results_filename(experiment_params) * "with" * string(proportion_not_updated) * "updates"
-    writedlm(results_file_location, experiment_results, ",")
-end
+run_estimation_experiments(experiment_params_list)
 println("started graphing")
-# graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="detailed-cycle-experiment")
-x_type::GROUP=dataset
-y_type::VALUE=estimate_error
-grouping::GROUP=cycle_size    
-x_values = []
-    y_values = []
-    groups = []
-    for experiment_params in experiment_params_list
-        # load the results
-        results_filename = params_to_results_filename(experiment_params)
-        results_path = "Experiments/Results/Estimation_" * results_filename * "with" * string(proportion_not_updated) * "updates"
-        # println("results path: ", results_path)
-        results_df = CSV.read(results_path, DataFrame; normalizenames=true)
-
-        # get the x_value and grouping (same for all results in this experiment param)
-
-        # keep track of the data points
-        for i in 1:nrow(results_df)
-            current_x = x_type == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, x_type)
-            current_group = grouping == query_type ? results_df[i, :QueryType] : get_value_from_param(experiment_params, grouping)
-            current_y = 0
-            if y_type == estimate_error
-                current_y = results_df[i, :Estimate] / results_df[i, :TrueCard]
-            else # y_type == runtime
-                current_y = results_df[i, :EstimationTime]
-            end
-            # push the errors and their groupings into the correct vector
-            push!(x_values, current_x)
-            push!(y_values, current_y)
-            push!(groups, current_group)
-        end
-    end
-    println("starting graphs")
-
-    # This seems to be necessary for using Plots.jl outside of the ipynb framework.
-    # See this: https://discourse.julialang.org/t/deactivate-plot-display-to-avoid-need-for-x-server/19359/15
-    ENV["GKSwstype"]="100"
-    gbplot = groupedboxplot(x_values, y_values, group = groups, yscale =:log10,
-                            ylims=[10^-13, 10^11], yticks=[10^-10, 10^-5, 10^-2, 1, 10^2, 10^5, 10^10],
-                            legend = :outertopleft, size = (1000, 600))
-    xlabel!(gbplot, "Dataset")
-    ylabel!(gbplot, "Accuracy")
-    plotname = "cycleswith" * string(proportion_not_updated) * "withcycleupdateshuman" * ".png"
-    savefig(gbplot, "Experiments/Results/Figures/" * plotname)
-
-# first, take the data graph and collect its list of edges
-# then, create a new data graph with the same number of nodes
-# from the original graph, take the list of edges
-# for a portion of the edges, put them into the new graph directly
-# for the remaining edges, add them as "summary edges" using the summary
-# then, run a query, using the "real" graph and the "updated" graph
-end
\ No newline at end of file
+# compare how overall accuracy is affected by summary updates
+# graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=proportion_not_updated, filename="overall-accuracy-and-updates")
+# compare how cycle stat accuracies are affected by summary updates
+graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates")
\ No newline at end of file
diff --git a/Experiments/build_color_summaries.jl b/Experiments/build_color_summaries.jl
index 992c98a..4285222 100644
--- a/Experiments/build_color_summaries.jl
+++ b/Experiments/build_color_summaries.jl
@@ -4,13 +4,34 @@ function build_experiments(experiment_params_list::Vector{ExperimentParams})
         dataset = experiment_params.dataset
         summary_params = experiment_params.summary_params
         data = load_dataset(dataset)
+        cloned_data = DataGraph(nv(data.graph))
+        remaining_edges = []
+        if (experiment_params.summary_params.proportion_not_updated < 1.0)
+            cloned_data.vertex_labels = data.vertex_labels
+            graph_edges = collect(edges(data.graph))
+            # edges_to_add = (length(graph_edges) * experiment_params.summary_params.proportion_not_updated)
+            for edge in graph_edges
+                if (rand() < experiment_params.summary_params.proportion_not_updated)
+                    add_labeled_edge!(cloned_data, (src(edge), dst(edge)), only(data.edge_labels[(src(edge), dst(edge))]))
+                    # edges_to_add -= 1
+                else
+                    push!(remaining_edges, edge)
+                end
+            end
+        end
         summary_name = params_to_summary_filename(experiment_params)
         summary_file_location = "Experiments/SerializedSummaries/" * summary_name
         println("Building Color Summary: ", summary_name)
         timing_vec = Float64[]
-        results = @timed generate_color_summary(data, summary_params; verbose=1, timing_vec=timing_vec)
-        summary_size = Base.summarysize(results.value)
-        serialize(summary_file_location, results.value)
+        results = @timed generate_color_summary((experiment_params.summary_params.proportion_not_updated < 1.0) ? cloned_data : data, summary_params; verbose=1, timing_vec=timing_vec)
+        current_summary = results.value
+        if (experiment_params.summary_params.proportion_not_updated < 1.0)
+            for edge in remaining_edges
+                add_summary_edge!(current_summary, src(edge), dst(edge), get(data.edge_labels, (src(edge), dst(edge)), []))
+            end
+        end
+        summary_size = Base.summarysize(current_summary)
+        serialize(summary_file_location, current_summary)
         push!(build_times, (string(dataset),
                              string(summary_params.partitioner),
                              string(summary_params.num_colors),
diff --git a/Experiments/graph_results.jl b/Experiments/graph_results.jl
index 0a28718..e93d9c8 100644
--- a/Experiments/graph_results.jl
+++ b/Experiments/graph_results.jl
@@ -1,5 +1,4 @@
-@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase
-#todo: query type
+@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated
 
 @enum VALUE estimate_error runtime build_time memory_footprint
 
@@ -244,6 +243,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
         return experiment_param.only_shortest_path_cycle
     elseif value_type == number_of_colors
         return experiment_param.summary_params.num_colors
+    elseif value_type == proportion_not_updated
+        return experiment_param.summary_params.proportion_not_updated
     else
         # default to grouping by technique
         return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds)
diff --git a/Experiments/utils.jl b/Experiments/utils.jl
index 8eca011..d02d2f2 100644
--- a/Experiments/utils.jl
+++ b/Experiments/utils.jl
@@ -16,13 +16,14 @@ struct ExperimentParams
     function ExperimentParams(;dataset::DATASET,  num_colors::Int=64, max_cycle_size=6,
         only_shortest_path_cycle=false, summary_max_paths=1000,
         partitioner::PARTITIONER = QuasiStable, weighting=true, inference_max_paths=500, use_partial_sums=true,
-        sampling_strategy=redistributive, label_refining_rounds = 0)
+        sampling_strategy=redistributive, label_refining_rounds = 0, proportion_not_updated=1.0)
         return new(dataset, ColorSummaryParams(num_colors=num_colors,
                                                        max_cycle_size=max_cycle_size,
                                                        max_partial_paths=summary_max_paths,
                                                        partitioner=partitioner,
                                                        weighting=weighting,
-                                                       label_refining_rounds=label_refining_rounds),
+                                                       label_refining_rounds=label_refining_rounds,
+                                                       proportion_not_updated=proportion_not_updated),
                     inference_max_paths,
                     only_shortest_path_cycle,
                     use_partial_sums,
@@ -37,7 +38,8 @@ function params_to_results_filename(experiment_params::ExperimentParams)
     name *= string(experiment_params.inference_max_paths) * "_"
     name *= string(experiment_params.only_shortest_path_cycle) * "_"
     name *= string(experiment_params.use_partial_sums) * "_"
-    name *= string(experiment_params.sampling_strategy) * ".csv"
+    name *= string(experiment_params.sampling_strategy) * "_"
+    name *= string(experiment_params.summary_params.proportion_not_updated) * ".csv"
     return name
 end
 
diff --git a/Source/CardinalityWithColors.jl b/Source/CardinalityWithColors.jl
index 5608241..2a29b17 100644
--- a/Source/CardinalityWithColors.jl
+++ b/Source/CardinalityWithColors.jl
@@ -33,10 +33,11 @@ struct ColorSummaryParams
     partitioner::PARTITIONER
     weighting::Bool
     label_refining_rounds::Int
+    proportion_not_updated::Float16
 
     function ColorSummaryParams(;num_colors::Int=64, max_cycle_size=4, max_partial_paths=1000,
-            partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0)
-        return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds)
+            partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0, proportion_not_updated = 1.0)
+        return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds, proportion_not_updated)
     end
 end
 
@@ -45,7 +46,8 @@ function params_to_string(params::ColorSummaryParams)
     summary_name *= string(params.num_colors) * "_"
     summary_name *= string(params.max_cycle_size) * "_"
     summary_name *= string(params.max_partial_paths)* "_"
-    summary_name *= string(params.label_refining_rounds)
+    summary_name *= string(params.label_refining_rounds)* "_"
+    summary_name *= string(params.proportion_not_updated)
     return summary_name
 end