Skip to content

Commit

Permalink
Merge branch 'main' into kbd-online-sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebd99 committed Nov 14, 2023
2 parents 6048a04 + e3d03d5 commit 7c5ea5a
Show file tree
Hide file tree
Showing 24 changed files with 498 additions and 279 deletions.
1 change: 1 addition & 0 deletions Experiments/Experiments.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ using StatsPlots
using CSV, DataFrames
using Parquet2: Dataset
using DelimitedFiles: writedlm
using BenchmarkTools

include("../Source/CardinalityWithColors.jl")
include("utils.jl")
Expand Down
6 changes: 3 additions & 3 deletions Experiments/Scripts/build_time_exps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
using Profile
include("../Experiments.jl")

#datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
datasets = [aids, yago, hprd, dblp]
datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
#datasets = [wordnet]
experiment_params = Vector{ExperimentParams}()
build_params = Vector{ExperimentParams}()
for dataset in datasets
Expand All @@ -13,5 +13,5 @@ end

graph_grouped_bar_plot(build_params; grouping=build_phase,
y_type=build_time,
y_lims=[0, 6000],
y_lims=[0, 360],
filename="build_time")
6 changes: 3 additions & 3 deletions Experiments/Scripts/comparison_exps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ end

#build_experiments(experiment_params)

#run_estimation_experiments(experiment_params)
run_estimation_experiments(experiment_params)

graph_grouped_boxplot_with_comparison_methods(experiment_params; ylims=[10^-5, 10^2],y_type = runtime, grouping=number_of_colors, y_label="Runtime (s)", filename="comparison_exps_runtime")
graph_grouped_boxplot_with_comparison_methods(experiment_params; y_type = estimate_error, grouping=number_of_colors, y_label="Relative Error", filename="comparison_exps_error")
graph_grouped_boxplot_with_comparison_methods(experiment_params; ylims=[10^-5, 10^2],y_type = runtime, grouping=number_of_colors, y_label="Runtime (s)", filename="comparison_exps_runtime_2")
graph_grouped_boxplot_with_comparison_methods(experiment_params; y_type = estimate_error, grouping=number_of_colors, y_label="Relative Error", filename="comparison_exps_error_2")
4 changes: 2 additions & 2 deletions Experiments/Scripts/memory_exps.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ using Profile
include("../Experiments.jl")

#datasets = [aids, yeast, hprd, dblp, youtube, wordnet]
datasets = [aids, yeast, hprd, dblp, wordnet]
num_colors = [4, 8, 16, 32, 64, 128]
datasets = [wordnet]
num_colors = [4, 8, 16, 32, 64, 128, 256]
experiment_params = Vector{ExperimentParams}()
build_params = Vector{ExperimentParams}()
for dataset in datasets
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
using Plots.PlotMeasures
include("../Experiments.jl")

datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]
println("started building")
for experiment_params in experiment_params_list
build_times = [("Dataset", "Partitioner", "NumColors", "BuildTime", "MemoryFootprint")]
dataset = experiment_params.dataset
summary_params = experiment_params.summary_params
data = load_dataset(dataset)
summary_name = params_to_summary_filename(experiment_params)
summary_file_location = "Experiments/SerializedSummaries/" * summary_name
println("Building Color Summary: ", summary_name)
results= @timed generate_color_summary(data, summary_params; verbose=1, use_cycle_join_table=false)
println("detailed time: ", results.time)
summary_size = Base.summarysize(results.value)
serialize(summary_file_location, results.value)
push!(build_times, (string(dataset),
string(summary_params.partitioner),
string(summary_params.num_colors),
string(results.time),
string(summary_size)))
results_filename = params_to_results_filename(experiment_params)
result_file_location = "Experiments/Results/Build_" * results_filename
writedlm(result_file_location, build_times, ",")
end
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="cycles-without-join-table-cycle-stats-experiment")
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
using Plots.PlotMeasures
include("Experiments/build_color_summaries.jl")
include("Experiments/get_true_cardinalities.jl")
include("Experiments/load_datasets.jl")
include("Experiments/load_querysets.jl")
include("Experiments/run_estimators.jl")
include("Experiments/graph_results.jl")
include("Experiments/utils.jl")
using Plots.PlotMeasures
include("../Experiments.jl")

# datasets::Vector{DATASET} = [aids, human, lubm80, yago, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]

println("started building")
build_experiments(experiment_params_list)

println("started estimating")
run_estimation_experiments(experiment_params_list)

graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=error, grouping=cycle_size, filename="justaidsagain")
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="cycle-experiment")
13 changes: 13 additions & 0 deletions Experiments/Scripts/run-inference-sampling-experiments.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using Plots.PlotMeasures
include("../Experiments.jl")

datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
max_paths = 60
experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, inference_max_paths=current_paths) for current_dataset in datasets for current_paths in 2:10:max_paths]

println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=inference_paths, filename="inferencesampling")
14 changes: 14 additions & 0 deletions Experiments/Scripts/run-summary-sampling-experiments.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using Plots.PlotMeasures
include("../Experiments.jl")


datasets::Vector{DATASET} = [aids, wordnet, lubm80, human]
max_paths = 1000
experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 0:200:max_paths]

println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=error, grouping=summary_paths, filename="summarysamples")
13 changes: 13 additions & 0 deletions Experiments/Scripts/run-summary-sampling-with-query-type.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using Plots.PlotMeasures
include("../Experiments.jl")

current_dataset = yeast
max_paths = 300

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_paths in 10:30:max_paths]
println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=query_type, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesquerytypesyeast")
21 changes: 21 additions & 0 deletions Experiments/Scripts/update_experiments.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using Plots.PlotMeasures
using Graphs
include("../Experiments.jl")

datasets::Vector{DATASET} = [wordnet]
# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6
proportions_not_updated = [0, 0.2, 0.4, 0.6, 0.8, 1]

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_cycle, proportion_not_updated=current_proportion)
for current_dataset in datasets for current_cycle in 2:max_cycles for current_proportion in proportions_not_updated]
println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
# compare how overall accuracy is affected by summary updates
# graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=proportion_not_updated, filename="overall-accuracy-and-updates")
# compare how cycle stat accuracies are affected by summary updates
graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates")
27 changes: 24 additions & 3 deletions Experiments/build_color_summaries.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,34 @@ function build_experiments(experiment_params_list::Vector{ExperimentParams})
dataset = experiment_params.dataset
summary_params = experiment_params.summary_params
data = load_dataset(dataset)
cloned_data = DataGraph(nv(data.graph))
remaining_edges = []
if (experiment_params.summary_params.proportion_not_updated < 1.0)
cloned_data.vertex_labels = data.vertex_labels
graph_edges = collect(edges(data.graph))
# edges_to_add = (length(graph_edges) * experiment_params.summary_params.proportion_not_updated)
for edge in graph_edges
if (rand() < experiment_params.summary_params.proportion_not_updated)
add_labeled_edge!(cloned_data, (src(edge), dst(edge)), only(data.edge_labels[(src(edge), dst(edge))]))
# edges_to_add -= 1
else
push!(remaining_edges, edge)
end
end
end
summary_name = params_to_summary_filename(experiment_params)
summary_file_location = "Experiments/SerializedSummaries/" * summary_name
println("Building Color Summary: ", summary_name)
timing_vec = Float64[]
results = @timed generate_color_summary(data, summary_params; verbose=1, timing_vec=timing_vec)
summary_size = Base.summarysize(results.value)
serialize(summary_file_location, results.value)
results = @timed generate_color_summary((experiment_params.summary_params.proportion_not_updated < 1.0) ? cloned_data : data, summary_params; verbose=1, timing_vec=timing_vec)
current_summary = results.value
if (experiment_params.summary_params.proportion_not_updated < 1.0)
for edge in remaining_edges
add_summary_edge!(current_summary, src(edge), dst(edge), get(data.edge_labels, (src(edge), dst(edge)), []))
end
end
summary_size = Base.summarysize(current_summary)
serialize(summary_file_location, current_summary)
push!(build_times, (string(dataset),
string(summary_params.partitioner),
string(summary_params.num_colors),
Expand Down
7 changes: 3 additions & 4 deletions Experiments/graph_results.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase
#todo: query type
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated

@enum VALUE estimate_error runtime build_time memory_footprint

Expand Down Expand Up @@ -92,9 +91,7 @@ function graph_grouped_boxplot_with_comparison_methods(experiment_params_list::V
# load the results
results_filename = params_to_results_filename(experiment_params)
results_path = "Experiments/Results/Estimation_" * results_filename
# println("results path: ", results_path)
results_df = CSV.read(results_path, DataFrame; normalizenames=true)

# get the x_value and grouping (same for all results in this experiment param)

# keep track of the data points
Expand Down Expand Up @@ -246,6 +243,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
return experiment_param.only_shortest_path_cycle
elseif value_type == number_of_colors
return experiment_param.summary_params.num_colors
elseif value_type == proportion_not_updated
return experiment_param.summary_params.proportion_not_updated
else
# default to grouping by technique
return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds)
Expand Down
22 changes: 11 additions & 11 deletions Experiments/run_estimators.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

function run_estimation_experiments(experiment_params_list::Vector{ExperimentParams})
for experiment_params in experiment_params_list
dataset = experiment_params.dataset
Expand All @@ -9,18 +8,19 @@ function run_estimation_experiments(experiment_params_list::Vector{ExperimentPar
experiment_results = []
push!(experiment_results, ("UpperBound", "Estimate", "LowerBound", "TrueCard", "EstimationTime", "QueryType", "QueryPath"))
for i in 1:length(all_queries[dataset])
query = all_queries[dataset][i].query
query::QueryGraph = all_queries[dataset][i].query
query_path = all_queries[dataset][i].query_path
exact_size = all_queries[dataset][i].exact_size
results = @timed get_cardinality_bounds(query, summary;
max_partial_paths = experiment_params.inference_max_paths,
use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true,
sampling_strategy=experiment_params.sampling_strategy,
only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)
upper_bound = results.value[3]
estimate = max(1, results.value[2])
lower_bound = results.value[1]
estimate_time = results.time
estimate_results = [(@timed get_cardinality_bounds(query, summary;
max_partial_paths = experiment_params.inference_max_paths,
use_partial_sums=experiment_params.use_partial_sums, usingStoredStats=true,
sampling_strategy=experiment_params.sampling_strategy,
only_shortest_path_cycle= experiment_params.only_shortest_path_cycle)) for _ in 1:3]
estimate_time = median([x.time for x in estimate_results]) # Convert back to seconds from nano seconds
bounds = estimate_results[1].value
upper_bound = bounds[3]
estimate = max(1, bounds[2])
lower_bound = bounds[1]
query_type = all_queries[dataset][i].query_type
push!(experiment_results, (upper_bound, estimate, lower_bound, exact_size, estimate_time, query_type, query_path))
end
Expand Down
5 changes: 3 additions & 2 deletions Experiments/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ struct ExperimentParams
function ExperimentParams(;dataset::DATASET, num_colors::Int=64, max_cycle_size=6,
only_shortest_path_cycle=false, summary_max_paths=1000,
partitioner::PARTITIONER = QuasiStable, weighting=true, inference_max_paths=500, use_partial_sums=true,
sampling_strategy=redistributive, label_refining_rounds = 0)
sampling_strategy=redistributive, label_refining_rounds = 0, proportion_not_updated=1.0)
return new(dataset, ColorSummaryParams(num_colors=num_colors,
max_cycle_size=max_cycle_size,
max_partial_paths=summary_max_paths,
partitioner=partitioner,
weighting=weighting,
label_refining_rounds=label_refining_rounds),
label_refining_rounds=label_refining_rounds,
proportion_not_updated=proportion_not_updated),
inference_max_paths,
only_shortest_path_cycle,
use_partial_sums,
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ version = "0.1.0"

[deps]
AutoHashEquals = "15f4f7f2-30c1-5605-9d31-71845cf9641f"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Expand All @@ -14,7 +15,6 @@ Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Probably = "2172800d-0309-5a57-a84f-d50c94757422"
ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
QuasiStableColors = "9c3856af-3e7c-4d34-a6af-a406867b22e4"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand Down
8 changes: 5 additions & 3 deletions Source/CardinalityWithColors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ struct ColorSummaryParams
partitioner::PARTITIONER
weighting::Bool
label_refining_rounds::Int
proportion_not_updated::Float16

function ColorSummaryParams(;num_colors::Int=64, max_cycle_size=4, max_partial_paths=1000,
partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0)
return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds)
partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0, proportion_not_updated = 1.0)
return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds, proportion_not_updated)
end
end

Expand All @@ -45,7 +46,8 @@ function params_to_string(params::ColorSummaryParams)
summary_name *= string(params.num_colors) * "_"
summary_name *= string(params.max_cycle_size) * "_"
summary_name *= string(params.max_partial_paths)* "_"
summary_name *= string(params.label_refining_rounds)
summary_name *= string(params.label_refining_rounds)* "_"
summary_name *= string(params.proportion_not_updated)
return summary_name
end

Expand Down
Loading

0 comments on commit 7c5ea5a

Please sign in to comment.