Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment cleanup #44

Merged
merged 3 commits into from
Nov 13, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
using Plots.PlotMeasures
include("Experiments/Experiments.jl")
include("../Experiments.jl")

datasets::Vector{DATASET} = [aids]
# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]
@@ -16,10 +14,7 @@ for experiment_params in experiment_params_list
summary_name = params_to_summary_filename(experiment_params)
summary_file_location = "Experiments/SerializedSummaries/" * summary_name
println("Building Color Summary: ", summary_name)
# normal_results = @timed generate_color_summary(data, summary_params; verbose=1)
# normal_results = @timed generate_color_summary(data, summary_params; verbose=0, detailed_cycles=false)
results= @timed generate_color_summary(data, summary_params; verbose=1, detailed_cycles=false)
# println("normal time: ", normal_results.time)
results= @timed generate_color_summary(data, summary_params; verbose=1, use_cycle_join_table=false)
println("detailed time: ", results.time)
summary_size = Base.summarysize(results.value)
serialize(summary_file_location, results.value)
@@ -35,4 +30,4 @@ end
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="detailed-sample-experiment")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=cycle_size, filename="cycles-without-join-table-cycle-stats-experiment")
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
using Plots.PlotMeasures
include("Experiments/Experiments.jl")
include("../Experiments.jl")

datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_size) for current_dataset in datasets for current_size in 2:max_cycles]

println("started building")
build_experiments(experiment_params_list)
println("started estimating")
13 changes: 13 additions & 0 deletions Experiments/Scripts/run-inference-sampling-experiments.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using Plots.PlotMeasures
include("../Experiments.jl")

datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
max_paths = 60
experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, inference_max_paths=current_paths) for current_dataset in datasets for current_paths in 2:10:max_paths]

println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=inference_paths, filename="inferencesampling")
14 changes: 14 additions & 0 deletions Experiments/Scripts/run-summary-sampling-experiments.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using Plots.PlotMeasures
include("../Experiments.jl")


datasets::Vector{DATASET} = [aids, wordnet, lubm80, human]
max_paths = 1000
experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_dataset in datasets for current_paths in 0:200:max_paths]

println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=error, grouping=summary_paths, filename="summarysamples")
13 changes: 13 additions & 0 deletions Experiments/Scripts/run-summary-sampling-with-query-type.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using Plots.PlotMeasures
include("../Experiments.jl")

current_dataset = yeast
max_paths = 300

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, summary_max_paths=current_paths) for current_paths in 10:30:max_paths]
println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
graph_grouped_box_plot(experiment_params_list, x_type=query_type, y_type=estimate_error, grouping=summary_paths, filename="summarysamplesquerytypesyeast")
21 changes: 21 additions & 0 deletions Experiments/Scripts/update_experiments.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using Plots.PlotMeasures
using Graphs
include("../Experiments.jl")

datasets::Vector{DATASET} = [wordnet]
# datasets::Vector{DATASET} = [aids, human, yeast, wordnet, youtube, dblp, patents]
# datasets::Vector{DATASET} = [aids, human, lubm80, yeast, hprd, wordnet, dblp, youtube, eu2005, patents]
max_cycles = 6
proportions_not_updated = [0, 0.2, 0.4, 0.6, 0.8, 1]

experiment_params_list::Vector{ExperimentParams} = [ExperimentParams(dataset=current_dataset, partitioner=QuasiStable, max_cycle_size=current_cycle, proportion_not_updated=current_proportion)
for current_dataset in datasets for current_cycle in 2:max_cycles for current_proportion in proportions_not_updated]
println("started building")
build_experiments(experiment_params_list)
println("started estimating")
run_estimation_experiments(experiment_params_list)
println("started graphing")
# compare how overall accuracy is affected by summary updates
# graph_grouped_box_plot(experiment_params_list, x_type=dataset, y_type=estimate_error, grouping=proportion_not_updated, filename="overall-accuracy-and-updates")
# compare how cycle stat accuracies are affected by summary updates
graph_grouped_box_plot(experiment_params_list, x_type=proportion_not_updated, y_type=estimate_error, grouping=cycle_size, filename="cycle-stats-and-updates")
27 changes: 24 additions & 3 deletions Experiments/build_color_summaries.jl
Original file line number Diff line number Diff line change
@@ -4,13 +4,34 @@ function build_experiments(experiment_params_list::Vector{ExperimentParams})
dataset = experiment_params.dataset
summary_params = experiment_params.summary_params
data = load_dataset(dataset)
cloned_data = DataGraph(nv(data.graph))
remaining_edges = []
if (experiment_params.summary_params.proportion_not_updated < 1.0)
cloned_data.vertex_labels = data.vertex_labels
graph_edges = collect(edges(data.graph))
# edges_to_add = (length(graph_edges) * experiment_params.summary_params.proportion_not_updated)
for edge in graph_edges
if (rand() < experiment_params.summary_params.proportion_not_updated)
add_labeled_edge!(cloned_data, (src(edge), dst(edge)), only(data.edge_labels[(src(edge), dst(edge))]))
# edges_to_add -= 1
else
push!(remaining_edges, edge)
end
end
end
summary_name = params_to_summary_filename(experiment_params)
summary_file_location = "Experiments/SerializedSummaries/" * summary_name
println("Building Color Summary: ", summary_name)
timing_vec = Float64[]
results = @timed generate_color_summary(data, summary_params; verbose=1, timing_vec=timing_vec)
summary_size = Base.summarysize(results.value)
serialize(summary_file_location, results.value)
results = @timed generate_color_summary((experiment_params.summary_params.proportion_not_updated < 1.0) ? cloned_data : data, summary_params; verbose=1, timing_vec=timing_vec)
current_summary = results.value
if (experiment_params.summary_params.proportion_not_updated < 1.0)
for edge in remaining_edges
add_summary_edge!(current_summary, src(edge), dst(edge), get(data.edge_labels, (src(edge), dst(edge)), []))
end
end
summary_size = Base.summarysize(current_summary)
serialize(summary_file_location, current_summary)
push!(build_times, (string(dataset),
string(summary_params.partitioner),
string(summary_params.num_colors),
5 changes: 3 additions & 2 deletions Experiments/graph_results.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase
#todo: query type
@enum GROUP dataset technique cycle_size summary_paths inference_paths query_type sampling_type cycle_stats number_of_colors build_phase proportion_not_updated

@enum VALUE estimate_error runtime build_time memory_footprint

@@ -244,6 +243,8 @@ function get_value_from_param(experiment_param::ExperimentParams, value_type::GR
return experiment_param.only_shortest_path_cycle
elseif value_type == number_of_colors
return experiment_param.summary_params.num_colors
elseif value_type == proportion_not_updated
return experiment_param.summary_params.proportion_not_updated
else
# default to grouping by technique
return (experiment_param.summary_params.partitioner, experiment_param.summary_params.label_refining_rounds)
5 changes: 3 additions & 2 deletions Experiments/utils.jl
Original file line number Diff line number Diff line change
@@ -16,13 +16,14 @@ struct ExperimentParams
function ExperimentParams(;dataset::DATASET, num_colors::Int=64, max_cycle_size=6,
only_shortest_path_cycle=false, summary_max_paths=1000,
partitioner::PARTITIONER = QuasiStable, weighting=true, inference_max_paths=500, use_partial_sums=true,
sampling_strategy=redistributive, label_refining_rounds = 0)
sampling_strategy=redistributive, label_refining_rounds = 0, proportion_not_updated=1.0)
return new(dataset, ColorSummaryParams(num_colors=num_colors,
max_cycle_size=max_cycle_size,
max_partial_paths=summary_max_paths,
partitioner=partitioner,
weighting=weighting,
label_refining_rounds=label_refining_rounds),
label_refining_rounds=label_refining_rounds,
proportion_not_updated=proportion_not_updated),
inference_max_paths,
only_shortest_path_cycle,
use_partial_sums,
8 changes: 5 additions & 3 deletions Source/CardinalityWithColors.jl
Original file line number Diff line number Diff line change
@@ -33,10 +33,11 @@ struct ColorSummaryParams
partitioner::PARTITIONER
weighting::Bool
label_refining_rounds::Int
proportion_not_updated::Float16

function ColorSummaryParams(;num_colors::Int=64, max_cycle_size=4, max_partial_paths=1000,
partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0)
return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds)
partitioner::PARTITIONER = QuasiStable, weighting=true, label_refining_rounds = 0, proportion_not_updated = 1.0)
return new(num_colors, max_cycle_size, max_partial_paths, partitioner, weighting, label_refining_rounds, proportion_not_updated)
end
end

@@ -45,7 +46,8 @@ function params_to_string(params::ColorSummaryParams)
summary_name *= string(params.num_colors) * "_"
summary_name *= string(params.max_cycle_size) * "_"
summary_name *= string(params.max_partial_paths)* "_"
summary_name *= string(params.label_refining_rounds)
summary_name *= string(params.label_refining_rounds)* "_"
summary_name *= string(params.proportion_not_updated)
return summary_name
end

37 changes: 0 additions & 37 deletions run-cycle-experiments-timed.jl

This file was deleted.

33 changes: 0 additions & 33 deletions run-inference-sampling-experiments.jl

This file was deleted.

24 changes: 0 additions & 24 deletions run-summary-sampling-experiments-full-range.jl

This file was deleted.

30 changes: 0 additions & 30 deletions run-summary-sampling-experiments.jl

This file was deleted.

24 changes: 0 additions & 24 deletions run-summary-sampling-range-query-type.jl

This file was deleted.

139 changes: 0 additions & 139 deletions update_experiments.jl

This file was deleted.