-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheval_functions.py
657 lines (543 loc) · 29.8 KB
/
eval_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
import pandas as pd
import json
import numpy as np
import mizani.formatters as mizani
import plotnine as p9
import tabulate as tab
from argparse import Namespace
import io
import os
import sys
from enum import Enum
import pyco_proc
from pyco_proc import StatsFormat, StatsDestination
def read_latest_result_file(bench, tool, timeout):
assert tool != ""
#substring to filter files with the same timeout
timeout_str = f"to{timeout}-"
matching_files = []
for root, _, files in os.walk(bench):
for file in files:
if tool in file and timeout_str in file:
matching_files.append(os.path.join(root, file))
if not matching_files:
print(f"WARNING: {tool} has no .tasks file for {bench}")
return ""
latest_file_name = sorted(matching_files, key = lambda x: x[-23:])[-1]
with open(latest_file_name) as latest_file:
return latest_file.read()
def load_benches(benches, tools, bench_selection, benchmark_to_group, timeout = 120):
dfs = dict()
for bench in benches:
input = ""
for tool in tools:
assert tool != ""
input += read_latest_result_file(bench, tool, timeout)
has_error_bench = "django" in benches or "biopython" in benches or "thefuck" in benches
input = pyco_proc.proc_res(io.StringIO(input), Namespace(csv=True,html=False,text=False,tick=False,stats=StatsDestination.OUTPUT_FILE, stats_format=StatsFormat.JSON, ignore_error=has_error_bench))
df = pd.read_csv(
io.StringIO(input),
sep=";",
dtype='unicode',
)
# for the case one tool has only timeouts
for tool in tools:
if tool+"-result" not in df.keys():
df[tool+"-result"] = "TO"
for key in df.keys():
if key.endswith("-stats"):
df[key] = df[key] \
.apply(lambda value: value.replace("TO", "{}").replace("ERR", "{}")) \
.apply(lambda value: value.replace("###", "").replace("\\", "")) \
.apply(json.loads)
# TODO: Bugfix for the incorrectly named value.
# Remove when "str-num-proc-underapprox-solved-preprocess" is no longer being generated and "str-num-proc-length-solved-preprocess" is being correctly generated instead.
def rename_underapprox_solved_preprocess_to_length_solved_preprocess(stats_dict):
if "str-num-proc-underapprox-solved-preprocess" in stats_dict:
stats_dict["str-num-proc-length-solved-preprocess"] = stats_dict.pop("str-num-proc-underapprox-solved-preprocess")
return stats_dict
df[key] = df[key].apply(rename_underapprox_solved_preprocess_to_length_solved_preprocess)
df["benchmark"] = bench
df["benchmark-group"] = benchmark_to_group[bench]
dfs[bench] = df
# tools_no_dates = ['-'.join(tool.split("-")[:-5]) for tool in tools]
# we select only columns with used tools
df_stats = pd.concat(dfs, ignore_index=True)[["benchmark", "benchmark-group", "name"] + [f"{tool}-stats" for tool in tools if "stats" in tool]]
df_runtime_result = pd.concat(dfs, ignore_index=True)[["benchmark", "benchmark-group", "name"] + [f(tool) for tool in tools for f in (lambda x: x + "-result", lambda x: x + "-runtime")]]
for tool in tools:
# set runtime to the given parameter for nonsolved instances (unknown, TO, ERR or something else)
df_runtime_result.loc[(df_runtime_result[f"{tool}-result"] != "sat")&(df_runtime_result[f"{tool}-result"] != "unsat"), f"{tool}-runtime"] = timeout
# runtime columns should be floats
df_runtime_result[f"{tool}-runtime"] = df_runtime_result[f"{tool}-runtime"].astype(float)
if bench_selection == "INT_CONVS":
# we select only those formulae that contain to_int/from_int
with open("int_convs_not-full_str_int.txt") as file:
# fsi_not_conv is a list of formulae from full_str_int that do not contain to_int/from_int
fsi_not_conv = file.read().splitlines()
with open("int_convs-str_small_rw.txt") as file:
# ssr_conv is a list of formulae from str_small_rw that contain to_int/from_int
ssr_conv = file.read().splitlines()
with open("int_convs-stringfuzz.txt") as file:
# sf_conv is a list of formulae from stringfuzz that contain to_int/from_int
sf_conv = file.read().splitlines()
df_runtime_result = df_runtime_result[(df_runtime_result.benchmark != "full_str_int")|(~(df_runtime_result.name.isin(fsi_not_conv)))]
df_runtime_result = df_runtime_result[((df_runtime_result.benchmark != "str_small_rw")&(df_runtime_result.benchmark != "stringfuzz"))|((df_runtime_result.name.isin(ssr_conv))|(df_runtime_result.name.isin(sf_conv)))]
if bench_selection == "QF_S":
# for woorpje, QF_S benchmarks are those that are not in 20230329-woorpje-lu/track05/
df_runtime_result = df_runtime_result[(df_runtime_result.benchmark != "woorpje")|(~(df_runtime_result.name.str.contains("/track05/")))]
if bench_selection == "QF_SLIA":
# for woorpje, QF_SLIA benchmarks are those that are in 20230329-woorpje-lu/track05/
df_runtime_result = df_runtime_result[(df_runtime_result.benchmark != "woorpje")|(df_runtime_result.name.str.contains("/track05/"))]
df_all = df_runtime_result.merge(df_stats)
return df_all
def scatter_plot(df, x_tool, y_tool, timeout = 120, clamp=True, clamp_domain=[0.01, 120], xname=None, yname=None, log=True, width=6, height=6, show_legend=True, legend_width=2, file_name_to_save=None, transparent=False, color_by_benchmark=True, color_column="benchmark", value_order=None):
"""Returns scatter plot plotting the values of df[x_tool] and df[y_tool] columns.
Args:
df (Dataframe): Dataframe containing the values to plot
x_tool (str): name of the tool for x-axis
y_tool (str): name of the tool for x-axis
clamp (bool, optional): Whether cuts outside of clamp_domain are cut off. Defaults to True.
clamp_domain (list, optional): The min/max values to plot. Defaults to [TIME_MIN, TIMEOUT].
xname (str, optional): Name of the x axis. Defaults to None, in that case uses xcol.
yname (str, optional): Name of the y axis. Defaults to None, in that case uses ycol.
log (bool, optional): Use logarithmic scale. Defaults to False.
width (int, optional): Figure width in inches. Defaults to 6.
height (int, optional): Figure height in inches. Defaults to 6.
show_legend (bool, optional): Print legend. Defaults to True.
file_name_to_save (str, optional): If not None, save the result to file_name_to_save.pdf. Defaults to None.
transparent (bool, optional): Whether the generated plot should have transparent background. Defaults to False.
color_by_benchmark (bool, optional): Whether the dots should be colored based on the benchmark (if not, there will be just one color). Defaults to True.
color_column (str, optional): Name of the column to use for coloring. Defaults to 'benchmark'.
"""
assert len(clamp_domain) == 2
POINT_SIZE = 1.0
DASH_PATTERN = (0, (6, 2))
if xname is None:
xname = x_tool
if yname is None:
yname = y_tool
x_tool = x_tool+"-runtime"
y_tool = y_tool+"-runtime"
if show_legend:
width += legend_width
# formatter for axes' labels
ax_formatter = mizani.custom_format('{:n}')
if clamp: # clamp overflowing values if required
clamp_domain[1] = timeout
df = df.copy(deep=True)
df.loc[df[x_tool] > clamp_domain[1], x_tool] = clamp_domain[1]
df.loc[df[y_tool] > clamp_domain[1], y_tool] = clamp_domain[1]
# generate scatter plot
scatter = p9.ggplot(df)
if color_by_benchmark:
scatter += p9.aes(x=x_tool, y=y_tool, color=color_column,)
scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, show_legend=show_legend, raster=True)
# rug plots
scatter += p9.geom_rug(na_rm=True, sides="tr", alpha=0.05, raster=True)
else:
scatter += p9.aes(x=x_tool, y=y_tool, \
color=color_column, \
)
scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, show_legend=show_legend, raster=True, color="orange")
# rug plots
scatter += p9.geom_rug(na_rm=True, sides="tr", alpha=0.05, raster=True, color="orange")
scatter += p9.labs(x=xname, y=yname)
scatter += p9.theme(legend_key_width=2)
scatter += p9.scale_color_hue(l=0.4, s=0.9, h=0.1)
if log: # log scale
scatter += p9.scale_x_log10(limits=clamp_domain, labels=ax_formatter)
scatter += p9.scale_y_log10(limits=clamp_domain, labels=ax_formatter)
else:
scatter += p9.scale_x_continuous(limits=clamp_domain, labels=ax_formatter)
scatter += p9.scale_y_continuous(limits=clamp_domain, labels=ax_formatter)
# scatter += p9.theme_xkcd()
scatter += p9.theme_bw()
scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5))
scatter += p9.theme(panel_grid_minor=p9.element_blank())
scatter += p9.theme(figure_size=(width, height))
scatter += p9.theme(axis_text=p9.element_text(size=24, color="black"))
scatter += p9.theme(axis_title=p9.element_text(size=24, color="black"))
scatter += p9.theme(legend_text=p9.element_text(size=12))
if transparent:
scatter += p9.theme(
plot_background=p9.element_blank(),
panel_background = p9.element_rect(alpha=0.0),
panel_border = p9.element_rect(colour = "black"),
legend_background=p9.element_rect(alpha=0.0),
legend_box_background=p9.element_rect(alpha=0.0),
)
if not show_legend:
scatter += p9.theme(legend_position='none')
# generate additional lines
scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal
scatter += p9.geom_vline(xintercept=clamp_domain[1], linetype=DASH_PATTERN) # vertical rule
scatter += p9.geom_hline(yintercept=clamp_domain[1], linetype=DASH_PATTERN) # horizontal rule
if file_name_to_save != None:
scatter.save(filename=f"{file_name_to_save}.pdf", dpi=500, verbose=False)
return scatter
def cactus_plot(df, tools, timeout = 120, tool_names = None, start = 0, end = None, logarithmic_y_axis=True, width=6, height=6, show_legend=True, put_legend_outside=False, file_name_to_save=None, num_of_x_ticks=5):
"""Returns cactus plot (sorted runtimes of each tool in tools). To print the result use result.figure.savefig("name_of_file.pdf", transparent=True).
Args:
df (Dataframe): Dataframe containing for each tool in tools column tool-result and tool-runtime containing the result and runtime for each benchmark.
tools (list): List of tools to plot.
tool_names (dict, optional): Maps each tool to its name that is used in the legend. If not set (=None), the names are taken directly from tools.
start (int, optional): The starting position of the x-axis. Defaults to 0.
end (int, optional): The ending position of the x-axis. If not set (=None), defaults to number of benchmarks, i.e. len(df).
logarithmic_y_axis (bool, optional): Use logarithmic scale for the y-axis. Defaults to True.
width (int, optional): Figure width in inches. Defaults to 6.
height (int, optional): Figure height in inches. Defaults to 6.
show_legend (bool, optional): Print legend. Defaults to True.
put_legend_outside (bool, optional): Whether to put legend outside the plot. Defaults to False.
file_name_to_save (str, optional): If not None, save the result to file_name_to_save.pdf. Defaults to None.
num_of_x_ticks (int, optional): Number of ticks on the x-axis. Defaults to 5.
"""
if tool_names == None:
tool_names = { tool:tool for tool in tools }
if end == None:
end = len(df)
concat = dict()
for tool in tools:
name = tool_names[tool]
concat[name] = pd.Series(sorted(get_solved(df, tool)[tool + "-runtime"].tolist()))
concat = pd.DataFrame(concat)
plt = concat.plot.line(figsize=(width, height))
ticks = np.linspace(start, end, num_of_x_ticks, dtype=int)
plt.set_xticks(ticks)
plt.grid(True, linestyle='--', alpha=0.5)
plt.set_xlim([start, end])
plt.set_ylim([0.1, timeout])
if logarithmic_y_axis:
plt.set_yscale('log')
plt.set_xlabel("Instances", fontsize=16)
plt.set_ylabel("Runtime [s]", fontsize=16)
if show_legend:
if put_legend_outside:
plt.legend(bbox_to_anchor=(1.04, 1), loc='upper left',framealpha=0.1)
else:
plt.legend(loc='upper left',framealpha=0.1)
# plt.axvline(x=end)
# plt.get_legend().remove()
# figlegend = pylab.figure(figsize=(4,4))
# figlegend.legend(plt.get_children(), concat.columns, loc='center', frameon=False)
# figlegend.savefig(f"graphs/fig-cactus-{file_name}-legend.pdf", dpi=1000, bbox_inches='tight')
# plt.figure.savefig(f"graphs/fig-cactus-{file_name}.pdf", dpi=1000, bbox_inches='tight')
plt.figure.tight_layout()
if file_name_to_save != None:
plt.figure.savefig(f"{file_name_to_save}.pdf", transparent=True)
return plt
def sanity_check(df, tool, compare_with):
"""Returns dataframe containing rows of df, where df[tool-result] is different (sat vs. unsat) than the result of any of the tools in compare_with
Args:
compare_with (list): List of tools to compare with.
"""
all_bad = []
for tool_other in compare_with:
pt = df
pt = pt[((pt[tool+"-result"].str.strip() == 'sat') & (pt[tool_other+"-result"].str.strip() == 'unsat') | (pt[tool+"-result"].str.strip() == 'unsat') & (pt[tool_other+"-result"].str.strip() == 'sat'))]
all_bad.append(pt)
return pd.concat(all_bad).drop_duplicates()
def get_invalid(df, tool):
"""Returns dataframe containing rows of df, where df[tool-result] is not a valid result (sat/unsat/unknown/TO/ERR)"""
return df[((df[tool+"-result"].str.strip() != 'sat') & (df[tool+"-result"].str.strip() != 'unsat') & (df[tool+"-result"].str.strip() != 'unknown') & (df[tool+"-result"].str.strip() != 'TO') & (df[tool+"-result"].str.strip() != 'ERR'))]
def get_solved(df, tool):
"""Returns dataframe containing rows of df, where df[tool-result] is a result, i.e., either 'sat' or 'unsat'"""
return df[(df[tool+"-result"].str.strip() == 'sat')|(df[tool+"-result"].str.strip() == 'unsat')]
def get_unknowns(df, tool):
"""Returns dataframe containing rows of df, where df[tool-result] is 'unknown'"""
return df[(df[tool+"-result"].str.strip() == 'unknown')]
def get_timeouts(df, tool):
"""Returns dataframe containing rows of df, where df[tool-result] is timeout, i.e., 'TO'"""
return df[(df[tool+"-result"].str.strip() == 'TO')]
def get_errors(df, tool):
"""Returns dataframe containing rows of df, where df[tool-result] is some error or memout, i.e., 'ERR'"""
return df[(df[tool+"-result"].str.strip() == 'ERR')]
def get_sat(df, tool):
"""Returns dataframe containing rows of df, where df[tool-result] is 'sat'"""
return df[(df[tool+"-result"].str.strip() == 'sat')]
def get_unsat(df, tool):
"""Returns dataframe containing rows of df, where df[tool-result] is 'unsat'"""
return df[(df[tool+"-result"].str.strip() == 'unsat')]
def simple_table(df, tools, benches, separately=False, times_from_solved=True):
"""Prints a simple table with statistics for each tools.
Args:
df (Dataframe): data
tools (list): List of tools.
benches (list): List of benchmark sets.
separately (bool, optional): Should we print table for each benchmark separately. Defaults to False.
times_from_solved (bool, optional): Should we compute total, average and median time from only solved instances. Defaults to True.
"""
result = ""
def print_table_from_full_df(df):
header = ["tool", "✅", "❌", "time", "avg", "med", "std", "sat", "unsat", "unknown", "TO", "MO+ERR", "other"]
result = ""
result += f"# of formulae: {len(df)}\n"
result += "--------------------------------------------------\n"
table = [header]
for tool in tools:
sat = len(df[df[tool + "-result"].str.strip() == "sat"])
unsat = len(df[df[tool + "-result"].str.strip() == "unsat"])
runtime_col = df[f"{tool}-runtime"]
if times_from_solved:
runtime_col = get_solved(df, tool)[f"{tool}-runtime"]
avg = runtime_col.mean()
median = runtime_col.median()
total = runtime_col.sum()
std = runtime_col.std()
unknown = len(get_unknowns(df, tool))
to = len(get_timeouts(df, tool))
err = len(get_errors(df, tool))
other = len(get_invalid(df, tool))
table.append([tool, sat+unsat, unknown+to+err+other, total, avg, median, std, sat, unsat, unknown, to, err, other])
result += tab.tabulate(table, headers='firstrow', floatfmt=".2f") + "\n"
result += "--------------------------------------------------\n\n"
return result
if (separately):
for bench in benches:
result += f"Benchmark {bench}\n"
result += print_table_from_full_df(df[df["benchmark"] == bench])
else:
result += print_table_from_full_df(df[df["benchmark"].isin(benches)])
return result
def add_vbs(df, tools_list, name = None):
"""Adds virtual best solvers from tools in tool_list
Args:
df (Dataframe): data
tools_list (list): list of tools
name (str, optional): Name of the vbs used for the new columns. If not set (=None), the name is generated from the name of tools in tool_list.
Returns:
Dataframe: same as df but with new columns for the vbs
"""
if name == None:
name = "+".join(tools_list)
df[f"{name}-runtime"] = df[[f"{tool}-runtime" for tool in tools_list]].min(axis=1)
def get_result(row):
nonlocal tools_list
if "sat" in [str(row[f"{tool}-result"]).strip() for tool in tools_list]:
return "sat"
elif "unsat" in [str(row[f"{tool}-result"]).strip() for tool in tools_list]:
return "unsat"
else:
return "unknown"
df[f"{name}-result"] = df.apply(get_result, axis=1) # https://stackoverflow.com/questions/26886653/create-new-column-based-on-values-from-other-columns-apply-a-function-of-multi
return df
def fuck():
print("fuck")
def q75(x):
return np.percentile(a=x, q=75)
def q25(x):
return np.percentile(a=x, q=25)
def get_stats_total(df: pd.DataFrame, tool: str, benchmarks: list[str]) -> pd.DataFrame:
return df.loc[df["benchmark"].isin(benchmarks)].drop(["name", "benchmark", f"{tool}-result"], axis=1).agg(['mean', 'std', 'min', 'max', 'sum', 'var', 'median'])
class StatsDataType(Enum):
INT = 0
FLOAT = 1
STRING = 2
STATS_DATA_TYPES = {
"added-eqs": StatsDataType.INT,
"arith-bound-propagations-lp": StatsDataType.INT,
"arith-eq-adapter": StatsDataType.INT,
"arith-fixed-eqs": StatsDataType.INT,
"arith-lower": StatsDataType.INT,
"arith-make-feasible": StatsDataType.INT,
"arith-max-columns": StatsDataType.INT,
"arith-max-rows": StatsDataType.INT,
"arith-upper": StatsDataType.INT,
"binary-propagations": StatsDataType.INT,
"decisions": StatsDataType.INT,
"del-clause": StatsDataType.INT,
"final-checks": StatsDataType.INT,
"noodler-final_checks": StatsDataType.INT,
"max-memory": StatsDataType.FLOAT,
"memory": StatsDataType.FLOAT,
"mk-bool-var": StatsDataType.INT,
"mk-clause": StatsDataType.INT,
"mk-clause-binary": StatsDataType.INT,
"num-allocs": StatsDataType.INT,
"num-checks": StatsDataType.INT,
"propagations": StatsDataType.INT,
"rlimit-count": StatsDataType.INT,
"str-num-proc-stabilization-finish": StatsDataType.INT,
"str-num-proc-stabilization-start": StatsDataType.INT,
"time": StatsDataType.FLOAT,
"total-time": StatsDataType.FLOAT,
"conflicts": StatsDataType.INT,
"str-num-solved-preprocess": StatsDataType.INT,
"str-num-proc-multi-memb-heur-finish": StatsDataType.INT,
"str-num-proc-multi-memb-heur-start": StatsDataType.INT,
"arith-diseq": StatsDataType.INT,
"arith-offset-eqs": StatsDataType.INT,
"str-num-proc-underapprox-finish": StatsDataType.INT,
"str-num-proc-underapprox-start": StatsDataType.INT,
"arith-gcd-calls": StatsDataType.INT,
"arith-patches": StatsDataType.INT,
"arith-patches-success": StatsDataType.INT,
"str-num-proc-length-finish": StatsDataType.INT,
"str-num-proc-length-start": StatsDataType.INT,
"str-num-proc-single-memb-heur-finish": StatsDataType.INT,
"str-num-proc-single-memb-heur-start": StatsDataType.INT,
"solve-eqs-elim-vars": StatsDataType.INT,
"solve-eqs-steps": StatsDataType.INT,
"str-num-proc-nielsen-finish": StatsDataType.INT,
"str-num-proc-nielsen-start": StatsDataType.INT,
"str-num-proc-unary-finish": StatsDataType.INT,
"str-num-proc-unary-start": StatsDataType.INT,
"str-num-proc-stabilization-solved-preprocess": StatsDataType.INT,
"str-num-proc-length-solved-preprocess": StatsDataType.INT,
}
def get_stats_dfs(df, tool, order=None):
global STATS_DATA_TYPES
TOOL_STATS_STR = f"{tool}-stats"
TOOL_RUNTIME_STR = f"{tool}-runtime"
TOOL_RESULT_STR = f"{tool}-result"
additional_columns_strs = ['name', "benchmark", TOOL_RUNTIME_STR, TOOL_RESULT_STR]
df_stats = pd.json_normalize(df[TOOL_STATS_STR].copy())
stats_columns_to_keep_strs = []
for column in df_stats.keys():
if column.endswith("-start") or \
column.endswith("-finish") or \
column.endswith("-preprocess") or \
column in ["noodler-final_checks"]:
stats_columns_to_keep_strs.append(column)
df_stats = df_stats[stats_columns_to_keep_strs]
df_stats = pd.concat([df[additional_columns_strs], df_stats], axis=1)
df_stats.replace({"benchmark": {"snia": "kaluza"}}, inplace=True)
for column in df_stats:
if column not in additional_columns_strs:
if STATS_DATA_TYPES[column] == StatsDataType.INT:
df_stats[column] = df_stats[column].astype('Int64')
elif STATS_DATA_TYPES[column] == StatsDataType.FLOAT:
df_stats[column] = df_stats[column].astype(float)
elif STATS_DATA_TYPES[column] == StatsDataType.STRING:
df_stats[column] = df_stats[column].astype(str)
# for column in df_stats.columns:
# if column.endswith("-start") or column.endswith("-finish"):
# column_per_final_check_name = f"{column}-per-final-check"
# df_stats[column_per_final_check_name] = df_stats[column] / df_stats["final-checks"]
# df_stats[column_per_final_check_name] = df_stats[column_per_final_check_name].astype('Int64')
if order:
if "unknown" not in order:
order.append("unknown")
df_stats["benchmark"] = pd.Categorical(df_stats["benchmark"], order)
df_stats.sort_values("benchmark")
fill_nan_dict = {}
for column in df_stats.keys():
if column in ["name", "benchmark"]:
continue
fill_nan_dict[column] = 0
# fill_nan_dict["name"] = "unknown"
# fill_nan_dict["benchmark"] = "unknown"
df_stats_zeroed_nans = df_stats.fillna(value=fill_nan_dict, inplace=False)
return df_stats, df_stats_zeroed_nans
def get_stats_grouped_by_benchmark(df, tool):
df_stats_grouped_by_benchmark = df.drop(["name", f"{tool}-result"], axis=1).groupby(["benchmark"], observed=True).agg(['sum', ])
return df_stats_grouped_by_benchmark
def get_stats_grouped_by_benchmark_counts(df, tool):
return df.drop(["name", f"{tool}-result"], axis=1).groupby(["benchmark"], observed=True).agg(['count'])
def get_stats_characteristics_grouped_by_benchmark_characteristics(df, tool):
return df.drop(["name", f"{tool}-result"], axis=1).groupby(["benchmark"]).agg(['mean', 'std', 'min', 'max', 'sum', 'var', q25, 'median', q75])
def group_to_benchmark_groups(df, benchmark_to_group, order=None):
df_grouped = df.copy()
# df_grouped.replace({"benchmark": benchmark_to_group}, inplace=True)
df_grouped["benchmark"] = df_grouped["benchmark"] \
.map(benchmark_to_group) \
# .astype('category')
# df_grouped["benchmark"].cat.set_categories(order)
# df_grouped["benchmark"].sort_values(inplace=True)
if order:
if "unknown" not in order:
order.append("unknown")
df_grouped["benchmark"] = pd.Categorical(df_grouped["benchmark"], order)
df_grouped.sort_values("benchmark")
return df_grouped
def get_stats_paper(df):
df_paper = pd.DataFrame()
# df_paper["name"] = df["name"]
df_paper["benchmark"] = df["benchmark"]
df_paper["noodler-final_checks"] = df["noodler-final_checks"]
df_paper["str-num-proc-memb-heur-start"] = \
df["str-num-proc-multi-memb-heur-start"] \
+ df["str-num-proc-single-memb-heur-finish"]
# + df["str-num-proc-single-memb-heur-start"] \
df_paper["str-num-proc-memb-heur-finish"] = df["str-num-proc-multi-memb-heur-finish"] + df["str-num-proc-single-memb-heur-finish"]
df_paper["str-num-proc-nielsen-start"] = df["str-num-proc-nielsen-start"]
df_paper["str-num-proc-nielsen-finish"] = df["str-num-proc-nielsen-finish"]
df_paper["str-num-proc-length-start"] = df["str-num-proc-length-start"] + df["str-num-proc-length-solved-preprocess"]
df_paper["str-num-proc-length-finish"] = df["str-num-proc-length-finish"] + df["str-num-proc-length-solved-preprocess"]
df_paper["str-num-proc-length-solved-preprocess"] = df["str-num-proc-length-solved-preprocess"]
df_paper["str-num-proc-stabilization-start"] = \
df["str-num-proc-stabilization-start"] \
+ df["str-num-proc-underapprox-finish"] \
+ df["str-num-proc-stabilization-solved-preprocess"]
# df["str-num-proc-underapprox-start"] \
df_paper["str-num-proc-stabilization-finish"] = df["str-num-proc-stabilization-finish"] + df["str-num-proc-underapprox-finish"] \
+ df["str-num-proc-stabilization-solved-preprocess"]
df_paper["str-num-proc-stabilization-solved-preprocess"] = df["str-num-proc-stabilization-solved-preprocess"]
return df_paper
def get_stats_per_benchmark_paper(df):
df_paper = get_stats_paper(df)
df_stats_per_benchmark_sum = df_paper.groupby("benchmark", observed=True).sum()
for key in df_stats_per_benchmark_sum.keys():
if key.endswith("-finish") or key.endswith("-start") or key.endswith("-solved-preprocess"):
df_stats_per_benchmark_sum[key] = df_stats_per_benchmark_sum[key] / df_stats_per_benchmark_sum["noodler-final_checks"] * 100
df_stats_per_benchmark_sum[key] = df_stats_per_benchmark_sum[key].round(decimals=2)
return df_stats_per_benchmark_sum
def write_latex_table_body(df, float_format="{:.2f}", format_index_name=True, index_to_latex=None):
def format_index_name_default(name):
if index_to_latex and name in index_to_latex:
return index_to_latex[name]
return name
df_table = df
if format_index_name:
df_table = df.rename(index=format_index_name_default)
return df_table.to_latex(buf=None, columns=None, header=False, index=True, na_rep='NaN', formatters=None, float_format=float_format.format, sparsify=None, index_names=True, bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None, decimal='.', multicolumn=None, multicolumn_format=None, multirow=None, caption=None, label=None, position=None).splitlines()
def table_solved_time(df, df_all, benchmarks, benchmark_to_latex, tool_to_latex, per_column="benchmark"):
table_lines = write_latex_table_body(df, format_index_name=True, index_to_latex=tool_to_latex, float_format="{:,.0f}")
table_lines.insert(2, "")
table_lines.insert(3, "")
table_lines.insert(4, "")
table_lines.insert(5, "")
i = 2
for benchmark in benchmarks:
table_lines[2] += "& " + f"{benchmark_to_latex[benchmark]} & "
if benchmark == "all":
count = df_all.count().iloc[0]
else:
count = df_all[df_all[per_column] == benchmark].count().iloc[0]
table_lines[3] += "& " + f"({count:,}) & "
table_lines[4] += "\\cmidrule[lr]{" + str(i) + "-" + str(i + 1) + "}"
i += 2
table_lines[5] += f"& solved & time "
table_lines[2] += "\\\\"
table_lines[3] += "\\\\"
table_lines[5] += "\\\\"
del table_lines[6]
table_lines = table_lines[1:-1]
return table_lines
def solved_time_transpose_per_benchmark(df_solved_time):
df_solved_time_transposed = df_solved_time.transpose()
concat_rows = []
for index, _ in df_solved_time_transposed.iterrows():
if not index.endswith("-result"):
continue
index_result_name = index
index_runtime_name = index_result_name.replace("-result", "-runtime")
procedure_name = index_result_name.replace("-result", "")
result_row = df_solved_time_transposed.loc[[index_result_name]]
runtime_row = df_solved_time_transposed.loc[[index_runtime_name]]
concat_row = [procedure_name]
values_result = list(result_row.values)
values_runtime = list(runtime_row.values)
for val_result, val_runtime in zip(values_result, values_runtime):
for val_result, val_runtime in zip(val_result, val_runtime):
concat_row += [val_result, val_runtime]
concat_rows.append(concat_row)
columns = ["tool"]
for column in df_solved_time_transposed.keys():
columns.append(f"{column}-result")
columns.append(f"{column}-runtime")
df_concat_rows = pd.DataFrame(concat_rows, columns=columns)
df_concat_rows.set_index("tool", inplace=True)
for column in df_concat_rows.columns:
if column.endswith("-result"):
df_concat_rows[column] = df_concat_rows[column].apply(lambda x: '{:,}'.format(x))
return df_concat_rows