Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion]
name: Regression Tests solo solutions
runs-on: ubuntu-20.04
env:
Expand Down
6 changes: 3 additions & 3 deletions arrow/groupby-arrow.R → R-arrow/groupby-R-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ source("./_helpers/helpers.R")
stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
.libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
suppressPackageStartupMessages({
library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
})
ver = packageVersion("arrow")
git = ""
task = "groupby"
solution = "arrow"
solution = "R-arrow"
fun = "group_by"
cache = TRUE
on_disk = FALSE
Expand Down
6 changes: 3 additions & 3 deletions arrow/join-arrow.R → R-arrow/join-R-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ source("./_helpers/helpers.R")

.libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
suppressPackageStartupMessages({
library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
})
ver = packageVersion("arrow")
git = ""
task = "join"
solution = "arrow"
solution = "R-arrow"
cache = TRUE
on_disk = FALSE

Expand Down
6 changes: 6 additions & 0 deletions R-arrow/setup-R-arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set -e

# install stable arrow
mkdir -p ./R-arrow/r-arrow
Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")'
2 changes: 1 addition & 1 deletion arrow/upg-arrow.sh → R-arrow/upg-R-arrow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ set -e

# upgrade all packages in arrow library only if new arrow is out
echo 'upgrading arrow...'
Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
4 changes: 4 additions & 0 deletions R-arrow/ver-R-arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
set -e

Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
14 changes: 7 additions & 7 deletions _benchplot/benchplot-dict.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ solution.dict = {list(
"juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")),
"clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")),
"polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")),
"arrow" = list(name=c(short="arrow", long="Arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
"R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
"duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")),
"duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")),
"datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3"))
Expand Down Expand Up @@ -199,7 +199,7 @@ groupby.syntax.dict = {list(
"regression v1 v2 by id2 id4" = "DF.groupby(['id2','id4']).agg((pl.pearson_corr('v1','v2')**2).alias('r2')).collect()",
"sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6']).agg([pl.sum('v3').alias('v3'), pl.count('v1').alias('count')]).collect()"
)},
"arrow" = {c(
"R-arrow" = {c(
"sum v1 by id1" = "AT %>% group_by(id1) %>% summarise(v1=sum(v1, na.rm=TRUE))",
"sum v1 by id1:id2" = "AT %>% group_by(id1, id2) %>% summarise(v1=sum(v1, na.rm=TRUE))",
"sum v1 mean v3 by id3" = "AT %>% group_by(id3) %>% summarise(v1=sum(v1, na.rm=TRUE), v3=mean(v3, na.rm=TRUE))",
Expand Down Expand Up @@ -260,7 +260,7 @@ groupby.syntax.dict = {list(
"juliads" = list(),
"clickhouse" = list(),
"polars" = list(),
"arrow" = list("Expression row_number() <= 2L not supported in Arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
"R-arrow" = list("Expression row_number() <= 2L not supported in R-arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in R-arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
"duckdb" = list(),
"duckdb-latest" = list(),
"datafusion" = list()
Expand Down Expand Up @@ -309,7 +309,7 @@ groupby.data.exceptions = {list(
"polars" = {list(
# "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # q10
)},
"arrow" = {list(
"R-arrow" = {list(
# "timeout" = c(), # q10
"internal error" = c("G1_1e8_2e0_0_0", "G1_1e8_1e2_0_1", "G1_1e8_1e2_5_0", "G1_1e9_1e2_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0","G1_1e9_1e1_0_0", # inherits from dplyr
"G1_1e9_2e0_0_0"), # #190
Expand Down Expand Up @@ -413,7 +413,7 @@ join.syntax.dict = {list(
"medium inner on factor" = "DF.merge(medium, on='id5')",
"big inner on int" = "DF.merge(big, on='id3')"
)},
"arrow" = {c(
"R-arrow" = {c(
"small inner on int" = "inner_join(DF, small, by='id1')",
"medium inner on int" = "inner_join(DF, medium, by='id2')",
"medium outer on int" = "left_join(DF, medium, by='id2')",
Expand Down Expand Up @@ -454,7 +454,7 @@ join.query.exceptions = {list(
"juliads" = list(),
"clickhouse" = list(),
"polars" = list(),
"arrow" = list(),
"R-arrow" = list(),
"duckdb" = list(),
"duckdb-latest" = list(),
"datafusion" = list()
Expand Down Expand Up @@ -496,7 +496,7 @@ join.data.exceptions = {list(
"polars" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
)},
"arrow" = {list(
"R-arrow" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1" )#,
# "not yet implemented: #189" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1","J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
)},
Expand Down
4 changes: 2 additions & 2 deletions _control/solutions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ clickhouse,groupby
clickhouse,join
polars,groupby
polars,join
arrow,groupby
arrow,join
R-arrow,groupby
R-arrow,join
duckdb,groupby
duckdb,join
duckdb-latest,groupby
Expand Down
2 changes: 1 addition & 1 deletion _launcher/launcher.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ readret = function(x) {
file.ext = function(x) {
ans = switch(
x,
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl", "juliads"="jl",
Expand Down
2 changes: 1 addition & 1 deletion _launcher/solution.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ if ("quiet" %in% names(args)) {
file.ext = function(x) {
ans = switch(
x,
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl", "juliads"="jl"
Expand Down
13 changes: 10 additions & 3 deletions _report/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) {
file.path(path, "report-done")
}
get_report_solutions = function() {
c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars","arrow","duckdb", "duckdb-latest", "datafusion")
c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "duckdb-latest", "datafusion", "arrow", "R-arrow")
}
get_data_levels = function() {
## groupby
Expand Down Expand Up @@ -69,6 +69,9 @@ clean_time = function(d) {
if (nrow(d[!nzchar(version) | is.na(version)]))
stop("timings data contains NA or '' as version field, that should not happen")
old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6")

# replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66)
d[which(solution == "arrow"),c("solution")] == "R-arrow"
d[!nzchar(git), git := NA_character_
][,"on_disk" := as.logical(on_disk)
][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_
Expand Down Expand Up @@ -243,9 +246,13 @@ transform = function(ld) {
# all ----

time_logs = function(path=getwd()) {
ct = clean_time(load_time(path=getwd()))
lt <- load_time(path=getwd())

ct = clean_time(lt)
d = model_time(ct)
l = model_logs(clean_logs(load_logs(path=path)))
ll <- load_logs(path=path)
ll$solution[ll$solution == "arrow"] <- "R-arrow"
l = model_logs(clean_logs(ll))
q = model_questions(clean_questions(load_questions(path=path)))

lq = merge_logs_questions(l, q)
Expand Down
6 changes: 0 additions & 6 deletions arrow/setup-arrow.sh

This file was deleted.

4 changes: 0 additions & 4 deletions arrow/ver-arrow.sh

This file was deleted.

2 changes: 1 addition & 1 deletion run.conf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# task, used in init-setup-iteration.R
export RUN_TASKS="groupby join"
# solution, used in init-setup-iteration.R
export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb duckdb-latest datafusion"
export RUN_SOLUTIONS="collapse data.table juliads juliadf dplyr pandas pydatatable spark dask clickhouse polars R-arrow duckdb duckdb-latest datafusion"

# flag to upgrade tools, used in run.sh on init
export DO_UPGRADE=false
Expand Down
4 changes: 2 additions & 2 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/upg-h2o.
if [[ "$RUN_SOLUTIONS" =~ "h2o" ]]; then ./h2o/ver-h2o.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/upg-polars.sh; fi;
if [[ "$RUN_SOLUTIONS" =~ "polars" ]]; then ./polars/ver-polars.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/upg-arrow.sh; fi;
if [[ "$RUN_SOLUTIONS" =~ "arrow" ]]; then ./arrow/ver-arrow.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/R-upg-arrow.sh; fi;
if [[ "$RUN_SOLUTIONS" =~ "R-arrow" ]]; then ./R-arrow/ver-R-arrow.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/upg-duckdb.sh; fi;
if [[ "$RUN_SOLUTIONS" == "duckdb" ]]; then ./duckdb/ver-duckdb.sh; fi;
if [[ "$DO_UPGRADE" == true && "$RUN_SOLUTIONS" == "duckdb-latest" ]]; then ./duckdb-latest/setup-duckdb-latest.sh; fi;
Expand Down