Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
eb326b8
change update scripts to use slightly different versions of duckdb
Tmonster Sep 28, 2023
881a4f0
Merge branch 'master' into duckdb-default-to-0.8-latest-runs-0.9
Tmonster Sep 28, 2023
10d590a
add clickhouse to regression script
Tmonster Nov 30, 2023
f96d042
actually install clickhouse for regression test
Tmonster Nov 30, 2023
a2a9fa0
remove clickhouse from skipped solutions
Tmonster Nov 30, 2023
56d659b
add tmate solution
Tmonster Nov 30, 2023
6c69d05
fix syntax mistake
Tmonster Nov 30, 2023
36593ce
remove clickhouse skip again
Tmonster Nov 30, 2023
0e48ec1
remove tmate session
Tmonster Nov 30, 2023
5c9af39
also stop clickhouse
Tmonster Nov 30, 2023
8ef6e50
only check if not test run
Tmonster Nov 30, 2023
bb6ab45
add tmate session back. Check for error and exception
Tmonster Dec 1, 2023
0e08e85
run polars and clickhouse. solution verify needs both results to work
Tmonster Dec 1, 2023
ab7eed7
run a benchmark twice
Tmonster Dec 1, 2023
ccd03e6
if clickhouse run an initial group by
Tmonster Dec 1, 2023
bd7281e
matrix.solution
Tmonster Dec 1, 2023
c7422ae
remove duplicate test cases
Tmonster Dec 1, 2023
ec99cb3
fix if statement
Tmonster Dec 1, 2023
0c93386
double quote to singe
Tmonster Dec 1, 2023
e071dec
fix datatable install
Tmonster Dec 1, 2023
e1f00eb
fix regex
Tmonster Dec 1, 2023
bd1beb2
add comment explaining why we run clickhouse twice
Tmonster Dec 4, 2023
c84afb2
Rename R to R-arrow (#68)
Tmonster Dec 6, 2023
0f47dcc
Merge branch 'master' into add_clickhouse_to_regression_2
Tmonster Dec 6, 2023
396cd63
Merge remote-tracking branch 'upstream/master'
Tmonster Dec 6, 2023
76bc770
Dask: Refactor and improve groupby-dask (#64)
milesgranger Dec 6, 2023
a55946f
Merge remote-tracking branch 'upstream/master'
Tmonster Dec 6, 2023
966765f
add results for dask
Tmonster Dec 6, 2023
56e4c74
fix bug for reporting arrow as R-arrow
Tmonster Dec 6, 2023
675e2ad
Merge branch 'master' into add_clickhouse_to_regression_2
Tmonster Dec 6, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion .github/workflows/regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion, dask, clickhouse]
name: Regression Tests solo solutions
runs-on: ubuntu-20.04
env:
Expand Down Expand Up @@ -54,6 +54,18 @@ jobs:
shell: bash
run: sudo swapoff -a


# needed because clickhouse for some reason produces an error the first
# time a benchmark is run. The next benchmark run will work and overwrite the
# old benchmark files.
- name: Run mini GroupBy benchmark if clickhouse
shell: bash
if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }}
run: |
python3 _utils/prep_solutions.py --task=groupby --solution=clickhouse
source path.env
TEST_RUN=true ./run.sh

- name: Run mini GroupBy benchmark
shell: bash
run: |
Expand Down
6 changes: 3 additions & 3 deletions arrow/groupby-arrow.R → R-arrow/groupby-R-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ source("./_helpers/helpers.R")
stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
.libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
suppressPackageStartupMessages({
library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
})
ver = packageVersion("arrow")
git = ""
task = "groupby"
solution = "arrow"
solution = "R-arrow"
fun = "group_by"
cache = TRUE
on_disk = FALSE
Expand Down
6 changes: 3 additions & 3 deletions arrow/join-arrow.R → R-arrow/join-R-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ source("./_helpers/helpers.R")

.libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
suppressPackageStartupMessages({
library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
})
ver = packageVersion("arrow")
git = ""
task = "join"
solution = "arrow"
solution = "R-arrow"
cache = TRUE
on_disk = FALSE

Expand Down
6 changes: 6 additions & 0 deletions R-arrow/setup-R-arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set -e

# install stable arrow
mkdir -p ./R-arrow/r-arrow
Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")'
2 changes: 1 addition & 1 deletion arrow/upg-arrow.sh → R-arrow/upg-R-arrow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ set -e

# upgrade all packages in arrow library only if new arrow is out
echo 'upgrading arrow...'
Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
4 changes: 4 additions & 0 deletions R-arrow/ver-R-arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
set -e

Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
14 changes: 7 additions & 7 deletions _benchplot/benchplot-dict.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ solution.dict = {list(
"juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")),
"clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")),
"polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")),
"arrow" = list(name=c(short="arrow", long="Arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
"R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
"duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")),
"duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")),
"datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3"))
Expand Down Expand Up @@ -199,7 +199,7 @@ groupby.syntax.dict = {list(
"regression v1 v2 by id2 id4" = "DF.groupby(['id2','id4']).agg((pl.pearson_corr('v1','v2')**2).alias('r2')).collect()",
"sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6']).agg([pl.sum('v3').alias('v3'), pl.count('v1').alias('count')]).collect()"
)},
"arrow" = {c(
"R-arrow" = {c(
"sum v1 by id1" = "AT %>% group_by(id1) %>% summarise(v1=sum(v1, na.rm=TRUE))",
"sum v1 by id1:id2" = "AT %>% group_by(id1, id2) %>% summarise(v1=sum(v1, na.rm=TRUE))",
"sum v1 mean v3 by id3" = "AT %>% group_by(id3) %>% summarise(v1=sum(v1, na.rm=TRUE), v3=mean(v3, na.rm=TRUE))",
Expand Down Expand Up @@ -260,7 +260,7 @@ groupby.syntax.dict = {list(
"juliads" = list(),
"clickhouse" = list(),
"polars" = list(),
"arrow" = list("Expression row_number() <= 2L not supported in Arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
"R-arrow" = list("Expression row_number() <= 2L not supported in R-arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in R-arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
"duckdb" = list(),
"duckdb-latest" = list(),
"datafusion" = list()
Expand Down Expand Up @@ -309,7 +309,7 @@ groupby.data.exceptions = {list(
"polars" = {list(
# "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # q10
)},
"arrow" = {list(
"R-arrow" = {list(
# "timeout" = c(), # q10
"internal error" = c("G1_1e8_2e0_0_0", "G1_1e8_1e2_0_1", "G1_1e8_1e2_5_0", "G1_1e9_1e2_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0","G1_1e9_1e1_0_0", # inherits from dplyr
"G1_1e9_2e0_0_0"), # #190
Expand Down Expand Up @@ -413,7 +413,7 @@ join.syntax.dict = {list(
"medium inner on factor" = "DF.merge(medium, on='id5')",
"big inner on int" = "DF.merge(big, on='id3')"
)},
"arrow" = {c(
"R-arrow" = {c(
"small inner on int" = "inner_join(DF, small, by='id1')",
"medium inner on int" = "inner_join(DF, medium, by='id2')",
"medium outer on int" = "left_join(DF, medium, by='id2')",
Expand Down Expand Up @@ -454,7 +454,7 @@ join.query.exceptions = {list(
"juliads" = list(),
"clickhouse" = list(),
"polars" = list(),
"arrow" = list(),
"R-arrow" = list(),
"duckdb" = list(),
"duckdb-latest" = list(),
"datafusion" = list()
Expand Down Expand Up @@ -496,7 +496,7 @@ join.data.exceptions = {list(
"polars" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
)},
"arrow" = {list(
"R-arrow" = {list(
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1" )#,
# "not yet implemented: #189" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1","J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
)},
Expand Down
4 changes: 2 additions & 2 deletions _control/solutions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ clickhouse,groupby
clickhouse,join
polars,groupby
polars,join
arrow,groupby
arrow,join
R-arrow,groupby
R-arrow,join
duckdb,groupby
duckdb,join
duckdb-latest,groupby
Expand Down
2 changes: 1 addition & 1 deletion _launcher/launcher.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ readret = function(x) {
file.ext = function(x) {
ans = switch(
x,
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl", "juliads"="jl",
Expand Down
2 changes: 1 addition & 1 deletion _launcher/solution.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ if ("quiet" %in% names(args)) {
file.ext = function(x) {
ans = switch(
x,
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
"collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
"pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
"clickhouse"="sql",
"juliadf"="jl", "juliads"="jl"
Expand Down
13 changes: 10 additions & 3 deletions _report/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) {
file.path(path, "report-done")
}
get_report_solutions = function() {
c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars","arrow","duckdb", "duckdb-latest", "datafusion")
c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "duckdb-latest", "datafusion", "arrow", "R-arrow")
}
get_data_levels = function() {
## groupby
Expand Down Expand Up @@ -69,6 +69,9 @@ clean_time = function(d) {
if (nrow(d[!nzchar(version) | is.na(version)]))
stop("timings data contains NA or '' as version field, that should not happen")
old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6")

# replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66)
d$solution[d$solution == "arrow"] <- "R-arrow"
d[!nzchar(git), git := NA_character_
][,"on_disk" := as.logical(on_disk)
][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_
Expand Down Expand Up @@ -243,9 +246,13 @@ transform = function(ld) {
# all ----

time_logs = function(path=getwd()) {
ct = clean_time(load_time(path=getwd()))
lt <- load_time(path=getwd())

ct = clean_time(lt)
d = model_time(ct)
l = model_logs(clean_logs(load_logs(path=path)))
ll <- load_logs(path=path)
ll$solution[ll$solution == "arrow"] <- "R-arrow"
l = model_logs(clean_logs(ll))
q = model_questions(clean_questions(load_questions(path=path)))

lq = merge_logs_questions(l, q)
Expand Down
11 changes: 5 additions & 6 deletions _utils/install_all_solutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ def install_all_solutions():
with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
solutions = csv.DictReader(solutions_file, delimiter=',')
for row in solutions:
if row['solution'] == "clickhouse":
continue
elif row['solution'] == "data.table":
if row['solution'] == "data.table":
install_solutions.add("datatable")
else:
install_solutions.add(row['solution'])
Expand All @@ -44,10 +42,11 @@ def install_all_solutions():
if solution.strip() == "all":
install_all_solutions()
else:
if solution == "clickhouse":
continue
elif solution == "data.table":
if solution == "data.table":
install_solution("datatable")
elif solution == "clickhouse":
install_solution("clickhouse")
install_solution("polars")
else:
install_solution(solution)

4 changes: 3 additions & 1 deletion _utils/prep_solutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
SOLUTIONS_FILENAME = "_control/solutions.csv"
RUN_CONF_FILENAME = "run.conf"

SKIPPED_SOLUTIONS = ["clickhouse"]
SKIPPED_SOLUTIONS = []


def print_usage():
Expand Down Expand Up @@ -33,6 +33,8 @@ def main():
solution = parse_solution()
if solution == "all":
solution = get_solutions(task)
if solution == "clickhouse":
solution = "clickhouse polars"
update_run_conf_solutions(solution, task)

def update_run_conf_solutions(solution_name_list, task):
Expand Down
4 changes: 2 additions & 2 deletions _utils/validate_no_errors.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
if [ $(grep -i "error" out/run_*.err | wc -l) = 0 ]
if [ $(grep -i 'error|exception' out/run_*.err | wc -l) = 0 ]
then
# no true errors found, print last line of each output script
echo "No Errors found in run_*.err logs"
else
echo "The following errors have been found. Failing check"
grep -i "error" out/*.err
grep -i "error|exception" out/*.err
exit 1
fi

Expand Down
6 changes: 0 additions & 6 deletions arrow/setup-arrow.sh

This file was deleted.

4 changes: 0 additions & 4 deletions arrow/ver-arrow.sh

This file was deleted.

20 changes: 15 additions & 5 deletions clickhouse/exec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,29 +34,39 @@ if [ $1 == 'groupby' ]; then
clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 Nullable(String), id2 Nullable(String), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
tail -n+2 data/$SRC_DATANAME.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT * FROM input('id1 Nullable(String), id2 Nullable(String), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)') FORMAT CSV"
# confirm all data loaded yandex/ClickHouse#4463
echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
if [ ! $TEST_RUN ]; then
echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
fi
elif [ $1 == 'join' ]; then
# lhs
clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME"
clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v1 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
tail -n+2 data/$SRC_DATANAME.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v1 Nullable(Float64)') FORMAT CSV"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
if [ ! $TEST_RUN ]; then
echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
fi
RHS=$(join_to_tbls $SRC_DATANAME)
RHS1=$(echo $RHS | cut -d' ' -f1)
clickhouse-client --query "DROP TABLE IF EXISTS $RHS1"
clickhouse-client --query "CREATE TABLE $RHS1 (id1 Nullable(Int32), id4 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
tail -n+2 data/$RHS1.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS1 SELECT * FROM input('id1 Nullable(Int32), id4 Nullable(String), v2 Nullable(Float64)') FORMAT CSV"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS1'\n$(echo $RHS1 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
if [ ! $TEST_RUN ]; then
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS1'\n$(echo $RHS1 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
fi
RHS2=$(echo $RHS | cut -d' ' -f2)
clickhouse-client --query "DROP TABLE IF EXISTS $RHS2"
clickhouse-client --query "CREATE TABLE $RHS2 (id1 Nullable(Int32), id2 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
tail -n+2 data/$RHS2.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS2 SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), v2 Nullable(Float64)') FORMAT CSV"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS2'\n$(echo $RHS2 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
if [ ! $TEST_RUN ]; then
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS2'\n$(echo $RHS2 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
fi
RHS3=$(echo $RHS | cut -d' ' -f3)
clickhouse-client --query "DROP TABLE IF EXISTS $RHS3"
clickhouse-client --query "CREATE TABLE $RHS3 (id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
tail -n+2 data/$RHS3.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS3 SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v2 Nullable(Float64)') FORMAT CSV"
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS3'\n$(echo $RHS3 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
if [ ! $TEST_RUN ]; then
echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS3'\n$(echo $RHS3 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
fi
else
echo "clickhouse task $1 not implemented" >&2 && exit 1
fi
Expand Down
2 changes: 1 addition & 1 deletion clickhouse/setup-clickhouse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ sudo rm /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse
sudo service clickhouse-server start

# stop server
#sudo service clickhouse-server stop
sudo service clickhouse-server stop

# let file table function access csv -- NO LONGER NECESSARY
# grep '<user_files_path>/var/lib/clickhouse/user_files/</user_files_path>' /etc/clickhouse-server/config.xml
Expand Down
2 changes: 1 addition & 1 deletion dask/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2023.10.1
2023.10.0
Loading