Tmonster · Tmonster · Sep 28, 2023 · Sep 28, 2023 · Nov 30, 2023 · Nov 30, 2023
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -17,7 +17,7 @@ jobs:
   strategy:
     fail-fast: false
     matrix:
-      solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
+      solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, duckdb-latest, datafusion, dask, clickhouse]
   name: Regression Tests solo solutions
   runs-on: ubuntu-20.04
   env:
@@ -54,6 +54,18 @@ jobs:
       shell: bash
       run: sudo swapoff -a
 
+
+    # needed because clickhouse for some reason produces an error the first
+    # time a benchmark is run. The next benchmark run will work and overwrite the
+    # old benchmark files.
+    - name: Run mini GroupBy benchmark if clickhouse
+      shell: bash
+      if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }}
+      run: |
+        python3 _utils/prep_solutions.py --task=groupby --solution=clickhouse
+        source path.env
+        TEST_RUN=true ./run.sh
+
     - name: Run mini GroupBy benchmark
       shell: bash
       run: |

diff --git a/arrow/groupby-arrow.R → R-arrow/groupby-R-arrow.R b/arrow/groupby-arrow.R → R-arrow/groupby-R-arrow.R
@@ -7,13 +7,13 @@ source("./_helpers/helpers.R")
 stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
 .libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
 suppressPackageStartupMessages({
-  library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
-  library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
+  library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
+  library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
 })
 ver = packageVersion("arrow")
 git = ""
 task = "groupby"
-solution = "arrow"
+solution = "R-arrow"
 fun = "group_by"
 cache = TRUE
 on_disk = FALSE

diff --git a/arrow/join-arrow.R → R-arrow/join-R-arrow.R b/arrow/join-arrow.R → R-arrow/join-R-arrow.R
@@ -6,13 +6,13 @@ source("./_helpers/helpers.R")
 
 .libPaths("./arrow/r-arrow") # tidyverse/dplyr#4641 ## leave it like here in case if this affects arrow pkg as well
 suppressPackageStartupMessages({
-  library("arrow", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
-  library("dplyr", lib.loc="./arrow/r-arrow", warn.conflicts=FALSE)
+  library("arrow", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
+  library("dplyr", lib.loc="./R-arrow/r-arrow", warn.conflicts=FALSE)
 })
 ver = packageVersion("arrow")
 git = ""
 task = "join"
-solution = "arrow"
+solution = "R-arrow"
 cache = TRUE
 on_disk = FALSE
 

diff --git a/R-arrow/setup-R-arrow.sh b/R-arrow/setup-R-arrow.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+# install stable arrow
+mkdir -p ./R-arrow/r-arrow
+Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")'
diff --git a/arrow/upg-arrow.sh → R-arrow/upg-R-arrow.sh b/arrow/upg-arrow.sh → R-arrow/upg-R-arrow.sh
@@ -3,4 +3,4 @@ set -e
 
 # upgrade all packages in arrow library only if new arrow is out
 echo 'upgrading arrow...'
-Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./arrow/r-arrow")) update.packages(lib.loc="./arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
+Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
diff --git a/R-arrow/ver-R-arrow.sh b/R-arrow/ver-R-arrow.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+
+Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
@@ -42,7 +42,7 @@ solution.dict = {list(
   "juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")),
   "clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")),
   "polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")),
-  "arrow" = list(name=c(short="arrow", long="Arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
+  "R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
   "duckdb" = list(name=c(short="duckdb", long="DuckDB"), color=c(strong="#ddcd07", light="#fff100")),
   "duckdb-latest" = list(name=c(short="duckdb-latest", long="duckdb-latest"), color=c(strong="#ddcd07", light="#fff100")),
   "datafusion" = list(name=c(short="datafusion", long="Datafusion"), color=c(strong="deepskyblue4", light="deepskyblue3"))
@@ -199,7 +199,7 @@ groupby.syntax.dict = {list(
     "regression v1 v2 by id2 id4" = "DF.groupby(['id2','id4']).agg((pl.pearson_corr('v1','v2')**2).alias('r2')).collect()",
     "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6']).agg([pl.sum('v3').alias('v3'), pl.count('v1').alias('count')]).collect()"
   )},
-  "arrow" = {c(
+  "R-arrow" = {c(
     "sum v1 by id1" = "AT %>% group_by(id1) %>% summarise(v1=sum(v1, na.rm=TRUE))",
     "sum v1 by id1:id2" = "AT %>% group_by(id1, id2) %>% summarise(v1=sum(v1, na.rm=TRUE))",
     "sum v1 mean v3 by id3" = "AT %>% group_by(id3) %>% summarise(v1=sum(v1, na.rm=TRUE), v3=mean(v3, na.rm=TRUE))",
@@ -260,7 +260,7 @@ groupby.syntax.dict = {list(
   "juliads" =     list(),
   "clickhouse" =  list(),
   "polars"     =  list(),
-  "arrow"      =  list("Expression row_number() <= 2L not supported in Arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
+  "R-arrow"      =  list("Expression row_number() <= 2L not supported in R-arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in R-arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
   "duckdb"     =  list(),
   "duckdb-latest"     =  list(),
   "datafusion" =  list()
@@ -309,7 +309,7 @@ groupby.data.exceptions = {list(
   "polars" = {list(
     # "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # q10
   )},
-  "arrow" = {list(
+  "R-arrow" = {list(
     # "timeout" = c(), # q10
     "internal error" = c("G1_1e8_2e0_0_0", "G1_1e8_1e2_0_1", "G1_1e8_1e2_5_0", "G1_1e9_1e2_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0","G1_1e9_1e1_0_0", # inherits from dplyr
                          "G1_1e9_2e0_0_0"), # #190
@@ -413,7 +413,7 @@ join.syntax.dict = {list(
     "medium inner on factor" = "DF.merge(medium, on='id5')",
     "big inner on int" = "DF.merge(big, on='id3')"
   )},
-  "arrow" = {c(
+  "R-arrow" = {c(
     "small inner on int" = "inner_join(DF, small, by='id1')",
     "medium inner on int" = "inner_join(DF, medium, by='id2')",
     "medium outer on int" = "left_join(DF, medium, by='id2')",
@@ -454,7 +454,7 @@ join.query.exceptions = {list(
   "juliads" =     list(),
   "clickhouse" =  list(),
   "polars"     =  list(),
-  "arrow"      =  list(),
+  "R-arrow"      =  list(),
   "duckdb"     =  list(),
   "duckdb-latest"     =  list(),
   "datafusion" =  list()
@@ -496,7 +496,7 @@ join.data.exceptions = {list(
   "polars" = {list(
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
   )},
-  "arrow" = {list(
+  "R-arrow" = {list(
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1" )#,
     # "not yet implemented: #189" = c("J1_1e7_NA_0_0","J1_1e7_NA_5_0","J1_1e7_NA_0_1","J1_1e8_NA_0_0","J1_1e8_NA_5_0","J1_1e8_NA_0_1","J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
   )},

diff --git a/_control/solutions.csv b/_control/solutions.csv
@@ -25,8 +25,8 @@ clickhouse,groupby
 clickhouse,join
 polars,groupby
 polars,join
-arrow,groupby
-arrow,join
+R-arrow,groupby
+R-arrow,join
 duckdb,groupby
 duckdb,join
 duckdb-latest,groupby

diff --git a/_launcher/launcher.R b/_launcher/launcher.R
@@ -14,7 +14,7 @@ readret = function(x) {
 file.ext = function(x) {
   ans = switch(
     x,
-    "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
+    "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
     "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl",

diff --git a/_launcher/solution.R b/_launcher/solution.R
@@ -110,7 +110,7 @@ if ("quiet" %in% names(args)) {
 file.ext = function(x) {
   ans = switch(
     x,
-    "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
+    "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "R-arrow"=, "duckdb"="R", "duckdb-latest"="R",
     "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl"

diff --git a/_report/report.R b/_report/report.R
@@ -6,7 +6,7 @@ get_report_status_file = function(path=getwd()) {
   file.path(path, "report-done")
 }
 get_report_solutions = function() {
-  c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars","arrow","duckdb", "duckdb-latest", "datafusion")
+  c("collapse", "data.table", "dplyr", "pandas", "pydatatable", "spark", "dask", "juliadf", "juliads", "clickhouse", "cudf", "polars", "duckdb", "duckdb-latest", "datafusion", "arrow", "R-arrow")
 }
 get_data_levels = function() {
   ## groupby
@@ -69,6 +69,9 @@ clean_time = function(d) {
   if (nrow(d[!nzchar(version) | is.na(version)]))
     stop("timings data contains NA or '' as version field, that should not happen")
   old_advanced_groupby_questions = c("median v3 sd v3 by id2 id4","max v1 - min v2 by id2 id4","largest two v3 by id2 id4","regression v1 v2 by id2 id4","sum v3 count by id1:id6")
+
+  # replace arrow with R-arrow (see https://github.com/duckdblabs/db-benchmark/pull/66)
+  d$solution[d$solution == "arrow"] <- "R-arrow"
   d[!nzchar(git), git := NA_character_
     ][,"on_disk" := as.logical(on_disk)
       ][task=="groupby" & solution%in%c("pandas","dask","spark") & batch<1558106628, "out_cols" := NA_integer_
@@ -243,9 +246,13 @@ transform = function(ld) {
 # all ----
 
 time_logs = function(path=getwd()) {
-  ct = clean_time(load_time(path=getwd()))
+  lt <- load_time(path=getwd())
+
+  ct = clean_time(lt)
   d = model_time(ct)
-  l = model_logs(clean_logs(load_logs(path=path)))
+  ll <- load_logs(path=path)
+  ll$solution[ll$solution == "arrow"] <- "R-arrow"
+  l = model_logs(clean_logs(ll))
   q = model_questions(clean_questions(load_questions(path=path)))
 
   lq = merge_logs_questions(l, q)

diff --git a/_utils/install_all_solutions.py b/_utils/install_all_solutions.py
@@ -26,9 +26,7 @@ def install_all_solutions():
     with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
         solutions = csv.DictReader(solutions_file, delimiter=',')
         for row in solutions:
-            if row['solution'] == "clickhouse":
-                continue
-            elif row['solution'] == "data.table":
+            if row['solution'] == "data.table":
                 install_solutions.add("datatable")
             else:
                 install_solutions.add(row['solution'])
@@ -44,10 +42,11 @@ def install_all_solutions():
     if solution.strip() == "all":
         install_all_solutions()
     else:
-        if solution == "clickhouse":
-            continue
-        elif solution == "data.table":
+        if solution == "data.table":
             install_solution("datatable")
+        elif solution == "clickhouse":
+            install_solution("clickhouse")
+            install_solution("polars")
         else:
             install_solution(solution)
 
diff --git a/_utils/prep_solutions.py b/_utils/prep_solutions.py
@@ -5,7 +5,7 @@
 SOLUTIONS_FILENAME = "_control/solutions.csv"
 RUN_CONF_FILENAME = "run.conf"
 
-SKIPPED_SOLUTIONS = ["clickhouse"]
+SKIPPED_SOLUTIONS = []
 
 
 def print_usage():
@@ -33,6 +33,8 @@ def main():
     solution = parse_solution()
     if solution == "all":
         solution = get_solutions(task)
+    if solution == "clickhouse":
+        solution = "clickhouse polars"
     update_run_conf_solutions(solution, task)
 
 def update_run_conf_solutions(solution_name_list, task):

diff --git a/_utils/validate_no_errors.sh b/_utils/validate_no_errors.sh
@@ -1,10 +1,10 @@
-if [ $(grep -i "error" out/run_*.err | wc -l) = 0 ]
+if [ $(grep -i 'error|exception' out/run_*.err | wc -l) = 0 ]
 then
 	# no true errors found, print last line of each output script
     echo "No Errors found in run_*.err logs"
 else
 	echo "The following errors have been found. Failing check"
-	grep -i "error" out/*.err
+	grep -i "error|exception" out/*.err
 	exit 1
 fi
 

diff --git a/arrow/setup-arrow.sh b/arrow/setup-arrow.sh
diff --git a/arrow/ver-arrow.sh b/arrow/ver-arrow.sh
diff --git a/clickhouse/exec.sh b/clickhouse/exec.sh
@@ -34,29 +34,39 @@ if [ $1 == 'groupby' ]; then
   clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 Nullable(String), id2 Nullable(String), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
   tail -n+2 data/$SRC_DATANAME.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT * FROM input('id1 Nullable(String), id2 Nullable(String), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)') FORMAT CSV"
   # confirm all data loaded yandex/ClickHouse#4463
-  echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  if [ ! $TEST_RUN ]; then
+    echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  fi
 elif [ $1 == 'join' ]; then
   # lhs
   clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME"
   clickhouse-client --query "CREATE TABLE $SRC_DATANAME (id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v1 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
   tail -n+2 data/$SRC_DATANAME.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $SRC_DATANAME SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v1 Nullable(Float64)') FORMAT CSV"
-  echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  if [ ! $TEST_RUN ]; then
+    echo -e "clickhouse-client --query 'SELECT count(*) FROM $SRC_DATANAME'\n$(echo $SRC_DATANAME | cut -d'_' -f2)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  fi
   RHS=$(join_to_tbls $SRC_DATANAME)
   RHS1=$(echo $RHS | cut -d' ' -f1)
   clickhouse-client --query "DROP TABLE IF EXISTS $RHS1"
   clickhouse-client --query "CREATE TABLE $RHS1 (id1 Nullable(Int32), id4 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
   tail -n+2 data/$RHS1.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS1 SELECT * FROM input('id1 Nullable(Int32), id4 Nullable(String), v2 Nullable(Float64)') FORMAT CSV"
-  echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS1'\n$(echo $RHS1 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  if [ ! $TEST_RUN ]; then
+    echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS1'\n$(echo $RHS1 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  fi
   RHS2=$(echo $RHS | cut -d' ' -f2)
   clickhouse-client --query "DROP TABLE IF EXISTS $RHS2"
   clickhouse-client --query "CREATE TABLE $RHS2 (id1 Nullable(Int32), id2 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
   tail -n+2 data/$RHS2.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS2 SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), v2 Nullable(Float64)') FORMAT CSV"
-  echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS2'\n$(echo $RHS2 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  if [ ! $TEST_RUN ]; then
+    echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS2'\n$(echo $RHS2 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  fi
   RHS3=$(echo $RHS | cut -d' ' -f3)
   clickhouse-client --query "DROP TABLE IF EXISTS $RHS3"
   clickhouse-client --query "CREATE TABLE $RHS3 (id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v2 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();"
   tail -n+2 data/$RHS3.csv | clickhouse-client --max_memory_usage $CH_MEM --max_insert_threads 1 --query "INSERT INTO $RHS3 SELECT * FROM input('id1 Nullable(Int32), id2 Nullable(Int32), id3 Nullable(Int32), id4 Nullable(String), id5 Nullable(String), id6 Nullable(String), v2 Nullable(Float64)') FORMAT CSV"
-  echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS3'\n$(echo $RHS3 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  if [ ! $TEST_RUN ]; then
+    echo -e "clickhouse-client --query 'SELECT count(*) FROM $RHS3'\n$(echo $RHS3 | cut -d'_' -f3)" | Rscript -e 'stdin=readLines(file("stdin")); if ((loaded<-as.numeric(system(stdin[1L], intern=TRUE)))!=as.numeric(stdin[2L])) stop("incomplete data load, expected: ", stdin[2L],", loaded: ", loaded)'
+  fi
 else
   echo "clickhouse task $1 not implemented" >&2 && exit 1
 fi

diff --git a/clickhouse/setup-clickhouse.sh b/clickhouse/setup-clickhouse.sh
@@ -14,7 +14,7 @@ sudo rm /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse
 sudo service clickhouse-server start
 
 # stop server
-#sudo service clickhouse-server stop
+sudo service clickhouse-server stop
 
 # let file table function access csv -- NO LONGER NECESSARY
 # grep '<user_files_path>/var/lib/clickhouse/user_files/</user_files_path>' /etc/clickhouse-server/config.xml

diff --git a/dask/VERSION b/dask/VERSION
@@ -1 +1 @@
-2023.10.1
+2023.10.0