diff --git a/.gitignore b/.gitignore
index 271f1c27..1bd8080b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,7 @@ data-raw/icd/desc_start_stop.rds
data-raw/icd/icd10/.download_stamp
data-raw/icd/icd10/icd10.rds
data-raw/icd/icd9/.download_stamp
-data-raw/icd/icd9/icd9_cm_pcs.rds
+data-raw/icd/icd9/icd9.rds
data-raw/icd/icd_chapters.rds
data-raw/icd/icd_chapters_subchapters.rds
data-raw/icd/icd_codes.rds
diff --git a/DESCRIPTION b/DESCRIPTION
index fb1f61a5..1face083 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -21,11 +21,11 @@ BugReports: https://github.com/dewittpe/medicalcoder/issues
LazyData: true
Suggests:
data.table,
+ dplyr,
kableExtra,
knitr,
R.utils,
- rmarkdown,
- tibble (>= 2.0.0)
+ rmarkdown
RoxygenNote: 7.3.3
VignetteBuilder: knitr
Roxygen: list(markdown = TRUE)
diff --git a/NEWS.md b/NEWS.md
index a74a3c4f..269f7a2b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,24 @@
+# medicalcoder 0.7.0.9000
+
+## New Features
+
+* If a `tibble` is passed to `comorbidities()` and the `dplyr` namespace is
+ available, then `dplyr` methods will be used for data manipulation. This
+ change will generally result in less computation time than base R
+ `data.frames` (`data.tables` require even less time).
+
+## Other Changes
+
+* Extend and improve the internal ICD-9 database to distinguish between CDC and
+ CMS source.
+
+* Fix documentation of the `mdcr` and `mdcr_longitudinal` datasets.
+
+* Clarified internal data.frame/data.table helpers: documented that `mdcr_select()`
+ deep-copies data.table subsets to avoid aliasing, noted the selfref fix in
+ `mdcr_set()`, and added inline guidance in the longitudinal section of
+ `comorbidities()` to explain the first-occurrence logic.
+
# medicalcoder 0.7.0
## Bug Fixes
diff --git a/R/charlson.R b/R/charlson.R
index 073757c0..1250e2fc 100644
--- a/R/charlson.R
+++ b/R/charlson.R
@@ -62,7 +62,7 @@
cci <- as.integer(as.vector(X %*% cci_wt))
# build the return object
- rtn <- cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+ rtn <- mdcr_cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
rtn <- mdcr_set(rtn, j = "num_cmrb", value = num_cmrb)
rtn <- mdcr_set(rtn, j = "cmrb_flag", value = cmrb_flag)
rtn <- mdcr_set(rtn, j = "cci", value = cci)
diff --git a/R/comorbidities.R b/R/comorbidities.R
index c914788a..856c1720 100644
--- a/R/comorbidities.R
+++ b/R/comorbidities.R
@@ -360,13 +360,13 @@ comorbidities.data.frame <- function(data,
# Determine the lookup table and the columns for the lookup table to keep
lookup_to_keep <- c("condition")
if (startsWith(method, "pccc")) {
- lookup <- get_pccc_codes()
+ lookup <- get(x = "pccc_codes", envir = ..mdcr_data_env.., inherits = FALSE)
lookup_to_keep <- c(lookup_to_keep, "subcondition", "transplant_flag", "tech_dep_flag")
} else if (startsWith(method, "charlson")) {
- lookup <- get_charlson_codes()
+ lookup <- get("charlson_codes", envir = ..mdcr_data_env.., inherits = FALSE)
lookup_to_keep <- c(lookup_to_keep)
} else if (startsWith(method, "elixhauser")) {
- lookup <- get_elixhauser_codes()
+ lookup <- get("elixhauser_codes", envir = ..mdcr_data_env.., inherits = FALSE)
lookup_to_keep <- c(lookup_to_keep, "poaexempt")
}
@@ -385,25 +385,21 @@ comorbidities.data.frame <- function(data,
##############################################################################
# inner join the data with the lookup table
on_full <-
- merge(
+ mdcr_inner_join(
x = if (full.codes) {data} else {data[0, ]},
y = lookup,
- all = FALSE,
by.x = by_x,
by.y = c("full_code", by_y),
- suffixes = c("", ".y"),
- sort = FALSE
+ suffixes = c("", ".y")
)
on_comp <-
- merge(
+ mdcr_inner_join(
x = if (compact.codes) {data} else {data[0, ]},
y = lookup,
- all = FALSE,
by.x = by_x,
by.y = c("code", by_y),
- suffixes = c("", ".y"),
- sort = FALSE
+ suffixes = c("", ".y")
)
##############################################################################
@@ -519,6 +515,7 @@ comorbidities.data.frame <- function(data,
grps <- c(grps, "subcondition")
byconditions <- c(byconditions, "subcondition")
}
+ # identify first occurrence per id/condition then retain encounters on/after it
tmp <- mdcr_select(cmrb, c(grps, encid))
tmp <- mdcr_setorder(tmp, c(grps, encid))
keep <- !mdcr_duplicated(tmp, by = grps)
@@ -527,12 +524,11 @@ comorbidities.data.frame <- function(data,
# merge on the poa.var
foc <-
- merge(x = foc,
- y = cmrb,
- all = TRUE,
- by.x = c(id.vars2, "first_occurrance", byconditions),
- by.y = c(id.vars2, encid, byconditions),
- sort = FALSE
+ mdcr_full_outer_join(
+ x = foc,
+ y = cmrb,
+ by.x = c(id.vars2, "first_occurrance", byconditions),
+ by.y = c(id.vars2, encid, byconditions)
)
if (startsWith(method, "pccc")) {
@@ -546,7 +542,7 @@ comorbidities.data.frame <- function(data,
foc <-
lapply(foc,
function(y) {
- rtn <- merge(x = iddf, y = y, all.x = TRUE, by = c(id.vars2), allow.cartesian = TRUE, sort = FALSE)
+ rtn <- mdcr_left_join(x = iddf, y = y, by = c(id.vars2))
rtn <- mdcr_subset(rtn, i = !is.na(rtn[["condition"]]))
i <- rtn[[encid]] >= rtn[["first_occurrance"]]
mdcr_subset(rtn, i = i)
@@ -619,17 +615,6 @@ comorbidities.data.frame <- function(data,
##############################################################################
# set attributes and return
- if (requireNamespace("tibble", quietly = TRUE) && inherits(data, "tbl_df")) {
- if (subconditions) {
- ccc[["conditions"]] <- getExportedValue(name = "as_tibble", ns = "tibble")(x = ccc[["conditions"]])
- for (i in seq_len(length(ccc[["subconditions"]]))) {
- ccc[["subconditions"]][[i]] <- getExportedValue(name = "as_tibble", ns = "tibble")(x = ccc[["subconditions"]][[i]])
- }
- } else {
- ccc <- getExportedValue(name = "as_tibble", ns = "tibble")(x = ccc)
- }
- }
-
attr(ccc, "method") <- method
attr(ccc, "id.vars") <- id.vars
attr(ccc, "flag.method") <- flag.method
diff --git a/R/datasets.R b/R/datasets.R
index 94d6ba08..5b98581a 100644
--- a/R/datasets.R
+++ b/R/datasets.R
@@ -1,9 +1,7 @@
#' Synthetic Data
#'
#' @format
-#' `mdcr` is a `data.frame` with 4 columns, one for a patient id and 41 for
-#' diagnostic codes and 41 possible procedure codes. Each row is for one
-#' patient id.
+#' `mdcr` is a `data.frame` with 4 columns, Each row is for one ICD id.
#'
#' * `patid`: patient identifier, integer values
#' * `icdv`: ICD version; integer values, 9 or 10
@@ -17,15 +15,15 @@
#' Synthetic Longitudinal Data
#'
#' @format
-#' `mdcr_longitudinal` is a `data.frame` with four columns. The codes are
-#' expected to be treated as diagnostic codes but there are a few ICD-9 codes
-#' which could match to procedure codes as well.
+#' `mdcr_longitudinal` is a `data.frame` with 4 columns. The codes are
+#' expected to be treated as diagnostic codes. Warning: there are a few ICD-9
+#' codes which could match to procedure codes.
#'
#' * `patid`: patient identifier, integer values
#' * `date`: date the diagnostic code was recorded
#' * `icdv`: ICD version 9 or 10, integer valued
#' * `code`: ICD codes; character values
-#"
+#'
#' @family datasets
#'
"mdcr_longitudinal"
diff --git a/R/elixhauser.R b/R/elixhauser.R
index 6eec657a..428fed2f 100644
--- a/R/elixhauser.R
+++ b/R/elixhauser.R
@@ -36,7 +36,7 @@
}
# build the return object
- rtn <- cbind(iddf, as.data.frame(results$X, check.names = FALSE, stringsAsFactors = FALSE))
+ rtn <- mdcr_cbind(iddf, as.data.frame(results$X, check.names = FALSE, stringsAsFactors = FALSE))
rtn <- mdcr_set(rtn, j = "num_cmrb", value = results$num_cmrb)
rtn <- mdcr_set(rtn, j = "cmrb_flag", value = results$cmrb_flag)
rtn <- mdcr_set(rtn, j = "mortality_index", value = results$mortality_index)
@@ -47,7 +47,6 @@
}
.elixhauser_post2022 <- function(ccc, id.vars, iddf, cmrb, poa.var, primarydx.var, method) {
-
conditions <- ..mdcr_internal_elixhauser_codes..[["condition"]][which(..mdcr_internal_elixhauser_codes..[[method]] == 1L)]
conditions <- sort(unique(conditions))
@@ -207,7 +206,6 @@
}
.elixhauser_pre2022 <- function(ccc, id.vars, iddf, cmrb, poa.var, primarydx.var, method) {
-
# what are the relevent coniditions
conditions <-
unique(..mdcr_internal_elixhauser_codes..[["condition"]][which(..mdcr_internal_elixhauser_codes..[[method]] == 1L)])
diff --git a/R/get_icd_codes.R b/R/get_icd_codes.R
index b28803fa..86ebf0fa 100644
--- a/R/get_icd_codes.R
+++ b/R/get_icd_codes.R
@@ -28,14 +28,16 @@
#'
#' `known_start` is the first fiscal or calendar year (depending on source) that
#' the medicalcoder package as definitive source data for. ICD-9-CM started in
-#' the United States in fiscal year 1980. Source information that could be
-#' downloaded from the CDC and CMS and added to the source code for the
-#' medicalcoder package goes back to 1997. As such 1997 is the "known start"
+#' the United States in fiscal year 1980. The CDC extracts included in
+#' medicalcoder span fiscal years 1997--2012; the CMS ICD-9-CM/PCS extracts
+#' start in fiscal year 2006 and run through fiscal year 2015. As such 1997 is
+#' the earliest "known start" for ICD-9 within medicalcoder.
#'
#' `known_end` is the last fiscal or calendar year (depending on source)
-#' for which we have definitive source data for. For ICD-9-CM and ICD-9-PCS
-#' that is 2015. For ICD-10-CM and ICD-10-PCS, which are active, it is just the
-#' last year of known data. ICD-10 from the WHO ends in 2019.
+#' for which we have definitive source data for. For ICD-9-CM and ICD-9-PCS,
+#' CMS provides data through fiscal year 2015, while the CDC extracts stop at
+#' fiscal year 2012. For ICD-10-CM and ICD-10-PCS, which are active, it is just
+#' the last year of known data. ICD-10 from the WHO ends in 2019.
#'
#' ## Header and Assignable Codes
#'
diff --git a/R/is_icd.R b/R/is_icd.R
index a0aefe2c..57535021 100644
--- a/R/is_icd.R
+++ b/R/is_icd.R
@@ -67,7 +67,7 @@ is_icd <- function(x, icdv = c(9L, 10L), dx = c(1L, 0L),
}
# get the known icd codes and filter to relevent codes
- codes <- get_icd_codes(with.descriptions = FALSE, with.hierarchy = FALSE)
+ codes <- get("icd_codes", envir = ..mdcr_data_env.., inherits = FALSE)
# keep based on icdv, dx, and src
keep <- (codes[["icdv"]] %in% icdv) & (codes[["dx"]] %in% dx) & (codes[["src"]] %in% src)
diff --git a/R/lookup_icd_codes.R b/R/lookup_icd_codes.R
index 4e9cda19..d70038bf 100644
--- a/R/lookup_icd_codes.R
+++ b/R/lookup_icd_codes.R
@@ -43,7 +43,7 @@ lookup_icd_codes <- function(x, regex = FALSE, full.codes = TRUE, compact.codes
assert_scalar_logical(compact.codes)
stopifnot(isTRUE(full.codes | compact.codes))
- ICDCODES <- get_icd_codes(with.descriptions = FALSE, with.hierarchy = FALSE)
+ ICDCODES <- get("icd_codes", envir = ..mdcr_data_env.., inherits = FALSE)
if (regex) {
if(full.codes) {
diff --git a/R/pccc.R b/R/pccc.R
index 74ef0d6d..dedf179b 100644
--- a/R/pccc.R
+++ b/R/pccc.R
@@ -139,7 +139,7 @@
X <- X[, colorder, drop = FALSE]
- rtn <- cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+ rtn <- mdcr_cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
if (subconditions) {
rtn <- list(conditions = rtn, subconditions = list())
@@ -166,7 +166,7 @@
X[cbind(ri[keep], ci[keep])] <- 1L
}
- rtn[["subconditions"]][[cnd]] <- cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+ rtn[["subconditions"]][[cnd]] <- mdcr_cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
}
}
@@ -209,7 +209,7 @@
X[match(key_tran, key_iddf), "any_transplant"] <- 1L
X <- cbind(X, num_cmrb, cmrb_flag)
- rtn <- cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+ rtn <- mdcr_cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
if (subconditions) {
rtn <- list(conditions = rtn, subconditions = list())
@@ -235,7 +235,7 @@
X[cbind(ri[keep], ci[keep])] <- 1L
}
- rtn[["subconditions"]][[cnd]] <- cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+ rtn[["subconditions"]][[cnd]] <- mdcr_cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
}
}
diff --git a/R/summary.R b/R/summary.R
index efcf7240..e6297501 100644
--- a/R/summary.R
+++ b/R/summary.R
@@ -104,7 +104,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
N <- nrow(object[["conditions"]])
- conditions <- get_pccc_conditions()[c("condition", "condition_label")]
+ conditions <- ..mdcr_internal_pccc_conditions..[c("condition", "condition_label")]
conditions <- unique(conditions)
conditions <- conditions[order(conditions[["condition"]]), ]
@@ -159,7 +159,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
warning(sprintf("Logic for pccc_summary_table has been implemented for flag.method = 'current'. Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
}
- conditions <- get_pccc_conditions()[c("condition", "condition_label")]
+ conditions <- ..mdcr_internal_pccc_conditions..[c("condition", "condition_label")]
conditions <- unique(conditions)
conditions <- conditions[order(conditions[["condition"]]), ]
@@ -189,7 +189,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
warning(sprintf("Logic for pccc_summary_table has been implemented for flag.method = 'current'. Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
}
- conditions <- get_pccc_conditions()[c("condition", "condition_label")]
+ conditions <- ..mdcr_internal_pccc_conditions..[c("condition", "condition_label")]
conditions <- unique(conditions)
conditions <- conditions[order(conditions[["condition"]]), ]
@@ -254,7 +254,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
warning(sprintf("Logic for charlson summary table has been implemented for flag.method = 'current'. Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
}
- cmrbs <- get_charlson_index_scores()[!is.na( get_charlson_index_scores()[[attr(object, "method")]]), c("condition_description", "condition")]
+ cmrbs <- ..mdcr_internal_charlson_index_scores..[!is.na( ..mdcr_internal_charlson_index_scores..[[attr(object, "method")]]), c("condition_description", "condition")]
cmrbs[["count"]] <- colSums(object[cmrbs[["condition"]]])
cmrbs[["percent"]] <- 100 * colMeans(object[cmrbs[["condition"]]])
@@ -305,7 +305,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
warning(sprintf("Logic for Elixhauser summary has been implemented for flag.method = 'current'. Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
}
- cmrbs <- get_elixhauser_index_scores()[!is.na( get_elixhauser_index_scores()[[attr(object, "method")]]), "condition", drop = FALSE]
+ cmrbs <- ..mdcr_internal_elixhauser_index_scores..[!is.na( ..mdcr_internal_elixhauser_index_scores..[[attr(object, "method")]]), "condition", drop = FALSE]
cmrbs <- unique(cmrbs)
cmrbs[["count"]] <- colSums(object[cmrbs[["condition"]]])
diff --git a/R/utilities.R b/R/utilities.R
index c92fa74e..c39b193c 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -6,6 +6,10 @@
#' subsetting needs to be specific to data.tables. These internal (non-exported)
#' functions provide the needed method by data.table or data.frame.
#'
+#' `mdcr_select()` deep-copies data.table subsets (via data.table::copy()) to
+#' avoid aliasing when downstream code mutates; this intentionally trades some
+#' performance for isolation.
+#'
#' @param x a data.frame or data.table
#' @param i Optional. Indicates the rows on which the values must be updated. If
#' not `NULL`, implies all rows.
@@ -25,13 +29,44 @@ NULL
#' @keywords internal
mdcr_set <- function(x, i = NULL, j, value) {
stopifnot(is.data.frame(x))
- if (requireNamespace("data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ if (requireNamespace(package = "data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ # calling data.table::setDT to make sure that the object can be modified by
+ # reference. Without data.table::setDT here we see an error if the table was
+ # read from disk (e.g., readRDS()) or hand-constructed:
+ #
+ # This data.table has either been loaded from disk (e.g. using
+ # readRDS()/load()) or constructed manually (e.g. using structure()).
+ # Please run setDT() or setalloccol() on it first (to pre-allocate space
+ # for new columns) before assigning by reference to it.
+ #
+ # data.tables read via readRDS()/load() or hand‑constructed can have an
+ # invalid .internal.selfref, so the first by‑reference op (set()/:= ) errors.
+ # data.table::setDT() reinitializes the selfref so data.table::set() can work
+ # by reference. This wrapper keeps by-ref semantics consistent across backends.
+
getExportedValue(name = "setDT", ns = "data.table")(x = x)
getExportedValue(name = "set", ns = "data.table")(x = x, i = i, j = j, value = value)
+ } else if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ mutate <- getExportedValue(name = "mutate", ns = "dplyr")
+ if (is.null(i)) {
+ newcol <- if (nrow(x) == 0L) value[0] else value
+ } else {
+ newcol <- x[[j]]
+ if (is.null(newcol)) {
+ newcol <- rep(NA, length = nrow(x))
+ storage.mode(newcol) <- typeof(value)
+ }
+ newcol[i] <- value
+ }
+ x <- do.call(mutate, c(list(.data = x), stats::setNames(list(newcol), j)))
} else {
if (is.null(i)) {
x[[j]] <- value
} else {
+ if (is.null(x[[j]])) {
+ x[[j]] <- rep(NA, nrow(x))
+ storage.mode(x[[j]]) <- typeof(value)
+ }
x[[j]][i] <- value
}
}
@@ -49,8 +84,25 @@ mdcr_select <- function(x, cols) {
return(x)
}
- if (requireNamespace("data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ # # By makeing sure that cols is a character vector the `with = FALSE` is not
+ # # needed for data.tables which will allow for a simple call. This is
+ # # important because `[.data.frame` will error if `with = FALSE` is passed.
+ # # `[.data.table` does not need `with = FALSE` if `j` is a character vector.
+ # stopifnot(inherits(cols, "character"))
+ # #x[, cols, drop = FALSE, with = FALSE]
+ # x[, cols, drop = FALSE]
+
+ if (requireNamespace(package = "data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ # note: the data.table::copy() is needed here because x[, cols] returns a
+ # shallow copy of the columns. The use of mdcr_select in the package
+ # implicitly assumes deep copies. Downstream setorder()/setnames() mutate in
+ # place, so copying here preserves the original. This pays a copy cost to
+ # protect callers who expect an isolated subset.
return(getExportedValue(name = "copy", ns = "data.table")(x[, cols, drop = FALSE, with = FALSE]))
+ } else if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ select <- getExportedValue(name = "select", ns = "dplyr")
+ all_of <- getExportedValue(name = "all_of", ns = "dplyr")
+ return(select(x, all_of(cols)))
} else {
return(x[, cols, drop = FALSE])
}
@@ -71,17 +123,27 @@ mdcr_subset <- function(x, i, cols) {
return(mdcr_select(x, cols = cols))
}
} else {
+ # match base/data.table semantics: logical i is converted to positions
rows <- if (is.logical(i)) which(i) else i
if (missing(cols)) {
- if (requireNamespace("data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ if (requireNamespace(package = "data.table", quietly = TRUE) && inherits(x, "data.table")) {
return(x[rows, , drop = FALSE, with = FALSE])
+ } else if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ slice <- getExportedValue(name = "slice", ns = "dplyr")
+ return(slice(x, rows))
} else {
return(x[rows, , drop = FALSE])
}
} else {
- if (requireNamespace("data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ if (requireNamespace(package = "data.table", quietly = TRUE) && inherits(x, "data.table")) {
return(x[rows, cols, drop = FALSE, with = FALSE])
+ } else if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ slice <- getExportedValue(name = "slice", ns = "dplyr")
+ select <- getExportedValue(name = "select", ns = "dplyr")
+ all_of <- getExportedValue(name = "all_of", ns = "dplyr")
+ x <- slice(x, rows)
+ return(select(x, all_of(cols)))
} else {
cols_idx <- match(cols, names(x))
return(x[rows, cols_idx, drop = FALSE])
@@ -97,8 +159,11 @@ mdcr_subset <- function(x, i, cols) {
#' @keywords internal
mdcr_setorder <- function(x, by) {
stopifnot(is.data.frame(x))
- if (requireNamespace("data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ if (requireNamespace(package = "data.table", quietly = TRUE) && inherits(x, "data.table")) {
getExportedValue(name = "setorderv", ns = "data.table")(x, by)
+ } else if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ arrange <- getExportedValue(name = "arrange", ns = "dplyr")
+ x <- do.call(arrange, c(list(.data = x), lapply(by, as.name)))
} else {
x <- x[do.call(order, x[by]), , drop = FALSE]
}
@@ -112,11 +177,15 @@ mdcr_setorder <- function(x, by) {
#' @keywords internal
mdcr_setnames <- function(x, old, new, ...) {
stopifnot(is.data.frame(x))
- if (requireNamespace("data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ stopifnot(is.character(old), is.character(new))
+ stopifnot(length(old) == length(new))
+ if (requireNamespace(package = "data.table", quietly = TRUE) && inherits(x, "data.table")) {
getExportedValue(name = "setnames", ns = "data.table")(x, old, new, ...)
+ } else if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ rename <- getExportedValue(name = "rename", ns = "dplyr")
+ args <- c(list(.data = x), stats::setNames(lapply(old, as.name), new))
+ x <- do.call(rename, args)
} else {
- stopifnot(is.character(old), is.character(new))
- stopifnot(length(old) == length(new))
for (i in seq_len(length(old))) {
names(x)[names(x) == old[i]] <- new[i]
}
@@ -131,17 +200,150 @@ mdcr_setnames <- function(x, old, new, ...) {
#' @keywords internal
mdcr_duplicated <- function(x, by = seq_along(x), ...) {
stopifnot(is.data.frame(x))
- if (requireNamespace("data.table", quietly = TRUE) && inherits(x, "data.table")) {
+ if (requireNamespace(package = "data.table", quietly = TRUE) && inherits(x, "data.table")) {
# Flag this frame as data.table-aware so duplicated.data.table uses its
# optimized path instead of falling back to duplicated.data.frame.
.datatable.aware <- TRUE
rtn <- utils::getFromNamespace(x = 'duplicated.data.table', ns = "data.table")(x, by = by, ...)
- } else {
+ } else { # this is for base R data.frames and tidyverse tbl_df
rtn <- duplicated(x[, by, drop = FALSE], ...)
}
rtn
}
+#'
+#' @rdname mdcr_data_frame_tools
+#' @family data.frame tools
+#' @noRd
+#' @keywords internal
+mdcr_left_join <- function(x, y, ...) {
+ stopifnot(is.data.frame(x), is.data.frame(y))
+
+ if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ lj <- getExportedValue(name = "left_join", ns = "dplyr")
+ dots <- list(...)
+ if (!is.null(dots$by.x) & !is.null(dots$by.y)) {
+ by <- stats::setNames(dots$by.y, dots$by.x)
+ dots$by.x <- NULL
+ dots$by.y <- NULL
+ } else if (!is.null(dots$by)) {
+ by <- dots$by
+ dots$by <- NULL
+ } else {
+ by <- NULL
+ }
+ if (!is.null(dots$suffixes)) {
+ suffix <- dots$suffixes
+ dots$suffixes <- NULL
+ } else {
+ suffix <- c(".x", ".y")
+ }
+ # normalize to dplyr's by/suffix arguments to mirror base/data.table defaults
+ rtn <- do.call(what = lj, args = c(list(x = x, y = y, by = by, suffix = suffix), dots))
+ } else {
+ # if x is a data.table and the data.table namespace is available then the
+ # data.table:::merge.data.table method will be called and a specific block
+ # for data.table is not needed here
+ rtn <- merge(x = x, y = y, all.x = TRUE, all.y = FALSE, sort = FALSE, allow.cartesian = TRUE, ...)
+ }
+ rtn
+}
+
+#'
+#' @rdname mdcr_data_frame_tools
+#' @family data.frame tools
+#' @noRd
+#' @keywords internal
+mdcr_full_outer_join <- function(x, y, ...) {
+ stopifnot(is.data.frame(x), is.data.frame(y))
+
+ if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ fj <- getExportedValue(name = "full_join", ns = "dplyr")
+ dots <- list(...)
+ if (!is.null(dots$by.x) & !is.null(dots$by.y)) {
+ by <- stats::setNames(dots$by.y, dots$by.x)
+ dots$by.x <- NULL
+ dots$by.y <- NULL
+ } else if (!is.null(dots$by)) {
+ by <- dots$by
+ dots$by <- NULL
+ } else {
+ by <- NULL
+ }
+ if (!is.null(dots$suffixes)) {
+ suffix <- dots$suffixes
+ dots$suffixes <- NULL
+ } else {
+ suffix <- c(".x", ".y")
+ }
+ # normalize to dplyr's by/suffix arguments to mirror base/data.table defaults
+ rtn <- do.call(what = fj, args = c(list(x = x, y = y, by = by, suffix = suffix), dots))
+ } else {
+ # if x is a data.table and the data.table namespace is available then the
+ # data.table:::merge.data.table method will be called and a specific block
+ # for data.table is not needed here
+ rtn <- merge(x = x, y = y, all.x = TRUE, all.y = TRUE, sort = FALSE, allow.cartesian = TRUE, ...)
+ }
+ rtn
+}
+
+#'
+#' @rdname mdcr_data_frame_tools
+#' @family data.frame tools
+#' @noRd
+#' @keywords internal
+mdcr_inner_join <- function(x, y, ...) {
+ stopifnot(is.data.frame(x), is.data.frame(y))
+
+ if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ ij <- getExportedValue(name = "inner_join", ns = "dplyr")
+ dots <- list(...)
+ if (!is.null(dots$by.x) & !is.null(dots$by.y)) {
+ by <- stats::setNames(dots$by.y, dots$by.x)
+ dots$by.x <- NULL
+ dots$by.y <- NULL
+ } else if (!is.null(dots$by)) {
+ by <- dots$by
+ dots$by <- NULL
+ } else {
+ by <- NULL
+ }
+ if (!is.null(dots$suffixes)) {
+ suffix <- dots$suffixes
+ dots$suffixes <- NULL
+ } else {
+ suffix <- c(".x", ".y")
+ }
+ # normalize to dplyr's by/suffix arguments to mirror base/data.table defaults
+ rtn <- do.call(what = ij, args = c(list(x = x, y = y, by = by, suffix = suffix), dots))
+ } else {
+ # if x is a data.table and the data.table namespace is available then the
+ # data.table:::merge.data.table method will be called and a specific block
+ # for data.table is not needed here
+ rtn <- merge(x = x, y = y, all.x = FALSE, all.y = FALSE, sort = FALSE, ...)
+ }
+ rtn
+}
+
+#'
+#' @rdname mdcr_data_frame_tools
+#' @family data.frame tools
+#' @noRd
+#' @keywords internal
+mdcr_cbind <- function(x, ...) {
+ stopifnot(is.data.frame(x))
+ if (requireNamespace(package = "dplyr", quietly = TRUE) && inherits(x, "tbl_df")) {
+ cb <- getExportedValue(name = "bind_cols", ns = "dplyr")
+ rtn <- cb(x, ...)
+ } else {
+ # if x is a data.table and the data.table namespace is available then the
+ # data.table:::cbind.data.table method will be called and a specific block
+ # for data.table is not needed here
+ rtn <- cbind(x, ...)
+ }
+ rtn
+}
+
################################################################################
diff --git a/README.Rmd b/README.Rmd
index 957a606a..024c6238 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -56,11 +56,11 @@ The primary objectives of medicalcoder are:
ease of maintenance and usability.
- Suggested packages are needed only for development work and building
vignettes. They are not required for installation or use.
- - That said, there are non-trivial performance gains when passing a
- [`data.table`](https://cran.r-project.org/package=data.table) to the
- `comorbidities()` function compared to passing a base `data.frame` or a
- `tibble` from the tidyverse.
- (See [benchmarking](https://github.com/dewittpe/medicalcoder/tree/main/benchmarking)).
+ - That said, there are non-trivial performance gains when passing a
+ [`data.table`](https://cran.r-project.org/package=data.table) to the
+ `comorbidities()` function. Passing a `tibble` is typically faster than a
+ base `data.frame` but slower than a `data.table`.
+ (See [benchmarking](https://github.com/dewittpe/medicalcoder/tree/main/benchmarking)).
- Internal lookup tables
- All required data are included in the package. If you have the `.tar.gz`
@@ -388,10 +388,12 @@ comorbidity algorithm to a data set are:
1. Data size: number of subjects/encounters.
2. Data storage class: medicalcoder has been built such that no imports of
- other namespaces is required. That said, when a `data.table` is passed to
+ other namespaces is required. That said, when a `data.table` is passed to
`comorbidities()` and the `data.table` namespace is available, then S3
dispatch for `merge` is used, along with some other methods, to reduce memory
- use and reduce computation time.
+ use and reduce computation time. When a `tibble` is passed and the tidyverse
+ namespaces are available, the tibble-aware paths improve performance over a
+ base `data.frame`, but `data.table` remains fastest.
3. `flag.method`: "current" will take less time than the "cumulative" method.
Details on the benchmarking method, summary graphics, and tables, can be found
diff --git a/README.md b/README.md
index 6af06abc..1abc3280 100644
--- a/README.md
+++ b/README.md
@@ -48,11 +48,11 @@ The primary objectives of medicalcoder are:
ease of maintenance and usability.
- Suggested packages are needed only for development work and building
vignettes. They are not required for installation or use.
- - That said, there are non-trivial performance gains when passing a
- [`data.table`](https://cran.r-project.org/package=data.table) to the
- `comorbidities()` function compared to passing a base `data.frame` or a
- `tibble` from the tidyverse.
- (See [benchmarking](https://github.com/dewittpe/medicalcoder/tree/main/benchmarking)).
+ - That said, there are non-trivial performance gains when passing a
+ [`data.table`](https://cran.r-project.org/package=data.table) to the
+ `comorbidities()` function. Passing a `tibble` is typically faster than a
+ base `data.frame` but slower than a `data.table`.
+ (See [benchmarking](https://github.com/dewittpe/medicalcoder/tree/main/benchmarking)).
- Internal lookup tables
- All required data are included in the package. If you have the `.tar.gz`
@@ -459,10 +459,12 @@ comorbidity algorithm to a data set are:
1. Data size: number of subjects/encounters.
2. Data storage class: medicalcoder has been built such that no imports of
- other namespaces is required. That said, when a `data.table` is passed to
+ other namespaces is required. That said, when a `data.table` is passed to
`comorbidities()` and the `data.table` namespace is available, then S3
dispatch for `merge` is used, along with some other methods, to reduce memory
- use and reduce computation time.
+ use and reduce computation time. When a `tibble` is passed and the tidyverse
+ namespaces are available, the tibble-aware paths improve performance over a
+ base `data.frame`, but `data.table` remains fastest.
3. `flag.method`: "current" will take less time than the "cumulative" method.
Details on the benchmarking method, summary graphics, and tables, can be found
diff --git a/benchmarking/Makefile b/benchmarking/Makefile
index 58db8f16..df5e910e 100644
--- a/benchmarking/Makefile
+++ b/benchmarking/Makefile
@@ -1,19 +1,19 @@
include ../Makevars
-.NOPARALLEL: .bench1 .bench2
+.NOPARALLEL: .bench
.PHONY: all clean
-all: benchmark2.svg README.md
+all: README.md
-grid%.tsv: make_grid%.sh
+grid.tsv: make_grid.sh
./$<
-.bench%: run_parallel%.sh grid%.tsv
+.bench: run_parallel.sh grid.tsv
./$<
@touch $@
-benchmark%.svg: benchmark%-summary.R .bench%
+outtable.rds: benchmark-summary.R .bench
$(RSCRIPT) $<
README.md: README.Rmd outtable.rds
@@ -21,11 +21,10 @@ README.md: README.Rmd outtable.rds
clean:
$(RM) .bench*
- $(RM) benchmark*.svg
+ $(RM) *.svg
$(RM) *.pdf
- $(RM) grid1.tsv grid2.tsv
- $(RM) -r logs1
- $(RM) -r logs2
- $(RM) -r bench1_results
- $(RM) -r bench2_results
+ $(RM) grid.tsv
+ $(RM) output.rds
+ $(RM) -r logs
+ $(RM) -r bench_results
diff --git a/benchmarking/README.Rmd b/benchmarking/README.Rmd
index b81315bd..c17a6341 100644
--- a/benchmarking/README.Rmd
+++ b/benchmarking/README.Rmd
@@ -17,10 +17,12 @@ comorbidity algorithm to a data set are:
1. Data size: number of subjects/encounters.
2. Data storage class: `medicalcoder` has been built such that no imports of
- other namespaces is required. That said, when a `data.table` is passed to
+ other namespaces is required. That said, when a `data.table` is passed to
`comorbidities()` and the `data.table` namespace is available, then S3
dispatch for `merge` is used, along with some other methods, to reduce memory
- use and reduce computation time.
+ use and reduce computation time. When a `tibble` is passed and tidyverse
+ namespaces are available, the tibble-aware path reduces time relative to a
+ base `data.frame`, though `data.table` remains the fastest option.
3. flag.method: "current" will take less time than the "cumulative" method.
@@ -61,12 +63,11 @@ outtable[, method := fifelse(subconditions, paste(method, "(with subconditions)"
outtable[, subconditions := NULL]
```
-In general, the expected time to apply a comorbidity method is the same between
-`data.frame`s and `tibble`s. There is a notable decrease in time required when
-`data.table`s are passed to `comorbidities()`. Best observed case: a
-`data.table` took
+In general, the expected time to apply a comorbidity method is lower for
+`tibble`s than for base `data.frame`s, and lower still for `data.table`s. Best
+observed case: a `data.table` took
`r outtable_orig[data_class == "data.table", min(relative_time)]`
-the time of a `data.frame`.
+the time of a `data.frame`; `tibble`s sit between the two.
```{r show-outtable, echo = FALSE, results = "asis"}
foo <- function(x) {
diff --git a/benchmarking/README.md b/benchmarking/README.md
index d483ed1d..b708c1f9 100644
--- a/benchmarking/README.md
+++ b/benchmarking/README.md
@@ -23,7 +23,7 @@ In general, the expected time to apply a comorbidity method is the same between
`data.frame`s and `tibble`s. There is a notable decrease in time required when
`data.table`s are passed to `comorbidities()`. Best observed case: a
`data.table` took
-0.3042897
+0.3106419
the time of a `data.frame`.
@@ -53,302 +53,302 @@ the time of a `data.frame`.
1,000
data.frame
-
0.10
+
0.19
1.00
-
0.30
-
0.37
+
0.31
+
0.48
1.00
-
0.28
+
0.29
data.table
-
0.08
-
0.87
-
0.30
-
0.26
-
0.72
-
0.28
+
0.17
+
0.89
+
0.31
+
0.39
+
0.81
+
0.29
tibble
-
0.10
-
1.09
-
0.30
-
0.38
-
1.04
-
0.28
+
0.23
+
1.20
+
0.31
+
0.54
+
1.20
+
0.29
2,000
data.frame
-
0.17
+
0.28
1.00
-
0.30
-
0.72
+
0.31
+
0.85
1.00
-
0.29
+
0.31
data.table
-
0.14
-
0.80
+
0.23
+
0.83
+
0.31
+
0.60
+
0.72
0.30
-
0.45
-
0.64
-
0.29
tibble
-
0.18
-
1.08
0.30
-
0.73
-
1.02
+
1.07
+
0.31
+
0.79
+
0.98
0.30
5,000
data.frame
-
0.41
+
0.53
1.00
0.32
-
1.82
+
1.93
1.00
-
0.35
+
0.38
data.table
-
0.29
-
0.72
+
0.39
+
0.75
0.32
-
1.02
-
0.57
-
0.35
+
1.20
+
0.62
+
0.36
tibble
-
0.42
-
1.05
-
0.31
-
1.82
-
1.01
+
0.49
+
0.93
+
0.32
+
1.47
+
0.77
0.36
10,000
data.frame
-
0.88
+
0.99
1.00
-
0.33
-
3.88
+
0.34
+
4.00
1.00
-
0.44
+
0.48
data.table
-
0.58
0.66
-
0.33
-
2.08
-
0.54
+
0.68
+
0.34
+
2.26
+
0.57
0.44
tibble
-
0.89
-
1.02
-
0.33
-
3.89
-
1.01
-
0.45
+
0.83
+
0.84
+
0.34
+
2.70
+
0.68
+
0.44
20,000
data.frame
-
1.88
+
1.93
1.00
-
0.37
-
7.90
+
0.39
+
8.17
1.00
-
0.64
+
0.68
data.table
-
1.13
+
1.18
+
0.62
+
0.39
+
4.26
+
0.53
0.61
-
0.38
-
4.06
-
0.52
-
0.64
tibble
-
1.86
-
0.99
-
0.37
-
7.94
-
1.02
-
0.65
+
1.50
+
0.78
+
0.39
+
5.13
+
0.63
+
0.62
50,000
data.frame
-
4.87
+
4.72
1.00
-
0.49
-
19.52
+
0.54
+
20.42
1.00
-
1.23
+
1.26
data.table
2.61
-
0.54
-
0.52
-
9.48
-
0.49
-
1.23
+
0.56
+
0.53
+
9.66
+
0.48
+
1.12
tibble
-
4.69
-
0.97
-
0.51
-
19.75
-
1.03
-
1.23
+
3.33
+
0.72
+
0.53
+
11.96
+
0.59
+
1.16
100,000
data.frame
-
8.93
+
8.76
1.00
-
0.75
-
37.98
+
0.80
+
38.94
1.00
-
2.26
+
2.33
data.table
-
4.35
-
0.50
-
0.78
-
17.19
-
0.46
-
2.19
+
4.50
+
0.52
+
0.79
+
17.37
+
0.45
+
2.02
tibble
-
8.68
-
0.98
-
0.78
-
38.34
-
1.02
-
2.26
+
5.60
+
0.66
+
0.80
+
21.50
+
0.56
+
2.20
200,000
data.frame
-
16.59
+
16.55
1.00
-
1.26
-
74.58
+
1.32
+
75.52
1.00
-
4.29
+
4.43
data.table
-
7.51
-
0.46
-
1.29
-
31.77
+
8.00
+
0.49
+
1.27
+
32.04
0.43
-
4.05
+
3.84
tibble
-
16.28
-
1.00
+
9.75
+
0.60
1.29
-
75.20
-
1.02
-
4.27
+
39.68
+
0.53
+
4.21
500,000
data.frame
-
39.81
+
40.19
1.00
-
2.84
-
186.55
+
2.89
+
189.54
1.00
-
10.24
+
10.35
data.table
-
17.33
-
0.44
-
2.87
-
75.38
+
18.37
+
0.46
+
2.72
+
76.28
0.41
-
9.37
+
9.39
tibble
-
39.58
-
1.01
-
2.86
-
188.54
-
1.02
-
10.13
+
22.18
+
0.56
+
2.71
+
94.83
+
0.51
+
9.75
1,000,000
data.frame
-
78.88
+
80.76
1.00
-
5.79
-
379.24
+
5.71
+
389.11
1.00
-
20.63
+
20.00
data.table
-
34.63
+
35.76
0.45
-
5.81
-
148.65
-
0.40
-
18.32
+
5.33
+
150.93
+
0.39
+
19.53
tibble
-
79.85
-
1.03
-
5.69
-
383.84
-
1.02
-
20.22
+
43.54
+
0.55
+
5.15
+
189.79
+
0.49
+
18.45
@@ -380,302 +380,302 @@ the time of a `data.frame`.
1,000
data.frame
-
0.11
+
0.20
1.00
-
0.30
-
0.53
+
0.31
+
0.67
1.00
-
0.28
+
0.29
data.table
-
0.09
-
0.84
+
0.18
+
0.89
0.30
-
0.39
-
0.76
-
0.28
+
0.54
+
0.81
+
0.29
tibble
-
0.12
-
1.03
-
0.30
-
0.55
-
1.02
-
0.28
+
0.24
+
1.22
+
0.31
+
0.74
+
1.15
+
0.29
2,000
data.frame
-
0.20
+
0.29
1.00
-
0.30
-
1.05
+
0.31
+
1.19
1.00
-
0.30
+
0.31
data.table
-
0.15
-
0.77
+
0.25
+
0.84
0.30
-
0.69
-
0.67
-
0.31
+
0.85
+
0.73
+
0.32
tibble
-
0.20
-
1.02
-
0.30
-
1.05
-
1.01
+
0.32
+
1.11
+
0.31
+
1.12
+
0.97
0.31
5,000
data.frame
-
0.45
+
0.56
1.00
-
0.31
-
2.62
+
0.33
+
2.72
1.00
-
0.40
+
0.39
data.table
-
0.31
-
0.69
+
0.42
+
0.76
0.32
-
1.56
-
0.60
-
0.40
+
1.72
+
0.64
+
0.41
tibble
-
0.45
-
1.01
-
0.31
-
2.61
-
1.00
+
0.54
+
0.97
+
0.32
+
2.15
+
0.79
0.40
10,000
data.frame
-
0.96
+
1.06
1.00
-
0.33
-
5.50
+
0.35
+
5.55
1.00
-
0.55
+
0.52
data.table
-
0.60
-
0.63
+
0.72
+
0.69
0.34
-
3.11
+
3.23
+
0.58
0.56
-
0.55
tibble
-
0.95
-
1.00
-
0.33
-
5.46
-
0.99
-
0.56
+
0.92
+
0.88
+
0.34
+
3.97
+
0.72
+
0.54
20,000
data.frame
-
2.04
+
2.10
1.00
0.39
-
10.97
+
11.14
1.00
-
0.89
+
0.81
data.table
-
1.17
-
0.58
+
1.31
+
0.63
0.39
-
6.01
-
0.55
-
0.87
+
6.04
+
0.54
+
0.86
tibble
-
2.00
-
0.98
-
0.38
-
10.96
-
1.01
-
0.90
+
1.66
+
0.80
+
0.39
+
7.40
+
0.66
+
0.82
50,000
data.frame
-
5.29
+
5.19
1.00
-
0.56
-
26.37
+
0.54
+
27.39
1.00
-
1.89
+
1.69
data.table
-
2.78
-
0.53
+
2.92
+
0.57
0.55
-
13.89
-
0.53
-
1.77
+
13.76
+
0.51
+
1.72
tibble
-
5.15
-
0.98
+
3.66
+
0.71
0.55
-
26.73
-
1.02
-
1.91
+
16.85
+
0.62
+
1.62
100,000
data.frame
-
9.82
+
9.49
1.00
-
0.88
-
50.73
+
0.81
+
52.73
1.00
-
3.47
+
3.17
data.table
-
4.86
-
0.50
-
0.84
-
25.19
-
0.50
-
3.12
+
5.01
+
0.54
+
0.85
+
25.57
+
0.49
+
3.07
tibble
-
9.59
-
0.98
-
0.87
-
51.58
-
1.03
-
3.44
+
6.10
+
0.66
+
0.82
+
30.70
+
0.59
+
2.94
200,000
data.frame
-
18.38
+
17.66
1.00
-
1.49
-
99.30
+
1.33
+
102.68
1.00
-
6.50
+
6.15
data.table
-
8.69
-
0.48
-
1.39
-
46.50
+
8.78
+
0.51
+
1.40
+
48.30
0.48
-
5.66
+
5.73
tibble
-
17.97
-
0.99
-
1.47
-
100.52
-
1.03
-
6.41
+
10.56
+
0.62
+
1.35
+
57.25
+
0.57
+
5.59
500,000
data.frame
-
44.21
+
42.42
1.00
-
3.23
-
250.26
+
2.92
+
255.09
1.00
-
15.30
+
15.10
data.table
-
20.05
+
19.67
+
0.47
+
2.99
+
115.50
0.46
-
3.09
-
109.65
-
0.44
-
12.90
+
13.88
tibble
-
43.14
-
0.99
-
3.23
-
249.23
-
1.01
-
15.37
+
24.13
+
0.58
+
2.83
+
137.08
+
0.54
+
13.88
1,000,000
data.frame
-
87.64
+
84.97
1.00
-
6.23
-
513.48
+
5.89
+
517.47
1.00
-
30.19
+
30.59
data.table
-
39.25
+
37.75
0.45
-
6.26
-
215.76
-
0.42
-
25.19
+
5.64
+
228.19
+
0.45
+
28.42
tibble
-
85.84
-
0.99
-
6.25
-
504.87
-
0.99
-
31.42
+
47.85
+
0.58
+
5.32
+
273.80
+
0.53
+
28.96
@@ -707,302 +707,302 @@ the time of a `data.frame`.
1,000
data.frame
-
0.15
+
0.25
1.00
-
0.29
-
1.27
+
0.30
+
1.39
1.00
-
0.32
+
0.34
data.table
-
0.10
-
0.68
-
0.29
-
0.87
-
0.70
-
0.32
+
0.19
+
0.75
+
0.30
+
1.05
+
0.78
+
0.33
tibble
-
0.16
-
1.07
0.29
-
1.31
-
1.04
+
1.15
+
0.30
+
1.52
+
1.15
0.32
2,000
data.frame
-
0.28
+
0.38
1.00
-
0.29
-
2.50
+
0.31
+
2.60
1.00
-
0.38
+
0.41
data.table
-
0.16
-
0.59
-
0.30
-
1.52
-
0.62
-
0.38
+
0.26
+
0.68
+
0.31
+
1.73
+
0.68
+
0.40
tibble
-
0.29
-
1.05
-
0.29
-
2.52
-
1.01
+
0.39
+
1.03
+
0.31
+
2.40
+
0.96
0.39
5,000
data.frame
-
0.68
+
0.77
1.00
-
0.31
-
6.23
+
0.33
+
6.18
1.00
-
0.59
+
0.63
data.table
-
0.32
-
0.49
-
0.32
-
3.30
-
0.53
+
0.44
0.58
+
0.33
+
3.56
+
0.58
+
0.59
tibble
-
0.70
-
1.03
-
0.31
-
6.17
-
0.99
-
0.61
+
0.68
+
0.89
+
0.34
+
4.76
+
0.77
+
0.58
10,000
data.frame
-
1.46
+
1.51
1.00
-
0.34
-
12.75
+
0.37
+
12.50
1.00
-
0.98
+
1.02
data.table
-
0.63
-
0.43
-
0.34
-
6.17
-
0.49
+
0.77
+
0.51
+
0.35
+
6.61
+
0.53
0.92
tibble
-
1.49
-
1.02
-
0.33
-
12.56
-
0.99
-
1.01
+
1.20
+
0.79
+
0.38
+
8.65
+
0.69
+
0.94
20,000
data.frame
-
3.05
+
3.02
1.00
-
0.40
-
25.49
+
0.44
+
24.77
1.00
-
1.79
+
1.81
data.table
-
1.25
-
0.41
-
0.41
-
11.48
-
0.45
-
1.61
+
1.39
+
0.46
+
0.42
+
12.24
+
0.49
+
1.58
tibble
-
3.08
-
1.02
-
0.40
-
25.15
-
0.99
-
1.85
+
2.18
+
0.73
+
0.47
+
15.96
+
0.65
+
1.65
50,000
data.frame
-
7.64
+
7.51
1.00
-
0.62
-
62.74
+
0.67
+
60.78
1.00
-
4.26
+
4.11
data.table
-
3.03
-
0.40
-
0.60
-
26.50
+
3.11
0.42
-
3.66
+
0.60
+
27.76
+
0.46
+
3.59
tibble
-
7.66
-
1.01
-
0.62
-
62.45
-
1.00
-
4.38
+
4.84
+
0.65
+
0.73
+
36.71
+
0.61
+
3.78
100,000
data.frame
-
14.41
+
14.21
1.00
-
1.02
-
121.75
+
1.08
+
118.87
1.00
-
8.32
+
7.83
data.table
-
5.33
+
5.30
0.38
-
0.93
-
50.89
-
0.42
-
6.99
+
0.91
+
51.09
+
0.44
+
7.09
tibble
-
14.23
-
1.00
-
1.01
-
122.85
-
1.01
-
8.30
+
8.45
+
0.61
+
1.16
+
69.71
+
0.59
+
7.16
200,000
data.frame
-
27.66
+
27.31
1.00
-
1.76
-
239.68
+
1.87
+
235.78
1.00
-
16.37
+
15.24
data.table
-
9.52
+
9.34
0.35
-
1.55
-
99.35
+
1.49
+
97.30
0.42
-
13.63
+
14.05
tibble
-
26.95
-
0.99
-
1.74
-
243.83
-
1.02
-
16.01
+
15.28
+
0.57
+
1.98
+
135.19
+
0.58
+
13.80
500,000
data.frame
-
69.05
+
67.70
1.00
-
3.83
-
607.95
+
4.14
+
601.65
1.00
-
40.08
+
37.74
data.table
-
21.85
+
21.48
0.32
-
3.28
-
246.75
+
3.20
+
243.94
0.41
-
33.83
+
34.18
tibble
-
66.26
-
0.97
-
3.79
-
616.73
-
1.02
-
39.18
+
36.19
+
0.54
+
4.31
+
336.73
+
0.56
+
33.70
1,000,000
data.frame
-
140.69
+
137.87
1.00
-
7.26
-
1256.20
+
8.04
+
1247.81
1.00
-
79.21
+
76.08
data.table
-
42.29
-
0.30
-
6.21
-
502.29
-
0.40
-
68.99
+
42.50
+
0.31
+
6.19
+
510.88
+
0.41
+
66.85
tibble
-
133.94
-
0.96
-
7.23
-
1262.62
-
1.01
-
78.35
+
72.45
+
0.53
+
8.44
+
690.52
+
0.56
+
67.83
@@ -1034,302 +1034,302 @@ the time of a `data.frame`.
1,000
data.frame
-
0.18
+
0.27
1.00
-
0.29
-
1.38
+
0.30
+
1.51
1.00
0.34
data.table
-
0.14
+
0.22
+
0.83
+
0.31
+
1.17
0.80
-
0.29
-
0.98
-
0.73
-
0.33
+
0.34
tibble
-
0.18
-
1.05
-
0.29
-
1.38
-
1.01
+
0.40
+
1.53
+
0.31
+
1.74
+
1.24
0.33
2,000
data.frame
-
0.32
+
0.42
1.00
-
0.29
-
2.69
+
0.30
+
2.84
1.00
0.41
data.table
-
0.21
-
0.69
0.30
-
1.71
-
0.65
-
0.39
+
0.73
+
0.31
+
1.93
+
0.70
+
0.40
tibble
-
0.32
-
1.02
-
0.29
-
2.66
-
0.99
-
0.41
+
0.53
+
1.32
+
0.31
+
2.73
+
1.01
+
0.40
5,000
data.frame
-
0.75
+
0.84
1.00
-
0.32
-
6.58
+
0.33
+
6.77
1.00
-
0.63
+
0.64
data.table
-
0.42
-
0.56
-
0.32
-
3.69
-
0.56
+
0.51
+
0.62
+
0.33
+
3.97
+
0.59
0.60
tibble
-
0.75
-
1.01
-
0.31
-
6.51
-
0.99
-
0.63
+
0.89
+
1.07
+
0.34
+
5.38
+
0.79
+
0.61
10,000
data.frame
-
1.59
+
1.62
1.00
-
0.35
-
13.16
+
0.36
+
13.61
1.00
-
1.03
+
1.06
data.table
-
0.80
-
0.50
-
0.34
-
6.87
-
0.53
-
0.96
+
0.88
+
0.55
+
0.36
+
7.31
+
0.54
+
0.95
tibble
-
1.59
-
1.00
-
0.34
-
13.20
-
1.01
-
1.03
+
1.50
+
0.92
+
0.37
+
9.79
+
0.71
+
0.97
20,000
data.frame
-
3.31
+
3.24
1.00
-
0.41
-
25.84
+
0.44
+
26.82
1.00
-
1.80
+
1.87
data.table
-
1.55
-
0.47
-
0.41
-
12.74
+
1.60
0.50
-
1.69
+
0.42
+
13.54
+
0.51
+
1.65
tibble
-
3.27
-
1.00
-
0.41
-
26.55
-
1.04
-
1.88
+
2.69
+
0.83
+
0.45
+
17.90
+
0.67
+
1.71
50,000
data.frame
-
8.29
+
8.09
1.00
-
0.62
-
63.01
+
0.67
+
65.35
1.00
-
4.09
+
4.26
data.table
-
3.66
+
3.59
0.45
0.60
-
29.36
-
0.47
-
3.91
+
31.18
+
0.48
+
3.78
tibble
-
8.18
-
1.00
-
0.61
-
66.69
-
1.07
-
4.44
+
5.95
+
0.75
+
0.67
+
40.48
+
0.63
+
3.93
100,000
data.frame
-
15.53
+
15.56
1.00
-
1.01
-
125.06
+
1.07
+
127.35
1.00
-
7.96
+
8.20
data.table
-
6.46
-
0.42
-
0.94
-
56.70
-
0.46
-
7.77
+
6.26
+
0.41
+
0.92
+
59.44
+
0.47
+
7.40
tibble
-
15.49
-
1.02
-
1.00
-
132.57
-
1.07
-
8.50
+
10.21
+
0.67
+
1.04
+
75.43
+
0.60
+
7.63
200,000
data.frame
-
29.64
+
30.28
1.00
-
1.75
-
251.28
+
1.84
+
252.49
1.00
-
15.81
+
16.10
data.table
-
11.57
-
0.40
-
1.58
-
111.75
-
0.45
-
15.42
+
11.32
+
0.38
+
1.53
+
115.89
+
0.46
+
14.68
tibble
-
29.67
-
1.02
+
18.18
+
0.61
1.74
-
264.40
-
1.06
-
16.49
+
145.13
+
0.58
+
14.86
500,000
data.frame
-
73.79
+
75.12
1.00
-
3.79
-
646.17
+
4.06
+
647.57
1.00
-
39.75
+
40.13
data.table
-
26.48
-
0.37
-
3.46
-
282.87
-
0.44
-
37.05
+
26.77
+
0.36
+
3.39
+
291.30
+
0.45
+
36.34
tibble
-
72.85
-
1.00
-
3.76
-
664.81
-
1.03
-
40.60
+
42.37
+
0.57
+
3.82
+
365.95
+
0.57
+
35.56
1,000,000
data.frame
-
150.63
+
151.87
1.00
-
7.05
-
1341.64
+
7.90
+
1349.61
1.00
-
80.41
+
81.79
data.table
-
51.26
-
0.35
-
6.73
-
585.30
-
0.44
-
71.25
+
53.90
+
0.36
+
6.74
+
602.01
+
0.45
+
72.67
tibble
-
146.26
-
0.98
-
7.01
-
1347.54
-
1.01
-
82.00
+
84.35
+
0.56
+
7.57
+
764.11
+
0.57
+
69.49
diff --git a/benchmarking/benchmark-composite.pdf b/benchmarking/benchmark-composite.pdf
new file mode 100644
index 00000000..db8d8798
Binary files /dev/null and b/benchmarking/benchmark-composite.pdf differ
diff --git a/benchmarking/benchmark-composite.svg b/benchmarking/benchmark-composite.svg
new file mode 100644
index 00000000..d6e6055f
--- /dev/null
+++ b/benchmarking/benchmark-composite.svg
@@ -0,0 +1,2418 @@
+
+
diff --git a/benchmarking/benchmark-relative.pdf b/benchmarking/benchmark-relative.pdf
new file mode 100644
index 00000000..e81d0307
Binary files /dev/null and b/benchmarking/benchmark-relative.pdf differ
diff --git a/benchmarking/benchmark-relative.svg b/benchmarking/benchmark-relative.svg
new file mode 100644
index 00000000..ead5d4d5
--- /dev/null
+++ b/benchmarking/benchmark-relative.svg
@@ -0,0 +1,924 @@
+
+
diff --git a/benchmarking/benchmark2-summary.R b/benchmarking/benchmark-summary.R
similarity index 74%
rename from benchmarking/benchmark2-summary.R
rename to benchmarking/benchmark-summary.R
index 8ba41ce0..49e946fe 100644
--- a/benchmarking/benchmark2-summary.R
+++ b/benchmarking/benchmark-summary.R
@@ -3,38 +3,38 @@ library(ggplot2)
################################################################################
# data import
-bench2 <-
- list.files("bench2_results", full.names = TRUE) |>
+bench <-
+ list.files("bench_results", full.names = TRUE) |>
lapply(readRDS) |>
lapply(setDT) |>
rbindlist()
mem <-
- list.files("./logs2/mem", pattern = "\\.tsv$", full.names = TRUE, recursive = TRUE) |>
+ list.files("./logs/mem", pattern = "\\.tsv$", full.names = TRUE, recursive = TRUE) |>
lapply(fread) |>
rbindlist()
mem[, subconditions := grepl("pccc_v3.1s", method)]
mem[, method := sub("s$", "", method)]
setnames(mem, "flag_method", "flag.method")
-bench2_summary <-
- bench2[, .(median_time_seconds = median(time_seconds)) , by = .(data_class, subjects, encounters, seed, method, subconditions, flag.method) ]
+bench_summary <-
+ bench[, .(median_time_seconds = median(time_seconds)) , by = .(data_class, subjects, encounters, seed, method, subconditions, flag.method) ]
mem_summary <-
mem[, .(median_rss_kib = median(max_rss_kib)), by = .(data_class, subjects, seed, method, subconditions, flag.method)]
-bench2_summary <-
- merge(bench2_summary, mem_summary, all = TRUE)
+bench_summary <-
+ merge(bench_summary, mem_summary, all = TRUE)
-bench2_summary[!is.na(median_time_seconds) & !is.na(median_rss_kib)]
+bench_summary[!is.na(median_time_seconds) & !is.na(median_rss_kib)]
-bench2_summary[, data_class := fcase(data_class == "DF", "data.frame",
+bench_summary[, data_class := fcase(data_class == "DF", "data.frame",
data_class == "DT", "data.table",
data_class == "TBL", "tibble")]
# relative time
-bench2_summary[!is.na(median_time_seconds), df_median := median_time_seconds[data_class == "data.frame"], by = .(subjects, encounters, method, subconditions, flag.method)]
-bench2_summary[, relative_time := (median_time_seconds / df_median)]
-bench2_summary[, df_median := NULL]
+bench_summary[!is.na(median_time_seconds), df_median := median_time_seconds[data_class == "data.frame"], by = .(subjects, encounters, method, subconditions, flag.method)]
+bench_summary[, relative_time := (median_time_seconds / df_median)]
+bench_summary[, df_median := NULL]
################################################################################
# Plotting helpers
@@ -48,7 +48,7 @@ facet_spec <- . ~ fifelse(subconditions,
paste(method, "(with subconditions)"),
method) + flag.method
g <-
- ggplot(bench2_summary) +
+ ggplot(bench_summary) +
theme_bw() +
aes(x = encounters, y = median_time_seconds,
color = data_class,
@@ -74,11 +74,11 @@ g <-
axis.text.x = element_text(hjust = 0.75)
)
-ggsave(file = "benchmark2.pdf", plot = g, width = 12, height = 7)
-ggsave(file = "benchmark2.svg", plot = g, width = 12, height = 7)
+ggsave(file = "benchmark.pdf", plot = g, width = 12, height = 7)
+ggsave(file = "benchmark.svg", plot = g, width = 12, height = 7)
gr <-
- ggplot(bench2_summary) +
+ ggplot(bench_summary) +
theme_bw() +
aes(x = encounters, y = relative_time, color = data_class, fill = data_class, linetype = data_class) +
stat_smooth(method = "loess", formula = y ~ x) +
@@ -97,11 +97,11 @@ gr <-
axis.text.x = element_text(hjust = 0.75)
)
-ggsave(file = "benchmark2-relative.svg", plot = gr, width = 12, height = 7)
-ggsave(file = "benchmark2-relative.pdf", plot = gr, width = 12, height = 7)
+ggsave(file = "benchmark-relative.svg", plot = gr, width = 12, height = 7)
+ggsave(file = "benchmark-relative.pdf", plot = gr, width = 12, height = 7)
g <-
- ggplot(bench2_summary) +
+ ggplot(bench_summary) +
theme_bw() +
aes(x = encounters, y = median_rss_kib / (1024^2),
color = data_class,
@@ -135,11 +135,11 @@ facet_spec <- . ~ fifelse(subconditions, paste(method, "(with subconditions)"),
outtable <- list()
-for (mt in unique(bench2_summary$method)) {
- for (sc in unique(bench2_summary$subconditions)) {
- for (dc in unique(bench2_summary$data_class)) {
- for (fm in unique(bench2_summary$flag.method)) {
- thisdt <- subset(bench2_summary, method == mt & subconditions == sc & data_class == dc & flag.method == fm)
+for (mt in unique(bench_summary$method)) {
+ for (sc in unique(bench_summary$subconditions)) {
+ for (dc in unique(bench_summary$data_class)) {
+ for (fm in unique(bench_summary$flag.method)) {
+ thisdt <- subset(bench_summary, method == mt & subconditions == sc & data_class == dc & flag.method == fm)
if (nrow(thisdt)) {
ats_loess <- loess(log10(median_time_seconds) ~ log10(encounters), data = thisdt)
ats <- predict(ats_loess, se = TRUE)
@@ -147,14 +147,14 @@ for (mt in unique(bench2_summary$method)) {
mem_loess <- loess(log10(median_rss_kib) ~ log10(encounters), data = thisdt)
mem <- predict(mem_loess, se = TRUE)
- bench2_summary[method == mt & subconditions == sc & data_class == dc & flag.method == fm & !is.na(median_time_seconds) & !is.na(encounters),
+ bench_summary[method == mt & subconditions == sc & data_class == dc & flag.method == fm & !is.na(median_time_seconds) & !is.na(encounters),
`:=`(
time_smoothed_y = 10^(ats$fit),
time_smoothed_lwr = 10^(ats$fit - 1.96 * ats$se.fit),
time_smoothed_upr = 10^(ats$fit + 1.96 * ats$se.fit)
)]
- bench2_summary[method == mt & subconditions == sc & data_class == dc & flag.method == fm & !is.na(median_rss_kib) & !is.na(encounters),
+ bench_summary[method == mt & subconditions == sc & data_class == dc & flag.method == fm & !is.na(median_rss_kib) & !is.na(encounters),
`:=`(
mem_smoothed_y = 10^(mem$fit),
mem_smoothed_lwr = 10^(mem$fit - 1.96 * mem$se.fit),
@@ -163,7 +163,7 @@ for (mt in unique(bench2_summary$method)) {
if (dc != "data.frame") {
rts_loess <- loess(relative_time ~ log10(encounters), data = thisdt)
rts <- predict(rts_loess, se = TRUE)
- bench2_summary[method == mt & subconditions == sc & data_class == dc & flag.method == fm & !is.na(relative_time) & !is.na(encounters),
+ bench_summary[method == mt & subconditions == sc & data_class == dc & flag.method == fm & !is.na(relative_time) & !is.na(encounters),
`:=`(
rel_time_smoothed_y = rts$fit,
rel_time_smoothed_lwr = rts$fit - 1.96 * rts$se.fit,
@@ -190,10 +190,8 @@ for (mt in unique(bench2_summary$method)) {
}
}
-outtable <- rbindlist(outtable)
-saveRDS(outtable, file = "outtable.rds")
-bench2_summary[data_class == "data.frame",
+bench_summary[data_class == "data.frame",
`:=`(
rel_time_smoothed_y = 1,
rel_time_smoothed_lwr = 1,
@@ -202,15 +200,15 @@ bench2_summary[data_class == "data.frame",
# use this data set to identify the flag.method
-setkey(bench2_summary,
+setkey(bench_summary,
method, data_class, subconditions, flag.method, subjects)
fmpt <-
- bench2_summary[, .(encounters = max(encounters, na.rm = TRUE)), keyby = .(method, data_class, subconditions, flag.method, subjects)]
-fmpt <- bench2_summary[fmpt, on = c(key(fmpt), "encounters")]
+ bench_summary[, .(encounters = max(encounters, na.rm = TRUE)), keyby = .(method, data_class, subconditions, flag.method, subjects)]
+fmpt <- bench_summary[fmpt, on = c(key(fmpt), "encounters")]
fmpt <- unique(fmpt)
g1 <-
- ggplot(bench2_summary) +
+ ggplot(bench_summary) +
theme_bw() +
aes(x = encounters,
y = time_smoothed_y,
@@ -223,7 +221,6 @@ g1 <-
) +
geom_line() +
geom_ribbon(alpha = 0.2, mapping = aes(color = NULL)) +
- #geom_point(mapping = aes(y = median)) +
geom_point(data = fmpt, mapping = aes(shape = flag.method), size = 2) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si(""))) +
scale_y_log10(labels = scales::label_comma()) +
@@ -242,7 +239,7 @@ g1 <-
)
g2 <-
- ggplot(bench2_summary) +
+ ggplot(bench_summary) +
theme_bw() +
aes(x = encounters,
y = rel_time_smoothed_y,
@@ -256,7 +253,7 @@ g2 <-
geom_line() +
geom_ribbon(alpha = 0.2, mapping = aes(color = NULL)) +
geom_point(data = fmpt[data_class != "data.frame"], mapping = aes(shape = flag.method), size = 2) +
- scale_y_continuous(breaks = seq(0.4, 1.4, by = 0.2)) +
+ scale_y_continuous() + #breaks = seq(0.4, 1.4, by = 0.2)) +
scale_x_log10(labels = scales::label_number(scale_cut = scales::cut_si(""))) +
annotation_logticks(sides = "b") +
scale_fill_manual(name = "Data Class", values = cclr) +
@@ -272,7 +269,7 @@ g2 <-
)
g3 <-
- ggplot(bench2_summary) +
+ ggplot(bench_summary) +
theme_bw() +
aes(x = encounters,
y = mem_smoothed_y / (1024^2),
@@ -302,7 +299,7 @@ g3 <-
axis.text.x = element_text(hjust = 0.75)
)
-svglite::svglite(filename = "benchmark2-composite.svg", width = 9, height = 7)
+svglite::svglite(filename = "benchmark-composite.svg", width = 9, height = 7)
ggpubr::ggarrange(g1 + theme(axis.title.x = element_blank(), axis.text.x = element_blank()),
g2 + theme(axis.title.x = element_blank(), axis.text.x = element_blank(), strip.text = element_blank(), strip.background = element_blank()),
@@ -311,9 +308,27 @@ svglite::svglite(filename = "benchmark2-composite.svg", width = 9, height = 7)
dev.off()
-pdf(file = "benchmark2-composite.pdf", width = 12, height = 9)
+png(filename = "benchmark-composite.png", width = 9, height = 7)
+
+ ggpubr::ggarrange(g1 + theme(axis.title.x = element_blank(), axis.text.x = element_blank()),
+ g2 + theme(axis.title.x = element_blank(), axis.text.x = element_blank(), strip.text = element_blank(), strip.background = element_blank()),
+ g3 + theme(strip.text = element_blank(), strip.background = element_blank()),
+ ncol = 1, align = "v", common.legend = TRUE)
+
+dev.off()
+
+pdf(file = "benchmark-composite.pdf", width = 12, height = 9)
ggpubr::ggarrange(g1 + theme(axis.title.x = element_blank(), axis.text.x = element_blank()),
g2 + theme(axis.title.x = element_blank(), axis.text.x = element_blank()),
g3,
ncol = 1, align = "v", common.legend = TRUE)
dev.off()
+
+################################################################################
+# final step - save the outtable.rds file, this is tracked in the Makefile
+outtable <- rbindlist(outtable)
+saveRDS(outtable, file = "outtable.rds")
+
+################################################################################
+# End of File #
+################################################################################
diff --git a/benchmarking/benchmark-utilities.R b/benchmarking/benchmark-utilities.R
index 252558c4..a29777ab 100644
--- a/benchmarking/benchmark-utilities.R
+++ b/benchmarking/benchmark-utilities.R
@@ -45,63 +45,7 @@ icd_codes <- medicalcoder::get_icd_codes()[, c("icdv", "dx", "full_code")]
icd_dx_codes <- icd_codes[icd_codes$dx == 1, ]
icd_pr_codes <- icd_codes[icd_codes$dx == 0, ]
-build_set1 <- function(data_class = c("DF", "DT", "TBL") , subjects = 10 , seed = 1) {
- subjects <- as.integer(subjects)
- stopifnot(subjects > 0)
- seed <- as.integer(seed)
- set.seed(seed)
-
- data_class <- match.arg(data_class)
-
- dxs <- as.list(sample(dx_distribution$ncodes, size = subjects, prob = dx_distribution$d, replace = TRUE))
- prs <- as.list(sample(pr_distribution$ncodes, size = subjects, prob = pr_distribution$d, replace = TRUE))
-
- # How many dx and pr codes from pccc?
- pdxs <- lapply(lapply(dxs, runif), function(x) sum(x < 0.3))
- pprs <- lapply(lapply(prs, runif), function(x) sum(x < 0.3))
-
- # any icd codes
- dxs <- mapply(function(x, y) {x - y}, x = dxs, y = pdxs)
- prs <- mapply(function(x, y) {x - y}, x = prs, y = pprs)
-
- set <-
- mapply(function(sid, pdx, ppr, adx, apr) {
- x <-
- rbind(
- pccc_dx_codes[sample(seq_len(nrow(pccc_dx_codes)), size = pdx), ],
- pccc_pr_codes[sample(seq_len(nrow(pccc_pr_codes)), size = ppr), ],
- icd_dx_codes[sample(seq_len(nrow(icd_dx_codes)), size = adx), ],
- icd_pr_codes[sample(seq_len(nrow(icd_pr_codes)), size = apr), ]
- )
- if (nrow(x)) {
- x$subject_id <- sid
- } else {
- x <- data.frame(icdv = NA_integer_, dx = NA_integer_, full_code = NA_character_, subject_id = sid)
- }
- x
- },
- pdx = pdxs, ppr = pprs, adx = dxs, apr = prs, sid = seq_along(pdxs),
- SIMPLIFY = FALSE
- )
- set <- do.call(rbind, set)
- rownames(set) <- NULL
-
- if (data_class == "DT") {
- require(data.table)
- data.table::setDT(set)
- } else if (data_class == "TBL") {
- require(tibble)
- set <- as_tibble(set)
- }
-
- attr(set, "data_class") <- data_class
- attr(set, "nsubjects") <- subjects
- attr(set, "nencounters") <- subjects
-
- set
-}
-
-build_set2 <- function(data_class = c("DF", "DT", "TBL") , subjects = 10 , seed = 1) {
+build_set <- function(data_class = c("DF", "DT", "TBL") , subjects = 10 , seed = 1) {
subjects <- as.integer(subjects)
stopifnot(subjects > 0)
seed <- as.integer(seed)
@@ -164,35 +108,7 @@ build_set2 <- function(data_class = c("DF", "DT", "TBL") , subjects = 10 , seed
set
}
-benchmark1 <- function(data, method, subconditions) {
- tic <- Sys.time()
- comorbidities(
- data = data,
- icd.codes = "full_code",
- id.vars = "subject_id",
- icdv.var = "icdv",
- dx.var = "dx",
- poa = 1,
- primarydx = 0,
- flag.method = "current",
- method = method,
- subconditions = subconditions
- )
- toc <- Sys.time()
-
- data.frame(
- data_class = attr(data, "data_class"),
- subjects = attr(data, "nsubjects"),
- encounters = attr(data, "nencounters"),
- method = method,
- subconditions = subconditions,
- flag.method = "current",
- seed = 1,
- time_seconds = as.numeric(difftime(toc, tic, units = "secs"))
- )
-}
-
-benchmark2 <- function(data, method, subconditions, flag.method) {
+benchmark <- function(data, method, subconditions, flag.method) {
tic <- Sys.time()
comorbidities(
data = data,
diff --git a/benchmarking/benchmark2.R b/benchmarking/benchmark.R
similarity index 87%
rename from benchmarking/benchmark2.R
rename to benchmarking/benchmark.R
index c13a4cfd..033557de 100644
--- a/benchmarking/benchmark2.R
+++ b/benchmarking/benchmark.R
@@ -9,7 +9,7 @@ if (interactive()) {
args <- as.list(strsplit(outfile, split = "__")[[1]])
this_data_set <-
- build_set2(
+ build_set(
data_class = args[[1]],
subjects = as.integer(args[[2]]),
seed = as.integer(args[[5]])
@@ -24,12 +24,12 @@ if (subconditions) {
}
output <-
- benchmark2(
+ benchmark(
data = this_data_set,
method = method,
subconditions = subconditions,
flag.method = flag_method
)
-saveRDS(output, file = file.path("bench2_results", outfile))
+saveRDS(output, file = file.path("bench_results", outfile))
diff --git a/benchmarking/benchmark.pdf b/benchmarking/benchmark.pdf
new file mode 100644
index 00000000..052393f4
Binary files /dev/null and b/benchmarking/benchmark.pdf differ
diff --git a/benchmarking/benchmark.svg b/benchmarking/benchmark.svg
new file mode 100644
index 00000000..5eb0607a
--- /dev/null
+++ b/benchmarking/benchmark.svg
@@ -0,0 +1,5301 @@
+
+
diff --git a/benchmarking/benchmark1-summary.R b/benchmarking/benchmark1-summary.R
deleted file mode 100644
index 6ba53690..00000000
--- a/benchmarking/benchmark1-summary.R
+++ /dev/null
@@ -1,77 +0,0 @@
-library(data.table)
-library(ggplot2)
-
-bench1 <-
- list.files("bench1_results", full.names = TRUE) |>
- lapply(readRDS) |>
- lapply(setDT) |>
- rbindlist()
-
-bench1[, data_class := fcase(data_class == "DF", "data.frame",
- data_class == "DT", "data.table",
- data_class == "TBL", "tibble")]
-
-cclr <- c("data.table" = "#8da0cb", "tibble" = "#fc8d62", "data.frame" = "#66c2a5")
-ctyp <- c("data.frame" = 2, "data.table" = 1, "tibble" = 3)
-
-bench1_summary <-
- bench1[,
- .(mean = mean(time_seconds), median = median(time_seconds), q3 = quantile(time_seconds, prob = 0.75), q1 = quantile(time_seconds, prob = 0.25))
- , by = .(data_class, subjects, encounters, method, subconditions, flag.method)
- ]
-
-# relative time
-bench1_summary[, df_mean := mean[data_class == "data.frame"], by = .(subjects, encounters, method, subconditions, flag.method)]
-bench1_summary[, rt := (mean / df_mean)]
-
-ggplot(bench1_summary) +
- theme_bw() +
- aes(x = subjects,
- ymin = q1,
- y = median,
- ymax = q3,
- color = data_class,
- fill = data_class,
- linetype = data_class,
- shape = data_class
- ) +
- geom_point() +
- geom_line() +
- scale_x_log10(labels = scales::label_comma()) +
- scale_y_log10(labels = scales::label_comma()) +
- scale_fill_manual(name = "Data Class", values = cclr) +
- scale_color_manual(name = "Data Class", values = cclr) +
- scale_linetype_manual(name = "Data Class", values = ctyp) +
- scale_shape_manual(name = "Data Class", values = ctyp) +
- annotation_logticks() +
- xlab("Encounters") +
- ylab("Time (seconds)") +
- facet_wrap(. ~ fifelse(subconditions, paste(method, "(with subconditions)"), method)) +
- theme(
- panel.grid.minor = element_blank(),
- legend.position = "bottom",
- axis.text.x = element_text(hjust = 0.75)
- )
-
-ggsave(file = "benchmark1.svg", width = 7, height = 7)
-
-ggplot(bench1_summary) +
- theme_bw() +
- aes(x = encounters, y = rt, color = data_class, fill = data_class, linetype = data_class) +
- stat_smooth(method = "loess", formula = y ~ x) +
- scale_y_continuous() +
- scale_x_log10(labels = scales::label_comma()) +
- annotation_logticks(sides = "b") +
- scale_color_manual(name = "Data Class", values = cclr) +
- scale_fill_manual(name = "Data Class", values = cclr) +
- scale_linetype_manual(name = "Data Class", values = ctyp) +
- xlab("Encounters") +
- ylab("Relative expected run time (vs data.frame)") +
- facet_wrap(. ~ fifelse(subconditions, paste(method, "(with subconditions)"), method)) +
- theme(
- panel.grid.minor.x = element_blank(),
- legend.position = "bottom",
- axis.text.x = element_text(hjust = 0.75)
- )
-
-ggsave(file = "benchmark1-relative.svg", width = 7, height = 7)
diff --git a/benchmarking/benchmark1.R b/benchmarking/benchmark1.R
deleted file mode 100644
index d8ac6cf8..00000000
--- a/benchmarking/benchmark1.R
+++ /dev/null
@@ -1,33 +0,0 @@
-source("benchmark-utilities.R")
-
-if (interactive()) {
- outfile <- "DT__1e3__pccc_v3.1__1__1.rds"
-} else {
- outfile <- commandArgs(trailingOnly = TRUE)
-}
-
-args <- as.list(strsplit(outfile, split = "__")[[1]])
-
-this_data_set <-
- build_set1(
- data_class = args[[1]],
- subjects = as.integer(args[[2]]),
- seed = as.integer(args[[4]])
- )
-
-method <- args[[3]]
-subconditions <- endsWith(method, "s")
-
-if (subconditions) {
- method <- sub("s$", "", method)
-}
-
-output <-
- benchmark1(
- data = this_data_set,
- method = method,
- subconditions = subconditions
- )
-
-saveRDS(output, file = file.path("bench1_results", outfile))
-
diff --git a/benchmarking/benchmark2-composite.png b/benchmarking/benchmark2-composite.png
deleted file mode 100644
index 8d34a22c..00000000
Binary files a/benchmarking/benchmark2-composite.png and /dev/null differ
diff --git a/benchmarking/benchmark2-composite.svg b/benchmarking/benchmark2-composite.svg
deleted file mode 100644
index 9e4ce247..00000000
--- a/benchmarking/benchmark2-composite.svg
+++ /dev/null
@@ -1,2464 +0,0 @@
-
-
diff --git a/benchmarking/benchmark2-relative.svg b/benchmarking/benchmark2-relative.svg
deleted file mode 100644
index 99c43cb2..00000000
--- a/benchmarking/benchmark2-relative.svg
+++ /dev/null
@@ -1,884 +0,0 @@
-
-
diff --git a/benchmarking/benchmark2.svg b/benchmarking/benchmark2.svg
deleted file mode 100644
index 2d0f9fe9..00000000
--- a/benchmarking/benchmark2.svg
+++ /dev/null
@@ -1,5333 +0,0 @@
-
-
diff --git a/benchmarking/make_grid2.sh b/benchmarking/make_grid.sh
similarity index 97%
rename from benchmarking/make_grid2.sh
rename to benchmarking/make_grid.sh
index 75d4a823..973c8af7 100755
--- a/benchmarking/make_grid2.sh
+++ b/benchmarking/make_grid.sh
@@ -22,4 +22,4 @@ ITERS=$(seq 1 5)
done
done
done
-} > grid2.tsv
+} > grid.tsv
diff --git a/benchmarking/make_grid1.sh b/benchmarking/make_grid1.sh
deleted file mode 100755
index efca7ffb..00000000
--- a/benchmarking/make_grid1.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-DATA_CLASSES=(DF DT TBL)
-SUBJECTS=(1e1 1e2 1e3 5e3 1e4 5e4 1e5 5e5 1e6)
-METHODS=(pccc_v3.1 pccc_v3.1s charlson_quan2005 elixhauser_quan2005)
-SEEDS=$(seq 1 10)
-ITERS=$(seq 1 10)
-
-{
- echo -e "data_class\tsubjects\tmethod\tseed\titer"
- for dc in "${DATA_CLASSES[@]}"; do
- for n in "${SUBJECTS[@]}"; do
- for m in "${METHODS[@]}"; do
- for s in ${SEEDS[@]}; do
- for i in ${ITERS}; do
- echo -e "${dc}\t${n}\t${m}\t${s}\t${i}"
- done
- done
- done
- done
- done
-} > grid1.tsv
diff --git a/benchmarking/outtable.rds b/benchmarking/outtable.rds
index c3d04d72..5f6976cc 100644
Binary files a/benchmarking/outtable.rds and b/benchmarking/outtable.rds differ
diff --git a/benchmarking/run_parallel2.sh b/benchmarking/run_parallel.sh
similarity index 72%
rename from benchmarking/run_parallel2.sh
rename to benchmarking/run_parallel.sh
index 2645fc6b..636b159b 100755
--- a/benchmarking/run_parallel2.sh
+++ b/benchmarking/run_parallel.sh
@@ -1,7 +1,7 @@
#!/bin/bash
set -euo pipefail
-mkdir -p bench2_results logs2 logs2/mem
+mkdir -p bench_results logs logs/mem
# Detect cores (Linux + macOS)
if command -v nproc >/dev/null 2>&1; then
@@ -34,12 +34,12 @@ parallel \
--shuf \
--bar \
--eta \
- --joblog logs2/joblog1.tsv \
- --results logs2/out \
+ --joblog logs/joblog1.tsv \
+ --results logs/out \
'
- OUT="bench2_results/{data_class}__{subjects}__{method}__{flag_method}__{seed}__{iter}.rds"
- MEMRAW="logs2/mem/{data_class}__{subjects}__{method}__{flag_method}__{seed}__{iter}.raw"
- MEMTSV="logs2/mem/{data_class}__{subjects}__{method}__{flag_method}__{seed}__{iter}.tsv"
+ OUT="bench_results/{data_class}__{subjects}__{method}__{flag_method}__{seed}__{iter}.rds"
+ MEMRAW="logs/mem/{data_class}__{subjects}__{method}__{flag_method}__{seed}__{iter}.raw"
+ MEMTSV="logs/mem/{data_class}__{subjects}__{method}__{flag_method}__{seed}__{iter}.tsv"
# Idempotent: skip if result already exists and non-empty
test -s "$OUT" && exit 0
@@ -47,11 +47,11 @@ parallel \
# Run and capture peak RSS cross-platform
if /usr/bin/time -v true >/dev/null 2>&1; then
# Linux/GNU time: write verbose stats to MEMRAW
- /usr/bin/time -v -o "$MEMRAW" Rscript benchmark2.R "$(basename "$OUT")"
+ /usr/bin/time -v -o "$MEMRAW" Rscript benchmark.R "$(basename "$OUT")"
PEAK_KIB=$(awk -F: "/Maximum resident set size/ {gsub(/^[[:space:]]+|[[:space:]]+$/,\"\",\$2); print \$2}" "$MEMRAW")
else
# macOS/BSD time: -l prints "maximum resident set size" in bytes to stderr
- /usr/bin/time -l Rscript benchmark2.R "$(basename "$OUT")" 2> "$MEMRAW"
+ /usr/bin/time -l Rscript benchmark.R "$(basename "$OUT")" 2> "$MEMRAW"
# Convert bytes -> KiB (rounded)
PEAK_KIB=$(awk "/maximum resident set size/ {kb=int(\$1/1024+0.5); print kb}" "$MEMRAW")
fi
@@ -60,11 +60,11 @@ parallel \
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"{data_class}" "{subjects}" "{method}" "{flag_method}" "{seed}" "{iter}" "$OUT" "$PEAK_KIB" >> "$MEMTSV"
' \
- :::: grid2.tsv
+ :::: grid.tsv
# Aggregate per-job mem into one table (overwrite each run)
{
echo -e "data_class\tsubjects\tmethod\tflag_method\tseed\titer\tout\tmax_rss_kib"
- awk 'FNR==1 && NR!=1 { next } { print }' logs2/mem/*.tsv 2>/dev/null || true
-} > logs2/peak_mem.tsv
+ awk 'FNR==1 && NR!=1 { next } { print }' logs/mem/*.tsv 2>/dev/null || true
+} > logs/peak_mem.tsv
diff --git a/benchmarking/run_parallel1.sh b/benchmarking/run_parallel1.sh
deleted file mode 100755
index c689b822..00000000
--- a/benchmarking/run_parallel1.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-
-mkdir -p bench1_results logs1 bench2_results logs2
-
-# Detect cores (Linux + macOS)
-if command -v nproc >/dev/null 2>&1; then
- CORES=$(nproc)
-else
- CORES=$(sysctl -n hw.ncpu)
-fi
-
-# Default concurrency: on dragontail 80% of machine cores (>=1)
-JOBS=$(( 4 * CORES / 5 ))
-if [ "$JOBS" -lt 1 ]; then JOBS=1; fi
-
-# Memory safeguard: require 2 GiB free before starting another job.
-# Tweak this per box; you can also use 4G/8G on small/large machines.
-MEMFREE="128G"
-
-# Export per-job env to force single-threaded math/libs:
-export OPENBLAS_NUM_THREADS=1
-export OMP_NUM_THREADS=1
-export MKL_NUM_THREADS=1
-export VECLIB_MAXIMUM_THREADS=1
-export GOTO_NUM_THREADS=1
-export R_DATATABLE_NUM_THREADS=1
-
-# Run!
-parallel \
- --colsep '\t' \
- --header : \
- --jobs "$JOBS" \
- --memfree "$MEMFREE" \
- --shuf \
- --bar \
- --eta \
- --joblog logs1/joblog1.tsv \
- --results logs1/out \
- '
- OUT="bench1_results/{data_class}__{subjects}__{method}__{seed}__{iter}.rds"
- # Idempotent: skip if file exists and is non-empty
- test -s "$OUT" && exit 0
-
- Rscript benchmark1.R "$(basename "$OUT")"
- ' \
- :::: grid1.tsv
diff --git a/man-roxygen/details-header-and-assignable-codes.R b/man-roxygen/details-header-and-assignable-codes.R
index 22d8f5a8..3bdabbbf 100644
--- a/man-roxygen/details-header-and-assignable-codes.R
+++ b/man-roxygen/details-header-and-assignable-codes.R
@@ -10,4 +10,5 @@
#' Code 055.7 is a header because 055.71 and 055.72 exist.
#'
#' Some codes change status across years. For example, ICD-9-CM 516.3 was
-#' assignable in fiscal years 2006–2011, then became a header in 2012–2015.
+#' assignable in fiscal years 1997–2011 for the CDC extracts (2006–2011 for CMS)
+#' and became a header in 2012–2015.
diff --git a/man-roxygen/params-icd-year.R b/man-roxygen/params-icd-year.R
index 7efd8535..4055f1b8 100644
--- a/man-roxygen/params-icd-year.R
+++ b/man-roxygen/params-icd-year.R
@@ -1,4 +1,5 @@
#' @param year Numeric scalar. Calendar or fiscal year to reference. Default
-#' is the most current year available per source. For ICD-9, the latest year
-#' is 2015; ICD-10 source are updated annually. Calendar year for WHO and CDC
-#' mortality. Fiscal year for CMS.
+#' is the most current year available per source. For ICD-9, CMS data run
+#' through fiscal year 2015 and CDC extracts through 2012; ICD-10 sources are
+#' updated annually. Calendar year for WHO and CDC mortality. Fiscal year for
+#' CMS.
diff --git a/man/get_icd_codes.Rd b/man/get_icd_codes.Rd
index 8d0bf6df..1d7db691 100644
--- a/man/get_icd_codes.Rd
+++ b/man/get_icd_codes.Rd
@@ -79,14 +79,16 @@ calendar years.
\code{known_start} is the first fiscal or calendar year (depending on source) that
the medicalcoder package as definitive source data for. ICD-9-CM started in
-the United States in fiscal year 1980. Source information that could be
-downloaded from the CDC and CMS and added to the source code for the
-medicalcoder package goes back to 1997. As such 1997 is the "known start"
+the United States in fiscal year 1980. The CDC extracts included in
+medicalcoder span fiscal years 1997--2012; the CMS ICD-9-CM/PCS extracts
+start in fiscal year 2006 and run through fiscal year 2015. As such 1997 is
+the earliest "known start" for ICD-9 within medicalcoder.
\code{known_end} is the last fiscal or calendar year (depending on source)
-for which we have definitive source data for. For ICD-9-CM and ICD-9-PCS
-that is 2015. For ICD-10-CM and ICD-10-PCS, which are active, it is just the
-last year of known data. ICD-10 from the WHO ends in 2019.
+for which we have definitive source data for. For ICD-9-CM and ICD-9-PCS,
+CMS provides data through fiscal year 2015, while the CDC extracts stop at
+fiscal year 2012. For ICD-10-CM and ICD-10-PCS, which are active, it is just
+the last year of known data. ICD-10 from the WHO ends in 2019.
}
\subsection{Header and Assignable Codes}{
diff --git a/man/is_icd.Rd b/man/is_icd.Rd
index fd2987c1..62a8e8d9 100644
--- a/man/is_icd.Rd
+++ b/man/is_icd.Rd
@@ -31,9 +31,10 @@ diagnostic (ICD-9-CM, ICD-10-CM, CDC mortality, WHO), \code{0L} for procedural
\code{"cdc"}. Defaults to all.}
\item{year}{Numeric scalar. Calendar or fiscal year to reference. Default
-is the most current year available per source. For ICD-9, the latest year
-is 2015; ICD-10 source are updated annually. Calendar year for WHO and CDC
-mortality. Fiscal year for CMS.}
+is the most current year available per source. For ICD-9, CMS data run
+through fiscal year 2015 and CDC extracts through 2012; ICD-10 sources are
+updated annually. Calendar year for WHO and CDC mortality. Fiscal year for
+CMS.}
\item{headerok}{Logical scalar. If \code{FALSE} (default), only assignable
codes are considered valid; if \code{TRUE}, header codes are also accepted.}
@@ -75,7 +76,8 @@ Similarly for ICD-9-CM: "055" is a header for measles; 055.0, 055.1,
Code 055.7 is a header because 055.71 and 055.72 exist.
Some codes change status across years. For example, ICD-9-CM 516.3 was
-assignable in fiscal years 2006–2011, then became a header in 2012–2015.
+assignable in fiscal years 1997–2011 for the CDC extracts (2006–2011 for CMS)
+and became a header in 2012–2015.
}
\examples{
################################################################################
diff --git a/man/lookup_icd_codes.Rd b/man/lookup_icd_codes.Rd
index 50acc26c..0f490682 100644
--- a/man/lookup_icd_codes.Rd
+++ b/man/lookup_icd_codes.Rd
@@ -61,7 +61,8 @@ Similarly for ICD-9-CM: "055" is a header for measles; 055.0, 055.1,
Code 055.7 is a header because 055.71 and 055.72 exist.
Some codes change status across years. For example, ICD-9-CM 516.3 was
-assignable in fiscal years 2006–2011, then became a header in 2012–2015.
+assignable in fiscal years 1997–2011 for the CDC extracts (2006–2011 for CMS)
+and became a header in 2012–2015.
}
\seealso{
\itemize{
diff --git a/man/mdcr.Rd b/man/mdcr.Rd
index b213245d..2b9e9482 100644
--- a/man/mdcr.Rd
+++ b/man/mdcr.Rd
@@ -5,9 +5,7 @@
\alias{mdcr}
\title{Synthetic Data}
\format{
-\code{mdcr} is a \code{data.frame} with 4 columns, one for a patient id and 41 for
-diagnostic codes and 41 possible procedure codes. Each row is for one
-patient id.
+\code{mdcr} is a \code{data.frame} with 4 columns, Each row is for one ICD id.
\itemize{
\item \code{patid}: patient identifier, integer values
\item \code{icdv}: ICD version; integer values, 9 or 10
diff --git a/man/mdcr_longitudinal.Rd b/man/mdcr_longitudinal.Rd
index 07dfcda0..65aa6f9a 100644
--- a/man/mdcr_longitudinal.Rd
+++ b/man/mdcr_longitudinal.Rd
@@ -5,9 +5,9 @@
\alias{mdcr_longitudinal}
\title{Synthetic Longitudinal Data}
\format{
-\code{mdcr_longitudinal} is a \code{data.frame} with four columns. The codes are
-expected to be treated as diagnostic codes but there are a few ICD-9 codes
-which could match to procedure codes as well.
+\code{mdcr_longitudinal} is a \code{data.frame} with 4 columns. The codes are
+expected to be treated as diagnostic codes. Warning: there are a few ICD-9
+codes which could match to procedure codes.
\itemize{
\item \code{patid}: patient identifier, integer values
\item \code{date}: date the diagnostic code was recorded
diff --git a/testing/Makefile b/testing/Makefile
index b7223d90..2ea8d5de 100644
--- a/testing/Makefile
+++ b/testing/Makefile
@@ -20,40 +20,6 @@ WITHOUTSUGGESTS := $(addsuffix -without-suggests,$(VERSIONS))
# Limit with-suggests to R versions whose base OS is still supported (roughly Debian >= bullseye/R 4.1+).
WITHSUGGESTS := $(addsuffix -with-suggests,$(V41) $(V42) $(V43) $(V44) $(V45) $(LATEST))
-# CRAN snapshots: use the date the *next* R release occurred. Empty means "current".
-SNAP_4.4.2 :=
-SNAP_4.4.1 := 2024-10-31
-SNAP_4.4.0 := 2024-06-14
-SNAP_4.3.3 := 2024-04-24
-SNAP_4.3.2 := 2024-02-29
-SNAP_4.3.1 := 2023-10-31
-SNAP_4.3.0 := 2023-06-16
-SNAP_4.2.3 := 2023-04-21
-SNAP_4.2.2 := 2023-03-15
-SNAP_4.2.1 := 2022-10-31
-SNAP_4.2.0 := 2022-06-23
-SNAP_4.1.3 := 2022-04-22
-SNAP_4.1.2 := 2022-03-10
-SNAP_4.1.1 := 2021-11-01
-SNAP_4.1.0 := 2021-08-10
-SNAP_4.0.5 := 2021-05-18
-SNAP_4.0.4 := 2021-03-31
-SNAP_4.0.3 := 2021-02-15
-SNAP_4.0.2 := 2020-10-10
-SNAP_4.0.1 := 2020-06-22
-SNAP_4.0.0 := 2020-06-06
-SNAP_3.6.3 := 2020-04-24
-SNAP_3.6.2 := 2020-02-29
-SNAP_3.6.1 := 2019-12-12
-SNAP_3.6.0 := 2019-07-05
-SNAP_3.5.3 := 2019-04-26
-SNAP_3.5.2 := 2019-03-11
-SNAP_3.5.1 := 2018-12-20
-SNAP_3.5.0 := 2018-07-02
-# 4.5.x snapshots default to "current" until next releases occur.
-
-snapshot_arg = $(if $(SNAP_$1),--build-arg SNAPSHOT=$(SNAP_$1),)
-
PKG_ROOT := ..
PKG_VERSION := $(shell awk '/^Version:/{print $$2}' $(PKG_ROOT)/DESCRIPTION)
PKG_NAME := $(shell awk '/^Package:/{print $$2}' $(PKG_ROOT)/DESCRIPTION)
@@ -74,12 +40,14 @@ $(TARBALL): $(PKG_ROOT)/$(TARBALL)
README.md: README.Rmd $(CHECKS)
$(R) -q -e "knitr::knit('$<', output = '$@')"
-R-%-without-suggests/$(PKG_NAME).Rcheck/00check.log: .images/R-%-without-suggests.stamp $(TARBALL)
+R-%-without-suggests/$(PKG_NAME).Rcheck/00check.log: $(TARBALL)
$(RM) -r R-$*-without-suggests
mkdir -p R-$*-without-suggests
$(DOCKER) run \
-v "$$(PWD)":/work \
- --rm mdcr-$*-without-suggests \
+ -w /work \
+ -e _R_CHECK_FORCE_SUGGESTS_=false \
+ --rm r-base:$* \
R CMD check --no-build-vignettes --ignore-vignettes --no-manual -o R-$*-without-suggests $(TARBALL) > R-$*-without-suggests/docker.log 2>&1
R-%-with-suggests/$(PKG_NAME).Rcheck/00check.log: .images/R-%-with-suggests.stamp $(TARBALL)
@@ -90,14 +58,9 @@ R-%-with-suggests/$(PKG_NAME).Rcheck/00check.log: .images/R-%-with-suggests.stam
--rm mdcr-$*-with-suggests \
R CMD check -o R-$*-with-suggests $(TARBALL) > R-$*-with-suggests/docker.log 2>&1
-.images/R-%-without-suggests.stamp: docker/without-suggests/Dockerfile $(TARBALL) | .images
- $(DOCKER) info >/dev/null 2>&1 || { echo "$(DOCKER) is unavailable or the daemon cannot be reached"; exit 1; }
- $(DOCKER) build --build-arg R_TAG=$* -t mdcr-$*-without-suggests -f $< .
- @touch $@
-
.images/R-%-with-suggests.stamp: docker/with-suggests/Dockerfile $(TARBALL) | .images
$(DOCKER) info >/dev/null 2>&1 || { echo "$(DOCKER) is unavailable or the daemon cannot be reached"; exit 1; }
- $(DOCKER) build --build-arg R_TAG=$* $(call snapshot_arg,$*) -t mdcr-$*-with-suggests -f $< .
+ $(DOCKER) build --build-arg R_TAG=$* -t mdcr-$*-with-suggests -f $< .
@touch $@
.images:
diff --git a/testing/README.Rmd b/testing/README.Rmd
index 99b16ed0..79eddd30 100644
--- a/testing/README.Rmd
+++ b/testing/README.Rmd
@@ -29,6 +29,11 @@ To run the tests you need
Just run `make` from this directory.
+**NOTE:** When something goes wrong and you need to dig into a specific image
+run from this directory.
+
+ docker run -v .:/work/ -it
+
# Last Testing Results
```{r}
diff --git a/testing/README.md b/testing/README.md
index 7d1e5b80..50564a8c 100644
--- a/testing/README.md
+++ b/testing/README.md
@@ -18,6 +18,11 @@ To run the tests you need
Just run `make` from this directory.
+**NOTE:** When something goes wrong and you need to dig into a specific image
+run from this directory.
+
+ docker run -v .:/work/ -it
+
# Last Testing Results
@@ -33,27 +38,111 @@ Just run `make` from this directory.
-
With Suggested Packages
+
With Suggested Packages
+
4.1.0
+
0
+
+
+
+
+
+
4.1.1
+
0
+
+
+
+
+
+
4.1.2
+
0
+
+
+
+
+
+
4.1.3
+
0
+
+
+
+
+
+
4.2.0
+
0
+
+
+
+
+
+
4.2.1
+
0
+
+
+
+
+
+
4.2.2
+
0
+
+
+
+
+
+
4.2.3
+
0
+
+
+
+
+
+
4.3.0
+
0
+
+
+
+
+
+
4.3.1
+
0
+
+
+
+
+
+
4.3.2
+
0
+
+
+
+
+
+
4.3.3
+
0
+
+
+
+
+
4.4.0
0
-
Warning 1
-
Note 4
+
+
4.4.1
0
-
Note 1
+
4.4.2
0
-
Note 1
+
4.5.0
@@ -82,196 +171,196 @@ Just run `make` from this directory.