dewittpe · dewittpe · Dec 17, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 15, 2025
diff --git a/Makefile b/Makefile
@@ -61,7 +61,7 @@ data-raw:
 	  > $@ 2>&1
 
 # README depends on dev deps because it uses devtools::load_all()
-README.md: $(PKG_ROOT)/README.Rmd .install_dev_deps.Rout benchmarking/outtable.rds
+README.md: $(PKG_ROOT)/README.Rmd .install_dev_deps.Rout
 	$(RSCRIPT) -e "devtools::load_all('$(PKG_ROOT)')" \
 	  -e "knitr::knit('$(PKG_ROOT)/README.Rmd', output='README.md')"
 

diff --git a/NEWS.md b/NEWS.md
@@ -19,6 +19,8 @@
   `mdcr_set()`, and added inline guidance in the longitudinal section of
   `comorbidities()` to explain the first-occurrence logic.
 
+* Improve cumulative flagging to apply first-occurrence logic more effecently.
+
 # medicalcoder 0.7.0
 
 ## Bug Fixes

diff --git a/R/comorbidities.R b/R/comorbidities.R
@@ -272,11 +272,7 @@ comorbidities.data.frame <- function(data,
     primarydx.var <- primarydx <- NULL
   }
 
-  flag.method <-
-    match.arg(
-      flag.method,
-      several.ok = FALSE
-    )
+  flag.method <- match.arg(flag.method, choices = c("current", "cumulative"), several.ok = FALSE)
 
   if (startsWith(method, "charlson") && !is.null(age.var)) {
     is_a_column(age.var, names(data))
@@ -520,29 +516,35 @@ comorbidities.data.frame <- function(data,
     tmp <- mdcr_setorder(tmp, c(grps, encid))
     keep <- !mdcr_duplicated(tmp, by = grps)
     foc <- mdcr_subset(tmp, keep)
-    foc <- mdcr_setnames(foc, old = encid, new = "first_occurrance")
 
-    # merge on the poa.var
+    # add the first occurrence on to the cmrb data.frame
     foc <-
-      mdcr_full_outer_join(
-        x = foc,
-        y = cmrb,
-        by.x = c(id.vars2, "first_occurrance", byconditions),
-        by.y = c(id.vars2, encid, byconditions)
+      mdcr_left_join(
+        x = cmrb,
+        y = foc,
+        by = c(id.vars2, encid, byconditions)
       )
+    foc <- mdcr_setnames(foc, old = encid, new = "first_occurrance")
+
+    iddf2 <-
+      mdcr_inner_join(
+        x = unique(mdcr_select(iddf, id.vars)),
+        y = unique(mdcr_select(foc, id.vars2)),
+        by = id.vars2)
+    iddf2 <- unique(iddf2)
 
     if (startsWith(method, "pccc")) {
       foc <- split(foc, f = mdcr_select(foc, c("condition", "subcondition")), drop = TRUE)
     } else {
       foc <- split(foc, f = mdcr_select(foc, c("condition")), drop = TRUE)
     }
-
     foc <- lapply(foc, unique)
 
+
     foc <-
       lapply(foc,
              function(y) {
-               rtn <- mdcr_left_join(x = iddf, y = y, by = c(id.vars2))
+               rtn <- mdcr_left_join(x = iddf2, y = y, by = c(id.vars2))
                rtn <- mdcr_subset(rtn, i = !is.na(rtn[["condition"]]))
                i <- rtn[[encid]] >= rtn[["first_occurrance"]]
                mdcr_subset(rtn, i = i)

diff --git a/benchmarking/.gitignore b/benchmarking/.gitignore
@@ -1,9 +1,6 @@
+*.rds
 *.tsv
-.bench1
-.bench2
+.benchmark
 Rplots.pdf
-bench1_results/
-bench2
-bench2_results/
-logs1/
-logs2/
+bench_results/
+logs/
diff --git a/benchmarking/Makefile b/benchmarking/Makefile
@@ -1,30 +1,9 @@
 include ../Makevars
 
-.NOPARALLEL: .bench
-
-.PHONY: all clean
-
 all: README.md
 
-grid.tsv: make_grid.sh
-	./$<
-
-.bench: run_parallel.sh grid.tsv
-	./$<
-	@touch $@
-
-outtable.rds: benchmark-summary.R .bench
+benchmark-composite.svg: benchmarks.R
 	$(RSCRIPT) $<
 
-README.md: README.Rmd outtable.rds
+README.md: README.Rmd benchmark-composite.svg
 	$(R) -e "knitr::knit('$<', output = '$@')"
-
-clean:
-	$(RM) .bench*
-	$(RM) *.svg
-	$(RM) *.pdf
-	$(RM) grid.tsv
-	$(RM) output.rds
-	$(RM) -r logs
-	$(RM) -r bench_results
-
diff --git a/benchmarking/README.Rmd b/benchmarking/README.Rmd
@@ -6,6 +6,7 @@ knitr::opts_chunk$set(
   comment = "#>",
   fig.path = "man/figures/README-"
 )
+library(knitr)
 library(kableExtra)
 suppressPackageStartupMessages(library(data.table))
 ```
@@ -15,7 +16,7 @@ suppressPackageStartupMessages(library(data.table))
 The major factors impacting the expected computation time for applying a
 comorbidity algorithm to a data set are:
 
-1. Data size: number of subjects/encounters.
+1. Data size: number of encounters.
 2. Data storage class: `medicalcoder` has been built such that no imports of
    other namespaces is required. That said, when a `data.table` is passed to
    `comorbidities()` and the `data.table` namespace is available, then S3
@@ -25,77 +26,51 @@ comorbidity algorithm to a data set are:
    base `data.frame`, though `data.table` remains the fastest option.
 3. flag.method: "current" will take less time than the "cumulative" method.
 
-<img src="benchmark2-composite.svg"/>
+The following summary is based on resampling the [MIMIC-IV Clinical Database
+Demo](https://physionet.org/content/mimic-iv-demo/2.2/) data set provided by
+PhysioNet.^1,2^ Using the MIMIC-IV Clinical Database Demo set will allow others to benchmark
+`medicalcoder` on their system with the same data as used here.
 
-```{r outtable, include = FALSE}
-outtable <- readRDS("outtable.rds")
-setDT(outtable)
-outtable_orig <- copy(outtable)
-outtable[, memory := memory / (1024^2)] # From KiB to GiB
-
-outtable <-
-  dcast(
-    data = outtable,
-    data_class + encounters + method + subconditions ~ flag.method,
-    value.var = list("time_seconds", "relative_time", "memory")
-  )
+[^1^] Johnson, A., Bulgarelli, L., Pollard, T., Horng, S., Celi, L. A., & Mark, R. (2023). MIMIC-IV Clinical Database Demo (version 2.2). PhysioNet. RRID:SCR_007345. https://doi.org/10.13026/dp1f-ex47
 
-neworder <-
-  c("method",
-    "subconditions",
-    "encounters",
-    "data_class",
-    "time_seconds_current",
-    "relative_time_current",
-    "memory_current",
-    "time_seconds_cumulative",
-    "relative_time_cumulative",
-    "memory_cumulative"
-  )
+[^2^] Goldberger, A., Amaral, L., Glass, L., Hausdorff, J., Ivanov, P. C., Mark, R., ... & Stanley, H. E. (2000). PhysioBank, PhysioToolkit, and PhysioNet: Components of a new research resource for complex physiologic signals. Circulation [Online]. 101 (23), pp. e215–e220. RRID:SCR_007345.
 
-setcolorder(outtable, neworder = neworder)
-outtable[, data_class := factor(data_class, levels = c("data.frame", "data.table", "tibble"))]
-setkey(outtable, method, subconditions, encounters, data_class)
+```{r mimic-iv-import, include = FALSE}
+mimicivdata <- readRDS(file.path("mimic-iv", "mimicivdata.rds"))
+data.table::setDT(mimicivdata)
+```
 
-outtable[, encounters := qwraps2::frmt(as.integer(encounters))]
+The MIMIC-IV demo data consists of
+`r data.table::uniqueN(mimicivdata, by = c("subject_id"))` subjects and a total of
+`r data.table::uniqueN(mimicivdata, by = c("subject_id", "hadm_id"))`
+encounters.  A mix of ICD-9 and ICD-10 diagnostic and procedure codes are in the
+data.  The subjects were resampled to generate data sets with upto 1,000,000
+subjects.  Each generated data set was used to benchmark by
+
+* `method`:
+    * charlson_quan2005
+    * elixhauser_quan2005
+    * pccc_v3.1 without subconditions
+    * pccc_v3.1 with subconditions
+* data class:
+    * data.frame
+    * tibble
+    * data.table
+* `flag.method`
+    * current
+    * cumulative
+
+<img src="benchmark-composite.svg"/>
 
-outtable[, method := fifelse(subconditions, paste(method, "(with subconditions)"), method)]
-outtable[, subconditions := NULL]
+```{r outtable, include = FALSE}
+mimicivbenchmark <-readRDS(file.path("mimic-iv", "benchmark.rds"))
+data.table::setDT(mimicivbenchmark)
 ```
 
 In general, the expected time to apply a comorbidity method is lower for
 `tibble`s than for base `data.frame`s, and lower still for `data.table`s. Best
 observed case: a `data.table` took
-`r outtable_orig[data_class == "data.table", min(relative_time)]`
-the time of a `data.frame`; `tibble`s sit between the two.
-
-```{r show-outtable, echo = FALSE, results = "asis"}
-foo <- function(x) {
-  kbl(
-    x = x,
-    format = "html",
-    escape = FALSE,
-    digits = 2,
-    align = "ccrrrrrr",
-    col.names = c("Encounters", "Data Class", rep(c("Time (seconds)", "Relative time", "Memory (GB)"), times = 2)),
-    caption = "Expected time (seconds), relative time (with respect to data.frame), and expected memory use, by flagging method (current or cumulative), number of encounters, and input data storage format."
-  ) |>
-  collapse_rows(columns = 1, valign = "middle") |>
-  row_spec(row = c(seq(1, nrow(x), by = 6)), background = "#D9D9D9") |>
-  row_spec(row = c(seq(2, nrow(x), by = 6)), background = "#D9D9D9") |>
-  row_spec(row = c(seq(3, nrow(x), by = 6)), background = "#D9D9D9") |>
-  add_header_above(c("", "", "flag.method = 'current'" = 3, "flag.method = 'cumulative'" = 3))
-}
-
-cat("\n### Benchmarking Charlson (Quan 2005)\n\n")
-foo(x = outtable[method == "charlson_quan2005", -"method"])
-
-cat("\n### Benchmarking Elixhauser (Quan 2005)\n\n")
-foo(x = outtable[method == "elixhauser_quan2005", -"method"])
-
-cat("\n### Benchmarking PCCC v3.1 (without subconditions)\n\n")
-foo(x = outtable[method == "pccc_v3.1", -"method"])
-
-cat("\n### Benchmarking PCCC v3.1 (with subconditions)\n\n")
-foo(x = outtable[method == "pccc_v3.1 (with subconditions)", -"method"])
-```
+`r mimicivbenchmark[data_class == "data.table", min(relative_time)]`
+the time of a `data.frame`. Best case for `tibble`s was
+`r mimicivbenchmark[data_class == "tibble", min(relative_time)]`
+the time of a `data.frame`.