dewittpe · dewittpe · Dec 12, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,7 @@ data-raw/icd/desc_start_stop.rds
 data-raw/icd/icd10/.download_stamp
 data-raw/icd/icd10/icd10.rds
 data-raw/icd/icd9/.download_stamp
-data-raw/icd/icd9/icd9_cm_pcs.rds
+data-raw/icd/icd9/icd9.rds
 data-raw/icd/icd_chapters.rds
 data-raw/icd/icd_chapters_subchapters.rds
 data-raw/icd/icd_codes.rds

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,11 +21,11 @@ BugReports: https://github.com/dewittpe/medicalcoder/issues
 LazyData: true
 Suggests:
     data.table,
+    dplyr,
     kableExtra,
     knitr,
     R.utils,
-    rmarkdown,
-    tibble (>= 2.0.0)
+    rmarkdown
 RoxygenNote: 7.3.3
 VignetteBuilder: knitr
 Roxygen: list(markdown = TRUE)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,24 @@
+# medicalcoder 0.7.0.9000
+
+## New Features
+
+* If a `tibble` is passed to `comorbidities()` and the `dplyr` namespace is
+  available, then `dplyr` methods will be used for data manipulation.  This
+  change will generally result in less computation time than base R
+  `data.frames` (`data.tables` require even less time).
+
+## Other Changes
+
+* Extend and improve the internal ICD-9 database to distinguish between CDC and
+  CMS source.
+
+* Fix documentation of the `mdcr` and `mdcr_longitudinal` datasets.
+
+* Clarified internal data.frame/data.table helpers: documented that `mdcr_select()`
+  deep-copies data.table subsets to avoid aliasing, noted the selfref fix in
+  `mdcr_set()`, and added inline guidance in the longitudinal section of
+  `comorbidities()` to explain the first-occurrence logic.
+
 # medicalcoder 0.7.0
 
 ## Bug Fixes

diff --git a/R/charlson.R b/R/charlson.R
@@ -62,7 +62,7 @@
   cci      <- as.integer(as.vector(X %*% cci_wt))
 
   # build the return object
-  rtn <- cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+  rtn <- mdcr_cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
   rtn <- mdcr_set(rtn, j = "num_cmrb",  value = num_cmrb)
   rtn <- mdcr_set(rtn, j = "cmrb_flag", value = cmrb_flag)
   rtn <- mdcr_set(rtn, j = "cci",      value = cci)

diff --git a/R/comorbidities.R b/R/comorbidities.R
@@ -360,13 +360,13 @@ comorbidities.data.frame <- function(data,
   # Determine the lookup table and the columns for the lookup table to keep
   lookup_to_keep <- c("condition")
   if (startsWith(method, "pccc")) {
-    lookup <- get_pccc_codes()
+    lookup <- get(x = "pccc_codes", envir = ..mdcr_data_env.., inherits = FALSE)
     lookup_to_keep <- c(lookup_to_keep, "subcondition", "transplant_flag", "tech_dep_flag")
   } else if (startsWith(method, "charlson")) {
-    lookup <- get_charlson_codes()
+    lookup <- get("charlson_codes", envir = ..mdcr_data_env.., inherits = FALSE)
     lookup_to_keep <- c(lookup_to_keep)
   } else if (startsWith(method, "elixhauser")) {
-    lookup <- get_elixhauser_codes()
+    lookup <- get("elixhauser_codes", envir = ..mdcr_data_env.., inherits = FALSE)
     lookup_to_keep <- c(lookup_to_keep, "poaexempt")
   }
 
@@ -385,25 +385,21 @@ comorbidities.data.frame <- function(data,
   ##############################################################################
   # inner join the data with the lookup table
   on_full <-
-    merge(
+    mdcr_inner_join(
       x = if (full.codes) {data} else {data[0, ]},
       y = lookup,
-      all = FALSE,
       by.x = by_x,
       by.y = c("full_code", by_y),
-      suffixes = c("", ".y"),
-      sort = FALSE
+      suffixes = c("", ".y")
     )
 
   on_comp <-
-    merge(
+    mdcr_inner_join(
       x = if (compact.codes) {data} else {data[0, ]},
       y = lookup,
-      all = FALSE,
       by.x = by_x,
       by.y = c("code", by_y),
-      suffixes = c("", ".y"),
-      sort = FALSE
+      suffixes = c("", ".y")
     )
 
   ##############################################################################
@@ -519,6 +515,7 @@ comorbidities.data.frame <- function(data,
       grps <- c(grps, "subcondition")
       byconditions <- c(byconditions, "subcondition")
     }
+    # identify first occurrence per id/condition then retain encounters on/after it
     tmp <- mdcr_select(cmrb, c(grps, encid))
     tmp <- mdcr_setorder(tmp, c(grps, encid))
     keep <- !mdcr_duplicated(tmp, by = grps)
@@ -527,12 +524,11 @@ comorbidities.data.frame <- function(data,
 
     # merge on the poa.var
     foc <-
-      merge(x = foc,
-            y = cmrb,
-            all = TRUE,
-            by.x = c(id.vars2, "first_occurrance", byconditions),
-            by.y = c(id.vars2, encid, byconditions),
-            sort = FALSE
+      mdcr_full_outer_join(
+        x = foc,
+        y = cmrb,
+        by.x = c(id.vars2, "first_occurrance", byconditions),
+        by.y = c(id.vars2, encid, byconditions)
       )
 
     if (startsWith(method, "pccc")) {
@@ -546,7 +542,7 @@ comorbidities.data.frame <- function(data,
     foc <-
       lapply(foc,
              function(y) {
-               rtn <- merge(x = iddf, y = y, all.x = TRUE, by = c(id.vars2), allow.cartesian = TRUE, sort = FALSE)
+               rtn <- mdcr_left_join(x = iddf, y = y, by = c(id.vars2))
                rtn <- mdcr_subset(rtn, i = !is.na(rtn[["condition"]]))
                i <- rtn[[encid]] >= rtn[["first_occurrance"]]
                mdcr_subset(rtn, i = i)
@@ -619,17 +615,6 @@ comorbidities.data.frame <- function(data,
 
   ##############################################################################
   # set attributes and return
-  if (requireNamespace("tibble", quietly = TRUE) && inherits(data, "tbl_df")) {
-    if (subconditions) {
-      ccc[["conditions"]] <- getExportedValue(name = "as_tibble", ns = "tibble")(x = ccc[["conditions"]])
-      for (i in seq_len(length(ccc[["subconditions"]]))) {
-        ccc[["subconditions"]][[i]] <- getExportedValue(name = "as_tibble", ns = "tibble")(x = ccc[["subconditions"]][[i]])
-      }
-    } else {
-      ccc <- getExportedValue(name = "as_tibble", ns = "tibble")(x = ccc)
-    }
-  }
-
   attr(ccc, "method") <- method
   attr(ccc, "id.vars") <- id.vars
   attr(ccc, "flag.method") <- flag.method

diff --git a/R/datasets.R b/R/datasets.R
@@ -1,9 +1,7 @@
 #' Synthetic Data
 #'
 #' @format
-#' `mdcr` is a `data.frame` with 4 columns, one for a patient id and 41 for
-#' diagnostic codes and 41 possible procedure codes.  Each row is for one
-#' patient id.
+#' `mdcr` is a `data.frame` with 4 columns, Each row is for one ICD id.
 #'
 #' * `patid`: patient identifier, integer values
 #' * `icdv`: ICD version; integer values, 9 or 10
@@ -17,15 +15,15 @@
 #' Synthetic Longitudinal Data
 #'
 #' @format
-#' `mdcr_longitudinal` is a `data.frame` with four columns.  The codes are
-#' expected to be treated as diagnostic codes but there are a few ICD-9 codes
-#' which could match to procedure codes as well.
+#' `mdcr_longitudinal` is a `data.frame` with 4 columns.  The codes are
+#' expected to be treated as diagnostic codes. Warning: there are a few ICD-9
+#' codes which could match to procedure codes.
 #'
 #' * `patid`: patient identifier, integer values
 #' * `date`: date the diagnostic code was recorded
 #' * `icdv`: ICD version 9 or 10, integer valued
 #' * `code`: ICD codes; character values
-#"
+#'
 #' @family datasets
 #'
 "mdcr_longitudinal"
diff --git a/R/elixhauser.R b/R/elixhauser.R
@@ -36,7 +36,7 @@
   }
 
   # build the return object
-  rtn <- cbind(iddf, as.data.frame(results$X, check.names = FALSE, stringsAsFactors = FALSE))
+  rtn <- mdcr_cbind(iddf, as.data.frame(results$X, check.names = FALSE, stringsAsFactors = FALSE))
   rtn <- mdcr_set(rtn, j = "num_cmrb", value = results$num_cmrb)
   rtn <- mdcr_set(rtn, j = "cmrb_flag", value = results$cmrb_flag)
   rtn <- mdcr_set(rtn, j = "mortality_index", value = results$mortality_index)
@@ -47,7 +47,6 @@
 }
 
 .elixhauser_post2022 <- function(ccc, id.vars, iddf, cmrb, poa.var, primarydx.var, method) {
-
   conditions <- ..mdcr_internal_elixhauser_codes..[["condition"]][which(..mdcr_internal_elixhauser_codes..[[method]] == 1L)]
   conditions <- sort(unique(conditions))
 
@@ -207,7 +206,6 @@
 }
 
 .elixhauser_pre2022 <- function(ccc, id.vars, iddf, cmrb, poa.var, primarydx.var, method) {
-
   # what are the relevent coniditions
   conditions <-
     unique(..mdcr_internal_elixhauser_codes..[["condition"]][which(..mdcr_internal_elixhauser_codes..[[method]] == 1L)])

diff --git a/R/get_icd_codes.R b/R/get_icd_codes.R
@@ -28,14 +28,16 @@
 #'
 #' `known_start` is the first fiscal or calendar year (depending on source) that
 #' the medicalcoder package as definitive source data for.  ICD-9-CM started in
-#' the United States in fiscal year 1980.  Source information that could be
-#' downloaded from the CDC and CMS and added to the source code for the
-#' medicalcoder package goes back to 1997.  As such 1997 is the "known start"
+#' the United States in fiscal year 1980.  The CDC extracts included in
+#' medicalcoder span fiscal years 1997--2012; the CMS ICD-9-CM/PCS extracts
+#' start in fiscal year 2006 and run through fiscal year 2015.  As such 1997 is
+#' the earliest "known start" for ICD-9 within medicalcoder.
 #'
 #' `known_end` is the last fiscal or calendar year (depending on source)
-#' for which we have definitive source data for.  For ICD-9-CM and ICD-9-PCS
-#' that is 2015.  For ICD-10-CM and ICD-10-PCS, which are active, it is just the
-#' last year of known data.  ICD-10 from the WHO ends in 2019.
+#' for which we have definitive source data for.  For ICD-9-CM and ICD-9-PCS,
+#' CMS provides data through fiscal year 2015, while the CDC extracts stop at
+#' fiscal year 2012.  For ICD-10-CM and ICD-10-PCS, which are active, it is just
+#' the last year of known data.  ICD-10 from the WHO ends in 2019.
 #'
 #' ## Header and Assignable Codes
 #'

diff --git a/R/is_icd.R b/R/is_icd.R
@@ -67,7 +67,7 @@ is_icd <- function(x, icdv = c(9L, 10L), dx = c(1L, 0L),
   }
 
   # get the known icd codes and filter to relevent codes
-  codes <- get_icd_codes(with.descriptions = FALSE, with.hierarchy = FALSE)
+  codes <- get("icd_codes", envir = ..mdcr_data_env.., inherits = FALSE)
 
   # keep based on icdv, dx, and src
   keep <- (codes[["icdv"]] %in% icdv) & (codes[["dx"]] %in% dx) & (codes[["src"]] %in% src)

diff --git a/R/lookup_icd_codes.R b/R/lookup_icd_codes.R
@@ -43,7 +43,7 @@ lookup_icd_codes <- function(x, regex = FALSE, full.codes = TRUE, compact.codes
   assert_scalar_logical(compact.codes)
   stopifnot(isTRUE(full.codes | compact.codes))
 
-  ICDCODES <- get_icd_codes(with.descriptions = FALSE, with.hierarchy = FALSE)
+  ICDCODES <- get("icd_codes", envir = ..mdcr_data_env.., inherits = FALSE)
 
   if (regex) {
     if(full.codes) {

diff --git a/R/pccc.R b/R/pccc.R
@@ -139,7 +139,7 @@
 
   X <- X[, colorder, drop = FALSE]
 
-  rtn <- cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+  rtn <- mdcr_cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
 
   if (subconditions) {
     rtn <- list(conditions = rtn, subconditions = list())
@@ -166,7 +166,7 @@
         X[cbind(ri[keep], ci[keep])] <- 1L
       }
 
-      rtn[["subconditions"]][[cnd]] <- cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+      rtn[["subconditions"]][[cnd]] <- mdcr_cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
     }
   }
 
@@ -209,7 +209,7 @@
   X[match(key_tran, key_iddf), "any_transplant"] <- 1L
   X <- cbind(X, num_cmrb, cmrb_flag)
 
-  rtn <- cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+  rtn <- mdcr_cbind(iddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
 
   if (subconditions) {
     rtn <- list(conditions = rtn, subconditions = list())
@@ -235,7 +235,7 @@
         X[cbind(ri[keep], ci[keep])] <- 1L
       }
 
-      rtn[["subconditions"]][[cnd]] <- cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
+      rtn[["subconditions"]][[cnd]] <- mdcr_cbind(uiddf, as.data.frame(X, check.names = FALSE, stringsAsFactors = FALSE))
     }
   }
 

diff --git a/R/summary.R b/R/summary.R
@@ -104,7 +104,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
 
   N <- nrow(object[["conditions"]])
 
-  conditions <- get_pccc_conditions()[c("condition", "condition_label")]
+  conditions <- ..mdcr_internal_pccc_conditions..[c("condition", "condition_label")]
   conditions <- unique(conditions)
   conditions <- conditions[order(conditions[["condition"]]), ]
 
@@ -159,7 +159,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
     warning(sprintf("Logic for pccc_summary_table has been implemented for flag.method = 'current'.  Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
   }
 
-  conditions <- get_pccc_conditions()[c("condition", "condition_label")]
+  conditions <- ..mdcr_internal_pccc_conditions..[c("condition", "condition_label")]
   conditions <- unique(conditions)
   conditions <- conditions[order(conditions[["condition"]]), ]
 
@@ -189,7 +189,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
     warning(sprintf("Logic for pccc_summary_table has been implemented for flag.method = 'current'.  Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
   }
 
-  conditions <- get_pccc_conditions()[c("condition", "condition_label")]
+  conditions <- ..mdcr_internal_pccc_conditions..[c("condition", "condition_label")]
   conditions <- unique(conditions)
   conditions <- conditions[order(conditions[["condition"]]), ]
 
@@ -254,7 +254,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
     warning(sprintf("Logic for charlson summary table has been implemented for flag.method = 'current'.  Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
   }
 
-  cmrbs <- get_charlson_index_scores()[!is.na( get_charlson_index_scores()[[attr(object, "method")]]), c("condition_description", "condition")]
+  cmrbs <- ..mdcr_internal_charlson_index_scores..[!is.na( ..mdcr_internal_charlson_index_scores..[[attr(object, "method")]]), c("condition_description", "condition")]
 
   cmrbs[["count"]] <- colSums(object[cmrbs[["condition"]]])
   cmrbs[["percent"]] <- 100 * colMeans(object[cmrbs[["condition"]]])
@@ -305,7 +305,7 @@ summary.medicalcoder_comorbidities_with_subconditions <- function(object, ...) {
     warning(sprintf("Logic for Elixhauser summary has been implemented for flag.method = 'current'.  Using this function for flag.method = '%s' may not provide a meaningful summary.", attr(object, "flag.method")))
   }
 
-  cmrbs <- get_elixhauser_index_scores()[!is.na( get_elixhauser_index_scores()[[attr(object, "method")]]), "condition", drop = FALSE]
+  cmrbs <- ..mdcr_internal_elixhauser_index_scores..[!is.na( ..mdcr_internal_elixhauser_index_scores..[[attr(object, "method")]]), "condition", drop = FALSE]
   cmrbs <- unique(cmrbs)
 
   cmrbs[["count"]] <- colSums(object[cmrbs[["condition"]]])