ropensci · jeffeaton · Apr 21, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: rdhs
 Type: Package
 Title: API Client and Dataset Management for the Demographic and Health Survey (DHS) Data
-Version: 0.8.2
+Version: 0.8.3
 Authors@R: 
     c(person(given = "OJ",
              family = "Watson",
@@ -43,10 +43,10 @@ Imports:
     qdapRegex,
     getPass,
     haven,
-    iotools,
     sf,
     cli, 
-    rlang
+    rlang,
+    vroom
 Suggests:
     testthat,
     knitr,

diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,16 @@
 # rdhs (development version)
 
+## rdhs 0.8.3
+
+* Internal change to `read_dhs_flat()` to reduce memory usage (`for` loop instead of `Map()`). 
+  Reduces risk of `Error: vector memory exhausted` when parsing large dataset.
+* Replace iotools::input.file() with vroom::vroom_fwf().
+
 ## rdhs 0.8.2
 
 * Spatial boundaries will be cached using the DHS client  (#122)
 
+
 ## rdhs 0.8.1
 
 * Convert DHS dataset flat file data dictionaries to UTF-8. This addresses parsing 

diff --git a/R/read_dhs_flat.R b/R/read_dhs_flat.R
@@ -335,25 +335,31 @@ read_dhs_flat <- function(zfile, all_lower=TRUE, meta_source=NULL) {
     stop("metadata file not found")
   }
 
-  types <- c("integer", "character", "numeric")
+  types <- c("i", "c", "n")
   dct$col_types <- types[match(dct$datatype, c("Numeric", "Alpha", "Decimal"))]
   dat <- read_zipdata(
-    zfile, "\\.DAT$", iotools::input.file, formatter = iotools::dstrfw,
-    col_types = dct$col_types, widths = dct$len, strict = FALSE
-  )
-  names(dat) <- dct$name
-  dat[dct$name] <- Map("attr<-", dat[dct$name], "label", dct$label)
+    zfile, "\\.DAT$", vroom::vroom_fwf,
+    col_positions = vroom::fwf_widths(dct$len, col_names = dct$name),
+    col_types = paste0(dct$col_types, collapse = ""),
+    progress = FALSE,
+    .name_repair = "minimal"
+  )  
+
+  for(idx in seq_along(dct$name)) {
+    attr(dat[dct$name[[idx]]], "label") <- dct$label[[idx]]
+  }
+
   haslbl <- unlist(lapply(dct$labels, length)) > 0
 
   # match on haven package version
   if (packageVersion("haven") > "1.1.2") {
-  dat[dct$name[haslbl]] <- Map(haven::labelled, dat[dct$name[haslbl]],
-                               dct$labels[haslbl],
-                               dct$label[haslbl])
+    dat[dct$name[haslbl]] <- Map(haven::labelled, dat[dct$name[haslbl]],
+                                 dct$labels[haslbl],
+                                 dct$label[haslbl])
   } else {
     dat[dct$name[haslbl]] <- Map(haven::labelled, dat[dct$name[haslbl]],
                                  dct$labels[haslbl])
   }
-
+  
   return(dat)
 }
diff --git a/R/utils.R b/R/utils.R
@@ -28,7 +28,7 @@ rbind_list_base <- function(x) {
 #'
 unzip_special <- function(zipfile, files = NULL, overwrite = TRUE,
                           junkpaths = FALSE, exdir = ".", unzip = "internal",
-                          setTimes = FALSE){
+                          setTimes = FALSE) {
 
   if (max(unzip(zipfile, list = TRUE)$Length) > 4e9) {
     unzip_file <- Sys.which("unzip")