diff --git a/DESCRIPTION b/DESCRIPTION index 8a89ee3..204a8a7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: rdhs Type: Package Title: API Client and Dataset Management for the Demographic and Health Survey (DHS) Data -Version: 0.8.2 +Version: 0.8.3 Authors@R: c(person(given = "OJ", family = "Watson", @@ -43,10 +43,10 @@ Imports: qdapRegex, getPass, haven, - iotools, sf, cli, - rlang + rlang, + vroom Suggests: testthat, knitr, diff --git a/NEWS.md b/NEWS.md index 703324e..2b9557a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,16 @@ # rdhs (development version) +## rdhs 0.8.3 + +* Internal change to `read_dhs_flat()` to reduce memory usage (`for` loop instead of `Map()`). + Reduces risk of `Error: vector memory exhausted` when parsing large dataset. +* Replace iotools::input.file() with vroom::vroom_fwf(). + ## rdhs 0.8.2 * Spatial boundaries will be cached using the DHS client (#122) + ## rdhs 0.8.1 * Convert DHS dataset flat file data dictionaries to UTF-8. This addresses parsing diff --git a/R/read_dhs_flat.R b/R/read_dhs_flat.R index fffb9b7..aeb99e4 100644 --- a/R/read_dhs_flat.R +++ b/R/read_dhs_flat.R @@ -335,25 +335,31 @@ read_dhs_flat <- function(zfile, all_lower=TRUE, meta_source=NULL) { stop("metadata file not found") } - types <- c("integer", "character", "numeric") + types <- c("i", "c", "n") dct$col_types <- types[match(dct$datatype, c("Numeric", "Alpha", "Decimal"))] dat <- read_zipdata( - zfile, "\\.DAT$", iotools::input.file, formatter = iotools::dstrfw, - col_types = dct$col_types, widths = dct$len, strict = FALSE - ) - names(dat) <- dct$name - dat[dct$name] <- Map("attr<-", dat[dct$name], "label", dct$label) + zfile, "\\.DAT$", vroom::vroom_fwf, + col_positions = vroom::fwf_widths(dct$len, col_names = dct$name), + col_types = paste0(dct$col_types, collapse = ""), + progress = FALSE, + .name_repair = "minimal" + ) + + for(idx in seq_along(dct$name)) { + attr(dat[dct$name[[idx]]], "label") <- dct$label[[idx]] + } + haslbl <- unlist(lapply(dct$labels, length)) > 0 # match on haven package version if (packageVersion("haven") > "1.1.2") { - dat[dct$name[haslbl]] <- Map(haven::labelled, dat[dct$name[haslbl]], - dct$labels[haslbl], - dct$label[haslbl]) + dat[dct$name[haslbl]] <- Map(haven::labelled, dat[dct$name[haslbl]], + dct$labels[haslbl], + dct$label[haslbl]) } else { dat[dct$name[haslbl]] <- Map(haven::labelled, dat[dct$name[haslbl]], dct$labels[haslbl]) } - + return(dat) } diff --git a/R/utils.R b/R/utils.R index b02fd20..dc0c962 100644 --- a/R/utils.R +++ b/R/utils.R @@ -28,7 +28,7 @@ rbind_list_base <- function(x) { #' unzip_special <- function(zipfile, files = NULL, overwrite = TRUE, junkpaths = FALSE, exdir = ".", unzip = "internal", - setTimes = FALSE){ + setTimes = FALSE) { if (max(unzip(zipfile, list = TRUE)$Length) > 4e9) { unzip_file <- Sys.which("unzip")