diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index c8841c356..e7e5fa890 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -14,6 +14,11 @@ Please: (backwards-incompatible changes to the documented interface) are noted. Collect the changes under the next release number (e.g. if you are on 1.7.2, then write your changes under the 1.8 heading). +- [ ] Styling and documentation checks. Make a PR comment with: + - `/style` to check the style and fix any issues. + - `/document` to check the package documentation and fix any issues. + - `/preview-docs` to preview the docs. + - See Actions GitHub tab to track progress of these commands. - See [DEVELOPMENT.md](DEVELOPMENT.md) for more information on the development process. diff --git a/.github/workflows/doc-preview.yaml b/.github/workflows/doc-preview.yaml deleted file mode 100644 index 068184225..000000000 --- a/.github/workflows/doc-preview.yaml +++ /dev/null @@ -1,65 +0,0 @@ -on: - issue_comment: - types: [created] - -name: doc-preview.yaml - -permissions: read-all - -jobs: - preview: - if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/preview-docs') }} - - runs-on: ubuntu-latest - permissions: - # Needed to write a comment on the PR - pull-requests: write - # Needed to read the PR branch - contents: read - steps: - - uses: actions/checkout@v4 - with: - # Checkout the PR branch - ref: refs/pull/${{ github.event.issue.number }}/head - - - uses: r-lib/actions/setup-pandoc@v2 - - - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::pkgdown, local::. - needs: website - - - name: Build site - run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) - shell: Rscript {0} - - - name: Deploy to Netlify - uses: nwtgck/actions-netlify@v3.0 - with: - # Standard config - github-token: ${{ secrets.GITHUB_TOKEN }} - deploy-message: "Deploy from GitHub Actions" - # 'docs/' is the default directory for pkgdown::build_site() - # we add 'dev' because _pkgdown.yml has 'development: mode: devel' - publish-dir: './docs/dev' - # Development deploys only - production-deploy: false - # Enable pull request comment (default) - enable-pull-request-comment: true - # Overwrite the pull request comment with updated link (default) - overwrites-pull-request-comment: true - # Don't deploy to GitHub - enable-github-deployment: false - # Don't update the status of the commit - enable-commit-status: false - # Don't comment on the commit - enable-commit-comment: false - env: - # Netlify credentials (currently from Dmitry's account) - NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} - NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} - timeout-minutes: 1 diff --git a/.github/workflows/document.yaml b/.github/workflows/document.yaml deleted file mode 100644 index bfc3c43da..000000000 --- a/.github/workflows/document.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -# -# Modifications: -# - devtools::build_readme() -# - workflow_dispatch added to allow manual triggering of the workflow -# - trigger branches changed -# - API key secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY -on: - push: - paths: ["R/**", "README.Rmd"] - workflow_dispatch: - -name: Document - -jobs: - document: - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - DELPHI_EPIDATA_KEY: ${{ secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY }} - steps: - - name: Checkout repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - name: Install dependencies - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: | - any::devtools - any::roxygen2 - needs: | - devtools - roxygen2 - - - name: Document - run: roxygen2::roxygenise() - shell: Rscript {0} - - - name: Build README.md from README.Rmd - run: Rscript -e 'if (file.exists("README.Rmd")) devtools::build_readme()' - - - name: Commit and push changes - run: | - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" - git add README.md - git add man/\* NAMESPACE DESCRIPTION - git commit -m "docs: document (GHA)" || echo "No changes to commit" - git pull --rebase - git push origin diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 21d030c94..78eae465f 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -1,10 +1,5 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -# -# Modifications: -# * workflow_dispatch added to allow manual triggering of the workflow -# * trigger branches changed -# * API key secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY on: push: branches: [main, dev] @@ -12,16 +7,17 @@ on: branches: [main, dev] workflow_dispatch: -name: Lint +name: lint.yaml + +permissions: read-all jobs: lint: runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - DELPHI_EPIDATA_KEY: ${{ secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: diff --git a/.github/workflows/pr-commands.yaml b/.github/workflows/pr-commands.yaml new file mode 100644 index 000000000..6cf91e878 --- /dev/null +++ b/.github/workflows/pr-commands.yaml @@ -0,0 +1,149 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +# Modifications: +# - Allow more roles to trigger each PR command +# - Document builds README.md from README.Rmd with devtools::build_readme() +# - Include a doc-preview command (uses Netlify to preview the docs) +on: + issue_comment: + types: [created] + +name: pr-commands.yaml + +permissions: read-all + +jobs: + document: + if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} + name: document + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/pr-fetch@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::roxygen2 + needs: pr-document + + - name: Document + run: roxygen2::roxygenise() + shell: Rscript {0} + + - name: Build README.md from README.Rmd + run: Rscript -e 'if (file.exists("README.Rmd")) devtools::build_readme()' + + - name: commit + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git add man/\* NAMESPACE + git commit -m 'Document' + + - uses: r-lib/actions/pr-push@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + style: + if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} + name: style + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/pr-fetch@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - uses: r-lib/actions/setup-r@v2 + + - name: Install dependencies + run: install.packages("styler") + shell: Rscript {0} + + - name: Style + run: styler::style_pkg() + shell: Rscript {0} + + - name: commit + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git add \*.R + git commit -m 'Style' + + - uses: r-lib/actions/pr-push@v2 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + preview: + if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'CONTRIBUTOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/preview-docs') }} + + runs-on: ubuntu-latest + permissions: + # Needed to write a comment on the PR + pull-requests: write + # Needed to read the PR branch + contents: read + steps: + - uses: actions/checkout@v4 + with: + # Checkout the PR branch + ref: refs/pull/${{ github.event.issue.number }}/head + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to Netlify + uses: nwtgck/actions-netlify@v3.0 + with: + # Standard config + github-token: ${{ secrets.GITHUB_TOKEN }} + deploy-message: "Deploy from GitHub Actions" + # 'docs/' is the default directory for pkgdown::build_site() + # we add 'dev' because _pkgdown.yml has 'development: mode: devel' + publish-dir: './docs/dev' + # Development deploys only + production-deploy: false + # Enable pull request comment (default) + enable-pull-request-comment: true + # Overwrite the pull request comment with updated link (default) + overwrites-pull-request-comment: true + # Don't deploy to GitHub + enable-github-deployment: false + # Don't update the status of the commit + enable-commit-status: false + # Don't comment on the commit + enable-commit-comment: false + env: + # Netlify credentials (currently from Dmitry's account) + NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} + NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} + timeout-minutes: 1 diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml deleted file mode 100644 index 396200a5f..000000000 --- a/.github/workflows/style.yml +++ /dev/null @@ -1,87 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -# -# Modifications: -# * workflow_dispatch added to allow manual triggering of the workflow -# * trigger branches changed -# * API key secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY -on: - push: - paths: - [ - "**.[rR]", - "**.[qrR]md", - "**.[rR]markdown", - "**.[rR]nw", - "**.[rR]profile", - ] - workflow_dispatch: - -name: Style - -jobs: - style: - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - DELPHI_EPIDATA_KEY: ${{ secrets.DELPHI_GITHUB_ACTIONS_EPIDATA_API_KEY }} - steps: - - name: Checkout repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Setup R - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - name: Install dependencies - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: any::styler, any::roxygen2 - needs: styler - - - name: Enable styler cache - run: styler::cache_activate() - shell: Rscript {0} - - - name: Determine cache location - id: styler-location - run: | - cat( - "location=", - styler::cache_info(format = "tabular")$location, - "\n", - file = Sys.getenv("GITHUB_OUTPUT"), - append = TRUE, - sep = "" - ) - shell: Rscript {0} - - - name: Cache styler - uses: actions/cache@v3 - with: - path: ${{ steps.styler-location.outputs.location }} - key: ${{ runner.os }}-styler-${{ github.sha }} - restore-keys: | - ${{ runner.os }}-styler- - ${{ runner.os }}- - - - name: Style - run: styler::style_pkg() - shell: Rscript {0} - - - name: Commit and push changes - run: | - if FILES_TO_COMMIT=($(git diff-index --name-only ${{ github.sha }} \ - | egrep --ignore-case '\.(R|[qR]md|Rmarkdown|Rnw|Rprofile)$')) - then - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" - git commit ${FILES_TO_COMMIT[*]} -m "style: styler (GHA)" - git pull --rebase - git push origin - else - echo "No changes to commit." - fi diff --git a/DESCRIPTION b/DESCRIPTION index 57981a195..decdb2802 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.11.0 +Version: 0.12.0 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), @@ -35,7 +35,8 @@ Description: This package introduces common data structures for working package is designed to be used in conjunction with `epipredict` for building and evaluating epidemiological models. License: MIT + file LICENSE -URL: https://cmu-delphi.github.io/epiprocess/ +URL: https://github.com/cmu-delphi/epiprocess, + https://cmu-delphi.github.io/epiprocess/ Depends: epidatasets, R (>= 3.6) @@ -46,9 +47,10 @@ Imports: dplyr (>= 1.1.0), ggplot2, glue, - lifecycle (>= 1.0.1), + lifecycle, lubridate, magrittr, + methods, pkgconfig, purrr, rlang, @@ -66,12 +68,13 @@ Suggests: distributional, epidatr, epipredict, + hardhat, here, knitr, outbreaks, readr, rmarkdown, - testthat (>= 3.1.5), + testthat, trendfilter, withr VignetteBuilder: @@ -102,6 +105,7 @@ Collate: 'methods-epi_archive.R' 'grouped_epi_archive.R' 'growth_rate.R' + 'inline-roxygen.R' 'key_colnames.R' 'methods-epi_df.R' 'outliers.R' diff --git a/NAMESPACE b/NAMESPACE index 2f97e5b3d..a77fca828 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,7 @@ S3method(as_epi_df,tbl_df) S3method(as_epi_df,tbl_ts) S3method(as_tibble,epi_df) S3method(as_tsibble,epi_df) +S3method(autoplot,epi_archive) S3method(autoplot,epi_df) S3method(clone,epi_archive) S3method(clone,grouped_epi_archive) @@ -31,6 +32,7 @@ S3method(epix_slide,epi_archive) S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(filter,epi_archive) S3method(group_by,epi_archive) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) @@ -46,10 +48,13 @@ S3method(key_colnames,epi_archive) S3method(key_colnames,epi_df) S3method(key_colnames,tbl_ts) S3method(mean,epi_df) +S3method(plot,epi_archive) +S3method(plot,epi_df) S3method(print,epi_archive) S3method(print,epi_df) S3method(print,grouped_epi_archive) S3method(print,growth_rate_params) +S3method(print,revision_analysis) S3method(summary,epi_df) S3method(ungroup,epi_df) S3method(ungroup,grouped_epi_archive) @@ -73,6 +78,7 @@ export(epi_slide_mean) export(epi_slide_opt) export(epi_slide_sum) export(epix_as_of) +export(epix_as_of_current) export(epix_fill_through_version) export(epix_merge) export(epix_slide) @@ -86,6 +92,7 @@ export(group_modify) export(growth_rate) export(growth_rate_params) export(guess_period) +export(is_epi_archive) export(is_epi_df) export(is_grouped_epi_archive) export(key_colnames) @@ -94,7 +101,9 @@ export(new_epi_archive) export(new_epi_df) export(relocate) export(rename) +export(revision_analysis) export(revision_summary) +export(set_versions_end) export(slice) export(sum_groups_epi_df) export(time_column_names) diff --git a/NEWS.md b/NEWS.md index 3ac814aa2..ba277bb49 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,8 +2,28 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's. +# epiprocess 0.12 + +## New features + +- `is_epi_archive` function has been reintroduced. +- `epix_as_of_current()` introduced as an alias for `epix_as_of(.$versions_end)`. +- Added `dplyr::filter` implementation for `epi_archive`s. + +## Improvements + +- Various documentation has been updated, simplified, and improved with better + examples. + # epiprocess 0.11 +## Highlights + +`{epiprocess}` should once again not require Rtools or a compiler to be able to +install! We've also updated some function interfaces to be more consistent +throughout the package & with tidyverse, and improved the generality of and +fixed bugs in various functions and documentation. + ## Breaking changes - `growth_rate()` argument order and names have changed. You will need to @@ -15,7 +35,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat for users without a compiler, we have placed `{trendfilter}` in Suggests:; if you want to use `method = "trendfilter"` you will need to manually install this dependency (e.g., with `remotes::install_github("glmgen/trendfilter")`). -- In `revision_summary()`: +- In `revision_summary()` (aliased to `revision_analysis()`): - The `should_compactify` argument is now called `compactify`. To migrate, change any calls with `should_compactfiy =` to `compactify =`. - Output now uses the name `lag_near_latest` instead of `time_near_latest`. To @@ -29,6 +49,11 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `min_waiting_period` now defines a nonstrict inequality instead of a strict one. To obtain the old bounds, bump the `min_waiting_period` up to the next possible value for your `time_type`. + - Added a `print()` method to take the place of `print_inform`. + - Removed the the argument `print_inform`. This is now always false. Replaced + with an option, `return_only_tibble` to return only the tibble of results + rather than the full S3 object. + - In `key_colnames()`: - On regular (non-`epi_df`) data frames, now requires manual specification of `geo_keys`, `other_keys`, and `time_keys`. @@ -42,12 +67,16 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat `new_epi_archive()`. - `validate_epi_archive()` now follows the validator convention of operating on an "unvalidated" `epi_archive` (from `new_epi_archive`) rather than arguments. +- `autoplot.epi_df()` argument `.max_facets` has been deprecated in favor of + `.facet_filter` which allows explicit selection of facets to show. ## Improvements -- `revision_summary()` now supports all `time_type`s. +- `revision_summary()` now supports all `time_type`s. Printed summary is more attractive. - The compactification tolerance setting now works with integer-type columns. - Various functions are now faster, using faster variants of core operations and avoiding reconstructing grouped `epi_df`s when unnecessary. +- Add `autoplot.epi_archive()` to display revision patterns. +- `sum_groups_epi_df()` now supports tidyselect syntax in it's second argument (#655). ## Bug fixes diff --git a/R/archive.R b/R/archive.R index 922f77835..44058f8b2 100644 --- a/R/archive.R +++ b/R/archive.R @@ -52,13 +52,6 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, class = "epiprocess__version_bound_mismatched_class" ) } - if (!identical(typeof(version_bound), typeof(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same `typeof` as x$version, - which has a `typeof` of {typeof(x$version)}", - class = "epiprocess__version_bound_mismatched_typeof" - ) - } } return(invisible(NULL)) @@ -207,38 +200,35 @@ next_after.Date <- function(x) x + 1L #' undergo tiny nonmeaningful revisions and the archive object with the #' default setting is too large. #' @param clobberable_versions_start Optional; `length`-1; either a value of the -#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and -#' `typeof`: specifically, either (a) the earliest version that could be -#' subject to "clobbering" (being overwritten with different update data, but -#' using the *same* version tag as the old update data), or (b) `NA`, to -#' indicate that no versions are clobberable. There are a variety of reasons -#' why versions could be clobberable under routine circumstances, such as (a) -#' today's version of one/all of the columns being published after initially -#' being filled with `NA` or LOCF, (b) a buggy version of today's data being -#' published but then fixed and republished later in the day, or (c) data -#' pipeline delays (e.g., publisher uploading, periodic scraping, database -#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected -#' later in the day (or even on a different day) than expected; potential -#' causes vary between different data pipelines. The default value is `NA`, -#' which doesn't consider any versions to be clobberable. Another setting that -#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. -#' @param versions_end Optional; length-1, same `class` and `typeof` as -#' `x$version`: what is the last version we have observed? The default is +#' same `class` as `x$version`, or an `NA` of any `class`: specifically, +#' either (a) the earliest version that could be subject to "clobbering" +#' (being overwritten with different update data, but using the *same* version +#' tag as the old update data), or (b) `NA`, to indicate that no versions are +#' clobberable. There are a variety of reasons why versions could be +#' clobberable under routine circumstances, such as (a) today's version of +#' one/all of the columns being published after initially being filled with +#' `NA` or LOCF, (b) a buggy version of today's data being published but then +#' fixed and republished later in the day, or (c) data pipeline delays (e.g., +#' publisher uploading, periodic scraping, database syncing, periodic +#' fetching, etc.) that make events (a) or (b) reflected later in the day (or +#' even on a different day) than expected; potential causes vary between +#' different data pipelines. The default value is `NA`, which doesn't consider +#' any versions to be clobberable. Another setting that may be appropriate for +#' some pipelines is `max_version_with_row_in(x)`. +#' @param versions_end Optional; length-1, same `class` as `x$version`: what is +#' the last version we have observed? The default is #' `max_version_with_row_in(x)`, but values greater than this could also be #' valid, and would indicate that we observed additional versions of the data #' beyond `max(x$version)`, but they all contained empty updates. (The default #' value of `clobberable_versions_start` does not fully trust these empty #' updates, and assumes that any version `>= max(x$version)` could be #' clobbered.) If `nrow(x) == 0`, then this argument is mandatory. -#' @return An `epi_archive` object. +#' @return * Of `new_epi_archive`: an (unvalidated) `epi_archive` #' #' @seealso [`epix_as_of`] [`epix_merge`] [`epix_slide`] #' @importFrom dplyr if_any if_all everything #' @importFrom utils capture.output #' -#' @name epi_archive -#' @export -#' #' @examples #' # Simple ex. with necessary keys #' tib <- tibble::tibble( @@ -277,6 +267,9 @@ next_after.Date <- function(x) x + 1L #' #' x <- df %>% as_epi_archive(other_keys = "county") #' +#' @name epi_archive +#' @order 3 +#' @export new_epi_archive <- function( x, geo_type, @@ -329,7 +322,14 @@ new_epi_archive <- function( #' Perform second (costly) round of validation that `x` is a proper `epi_archive` #' +#' Does not validate `geo_type` or `time_type` against `geo_value` and +#' `time_value` columns. These are assumed to have been set to compatibly. +#' +#' @return * Of `validate_epi_archive`: an `epi_archive`, +#' [invisibly][base::invisible] (or raises an error if `x` was invalid) +#' #' @rdname epi_archive +#' @order 4 #' @export validate_epi_archive <- function(x) { assert_class(x, "epi_archive") @@ -515,8 +515,10 @@ is_locf <- function(vec, abs_tol, is_key) { # nolint: object_usage_linter #' @param .versions_end location based versions_end, used to avoid prefix #' `version = issue` from being assigned to `versions_end` instead of being #' used to rename columns. +#' @return * Of `as_epi_archive`: an `epi_archive` object #' #' @rdname epi_archive +#' @order 1 #' #' @export as_epi_archive <- function( @@ -808,3 +810,16 @@ clone.epi_archive <- function(x) { x$DT <- data.table::copy(x$DT) x } + +#' Test for `epi_archive` format +#' +#' @param x An object. +#' @return * Of `is_epi_archive`: `TRUE` if the object inherits from `epi_archive`, +#' otherwise `FALSE`. +#' +#' @rdname epi_archive +#' @order 2 +#' @export +is_epi_archive <- function(x) { + inherits(x, "epi_archive") +} diff --git a/R/autoplot.R b/R/autoplot.R index ecfe5f1c9..2644f3ba4 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -1,7 +1,7 @@ -#' Automatically plot an epi_df +#' Automatically plot an epi_df or epi_archive #' -#' @param object An `epi_df` -#' @param ... <[`tidy-select`][dplyr_tidy_select]> One or more unquoted +#' @param object,x An `epi_df` or `epi_archive` +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted #' expressions separated by commas. Variable names can be used as if they #' were positions in the data frame, so expressions like `x:y` can #' be used to select a range of variables. @@ -16,16 +16,27 @@ #' * `none` - no coloring aesthetic is applied #' @param .facet_by Similar to `.color_by` except that the default is to display #' each numeric variable on a separate facet -#' @param .base_color Lines will be shown with this color. For example, with a -#' single numeric variable and faceting by `geo_value`, all locations would -#' share the same color line. -#' @param .max_facets Cut down of the number of facets displayed. Especially -#' useful for testing when there are many `geo_value`'s or keys. +#' @param .base_color Lines will be shown with this color if `.color_by == "none"`. +#' For example, with a single numeric variable and faceting by `geo_value`, all +#' locations would share the same color line. +#' @param .max_facets `r lifecycle::badge("deprecated")` +#' @param .facet_filter Select which facets will be displayed. Especially +#' useful for when there are many `geo_value`'s or keys. This is a +#' <[`rlang`][rlang::args_data_masking]> expression along the lines of [dplyr::filter()]. +#' However, it must be a single expression combined with the `&` operator. This +#' contrasts to the typical use case which allows multiple comma-separated expressions +#' which are implicitly combined with `&`. When multiple variables are selected +#' with `...`, their names can be filtered in combination with other factors +#' by using `.response_name`. See the examples below. #' -#' @return A ggplot object +#' +#' +#' @return A [ggplot2::ggplot] object #' @export +#' @name autoplot-epi #' #' @examples +#' # -- Use it on an `epi_df` #' autoplot(cases_deaths_subset, cases, death_rate_7d_av) #' autoplot(cases_deaths_subset, case_rate_7d_av, .facet_by = "geo_value") #' autoplot(cases_deaths_subset, case_rate_7d_av, @@ -41,16 +52,39 @@ #' autoplot(cases_deaths_subset, case_rate_7d_av, #' .base_color = "red", .facet_by = "geo_value" #' ) +#' +#' # filter to only some facets, must be explicitly combined +#' autoplot(cases_deaths_subset, cases, death_rate_7d_av, +#' .facet_by = "all", +#' .facet_filter = (.response_name == "cases" & geo_value %in% c("tx", "pa")) | +#' (.response_name == "death_rate_7d_av" & +#' geo_value %in% c("ca", "fl", "ga", "ny")) +#' ) +#' # Just an alias for convenience +#' plot(cases_deaths_subset, cases, death_rate_7d_av, +#' .facet_by = "all", +#' .facet_filter = (.response_name == "cases" & geo_value %in% c("tx", "pa")) | +#' (.response_name == "death_rate_7d_av" & +#' geo_value %in% c("ca", "fl", "ga", "ny")) +#' ) autoplot.epi_df <- function( object, ..., .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), .base_color = "#3A448F", - .max_facets = Inf) { + .facet_filter = NULL, + .max_facets = deprecated()) { .color_by <- rlang::arg_match(.color_by) .facet_by <- rlang::arg_match(.facet_by) + .facet_filter <- rlang::enquo(.facet_filter) - assert(anyInfinite(.max_facets), checkInt(.max_facets), combine = "or") + if (lifecycle::is_present(.max_facets)) { + lifecycle::deprecate_warn( + "0.11.1", + "autoplot.epi_df(.max_facets = )", + "autoplot.epi_df(.facet_filter = )" + ) + } assert_character(.base_color, len = 1) key_cols <- key_colnames(object) @@ -58,44 +92,14 @@ autoplot.epi_df <- function( geo_and_other_keys <- key_colnames(object, exclude = "time_value") # --- check for numeric variables - allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) - allowed <- allowed[allowed] - if (length(allowed) == 0 && rlang::dots_n(...) == 0L) { - cli::cli_abort("No numeric variables were available to plot automatically.", - class = "epiprocess__no_numeric_vars_available" - ) - } - vars <- tidyselect::eval_select(rlang::expr(c(...)), object) - if (rlang::is_empty(vars)) { # find them automatically if unspecified - vars <- tidyselect::eval_select(names(allowed)[1], object) - cli::cli_warn( - "Plot variable was unspecified. Automatically selecting {.var {names(allowed)[1]}}.", - class = "epiprocess__unspecified_plot_var" - ) - } else { # if variables were specified, ensure that they are numeric - ok <- names(vars) %in% names(allowed) - if (!any(ok)) { - cli::cli_abort( - "None of the requested variables {.var {names(vars)}} are numeric.", - class = "epiprocess__all_requested_vars_not_numeric" - ) - } else if (!all(ok)) { - cli::cli_warn( - c( - "Only the requested variables {.var {names(vars)[ok]}} are numeric.", - i = "`autoplot()` cannot display {.var {names(vars)[!ok]}}." - ), - class = "epiprocess__some_requested_vars_not_numeric" - ) - vars <- vars[ok] - } - } + vars <- autoplot_check_viable_response_vars(object, ..., non_key_cols = non_key_cols) + nvars <- length(vars) # --- create a viable df to plot pos <- tidyselect::eval_select( rlang::expr(c("time_value", tidyselect::all_of(geo_and_other_keys), names(vars))), object ) - if (length(vars) > 1) { + if (nvars > 1) { object <- tidyr::pivot_longer( object[pos], tidyselect::all_of(names(vars)), values_to = ".response", @@ -106,35 +110,34 @@ autoplot.epi_df <- function( } all_keys <- rlang::syms(as.list(geo_and_other_keys)) other_keys <- rlang::syms(as.list(setdiff(geo_and_other_keys, "geo_value"))) - all_avail <- rlang::syms(as.list(c(geo_and_other_keys, ".response_name"))) + all_avail <- rlang::syms(as.list(c( + geo_and_other_keys, + if (nvars > 1) ".response_name" else NULL + ))) object <- object %>% dplyr::mutate( .colours = switch(.color_by, - all_keys = interaction(!!!all_keys, sep = "/"), + all_keys = interaction(!!!all_keys, sep = " / "), geo_value = .data$geo_value, - other_keys = interaction(!!!other_keys, sep = "/"), - all = interaction(!!!all_avail, sep = "/"), + other_keys = interaction(!!!other_keys, sep = " / "), + all = interaction(!!!all_avail, sep = " / "), NULL ), .facets = switch(.facet_by, - all_keys = interaction(!!!all_keys, sep = "/"), + all_keys = interaction(!!!all_keys, sep = " / "), geo_value = as.factor(.data$geo_value), - other_keys = interaction(!!!other_keys, sep = "/"), - all = interaction(!!!all_avail, sep = "/"), + other_keys = interaction(!!!other_keys, sep = " / "), + all = interaction(!!!all_avail, sep = " / "), NULL ) ) - if (.max_facets < Inf && ".facets" %in% names(object)) { - n_facets <- nlevels(object$.facets) - if (n_facets > .max_facets) { - top_n <- levels(as.factor(object$.facets))[seq_len(.max_facets)] - object <- dplyr::filter(object, .data$.facets %in% top_n) %>% - dplyr::mutate(.facets = droplevels(.data$.facets)) - if (".colours" %in% names(object)) { - object <- dplyr::mutate(object, .colours = droplevels(.data$.colours)) - } + if (!rlang::quo_is_null(.facet_filter) && ".facets" %in% names(object)) { + object <- dplyr::filter(object, !!.facet_filter) %>% + dplyr::mutate(.facets = droplevels(.data$.facets)) + if (".colours" %in% names(object)) { + object <- dplyr::mutate(object, .colours = droplevels(.data$.colours)) } } @@ -170,3 +173,197 @@ autoplot.epi_df <- function( } p } + +autoplot_check_viable_response_vars <- function( + object, ..., non_key_cols, call = caller_env()) { + allowed <- purrr::map_lgl(object[non_key_cols], is.numeric) + allowed <- allowed[allowed] + if (length(allowed) == 0 && rlang::dots_n(...) == 0L) { + cli::cli_abort("No numeric variables were available to plot automatically.", + class = "epiprocess__no_numeric_vars_available", + call = call + ) + } + vars <- tidyselect::eval_select(rlang::expr(c(...)), object) + if (rlang::is_empty(vars)) { # find them automatically if unspecified + vars <- tidyselect::eval_select(names(allowed)[1], object) + cli::cli_warn( + "Plot variable was unspecified. Automatically selecting {.var {names(allowed)[1]}}.", + class = "epiprocess__unspecified_plot_var", + call = call + ) + } else { # if variables were specified, ensure that they are numeric + ok <- names(vars) %in% names(allowed) + if (!any(ok)) { + cli::cli_abort( + "None of the requested variables {.var {names(vars)}} are numeric.", + class = "epiprocess__all_requested_vars_not_numeric", + call = call + ) + } else if (!all(ok)) { + cli::cli_warn( + c( + "Only the requested variables {.var {names(vars)[ok]}} are numeric.", + i = "`autoplot()` cannot display {.var {names(vars)[!ok]}}." + ), + class = "epiprocess__some_requested_vars_not_numeric", + call = call + ) + vars <- vars[ok] + } + } + vars +} + + + +#' @param .versions Select which versions will be displayed. By default, +#' a separate line will be shown with the data as it would have appeared on +#' every day in the archive. This can sometimes become overwhelming. For +#' example, daily data would display a line for what the data would have looked +#' like on every single day. To override this, you can select specific dates, +#' by passing a vector of values here. Alternatively, a sequence can be +#' automatically created by passing a string like `"2 weeks"` or `"month"`. +#' For time types where the `time_value` is a date object, any string that +#' is interpretable by [base::seq.Date()] is allowed. +#' +#' For `time_type = "integer"`, an integer larger than 1 will give a subset +#' of versions. +#' @param .mark_versions Logical. Indicate whether to mark each version with +#' a vertical line. Note that displaying many versions can become busy. +#' +#' @export +#' @rdname autoplot-epi +#' +#' @examples +#' +#' # -- Use it on an archive +#' +#' autoplot(archive_cases_dv_subset, percent_cli, .versions = "week") +#' autoplot(archive_cases_dv_subset_all_states, percent_cli, +#' .versions = "week", +#' .facet_filter = geo_value %in% c("or", "az", "vt", "ms") +#' ) +#' autoplot(archive_cases_dv_subset, percent_cli, +#' .versions = "month", +#' .facet_filter = geo_value == "ca" +#' ) +#' autoplot(archive_cases_dv_subset_all_states, percent_cli, +#' .versions = "1 month", +#' .facet_filter = geo_value %in% c("or", "az", "vt", "ms"), +#' .mark_versions = TRUE +#' ) +#' # Just an alias for convenience +#' plot(archive_cases_dv_subset_all_states, percent_cli, +#' .versions = "1 month", +#' .facet_filter = geo_value %in% c("or", "az", "vt", "ms"), +#' .mark_versions = TRUE +#' ) +autoplot.epi_archive <- function(object, ..., + .base_color = "black", + .versions = NULL, + .mark_versions = FALSE, + .facet_filter = NULL) { + time_type <- object$time_type + checkmate::assert_logical(.mark_versions, len = 1L) + if (time_type == "custom") { + cli_abort( + "This `epi_archive` has custom `time_type`. This is currently unsupported.", + class = "epiprocess__autoplot_archive_custom_time_type" + ) + } + + max_version <- max(object$DT$version) + min_version <- min(object$DT$version) + + tt_lookup <- c("day" = "day", "week" = "week", "yearmonth" = "month") + .versions <- .versions %||% ifelse(time_type == "integer", 1L, unname(tt_lookup[time_type])) + if (is.character(.versions) || length(.versions) == 1L) { + if (is.numeric(.versions)) .versions <- round(abs(.versions)) + .versions <- seq(min_version, max_version - 1, by = .versions) + } else if (methods::is(.versions, "Date") || is.numeric(.versions)) { + .versions <- .versions[min_version <= .versions & .versions <= max_version] + } else { + cli_abort( + "Requested `.versions` don't appear to match the available `time_type`.", + class = "epiprocess__autoplot_archive_bad_versions" + ) + } + + + finalized <- epix_as_of(object, max_version) + key_cols <- key_colnames(finalized) + non_key_cols <- setdiff(names(finalized), key_cols) + vars <- autoplot_check_viable_response_vars(finalized, ..., non_key_cols = non_key_cols) + nvars <- length(vars) + + bp <- autoplot.epi_df( + finalized, ..., + .base_color = .base_color, .facet_by = "all", + .facet_filter = {{ .facet_filter }}, .color_by = "none" + ) + ggplot2::xlab("Date") + geo_and_other_keys <- key_colnames(object, exclude = c("time_value", "version")) + all_avail <- rlang::syms(as.list(c( + geo_and_other_keys, + if (nvars > 1) ".response_name" else NULL + ))) + + snapshots <- purrr::map( + .versions, + function(v) { + dplyr::mutate(epix_as_of(object, v), version = v) + } + ) %>% + purrr::list_rbind() %>% + dplyr::mutate(.facets = interaction(!!!all_avail, sep = " / ")) + + if (nvars > 1) { + snapshots <- tidyr::pivot_longer( + snapshots, tidyselect::all_of(names(vars)), + values_to = ".response", + names_to = ".response_name" + ) + } else { + snapshots <- dplyr::rename(snapshots, .response := !!names(vars)) # nolint: object_usage_linter + } + snapshots <- snapshots %>% + dplyr::filter(!is.na(.response), .data$.facets %in% unique(bp$data$.facets)) + + bp <- bp + + ggplot2::geom_line( + data = snapshots, + mapping = ggplot2::aes(y = .response, color = version, group = factor(version)) + ) + + if (methods::is(.versions, "Date")) { + bp <- bp + ggplot2::scale_color_viridis_c(name = "Version", trans = "date") + } else { + bp <- bp + ggplot2::scale_color_viridis_c(name = "Version") + } + + if (.mark_versions) { + bp <- bp + + ggplot2::geom_vline( + data = snapshots, + ggplot2::aes(color = version, xintercept = version), + linewidth = .5, + linetype = 3, + show.legend = FALSE + ) + } + # make the finalized layer last + bp$layers <- rev(bp$layers) + bp +} + +#' @export +#' @rdname autoplot-epi +plot.epi_df <- function(x, ...) { + autoplot(x, ...) +} + +#' @export +#' @rdname autoplot-epi +plot.epi_archive <- function(x, ...) { + autoplot(x, ...) +} diff --git a/R/epi_df.R b/R/epi_df.R index 4955ab08d..b9d999d97 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -6,7 +6,8 @@ #' which can be seen as measured variables at each key. In brief, an `epi_df` #' represents a snapshot of an epidemiological data set at a point in time. #' -#' @details An `epi_df` is a tibble with (at least) the following columns: +#' @details An `epi_df` is a kind of tibble with (at least) the following +#' columns: #' #' - `geo_value`: A character vector representing the geographical unit of #' observation. This could be a country code, a state name, a county code, @@ -14,10 +15,11 @@ #' - `time_value`: A date or integer vector representing the time of observation. #' #' Other columns can be considered as measured variables, which we also refer to -#' as signal variables. An `epi_df` object also has metadata with (at least) -#' the following fields: +#' as indicators or signals. An `epi_df` object also has metadata with (at +#' least) the following fields: #' #' * `geo_type`: the type for the geo values. +#' * `time_type`: the type for the time values. #' * `as_of`: the time value at which the given data were available. #' #' Most users should use `as_epi_df`. The input tibble `x` to the constructor diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 378ea13bc..5a64e3ff1 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -203,13 +203,12 @@ ungroup.grouped_epi_archive <- function(x, ...) { } -#' @rdname epix_slide -#' #' @importFrom data.table key address rbindlist setDF copy #' @importFrom tibble as_tibble new_tibble validate_tibble #' @importFrom dplyr group_by groups #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg +#' #' @export epix_slide.grouped_epi_archive <- function( .x, @@ -437,9 +436,13 @@ epix_slide.grouped_epi_archive <- function( out <- lapply(.versions, function(.version) { # Ungrouped as-of data; `epi_df` if `all_versions` is `FALSE`, # `epi_archive` if `all_versions` is `TRUE`: + min_time_value <- .version - .before + if (is.na(min_time_value)) { + min_time_value <- -Inf + } as_of_raw <- .x$private$ungrouped %>% epix_as_of( .version, - min_time_value = .version - .before, + min_time_value = min_time_value, all_versions = .all_versions ) diff --git a/R/inline-roxygen.R b/R/inline-roxygen.R new file mode 100644 index 000000000..e6fb4cd11 --- /dev/null +++ b/R/inline-roxygen.R @@ -0,0 +1,16 @@ +# Helpers here are meant to be used inside inline R expressions within roxygen2 +# documentation when @template is inappropriate. + +#' Description of a single arg that tidyselects value variables +#' +#' Not meant for when describing tidyselect `...`. +#' +#' @keywords internal +tidyselect_arg_roxygen <- ' + <[`tidy-select`][dplyr::dplyr_tidy_select]> An unquoted column + name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), + [other tidy-select expression][tidyselect::language], or a vector of + characters (e.g. `c("cases", "deaths")`). Variable names can be used as if + they were positions in the data frame, so expressions like `x:y` can be + used to select a range of variables. +' diff --git a/R/key_colnames.R b/R/key_colnames.R index eeecce05b..f685b7210 100644 --- a/R/key_colnames.R +++ b/R/key_colnames.R @@ -91,8 +91,8 @@ key_colnames.epi_df <- function(x, ..., if (!identical(other_keys, expected_other_keys)) { cli_abort(c( "The provided `other_keys` argument didn't match the `other_keys` of `x`", - "*" = "`other_keys` was {format_chr_with_quotes(other_keys)}", - "*" = "`expected_other_keys` was {format_chr_with_quotes(expected_other_keys)}", + "*" = "`other_keys` was {format_chr_deparse(other_keys)}", + "*" = "`expected_other_keys` was {format_chr_deparse(expected_other_keys)}", "i" = "If you know that `x` will always be an `epi_df` and resolve this discrepancy by adjusting the metadata of `x`, you shouldn't have to pass `other_keys =` here anymore, diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 362fd4eaa..6c372d9bf 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -58,6 +58,7 @@ #' epix_as_of(archive_cases_dv_subset2, max(archive_cases_dv_subset$DT$version)) #' #' @importFrom data.table between key +#' @importFrom checkmate assert_scalar assert_logical assert_class #' @export epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, max_version = deprecated()) { @@ -79,15 +80,17 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, "`version` must have the same `class` vector as `epi_archive$DT$version`." ) } - if (!identical(typeof(version), typeof(x$DT$version))) { - cli_abort( - "`version` must have the same `typeof` as `epi_archive$DT$version`." - ) - } assert_scalar(version, na.ok = FALSE) if (version > x$versions_end) { cli_abort("`version` must be at most `epi_archive$versions_end`.") } + assert_scalar(min_time_value, na.ok = FALSE) + min_time_value_inf <- is.infinite(min_time_value) && min_time_value < 0 + min_time_value_same_type <- identical(class(min_time_value), class(x$DT$time_value)) + if (!min_time_value_inf && !min_time_value_same_type) { + cli_abort("`min_time_value` must be either -Inf or a time_value of the same type and + class as `epi_archive$time_value`.") + } assert_logical(all_versions, len = 1) if (!is.na(x$clobberable_versions_start) && version >= x$clobberable_versions_start) { cli_warn( @@ -100,39 +103,63 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, ) } - # We can't disable nonstandard evaluation nor use the `..` feature in the `i` - # argument of `[.data.table` below; try to avoid problematic names and abort - # if we fail to do so: - .min_time_value <- min_time_value - .version <- version - if (any(c(".min_time_value", ".version") %in% names(x$DT))) { - cli_abort("epi_archives can't contain a `.min_time_value` or `.version` column") - } - # Filter by version and return if (all_versions) { # epi_archive is copied into result, so we can modify result directly result <- epix_truncate_versions_after(x, version) - result$DT <- result$DT[time_value >= .min_time_value, ] # nolint: object_usage_linter + if (!min_time_value_inf) { + # See below for why we need this branch. + filter_mask <- result$DT$time_value >= min_time_value + result$DT <- result$DT[filter_mask, ] # nolint: object_usage_linter + } return(result) } # Make sure to use data.table ways of filtering and selecting - as_of_epi_df <- x$DT[time_value >= .min_time_value & version <= .version, ] %>% # nolint: object_usage_linter - unique( - by = c("geo_value", "time_value", other_keys), - fromLast = TRUE - ) %>% + if (min_time_value_inf) { + # This branch is needed for `epix_as_of` to work with `yearmonth` time type + # to avoid time_value > .min_time_value, which is NA for `yearmonth`. + filter_mask <- x$DT$version <= version + } else { + filter_mask <- x$DT$time_value >= min_time_value & x$DT$version <= version + } + as_of_epi_df <- x$DT[filter_mask, ] %>% + unique(by = c("geo_value", "time_value", other_keys), fromLast = TRUE) %>% + as.data.frame() %>% tibble::as_tibble() %>% dplyr::select(-"version") %>% - as_epi_df( - as_of = version, - other_keys = other_keys - ) + as_epi_df(as_of = version, other_keys = other_keys) return(as_of_epi_df) } +#' Get the latest snapshot from an `epi_archive` object. +#' +#' The latest snapshot is the snapshot of the last known version. +#' +#' @param x An `epi_archive` object +#' @return The latest snapshot from an `epi_archive` object +#' @export +epix_as_of_current <- function(x) { + assert_class(x, "epi_archive") + x %>% epix_as_of(.$versions_end) +} + +#' Set the `versions_end` attribute of an `epi_archive` object +#' +#' An escape hatch for epix_as_of, which does not allow version > +#' `$versions_end`. +#' +#' @param x An `epi_archive` object +#' @param versions_end The new `versions_end` value +#' @return An `epi_archive` object with the updated `versions_end` attribute +#' @export +set_versions_end <- function(x, versions_end) { + assert_class(x, "epi_archive") + validate_version_bound(versions_end, x$DT, na_ok = FALSE) + x$versions_end <- versions_end + x +} #' Fill `epi_archive` unobserved history #' @@ -450,7 +477,8 @@ epix_merge <- function(x, y, y_nonby_colnames <- setdiff(names(y_dt), by) if (length(intersect(x_nonby_colnames, y_nonby_colnames)) != 0L) { cli_abort(" - `x` and `y` DTs have overlapping non-by column names; + `x` and `y` DTs both have measurement columns named + {format_chr_with_quotes(intersect(x_nonby_colnames, y_nonby_colnames))}; this is currently not supported; please manually fix up first: any overlapping columns that can are key-like should be incorporated into the key, and other columns should be renamed. @@ -622,72 +650,89 @@ epix_detailed_restricted_mutate <- function(.data, ...) { } -#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` +#' Take each requested (group and) version in an archive, run a computation (e.g., forecast) #' -#' Slides a given function over variables in an `epi_archive` object. This -#' behaves similarly to `epi_slide()`, with the key exception that it is -#' version-aware: the sliding computation at any given reference time t is -#' performed on **data that would have been available as of t**. This function -#' is intended for use in accurate backtesting of models; see -#' `vignette("backtesting", package="epipredict")` for a walkthrough. +#' ... and collect the results. This is useful for more accurately simulating +#' how a forecaster, nowcaster, or other algorithm would have behaved in real +#' time, factoring in reporting latency and data revisions; see +#' \href{https://cmu-delphi.github.io/epipredict/articles/backtesting.html}{`vignette("backtesting", +#' package="epipredict")`} for a walkthrough. +#' +#' This is similar to looping over versions and calling [`epix_as_of`], but has +#' some conveniences such as working naturally with [`grouped_epi_archive`]s, +#' optional time windowing, and syntactic sugar to make things shorter to write. #' #' @param .x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, #' all data in `x` will be treated as part of a single data group. #' @param .f Function, formula, or missing; together with `...` specifies the -#' computation to slide. To "slide" means to apply a computation over a -#' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `.before` parameter (see details for more). If a -#' function, `.f` must have the form `function(x, g, t, ...)`, where -#' -#' - "x" is an epi_df with the same column names as the archive's `DT`, minus -#' the `version` column -#' - "g" is a one-row tibble containing the values of the grouping variables -#' for the associated group -#' - "t" is the ref_time_value for the current window -#' - "..." are additional arguments +#' computation. The computation will be run on each requested group-version +#' combination, with a time window filter applied if `.before` is supplied. +#' +#' If `.f` is a function must have the form `function(x, g, v)` or +#' `function(x, g, v, )`, where +#' +#' - `x` is an `epi_df` with the same column names as the archive's `DT`, +#' minus the `version` column. (Or, if `.all_versions = TRUE`, an +#' `epi_archive` with the requested partial version history.) +#' +#' - `g` is a one-row tibble containing the values of the grouping variables +#' for the associated group. +#' +#' - `v` (length-1) is the associated `version` (one of the requested +#' `.versions`) +#' +#' - `` are optional; you can add such +#' arguments to your function and set them by passing them through the +#' `...` argument to `epix_slide()`. #' #' If a formula, `.f` can operate directly on columns accessed via `.x$var` or #' `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for #' each group-`ref_time_value` combination. The group key can be accessed via -#' `.y` or `.group_key`, and the reference time value can be accessed via `.z` -#' or `.ref_time_value`. If `.f` is missing, then `...` will specify the -#' computation. +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z`, `.version`, or `.ref_time_value`. If `.f` is missing, then `...` will +#' specify the computation. #' @param ... Additional arguments to pass to the function or formula specified #' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted #' as a ["data-masking"][rlang::args_data_masking] expression or expressions #' for tidy evaluation; in addition to referring columns directly by name, the #' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs, #' and can also refer to `.x` (not the same as the input epi_archive), -#' `.group_key`, and `.ref_time_value`. See details for more. -#' @param .before How many time values before the `.ref_time_value` -#' should each snapshot handed to the function `.f` contain? If provided, it -#' should be a single value that is compatible with the time_type of the -#' time_value column (more below), but most commonly an integer. This window -#' endpoint is inclusive. For example, if `.before = 7`, `time_type` -#' in the archive is "day", and the `.ref_time_value` is January 8, then the -#' smallest time_value in the snapshot will be January 1. If missing, then the -#' default is no limit on the time values, so the full snapshot is given. -#' @param .versions Reference time values / versions for sliding -#' computations; each element of this vector serves both as the anchor point -#' for the `time_value` window for the computation and the `max_version` -#' `epix_as_of` which we fetch data in this window. If missing, then this will -#' set to a regularly-spaced sequence of values set to cover the range of -#' `version`s in the `DT` plus the `versions_end`; the spacing of values will -#' be guessed (using the GCD of the skips between values). +#' `.group_key` and `.version`/`.ref_time_value`. See details for more. +#' @param .before Optional; applies a `time_value` filter before running each +#' computation. The default is not to apply a `time_value` filter. If +#' provided, it should be a single integer or difftime that is compatible with +#' the time_type of the time_value column. If an integer, then the minimum +#' possible `time_value` included will be that many time steps (according to +#' the `time_type`) before each requested `.version`. This window endpoint is +#' inclusive. For example, if `.before = 14`, the `time_type` in the archive +#' is "day", and the requested `.version` is January 15, then the smallest +#' possible `time_value` possible in the snapshot will be January 1. Note that +#' this does not mean that there will be 14 or 15 distinct `time_value`s +#' actually appearing in the data; for most reporting streams, reporting as of +#' January 15 won't include `time_value`s all the way through January 14, due +#' to reporting latency. Unlike `epi_slide()`, `epix_slide()` won't fill in +#' any missing `time_values` in this window. +#' @param .versions Requested versions on which to run the computation. Each +#' requested `.version` also serves as the anchor point from which +#' the `time_value` window specified by `.before` is drawn. If `.versions` is +#' missing, it will be set to a regularly-spaced sequence of values set to +#' cover the range of `version`s in the `DT` plus the `versions_end`; the +#' spacing of values will be guessed (using the GCD of the skips between +#' values). #' @param .new_col_name Either `NULL` or a string indicating the name of the new #' column that will contain the derived values. The default, `NULL`, will use #' the name "slide_value" unless your slide computations output data frames, -#' in which case they will be unpacked into the constituent columns and those -#' names used. If the resulting column name(s) overlap with the column names -#' used for labeling the computations, which are `group_vars(x)` and -#' `"version"`, then the values for these columns must be identical to the -#' labels we assign. +#' in which case they will be unpacked into the constituent columns and the +#' data frame's column names will be used instead. If the resulting column +#' name(s) overlap with the column names used for labeling the computations, +#' which are `group_vars(x)` and `"version"`, then the values for these +#' columns must be identical to the labels we assign. #' @param .all_versions (Not the same as `.all_rows` parameter of `epi_slide`.) #' If `.all_versions = TRUE`, then the slide computation will be passed the -#' version history (all `version <= .version` where `.version` is one of the -#' requested `.versions`) for rows having a `time_value` of at least `.version -#' - before`. Otherwise, the slide computation will be passed only the most -#' recent `version` for every unique `time_value`. Default is `FALSE`. +#' version history (all versions `<= .version` where `.version` is one of the +#' requested `.version`s), in `epi_archive` format. Otherwise, the slide +#' computation will be passed only the most recent `version` for every unique +#' `time_value`, in `epi_df` format. Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables (if any), #' `time_value`, containing the reference time values for the slide #' computation, and a column named according to the `.new_col_name` argument, @@ -696,16 +741,17 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' @details A few key distinctions between the current function and `epi_slide()`: #' 1. In `.f` functions for `epix_slide`, one should not assume that the input #' data to contain any rows with `time_value` matching the computation's -#' `.ref_time_value` (accessible via `attributes()$metadata$as_of`); for -#' typical epidemiological surveillance data, observations pertaining to a -#' particular time period (`time_value`) are first reported `as_of` some -#' instant after that time period has ended. +#' `.version`, due to reporting latency; for typical epidemiological +#' surveillance data, observations pertaining to a particular time period +#' (`time_value`) are first reported `as_of` some instant after that time +#' period has ended. No time window completion is performed as in +#' `epi_slide()`. #' 2. The input class and columns are similar but different: `epix_slide` #' (with the default `.all_versions=FALSE`) keeps all columns and the #' `epi_df`-ness of the first argument to each computation; `epi_slide` only #' provides the grouping variables in the second input, and will convert the #' first input into a regular tibble if the grouping variables include the -#' essential `geo_value` column. (With .all_versions=TRUE`, `epix_slide` will +#' essential `geo_value` column. (With `.all_versions=TRUE`, `epix_slide` #' will provide an `epi_archive` rather than an `epi-df` to each #' computation.) #' 3. The output class and columns are similar but different: `epix_slide()` @@ -719,81 +765,60 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' 4. There are no size stability checks or element/row recycling to maintain #' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is #' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly -#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed -#' in the "advanced" vignette. +#' analogous to [`dplyr::mutate`].) #' 5. `.all_rows` is not supported in `epix_slide`; since the slide #' computations are allowed more flexibility in their outputs than in #' `epi_slide`, we can't guess a good representation for missing computations #' for excluded group-`.ref_time_value` pairs. -#' 76. The `.versions` default for `epix_slide` is based on making an +#' 6. The `.versions` default for `epix_slide` is based on making an #' evenly-spaced sequence out of the `version`s in the `DT` plus the -#' `versions_end`, rather than the `time_value`s. +#' `versions_end`, rather than all unique `time_value`s. +#' 7. `epix_slide()` computations can refer to the current element of +#' `.versions` as either `.version` or `.ref_time_value`, while `epi_slide()` +#' computations refer to the current element of `.ref_time_values` with +#' `.ref_time_value`. #' #' Apart from the above distinctions, the interfaces between `epix_slide()` and #' `epi_slide()` are the same. #' -#' Furthermore, the current function can be considerably slower than -#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch -#' properly-versioned snapshots from the data archive (via `epix_as_of()`), -#' and (2) it performs a "manual" sliding of sorts, and does not benefit from -#' the highly efficient `slider` package. For this reason, it should never be -#' used in place of `epi_slide()`, and only used when version-aware sliding is -#' necessary (as it its purpose). -#' #' @examples #' library(dplyr) #' -#' # Reference time points for which we want to compute slide values: -#' versions <- seq(as.Date("2020-06-02"), -#' as.Date("2020-06-15"), -#' by = "1 day" -#' ) +#' # Request only a small set of versions, for example's sake: +#' requested_versions <- +#' seq(as.Date("2020-09-02"), as.Date("2020-09-15"), by = "1 day") #' -#' # A simple (but not very useful) example (see the archive vignette for a more -#' # realistic one): +#' # Investigate reporting lag of `percent_cli` signal (though normally we'd +#' # probably work off of the dedicated `revision_summary()` function instead): #' archive_cases_dv_subset %>% -#' group_by(geo_value) %>% #' epix_slide( -#' .f = ~ mean(.x$case_rate_7d_av), -#' .before = 2, -#' .versions = versions, -#' .new_col_name = "case_rate_7d_av_recent_av" -#' ) %>% -#' ungroup() -#' # We requested time windows that started 2 days before the corresponding time -#' # values. The actual number of `time_value`s in each computation depends on -#' # the reporting latency of the signal and `time_value` range covered by the -#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -#' # discarded -#' # * 1 `time_value`, for ref time 2020-06-02 -#' # * 2 `time_value`s, for the rest of the results -#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because -#' # of data latency, we'll never have an observation -#' # `time_value == .ref_time_value` as of `.ref_time_value`. -#' # The example below shows this type of behavior in more detail. -#' -#' # Examining characteristics of the data passed to each computation with -#' # `all_versions=FALSE`. +#' geowide_percent_cli_max_time = max(time_value[!is.na(percent_cli)]), +#' geowide_percent_cli_rpt_lag = .version - geowide_percent_cli_max_time, +#' .versions = requested_versions +#' ) #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% #' epix_slide( -#' function(x, gk, rtv) { -#' tibble( -#' time_range = if (nrow(x) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) -#' }, -#' n = nrow(x), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' .before = 5, .all_versions = FALSE, -#' .versions = versions -#' ) %>% -#' ungroup() %>% -#' arrange(geo_value, version) +#' percent_cli_max_time = max(time_value[!is.na(percent_cli)]), +#' percent_cli_rpt_lag = .version - percent_cli_max_time, +#' .versions = requested_versions +#' ) +#' +#' # Backtest a forecaster "pseudoprospectively" (i.e., faithfully with respect +#' # to the data version history): +#' case_death_rate_archive %>% +#' epix_slide( +#' .versions = as.Date(c("2021-10-01", "2021-10-08")), +#' function(x, g, v) { +#' epipredict::arx_forecaster( +#' x, +#' outcome = "death_rate", +#' predictors = c("death_rate_7d_av", "case_rate_7d_av") +#' )$predictions +#' } +#' ) +#' # See `vignette("backtesting", package="epipredict")` for a full walkthrough +#' # on backtesting forecasters, including plots, etc. #' #' # --- Advanced: --- #' @@ -825,7 +850,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' ) #' }, #' .before = 5, .all_versions = TRUE, -#' .versions = versions +#' .versions = requested_versions #' ) %>% #' ungroup() %>% #' # Focus on one geo_value so we can better see the columns above: @@ -845,7 +870,6 @@ epix_slide <- function( } -#' @rdname epix_slide #' @export epix_slide.epi_archive <- function( .x, @@ -879,10 +903,13 @@ epix_slide.epi_archive <- function( #' @noRd epix_slide_versions_default <- function(ea) { versions_with_updates <- c(ea$DT$version, ea$versions_end) - tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + if (ea$time_type == "yearmonth") { + min(versions_with_updates) + seq(0, max(versions_with_updates) - min(versions_with_updates), by = 1) + } else { + tidyr::full_seq(versions_with_updates, guess_period(versions_with_updates)) + } } - #' Filter an `epi_archive` object to keep only older versions #' #' Generates a filtered `epi_archive` from an `epi_archive` object, keeping @@ -904,9 +931,6 @@ epix_truncate_versions_after.epi_archive <- function(x, max_version) { if (!identical(class(max_version), class(x$DT$version))) { cli_abort("`max_version` must have the same `class` as `epi_archive$DT$version`.") } - if (!identical(typeof(max_version), typeof(x$DT$version))) { - cli_abort("`max_version` must have the same `typeof` as `epi_archive$DT$version`.") - } assert_scalar(max_version, na.ok = FALSE) if (max_version > x$versions_end) { cli_abort("`max_version` must be at most `epi_archive$versions_end`.") @@ -983,3 +1007,163 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols data } + + + +#' [`dplyr::filter`] for `epi_archive`s +#' +#' @param .data an `epi_archive` +#' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed +#' unless you use `.format_aware = TRUE`; see details. +#' @param .by as in [`dplyr::filter`] +#' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See +#' details. +#' +#' @details +#' +#' By default, using the `version` column or measurement columns is disabled as +#' it's easy to get unexpected results. See if either [`epix_as_of`] or +#' [`epix_slide`] works for any version selection you have in mind: for version +#' selection, see the `version` or `.versions` args, respectively; for +#' measurement column-based filtering, try `filter`ing after `epix_as_of` or +#' inside the `.f` in `epix_slide()`. If they don't cover your use case, then +#' you can set `.format_aware = TRUE` to enable usage of these columns, but be +#' careful to: +#' * Factor in that `.data$DT` may have been converted into a compact format +#' based on diffing consecutive versions, and the last version of each +#' observation in `.data$DT` will always be carried forward to future +#' `version`s`; see details of [`as_epi_archive`]. +#' * Set `clobberable_versions_start` and `versions_end` of the result +#' appropriately after the `filter` call. They will be initialized with the +#' same values as in `.data`. +#' +#' `dplyr::filter` also has an optional argument `.preserve`, which should not +#' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do +#' not currently support `dplyr::filter`. +#' +#' @examples +#' +#' # Filter to one location and a particular time range: +#' archive_cases_dv_subset %>% +#' filter(geo_value == "fl", time_value >= as.Date("2020-10-01")) +#' +#' # Convert to weekly by taking the Saturday data for each week, so that +#' # `case_rate_7d_av` represents a Sun--Sat average: +#' archive_cases_dv_subset %>% +#' filter(as.POSIXlt(time_value)$wday == 6L) +#' +#' # Filtering involving the `version` column or measurement columns requires +#' # extra care. See epix_as_of and epix_slide instead for some common +#' # operations. One semi-common operation that ends up being fairly simple is +#' # treating observations as finalized after some amount of time, and ignoring +#' # any revisions that were made after that point: +#' archive_cases_dv_subset %>% +#' filter( +#' version <= time_value + as.difftime(60, units = "days"), +#' .format_aware = TRUE +#' ) +#' +#' @export +filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + if (.format_aware) { + out_tbl <- in_tbl %>% + filter(..., .by = {{ .by }}) + } else { + measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data)) + forbidden_colnames <- c("version", measurement_colnames) + out_tbl <- in_tbl %>% + filter( + # Add our own fake filter arg to the user's ..., to update the data mask + # to prevent `version` column usage. + { + # We should be evaluating inside the data mask. To disable both + # `version` and `.data$version` etc., we need to go to the ancestor + # environment containing the data mask's column bindings. This is + # likely just the parent env, but search to make sure, in a way akin + # to `<<-`: + e <- environment() + while (!identical(e, globalenv()) && !identical(e, emptyenv())) { # nolint:vector_logic_linter + if ("version" %in% names(e)) { + # This is where the column bindings are. Replace the forbidden ones. + # They are expected to be active bindings, so directly + # assigning has issues; `rm` first. + rm(list = forbidden_colnames, envir = e) + eval_env <- new.env(parent = asNamespace("epiprocess")) # see (2) below + delayedAssign( + "version", + cli_abort(c( + "Using `version` in `filter.epi_archive` may produce unexpected results.", + ">" = "See if `epix_as_of` or `epix_slide` would work instead.", + ">" = "If not, see `?filter.epi_archive` details for how to proceed." + ), class = "epiprocess__filter_archive__used_version"), + eval.env = eval_env, + assign.env = e + ) + for (measurement_colname in measurement_colnames) { + # Record current `measurement_colname` and set up execution for + # the promise for the error in its own dedicated environment, so + # that (1) `for` loop updating its value and `rm` cleanup don't + # mess things up. We can also (2) prevent changes to data mask + # ancestry (to involve user's quosure env rather than our + # quosure env) or contents (from edge case of user binding + # functions inside the mask) from potentially interfering by + # setting the promise's execution environment to skip over the + # data mask. + eval_env <- new.env(parent = asNamespace("epiprocess")) + eval_env[["local_measurement_colname"]] <- measurement_colname + delayedAssign( + measurement_colname, + cli_abort(c( + "Using `{format_varname(local_measurement_colname)}` + in `filter.epi_archive` may produce unexpected results.", + ">" = "See `?filter.epi_archive` details for how to proceed." + ), class = "epiprocess__filter_archive__used_measurement"), + eval.env = eval_env, + assign.env = e + ) + } + break + } + e <- parent.env(e) + } + # Don't mask similarly-named user objects in ancestor envs: + rm(list = c("e", "measurement_colname", "eval_env")) + TRUE + }, + ..., + .by = {{ .by }} + ) + } + # We could try to re-infer the geo_type, e.g., when filtering from + # national+state to just state. However, we risk inference failures such as + # "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when + # working with non-USA data: + out_geo_type <- .data$geo_type + if (.data$time_type == "day") { + # We might be going from daily to weekly; re-infer: + out_time_type <- guess_time_type(out_tbl$time_value) + } else { + # We might be filtering weekly to a single time_value; avoid re-inferring to + # stay "week". Or in other cases, just skip inferring, as re-inferring is + # expected to match the input time_type: + out_time_type <- .data$time_type + } + # Even if they narrow down to just a single value of an other_keys column, + # it's probably still better (& simpler) to treat it as an other_keys column + # since it still exists in the result: + out_other_keys <- .data$other_keys + # `filter` makes no guarantees about not aliasing columns in its result when + # the filter condition is all TRUE, so don't setDT. + out_dtbl <- as.data.table(out_tbl, key = c("geo_value", "time_value", out_other_keys, "version")) + result <- new_epi_archive( + out_dtbl, + out_geo_type, out_time_type, out_other_keys, + # Assume version-related metadata unchanged; part of why we want to push + # back on filter expressions like `.data$version <= .env$as_of`: + .data$clobberable_versions_start, .data$versions_end + ) + # Filtering down rows while keeping all (ukey) columns should preserve ukey + # uniqueness. + result +} diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 1191521c9..51f6cf33f 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -499,34 +499,40 @@ group_epi_df <- function(x, exclude = character()) { #' the resulting `epi_df` will have `geo_value` set to `"total"`. #' #' @param .x an `epi_df` -#' @param sum_cols character vector of the columns to aggregate +#' @param sum_cols `r tidyselect_arg_roxygen` #' @param group_cols character vector of column names to group by. "time_value" is -#' included by default. +#' included by default. #' @return an `epi_df` object #' +#' @examples +#' # This data has other_keys age_group and edu_qual: +#' grad_employ_subset +#' +#' # Aggregate num_graduates within each geo_value (and time_value): +#' grad_employ_subset %>% +#' sum_groups_epi_df(num_graduates, group_cols = "geo_value") +#' #' @export -sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) { +sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { assert_class(.x, "epi_df") - assert_character(sum_cols) assert_character(group_cols) - checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x))) checkmate::assert_subset(group_cols, key_colnames(.x)) if (!"time_value" %in% group_cols) { group_cols <- c("time_value", group_cols) } - - out <- .x %>% - group_by(across(all_of(group_cols))) %>% - dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop") + # Attempt tidyselection ourselves to get "Error in `sum_groups_epi_df()`" + # rather than "in `dplyr::summarize()`", before forwarding: + sum_cols <- rlang::enquo(sum_cols) + tidyselect::eval_select(sum_cols, .x) + out <- group_by(.x, across(all_of(group_cols))) %>% + dplyr::summarize(across(!!sum_cols, sum), .groups = "drop") # To preserve epi_df-ness, we need to ensure that the `geo_value` column is # present. - out <- if (!"geo_value" %in% group_cols) { - out %>% + if (!"geo_value" %in% group_cols) { + out <- out %>% mutate(geo_value = "total") %>% - relocate(geo_value, .before = 1) - } else { - out + relocate("geo_value", .before = 1) } # The `geo_type` will be correctly inherited here by the following logic: @@ -535,10 +541,10 @@ sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) # - if `geo_value` is not in `group_cols`, then the constructor will see # the unrecognizeable "total" value and will correctly infer the "custom" # geo_type. - out %>% - as_epi_df( - as_of = attr(.x, "metadata")$as_of, - other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols) - ) %>% + as_epi_df( + out, + as_of = attr(.x, "metadata")$as_of, + other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols) + ) %>% arrange_canonical() } diff --git a/R/revision_analysis.R b/R/revision_analysis.R index f36dcc16a..4e12fb77c 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -26,7 +26,7 @@ #' the window afterwards at 150. #' #' @param epi_arch an epi_archive to be analyzed -#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to +#' @param ... <[`tidyselect`][dplyr::dplyr_tidy_select]>, used to choose the column to #' summarize. If empty and there is only one value/measurement column (i.e., #' not in [`key_colnames`]) in the archive, it will automatically select it. #' If supplied, `...` must select exactly one column. @@ -34,31 +34,16 @@ #' `NA`'s compactify is run again if `compactify` is `TRUE` to make #' sure there are no duplicate values from occasions when the signal is #' revised to `NA`, and then back to its immediately-preceding value. -#' @param print_inform bool, determines whether to print summary information, or -#' only return the full summary tibble #' @param min_waiting_period `difftime`, integer or `NULL`. Sets a cutoff: any #' time_values that have not had at least `min_waiting_period` to stabilize as #' of the `versions_end` are removed. `min_waiting_period` should characterize #' the typical time during which most significant revisions occur. The default #' of 60 days corresponds to a typical near-final value for case counts as #' reported in the context of insurance. To avoid this filtering, either set -#' to `NULL` or 0. +#' to `NULL` or 0. A `difftime` will be rounded up to the appropriate `time_type` if +#' necessary (that is 5 days will be rounded to 1 week if the data is weekly). #' @param within_latest double between 0 and 1. Determines the threshold #' used for the `lag_to` -#' @param quick_revision difftime or integer (integer is treated as days), for -#' the printed summary, the amount of time between the final revision and the -#' actual time_value to consider the revision quickly resolved. Default of 3 -#' days -#' @param few_revisions integer, for the printed summary, the upper bound on the -#' number of revisions to consider "few". Default is 3. -#' @param abs_spread_threshold length-1 numeric, for the printed summary, the -#' maximum spread used to characterize revisions which don't actually change -#' very much. Default is 5% of the maximum value in the dataset, but this is -#' the most unit dependent of values, and likely needs to be chosen -#' appropriate for the scale of the dataset. -#' @param rel_spread_threshold length-1 double between 0 and 1, for the printed -#' summary, the relative spread fraction used to characterize revisions which -#' don't actually change very much. Default is .1, or 10% of the final value #' @param compactify bool. If `TRUE`, we will compactify after the signal #' requested in `...` has been selected on its own and the `drop_nas` step. #' This helps, for example, to give similar results when called on @@ -67,6 +52,8 @@ #' requested signal. The default is `TRUE`. #' @param compactify_abs_tol length-1 double, used if `compactify` is `TRUE`, it #' determines the threshold for when two doubles are considered identical. +#' @param return_only_tibble boolean to return only the simple `tibble` of +#' computational results rather than the complete S3 object. #' #' @details Applies to `epi_archive`s with `time_type`s of `"day"`, `"week"`, #' and `"yearmonth"`. It can also work with a `time_type` of `"integer"` if @@ -76,9 +63,15 @@ #' produce incorrect results for some calculations, since week numbering #' contains jumps at year boundaries. #' +#' @return An S3 object with class `revision_behavior`. This function is typically +#' called for the purposes of inspecting the printed output. The +#' results of the computations are available in +#' `revision_analysis(...)$revision_behavior`. If you only want to access +#' the internal computations, use `return_only_tibble = TRUE`. +#' #' @examples -#' revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) -#' revision_example %>% arrange(desc(spread)) +#' revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) +#' revision_example$revision_behavior %>% arrange(desc(spread)) #' #' @export #' @importFrom cli cli_inform cli_abort cli_li @@ -86,21 +79,19 @@ #' @importFrom vctrs vec_cast #' @importFrom dplyr mutate group_by arrange filter if_any all_of across pull pick c_across #' everything ungroup summarize if_else %>% -revision_summary <- function(epi_arch, - ..., - drop_nas = TRUE, - print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), - within_latest = 0.2, - quick_revision = as.difftime(3, units = "days") %>% - difftime_approx_ceiling_time_delta(epi_arch$time_type), - few_revisions = 3, - abs_spread_threshold = NULL, - rel_spread_threshold = 0.1, - compactify = TRUE, - compactify_abs_tol = 0) { +revision_analysis <- function(epi_arch, + ..., + drop_nas = TRUE, + min_waiting_period = as.difftime(60, units = "days"), + within_latest = 0.2, + compactify = TRUE, + compactify_abs_tol = 0, + return_only_tibble = FALSE) { assert_class(epi_arch, "epi_archive") + if (methods::is(min_waiting_period, "difftime")) { + min_waiting_period <- min_waiting_period %>% + difftime_approx_ceiling_time_delta(epi_arch$time_type) + } # if the column to summarize isn't specified, use the only one if there is only one if (dots_n(...) == 0) { # Choose the first column that's not a key: @@ -126,11 +117,6 @@ revision_summary <- function(epi_arch, cli_abort("Not currently implementing more than one column at a time. Run each separately.") } } - if (is.null(abs_spread_threshold)) { - abs_spread_threshold <- .05 * epi_arch$DT %>% - pull(!!arg) %>% - max(na.rm = TRUE) - } # for each time_value, get # the number of revisions # the maximum spread in value (both absolute and relative) @@ -193,63 +179,113 @@ revision_summary <- function(epi_arch, time_value, geo_value, all_of(epikey_names), n_revisions, min_lag, max_lag, # nolint: object_usage_linter lag_near_latest, spread, rel_spread, min_value, max_value, median_value # nolint: object_usage_linter ) - if (print_inform) { - cli_inform("Min lag (time to first version):") - time_delta_summary(revision_behavior$min_lag, time_type) %>% print() - if (!drop_nas) { - total_na <- epi_arch$DT %>% - filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter - nrow() - cli_inform("Fraction of all versions that are `NA`:") - cli_li(num_percent(total_na, nrow(epi_arch$DT), "")) - cli_inform("") - } - cli_inform("Fraction of epi_key+time_values with") - total_num <- nrow(revision_behavior) # nolint: object_usage_linter - total_num_unrevised <- sum(revision_behavior$n_revisions == 0) # nolint: object_usage_linter - cli_inform("No revisions:") - cli_li(num_percent(total_num_unrevised, total_num, "")) - total_quickly_revised <- sum( # nolint: object_usage_linter - time_delta_to_n_steps(revision_behavior$max_lag, time_type) <= - time_delta_to_n_steps(quick_revision, time_type) - ) - cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, time_type)} - of the `time_value`):") - cli_li(num_percent(total_quickly_revised, total_num, "")) - total_barely_revised <- sum( # nolint: object_usage_linter - revision_behavior$n_revisions <= - few_revisions - ) - cli_inform("Few revisions (At most {few_revisions} revisions for that `time_value`):") - cli_li(num_percent(total_barely_revised, total_num, "")) - cli_inform("") - cli_inform("Fraction of revised epi_key+time_values which have:") + total_na <- epi_arch$DT %>% + filter(is.na(c_across(!!arg))) %>% # nolint: object_usage_linter + nrow() + if (!return_only_tibble) { + revision_behavior <- structure(list( + revision_behavior = revision_behavior, + range_time_values = range(epi_arch$DT$time_value), + signal_variable = arg, + drop_nas = drop_nas, + time_type = time_type, + total_na = total_na, + max_val = max(epi_arch$DT[[arg]], na.rm = TRUE), + n_obs = nrow(epi_arch$DT), + within_latest = within_latest + ), class = "revision_analysis") + } + return(revision_behavior) +} + + - real_revisions <- revision_behavior %>% filter(n_revisions > 0) # nolint: object_usage_linter - n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter - rel_spread <- sum( # nolint: object_usage_linter - real_revisions$rel_spread < - rel_spread_threshold, - na.rm = TRUE - ) + sum(is.na(real_revisions$rel_spread)) - cli_inform("Less than {rel_spread_threshold} spread in relative value:") - cli_li(num_percent(rel_spread, n_real_revised, "")) - abs_spread <- sum( # nolint: object_usage_linter - real_revisions$spread > - abs_spread_threshold - ) # nolint: object_usage_linter - cli_inform("Spread of more than {abs_spread_threshold} in actual value (when revised):") - cli_li(num_percent(abs_spread, n_real_revised, "")) - # time_type_unit_pluralizer[[time_type]] is a format string controlled by us - # and/or downstream devs, so we can paste it onto our format string safely: - units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[time_type]])) # nolint: object_usage_linter - cli_inform("{toTitleCase(units_plural)} until within {within_latest*100}% of the latest value:") - time_delta_summary(revision_behavior[["lag_near_latest"]], time_type) %>% print() +#' Print a `revision_analysis` object +#' +#' @param x a `revision_analysis` object +#' @param quick_revision Difftime or integer (integer is treated as days). +#' The amount of time between the final revision and the +#' actual time_value to consider the revision quickly resolved. Default of 3 +#' days. This will be rounded up to the appropriate `time_type` if +#' necessary (that is 5 days will be rounded to 1 week if the data is weekly). +#' @param few_revisions Integer. The upper bound on the +#' number of revisions to consider "few". Default is 3. +#' @param abs_spread_threshold Scalar numeric. The +#' maximum spread used to characterize revisions which don't actually change +#' very much. Default is 5% of the maximum value in the dataset, but this is +#' the most unit dependent of values, and likely needs to be chosen +#' appropriate for the scale of the dataset. +#' @param rel_spread_threshold Scalar between 0 and 1. The relative spread fraction used to characterize revisions which +#' don't actually change very much. Default is .1, or 10% of the final value +#' +#' @rdname revision_analysis +#' @export +print.revision_analysis <- function(x, + quick_revision = as.difftime(3, units = "days"), + few_revisions = 3, + abs_spread_threshold = NULL, + rel_spread_threshold = 0.1, + ...) { + if (methods::is(quick_revision, "difftime")) { + quick_revision <- quick_revision %>% + difftime_approx_ceiling_time_delta(x$time_type) } - return(revision_behavior) + if (is.null(abs_spread_threshold)) abs_spread_threshold <- .05 * x$max_val + rev_beh <- x$revision_behavior + cli::cli_h2("An epi_archive spanning {.val {x$range_time_values[1]}} to {.val {x$range_time_values[2]}}.") + cli::cli_h3("Min lag (time to first version):") + time_delta_summary(rev_beh$min_lag, x$time_type) %>% print() + if (!x$drop_nas) { + cli_inform("Fraction of all versions that are `NA`:") + cli_li(num_percent(x$total_na, x$n_obs, "")) + cli_inform("") + } + cli::cli_h3("Fraction of epi_key + time_values with") + total_num <- nrow(rev_beh) # nolint: object_usage_linter + total_num_unrevised <- sum(rev_beh$n_revisions == 0) # nolint: object_usage_linter + cli_inform("No revisions:") + cli_li(num_percent(total_num_unrevised, total_num, "")) + total_quickly_revised <- sum( # nolint: object_usage_linter + time_delta_to_n_steps(rev_beh$max_lag, x$time_type) <= + time_delta_to_n_steps(quick_revision, x$time_type) + ) + cli_inform("Quick revisions (last revision within {format_time_delta(quick_revision, x$time_type)} + of the `time_value`):") + cli_li(num_percent(total_quickly_revised, total_num, "")) + total_barely_revised <- sum(rev_beh$n_revisions <= few_revisions) + cli_inform("Few revisions (At most {.val {few_revisions}} revisions for that `time_value`):") + cli_li(num_percent(total_barely_revised, total_num, "")) + + cli::cli_h3("Fraction of revised epi_key + time_values which have:") + + real_revisions <- rev_beh %>% filter(n_revisions > 0) # nolint: object_usage_linter + n_real_revised <- nrow(real_revisions) # nolint: object_usage_linter + rel_spread <- sum( # nolint: object_usage_linter + real_revisions$rel_spread < rel_spread_threshold, + na.rm = TRUE + ) + sum(is.na(real_revisions$rel_spread)) + cli_inform("Less than {.val {rel_spread_threshold}} spread in relative value:") + cli_li(num_percent(rel_spread, n_real_revised, "")) + abs_spread <- sum( # nolint: object_usage_linter + real_revisions$spread > abs_spread_threshold + ) # nolint: object_usage_linter + divid <- cli::cli_div(theme = list(.val = list(digits = 3))) + cli_inform("Spread of more than {.val {abs_spread_threshold}} in actual value (when revised):") + cli::cli_end(divid) + cli_li(num_percent(abs_spread, n_real_revised, "")) + + # time_type_unit_pluralizer[[time_type]] is a format string controlled by us + # and/or downstream devs, so we can paste it onto our format string safely: + units_plural <- pluralize(paste0("{qty(2)}", time_type_unit_pluralizer[[x$time_type]])) # nolint: object_usage_linter + cli::cli_h3("{toTitleCase(units_plural)} until within {.val {x$within_latest*100}}% of the latest value:") + time_delta_summary(rev_beh[["lag_near_latest"]], x$time_type) %>% print() } +#' @export +#' @rdname revision_analysis +revision_summary <- revision_analysis + #' pull the value from lags when values starts indefinitely being within prop of its latest value. #' @param lags vector of lags; should be sorted #' @param values this should be a vector (e.g., a column) with length matching that of `lags` diff --git a/R/slide.R b/R/slide.R index abc7c3b77..bc993bf4a 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,47 +1,82 @@ -#' Slide a function over variables in an `epi_df` object +#' More general form of [`epi_slide_opt`] for rolling/running computations #' -#' @description Slides a given function over variables in an `epi_df` object. -#' This is useful for computations like rolling averages. The function supports -#' many ways to specify the computation, but by far the most common use case is -#' as follows: +#' Most rolling/running computations can be handled by [`epi_slide_mean`], +#' [`epi_slide_sum`], or the medium-generality [`epi_slide_opt`] functions, +#' which have been specialized to run more quickly. `epi_slide()` is a slower +#' but even more general function for rolling/running computations, and uses a +#' different interface to specify the computations; you typically only need to +#' consider using `epi_slide()` if you have a computation that depends on +#' multiple columns simultaneously, outputs multiple columns simultaneously, or +#' produces non-numeric output. For example, this computation depends on +#' multiple columns: #' #' ``` -#' # Create new column `cases_7dmed` that contains a 7-day trailing median of cases -#' epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) +#' cases_deaths_subset %>% +#' epi_slide( +#' cfr_estimate_v0 = death_rate_7d_av[[22]]/case_rate_7d_av[[1]], +#' .window_size = 22 +#' ) %>% +#' print(n = 30) #' ``` #' -#' For two very common use cases, we provide optimized functions that are much -#' faster than `epi_slide`: `epi_slide_mean()` and `epi_slide_sum()`. We -#' recommend using these functions when possible. +#' (Here, the value 22 was selected using `epi_cor()` and averaging across +#' `geo_value`s. See +#' \href{https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1}{this +#' manuscript} for some warnings & information using similar types of CFR +#' estimators.) #' #' See `vignette("epi_df")` for more examples. #' #' @template basic-slide-params -#' @param .f Function, formula, or missing; together with `...` specifies the -#' computation to slide. The return of the computation should either be a -#' scalar or a 1-row data frame. Data frame returns will be -#' `tidyr::unpack()`-ed, if named, and will be [`tidyr::pack`]-ed columns, if -#' not named. See examples. +#' @param .f,... The computation to slide. The input will be a time window of +#' the data for a single subpopulation (i.e., a single `geo_value` and single +#' value for any [`other_keys`][as_epi_df] you set up, such as age groups, race, etc.). +#' The input will always have the same size, determined by `.window_size`, and +#' will fill in any missing `time_values`, using `NA` values for missing +#' measurements. The output should be a scalar value or a 1-row data frame; +#' these outputs will be collected into a new column or columns in the +#' `epi_slide()` result. Data frame outputs will be unpacked into multiple +#' columns in the result by default, or [`tidyr::pack`]ed into a single +#' data-frame-type column if you provide a name for such a column (e.g., via +#' `.new_col_name`). +#' +#' You can specify the computation in one of the following ways: +#' +#' - Don't provide `.f`, and instead use one or more +#' [`dplyr::summarize`]-esque ["data-masking"][rlang::args_data_masking] +#' expressions in `...`, e.g., `cfr_estimate_v0 = +#' death_rate_7d_av[[22]]/case_rate_7d_av[[1]]`. This way is sometimes more +#' convenient, but also has the most computational overhead. +#' +#' - Provide a formula in `.f`, e.g., `~ +#' .x$death_rate_7d_av[[22]]/.x$case_rate_7d_av[[1]]`. In this formula, `.x` +#' is an `epi_df` containing data for a single time window as described above, +#' taken from the original `.x` fed into `epi_slide()`. +#' +#' - Provide a function in `.f`, e.g., `function(x, g, t) +#' x$death_rate_7d_av[[22]]/x$case_rate_7d_av[[1]]`. The function should be of +#' the form `function(x, g, t)` or `function(x, g, t, )`, where: #' -#' - If `.f` is missing, then `...` will specify the computation via -#' tidy-evaluation. This is usually the most convenient way to use -#' `epi_slide`. See examples. -#' - If `.f` is a formula, then the formula should use `.x` (not the same as -#' the input `epi_df`) to operate on the columns of the input `epi_df`, e.g. -#' `~mean(.x$var)` to compute a mean of `var`. -#' - If a function, `.f` must have the form `function(x, g, t, ...)`, where: #' - `x` is a data frame with the same column names as the original object, -#' minus any grouping variables, with only the windowed data for one -#' group-`.ref_time_value` combination -#' - `g` is a one-row tibble containing the values of the grouping variables -#' for the associated group +#' minus any grouping variables, with only the windowed data for one +#' group-`.ref_time_value` combination +#' +#' - `g` is a one-row tibble specifying the `geo_value` and value of any +#' `other_keys` for this computation +#' #' - `t` is the `.ref_time_value` for the current window -#' - `...` are additional arguments #' -#' @param ... Additional arguments to pass to the function or formula specified -#' via `.f`. Alternatively, if `.f` is missing, then the `...` is interpreted -#' as a ["data-masking"][rlang::args_data_masking] expression or expressions -#' for tidy evaluation. +#' - If you have a complex `.f` containing ``, you can provide values for those arguments in the `...` +#' argument to `epi_slide()`. +#' +#' The values of `g` and `t` are also available to data-masking expression and +#' formula-based computations as `.group_key` and `.ref_time_value`, +#' respectively. Formula computations also let you use `.y` or `.z`, +#' respectively, as additional names for these same quantities (similar to +#' [`dplyr::group_modify`]). +#' #' @param .new_col_name Name for the new column that will contain the computed #' values. The default is "slide_value" unless your slide computations output #' data frames, in which case they will be unpacked (as in `tidyr::unpack()`) @@ -49,14 +84,34 @@ #' be given names that clash with the existing columns of `.x`. #' #' @details +#' +#' ## Motivation and lower-level alternatives +#' +#' `epi_slide()` is focused on preventing errors and providing a convenient +#' interface. If you need computational speed, many computations can be optimized +#' by one of the following: +#' +#' * Performing core sliding operations with `epi_slide_opt()` with +#' `frollapply`, and using potentially-grouped `mutate()`s to transform or +#' combine the results. +#' +#' * Grouping by `geo_value` and any `other_keys`; [`complete()`]ing with +#' `full_seq()` to fill in time gaps; `arrange()`ing by `time_value`s within +#' each group; using `mutate()` with vectorized operations and shift operators +#' like `dplyr::lead()` and `dplyr::lag()` to perform the core operations, +#' being careful to give the desired results for the least and most recent +#' `time_value`s (often `NA`s for the least recent); ungrouping; and +#' `filter()`ing back down to only rows that existed before the `complete()` +#' stage if necessary. +#' #' ## Advanced uses of `.f` via tidy evaluation #' -#' If specifying `.f` via tidy evaluation, in addition to the standard [`.data`] -#' and [`.env`], we make some additional "pronoun"-like bindings available: +#' If specifying `.f` via tidy evaluation, in addition to the standard [`.data`][rlang::.data] +#' and [`.env`][rlang::.env], we make some additional "pronoun"-like bindings available: #' #' - .x, which is like `.x` in [`dplyr::group_modify`]; an ordinary object #' like an `epi_df` rather than an rlang [pronoun][rlang::as_data_pronoun] -#' like [`.data`]; this allows you to use additional `dplyr`, `tidyr`, and +#' like `.data`; this allows you to use additional `dplyr`, `tidyr`, and #' `epiprocess` operations. If you have multiple expressions in `...`, this #' won't let you refer to the output of the earlier expressions, but `.data` #' will. @@ -72,34 +127,43 @@ #' @examples #' library(dplyr) #' -#' # Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases -#' cases_deaths_subset %>% +#' # Generate some simple time-varying CFR estimates: +#' with_cfr_estimates <- cases_deaths_subset %>% #' epi_slide( -#' cases_7sd = sd(cases, na.rm = TRUE), -#' cases_7dav = mean(cases, na.rm = TRUE), -#' .window_size = 7 -#' ) %>% -#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) -#' # Note that epi_slide_mean could be used to more quickly calculate cases_7dav. +#' cfr_estimate_v0 = death_rate_7d_av[[22]] / case_rate_7d_av[[1]], +#' .window_size = 22 +#' ) +#' with_cfr_estimates %>% +#' print(n = 30) +#' # (Here, the value 22 was selected using `epi_cor()` and averaging across +#' # `geo_value`s. See +#' # https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1 for some +#' # warnings & information using CFR estimators along these lines.) #' -#' # In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or -#' # formula in a way similar to [`dplyr::group_modify`]: -#' my_summarizer <- function(window_data) { -#' window_data %>% -#' summarize( -#' cases_7sd = sd(cases, na.rm = TRUE), -#' cases_7dav = mean(cases, na.rm = TRUE) -#' ) +#' # In addition to the [`dplyr::mutate`]-like syntax, you can feed in a +#' # function or formula in a way similar to [`dplyr::group_modify`]; these +#' # often run much more quickly: +#' my_computation <- function(window_data) { +#' tibble( +#' cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / +#' window_data$case_rate_7d_av[[1]] +#' ) #' } -#' cases_deaths_subset %>% +#' with_cfr_estimates2 <- cases_deaths_subset %>% #' epi_slide( -#' ~ my_summarizer(.x), -#' .window_size = 7 -#' ) %>% -#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) -#' -#' -#' +#' ~ my_computation(.x), +#' .window_size = 22 +#' ) +#' with_cfr_estimates3 <- cases_deaths_subset %>% +#' epi_slide( +#' function(window_data, g, t) { +#' tibble( +#' cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / +#' window_data$case_rate_7d_av[[1]] +#' ) +#' }, +#' .window_size = 22 +#' ) #' #' #' #### Advanced: #### @@ -549,20 +613,25 @@ get_before_after_from_window <- function(window_size, align, time_type) { list(before = before, after = after) } -#' Optimized slide functions for common cases +#' Calculate rolling or running means, sums, etc., or custom calculations #' -#' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] -#' or [slider::summary-slide] function over variables in an `epi_df` object. -#' These functions tend to be much faster than `epi_slide()`. See -#' `vignette("epi_df")` for more examples. +#' @description These methods take each subpopulation (i.e., a single +#' `geo_value` and combination of any `other_keys` you set up for age groups, +#' etc.) and perform a `.window_size`-width time window rolling/sliding +#' computation, or alternatively, a running/cumulative computation (with +#' `.window_size = Inf`) on the requested columns. Explicit `NA` measurements +#' are temporarily added to fill in any time gaps, and, for rolling +#' computations, to pad the time series to ensure that the first & last +#' computations use exactly `.window_size` values. +#' +#' `epi_slide_opt` allows you to use any [data.table::froll] or +#' [slider::summary-slide] function. If none of those specialized functions fit +#' your usecase, you can use `data.table::frollapply` together with a non-rolling +#' function (e.g., `median`). See [`epi_slide`] if you need to work with +#' multiple columns at once or output a custom type. #' #' @template basic-slide-params -#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column -#' name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), -#' [other tidy-select expression][tidyselect::language], or a vector of -#' characters (e.g. `c("cases", "deaths")`). Variable names can be used as if -#' they were positions in the data frame, so expressions like `x:y` can be -#' used to select a range of variables. +#' @param .col_names `r tidyselect_arg_roxygen` #' #' The tidy-selection renaming interface is not supported, and cannot be used #' to provide output column names; if you want to customize the output column @@ -1108,7 +1177,7 @@ epi_slide_sum <- function( #' `before` and `after` args are assumed to have been validated by the calling #' function (using `validate_slide_window_arg`). #' -#' @importFrom checkmate assert_function +#' @importFrom checkmate assert_function anyInfinite #' @keywords internal full_date_seq <- function(x, before, after, time_type) { if (!time_type %in% c("day", "week", "yearmonth", "integer")) { @@ -1126,7 +1195,7 @@ full_date_seq <- function(x, before, after, time_type) { if (time_type %in% c("yearmonth", "integer")) { all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) - if (before != 0 && before != Inf) { + if (before != 0 && !anyInfinite(before)) { pad_early_dates <- all_dates[1L] - before:1 } if (after != 0) { @@ -1139,7 +1208,7 @@ full_date_seq <- function(x, before, after, time_type) { ) all_dates <- seq(min(x$time_value), max(x$time_value), by = by) - if (before != 0 && before != Inf) { + if (before != 0 && !anyInfinite(before)) { # The behavior is analogous to the branch with tsibble types above. For # more detail, note that the function `seq.Date(from, ..., length.out = # n)` returns `from + 0:n`. Since we want `from + 1:n`, we drop the first diff --git a/R/utils.R b/R/utils.R index 66270d69c..972170998 100644 --- a/R/utils.R +++ b/R/utils.R @@ -98,26 +98,29 @@ format_chr_deparse <- function(x) { paste(collapse = "", deparse(x)) } -#' Format a character vector as a string via deparsing/quoting each +#' Format each entry in a character vector via quoting; special replacement for length 0 +#' +#' Performs no escaping within the strings; if you want something that reader +#' could copy-paste to debug, look into `format_deparse` (note that this +#' collapses into a single string). +#' +#' @param x chr; e.g., `colnames` of some data frame +#' @param empty chr, likely string; what should be output if `x` is of length 0? +#' @return chr; same `length` as `x` if `x` had nonzero length; value of `empty` otherwise +#' +#' @examples +#' cli::cli_inform('{epiprocess:::format_chr_with_quotes("x")}') +#' cli::cli_inform('{epiprocess:::format_chr_with_quotes(c("x","y"))}') +#' nms <- c("x", "\"Total Cases\"") +#' cli::cli_inform("{epiprocess:::format_chr_with_quotes(nms)}") +#' cli::cli_inform("{epiprocess:::format_chr_with_quotes(character())}") #' -#' @param x `chr`; e.g., `colnames` of some data frame -#' @param empty string; what should be output if `x` is of length 0? -#' @return string #' @keywords internal format_chr_with_quotes <- function(x, empty = "*none*") { if (length(x) == 0L) { empty } else { - # Deparse to get quoted + escape-sequenced versions of varnames; collapse to - # single line (assuming no newlines in `x`). Though if we hand this to cli - # it may insert them (even in middle of quotes) while wrapping lines. - deparsed_collapsed <- paste(collapse = "", deparse(x)) - if (length(x) == 1L) { - deparsed_collapsed - } else { - # remove surrounding `c()`: - substr(deparsed_collapsed, 3L, nchar(deparsed_collapsed) - 1L) - } + paste0('"', x, '"') } } diff --git a/_pkgdown.yml b/_pkgdown.yml index 3742bd416..ebd73dc84 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -37,20 +37,12 @@ articles: contents: - compactify -repo: - url: - home: https://github.com/cmu-delphi/epiprocess/ - source: https://github.com/cmu-delphi/epiprocess/ - issue: https://github.com/cmu-delphi/epiprocess/issues - reference: - title: "`epi_df` basics" desc: Constructors and information for `epi_df` objects. - contents: - epi_df - - print.epi_df - group_epi_df - - autoplot.epi_df - title: "`epi_df` manipulation" desc: Functions operating on `epi_df` objects. @@ -71,7 +63,6 @@ reference: desc: Constructors and information for `epi_archive` objects. - contents: - epi_archive - - print.epi_archive - clone - group_by.epi_archive @@ -79,11 +70,20 @@ reference: desc: Functions operating on `epi_archive` objects. - contents: - epix_as_of + - epix_as_of_current - epix_slide - - epix_merge - revision_summary + - epix_merge + - filter.epi_archive - epix_fill_through_version - epix_truncate_versions_after + - set_versions_end + + - title: Basic analysis and visualization + - contents: + - starts_with("autoplot") + - starts_with("print") + - revision_summary - title: Example data - contents: diff --git a/man/autoplot-epi.Rd b/man/autoplot-epi.Rd new file mode 100644 index 000000000..c7c8a7c4f --- /dev/null +++ b/man/autoplot-epi.Rd @@ -0,0 +1,149 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/autoplot.R +\name{autoplot-epi} +\alias{autoplot-epi} +\alias{autoplot.epi_df} +\alias{autoplot.epi_archive} +\alias{plot.epi_df} +\alias{plot.epi_archive} +\title{Automatically plot an epi_df or epi_archive} +\usage{ +\method{autoplot}{epi_df}( + object, + ..., + .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), + .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), + .base_color = "#3A448F", + .facet_filter = NULL, + .max_facets = deprecated() +) + +\method{autoplot}{epi_archive}( + object, + ..., + .base_color = "black", + .versions = NULL, + .mark_versions = FALSE, + .facet_filter = NULL +) + +\method{plot}{epi_df}(x, ...) + +\method{plot}{epi_archive}(x, ...) +} +\arguments{ +\item{object, x}{An \code{epi_df} or \code{epi_archive}} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted +expressions separated by commas. Variable names can be used as if they +were positions in the data frame, so expressions like \code{x:y} can +be used to select a range of variables.} + +\item{.color_by}{Which variables should determine the color(s) used to plot +lines. Options include: +\itemize{ +\item \code{all_keys} - the default uses the interaction of any key variables +including the \code{geo_value} +\item \code{geo_value} - \code{geo_value} only +\item \code{other_keys} - any available keys that are not \code{geo_value} +\item \code{.response} - the numeric variables (same as the y-axis) +\item \code{all} - uses the interaction of all keys and numeric variables +\item \code{none} - no coloring aesthetic is applied +}} + +\item{.facet_by}{Similar to \code{.color_by} except that the default is to display +each numeric variable on a separate facet} + +\item{.base_color}{Lines will be shown with this color if \code{.color_by == "none"}. +For example, with a single numeric variable and faceting by \code{geo_value}, all +locations would share the same color line.} + +\item{.facet_filter}{Select which facets will be displayed. Especially +useful for when there are many \code{geo_value}'s or keys. This is a +<\code{\link[rlang:args_data_masking]{rlang}}> expression along the lines of \code{\link[dplyr:filter]{dplyr::filter()}}. +However, it must be a single expression combined with the \code{&} operator. This +contrasts to the typical use case which allows multiple comma-separated expressions +which are implicitly combined with \code{&}. When multiple variables are selected +with \code{...}, their names can be filtered in combination with other factors +by using \code{.response_name}. See the examples below.} + +\item{.max_facets}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}} + +\item{.versions}{Select which versions will be displayed. By default, +a separate line will be shown with the data as it would have appeared on +every day in the archive. This can sometimes become overwhelming. For +example, daily data would display a line for what the data would have looked +like on every single day. To override this, you can select specific dates, +by passing a vector of values here. Alternatively, a sequence can be +automatically created by passing a string like \code{"2 weeks"} or \code{"month"}. +For time types where the \code{time_value} is a date object, any string that +is interpretable by \code{\link[base:seq.Date]{base::seq.Date()}} is allowed. + +For \code{time_type = "integer"}, an integer larger than 1 will give a subset +of versions.} + +\item{.mark_versions}{Logical. Indicate whether to mark each version with +a vertical line. Note that displaying many versions can become busy.} +} +\value{ +A \link[ggplot2:ggplot]{ggplot2::ggplot} object +} +\description{ +Automatically plot an epi_df or epi_archive +} +\examples{ +# -- Use it on an `epi_df` +autoplot(cases_deaths_subset, cases, death_rate_7d_av) +autoplot(cases_deaths_subset, case_rate_7d_av, .facet_by = "geo_value") +autoplot(cases_deaths_subset, case_rate_7d_av, + .color_by = "none", + .facet_by = "geo_value" +) +autoplot(cases_deaths_subset, case_rate_7d_av, + .color_by = "none", + .base_color = "red", .facet_by = "geo_value" +) + +# .base_color specification won't have any effect due .color_by default +autoplot(cases_deaths_subset, case_rate_7d_av, + .base_color = "red", .facet_by = "geo_value" +) + +# filter to only some facets, must be explicitly combined +autoplot(cases_deaths_subset, cases, death_rate_7d_av, + .facet_by = "all", + .facet_filter = (.response_name == "cases" & geo_value \%in\% c("tx", "pa")) | + (.response_name == "death_rate_7d_av" & + geo_value \%in\% c("ca", "fl", "ga", "ny")) +) +# Just an alias for convenience +plot(cases_deaths_subset, cases, death_rate_7d_av, + .facet_by = "all", + .facet_filter = (.response_name == "cases" & geo_value \%in\% c("tx", "pa")) | + (.response_name == "death_rate_7d_av" & + geo_value \%in\% c("ca", "fl", "ga", "ny")) +) + +# -- Use it on an archive + +autoplot(archive_cases_dv_subset, percent_cli, .versions = "week") +autoplot(archive_cases_dv_subset_all_states, percent_cli, + .versions = "week", + .facet_filter = geo_value \%in\% c("or", "az", "vt", "ms") +) +autoplot(archive_cases_dv_subset, percent_cli, + .versions = "month", + .facet_filter = geo_value == "ca" +) +autoplot(archive_cases_dv_subset_all_states, percent_cli, + .versions = "1 month", + .facet_filter = geo_value \%in\% c("or", "az", "vt", "ms"), + .mark_versions = TRUE +) +# Just an alias for convenience +plot(archive_cases_dv_subset_all_states, percent_cli, + .versions = "1 month", + .facet_filter = geo_value \%in\% c("or", "az", "vt", "ms"), + .mark_versions = TRUE +) +} diff --git a/man/autoplot.epi_df.Rd b/man/autoplot.epi_df.Rd deleted file mode 100644 index d53335c14..000000000 --- a/man/autoplot.epi_df.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/autoplot.R -\name{autoplot.epi_df} -\alias{autoplot.epi_df} -\title{Automatically plot an epi_df} -\usage{ -\method{autoplot}{epi_df}( - object, - ..., - .color_by = c("all_keys", "geo_value", "other_keys", ".response", "all", "none"), - .facet_by = c(".response", "other_keys", "all_keys", "geo_value", "all", "none"), - .base_color = "#3A448F", - .max_facets = Inf -) -} -\arguments{ -\item{object}{An \code{epi_df}} - -\item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> One or more unquoted -expressions separated by commas. Variable names can be used as if they -were positions in the data frame, so expressions like \code{x:y} can -be used to select a range of variables.} - -\item{.color_by}{Which variables should determine the color(s) used to plot -lines. Options include: -\itemize{ -\item \code{all_keys} - the default uses the interaction of any key variables -including the \code{geo_value} -\item \code{geo_value} - \code{geo_value} only -\item \code{other_keys} - any available keys that are not \code{geo_value} -\item \code{.response} - the numeric variables (same as the y-axis) -\item \code{all} - uses the interaction of all keys and numeric variables -\item \code{none} - no coloring aesthetic is applied -}} - -\item{.facet_by}{Similar to \code{.color_by} except that the default is to display -each numeric variable on a separate facet} - -\item{.base_color}{Lines will be shown with this color. For example, with a -single numeric variable and faceting by \code{geo_value}, all locations would -share the same color line.} - -\item{.max_facets}{Cut down of the number of facets displayed. Especially -useful for testing when there are many \code{geo_value}'s or keys.} -} -\value{ -A ggplot object -} -\description{ -Automatically plot an epi_df -} -\examples{ -autoplot(cases_deaths_subset, cases, death_rate_7d_av) -autoplot(cases_deaths_subset, case_rate_7d_av, .facet_by = "geo_value") -autoplot(cases_deaths_subset, case_rate_7d_av, - .color_by = "none", - .facet_by = "geo_value" -) -autoplot(cases_deaths_subset, case_rate_7d_av, - .color_by = "none", - .base_color = "red", .facet_by = "geo_value" -) - -# .base_color specification won't have any effect due .color_by default -autoplot(cases_deaths_subset, case_rate_7d_av, - .base_color = "red", .facet_by = "geo_value" -) -} diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index f98666e03..32cba5b5c 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -1,23 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/archive.R -\name{epi_archive} +\name{as_epi_archive} +\alias{as_epi_archive} +\alias{is_epi_archive} \alias{epi_archive} \alias{new_epi_archive} \alias{validate_epi_archive} -\alias{as_epi_archive} -\title{\code{epi_archive} object} +\title{\code{as_epi_archive} converts a data frame, data table, or tibble into an +\code{epi_archive} object.} \usage{ -new_epi_archive( - x, - geo_type, - time_type, - other_keys, - clobberable_versions_start, - versions_end -) - -validate_epi_archive(x) - as_epi_archive( x, geo_type = deprecated(), @@ -30,6 +21,19 @@ as_epi_archive( ..., versions_end = .versions_end ) + +is_epi_archive(x) + +new_epi_archive( + x, + geo_type, + time_type, + other_keys, + clobberable_versions_start, + versions_end +) + +validate_epi_archive(x) } \arguments{ \item{x}{A data.frame, data.table, or tibble, with columns \code{geo_value}, @@ -47,32 +51,6 @@ that should be considered key variables (in the language of \code{data.table}) apart from "geo_value", "time_value", and "version". Typical examples are "age" or more granular geographies.} -\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the \emph{same} version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable under routine circumstances, such as (a) -today's version of one/all of the columns being published after initially -being filled with \code{NA} or LOCF, (b) a buggy version of today's data being -published but then fixed and republished later in the day, or (c) data -pipeline delays (e.g., publisher uploading, periodic scraping, database -syncing, periodic fetching, etc.) that make events (a) or (b) reflected -later in the day (or even on a different day) than expected; potential -causes vary between different data pipelines. The default value is \code{NA}, -which doesn't consider any versions to be clobberable. Another setting that -may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} - -\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is -\code{max_version_with_row_in(x)}, but values greater than this could also be -valid, and would indicate that we observed additional versions of the data -beyond \code{max(x$version)}, but they all contained empty updates. (The default -value of \code{clobberable_versions_start} does not fully trust these empty -updates, and assumes that any version \verb{>= max(x$version)} could be -clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} - \item{compactify}{Optional; \code{TRUE}, \code{FALSE}, or \code{"message"}. \code{TRUE} will remove some redundant rows, \code{FALSE} will not. \code{"message"} is like \code{TRUE} but will emit a message if anything was changed. Default is \code{TRUE}. See @@ -84,15 +62,57 @@ corresponds to exact equality. Consider using this if your value columns undergo tiny nonmeaningful revisions and the archive object with the default setting is too large.} +\item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the +same \code{class} as \code{x$version}, or an \code{NA} of any \code{class}: specifically, +either (a) the earliest version that could be subject to "clobbering" +(being overwritten with different update data, but using the \emph{same} version +tag as the old update data), or (b) \code{NA}, to indicate that no versions are +clobberable. There are a variety of reasons why versions could be +clobberable under routine circumstances, such as (a) today's version of +one/all of the columns being published after initially being filled with +\code{NA} or LOCF, (b) a buggy version of today's data being published but then +fixed and republished later in the day, or (c) data pipeline delays (e.g., +publisher uploading, periodic scraping, database syncing, periodic +fetching, etc.) that make events (a) or (b) reflected later in the day (or +even on a different day) than expected; potential causes vary between +different data pipelines. The default value is \code{NA}, which doesn't consider +any versions to be clobberable. Another setting that may be appropriate for +some pipelines is \code{max_version_with_row_in(x)}.} + \item{.versions_end}{location based versions_end, used to avoid prefix \code{version = issue} from being assigned to \code{versions_end} instead of being used to rename columns.} \item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For example \code{version = release_date}} + +\item{versions_end}{Optional; length-1, same \code{class} as \code{x$version}: what is +the last version we have observed? The default is +\code{max_version_with_row_in(x)}, but values greater than this could also be +valid, and would indicate that we observed additional versions of the data +beyond \code{max(x$version)}, but they all contained empty updates. (The default +value of \code{clobberable_versions_start} does not fully trust these empty +updates, and assumes that any version \verb{>= max(x$version)} could be +clobbered.) If \code{nrow(x) == 0}, then this argument is mandatory.} } \value{ -An \code{epi_archive} object. +\itemize{ +\item Of \code{as_epi_archive}: an \code{epi_archive} object +} + +\itemize{ +\item Of \code{is_epi_archive}: \code{TRUE} if the object inherits from \code{epi_archive}, +otherwise \code{FALSE}. +} + +\itemize{ +\item Of \code{new_epi_archive}: an (unvalidated) \code{epi_archive} +} + +\itemize{ +\item Of \code{validate_epi_archive}: an \code{epi_archive}, +\link[base:invisible]{invisibly} (or raises an error if \code{x} was invalid) +} } \description{ The second main data structure for storing time series in @@ -109,6 +129,9 @@ only performs some fast, basic checks on the inputs. \code{validate_epi_archive} can perform more costly validation checks on its output. But most users should use \code{as_epi_archive}, which performs all necessary checks and has some additional features. + +Does not validate \code{geo_type} or \code{time_type} against \code{geo_value} and +\code{time_value} columns. These are assumed to have been set to compatibly. } \details{ An \code{epi_archive} contains a \code{data.table} object \code{DT} (from the diff --git a/man/epi_df.Rd b/man/epi_df.Rd index a67827187..0ec5830ca 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -89,7 +89,8 @@ which can be seen as measured variables at each key. In brief, an \code{epi_df} represents a snapshot of an epidemiological data set at a point in time. } \details{ -An \code{epi_df} is a tibble with (at least) the following columns: +An \code{epi_df} is a kind of tibble with (at least) the following +columns: \itemize{ \item \code{geo_value}: A character vector representing the geographical unit of observation. This could be a country code, a state name, a county code, @@ -98,10 +99,11 @@ etc. } Other columns can be considered as measured variables, which we also refer to -as signal variables. An \code{epi_df} object also has metadata with (at least) -the following fields: +as indicators or signals. An \code{epi_df} object also has metadata with (at +least) the following fields: \itemize{ \item \code{geo_type}: the type for the geo values. +\item \code{time_type}: the type for the time values. \item \code{as_of}: the time value at which the given data were available. } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index f053909db..92c1fa914 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/slide.R \name{epi_slide} \alias{epi_slide} -\title{Slide a function over variables in an \code{epi_df} object} +\title{More general form of \code{\link{epi_slide_opt}} for rolling/running computations} \usage{ epi_slide( .x, @@ -20,34 +20,46 @@ epi_slide( and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} -\item{.f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. The return of the computation should either be a -scalar or a 1-row data frame. Data frame returns will be -\code{tidyr::unpack()}-ed, if named, and will be \code{\link[tidyr:pack]{tidyr::pack}}-ed columns, if -not named. See examples. +\item{.f, ...}{The computation to slide. The input will be a time window of +the data for a single subpopulation (i.e., a single \code{geo_value} and single +value for any \code{\link[=as_epi_df]{other_keys}} you set up, such as age groups, race, etc.). +The input will always have the same size, determined by \code{.window_size}, and +will fill in any missing \code{time_values}, using \code{NA} values for missing +measurements. The output should be a scalar value or a 1-row data frame; +these outputs will be collected into a new column or columns in the +\code{epi_slide()} result. Data frame outputs will be unpacked into multiple +columns in the result by default, or \code{\link[tidyr:pack]{tidyr::pack}}ed into a single +data-frame-type column if you provide a name for such a column (e.g., via +\code{.new_col_name}). + +You can specify the computation in one of the following ways: \itemize{ -\item If \code{.f} is missing, then \code{...} will specify the computation via -tidy-evaluation. This is usually the most convenient way to use -\code{epi_slide}. See examples. -\item If \code{.f} is a formula, then the formula should use \code{.x} (not the same as -the input \code{epi_df}) to operate on the columns of the input \code{epi_df}, e.g. -\code{~mean(.x$var)} to compute a mean of \code{var}. -\item If a function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where: +\item Don't provide \code{.f}, and instead use one or more +\code{\link[dplyr:summarise]{dplyr::summarize}}-esque \link[rlang:args_data_masking]{"data-masking"} +expressions in \code{...}, e.g., \code{cfr_estimate_v0 = death_rate_7d_av[[22]]/case_rate_7d_av[[1]]}. This way is sometimes more +convenient, but also has the most computational overhead. +\item Provide a formula in \code{.f}, e.g., \code{~ .x$death_rate_7d_av[[22]]/.x$case_rate_7d_av[[1]]}. In this formula, \code{.x} +is an \code{epi_df} containing data for a single time window as described above, +taken from the original \code{.x} fed into \code{epi_slide()}. +\item Provide a function in \code{.f}, e.g., \code{function(x, g, t) x$death_rate_7d_av[[22]]/x$case_rate_7d_av[[1]]}. The function should be of +the form \verb{function(x, g, t)} or \verb{function(x, g, t, )}, where: \itemize{ \item \code{x} is a data frame with the same column names as the original object, minus any grouping variables, with only the windowed data for one group-\code{.ref_time_value} combination -\item \code{g} is a one-row tibble containing the values of the grouping variables -for the associated group +\item \code{g} is a one-row tibble specifying the \code{geo_value} and value of any +\code{other_keys} for this computation \item \code{t} is the \code{.ref_time_value} for the current window -\item \code{...} are additional arguments +\item If you have a complex \code{.f} containing \verb{}, you can provide values for those arguments in the \code{...} +argument to \code{epi_slide()}. } -}} -\item{...}{Additional arguments to pass to the function or formula specified -via \code{.f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted -as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions -for tidy evaluation.} +The values of \code{g} and \code{t} are also available to data-masking expression and +formula-based computations as \code{.group_key} and \code{.ref_time_value}, +respectively. Formula computations also let you use \code{.y} or \code{.z}, +respectively, as additional names for these same quantities (similar to +\code{\link[dplyr:group_map]{dplyr::group_modify}}). +}} \item{.window_size}{The size of the sliding window. The accepted values depend on the type of the \code{time_value} column in \code{.x}: @@ -95,30 +107,60 @@ added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups as \code{.x} if \code{.x} was grouped. } \description{ -Slides a given function over variables in an \code{epi_df} object. -This is useful for computations like rolling averages. The function supports -many ways to specify the computation, but by far the most common use case is -as follows: - -\if{html}{\out{
}}\preformatted{# Create new column `cases_7dmed` that contains a 7-day trailing median of cases -epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) +Most rolling/running computations can be handled by \code{\link{epi_slide_mean}}, +\code{\link{epi_slide_sum}}, or the medium-generality \code{\link{epi_slide_opt}} functions, +which have been specialized to run more quickly. \code{epi_slide()} is a slower +but even more general function for rolling/running computations, and uses a +different interface to specify the computations; you typically only need to +consider using \code{epi_slide()} if you have a computation that depends on +multiple columns simultaneously, outputs multiple columns simultaneously, or +produces non-numeric output. For example, this computation depends on +multiple columns: +} +\details{ +\if{html}{\out{
}}\preformatted{cases_deaths_subset \%>\% + epi_slide( + cfr_estimate_v0 = death_rate_7d_av[[22]]/case_rate_7d_av[[1]], + .window_size = 22 + ) \%>\% + print(n = 30) }\if{html}{\out{
}} -For two very common use cases, we provide optimized functions that are much -faster than \code{epi_slide}: \code{epi_slide_mean()} and \code{epi_slide_sum()}. We -recommend using these functions when possible. +(Here, the value 22 was selected using \code{epi_cor()} and averaging across +\code{geo_value}s. See +\href{https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1}{this +manuscript} for some warnings & information using similar types of CFR +estimators.) See \code{vignette("epi_df")} for more examples. +\subsection{Motivation and lower-level alternatives}{ + +\code{epi_slide()} is focused on preventing errors and providing a convenient +interface. If you need computational speed, many computations can be optimized +by one of the following: +\itemize{ +\item Performing core sliding operations with \code{epi_slide_opt()} with +\code{frollapply}, and using potentially-grouped \code{mutate()}s to transform or +combine the results. +\item Grouping by \code{geo_value} and any \code{other_keys}; \code{\link[=complete]{complete()}}ing with +\code{full_seq()} to fill in time gaps; \code{arrange()}ing by \code{time_value}s within +each group; using \code{mutate()} with vectorized operations and shift operators +like \code{dplyr::lead()} and \code{dplyr::lag()} to perform the core operations, +being careful to give the desired results for the least and most recent +\code{time_value}s (often \code{NA}s for the least recent); ungrouping; and +\code{filter()}ing back down to only rows that existed before the \code{complete()} +stage if necessary. } -\details{ +} + \subsection{Advanced uses of \code{.f} via tidy evaluation}{ -If specifying \code{.f} via tidy evaluation, in addition to the standard \code{\link{.data}} -and \code{\link{.env}}, we make some additional "pronoun"-like bindings available: +If specifying \code{.f} via tidy evaluation, in addition to the standard \code{\link[rlang:dot-data]{.data}} +and \code{\link[rlang:dot-data]{.env}}, we make some additional "pronoun"-like bindings available: \itemize{ \item .x, which is like \code{.x} in \code{\link[dplyr:group_map]{dplyr::group_modify}}; an ordinary object like an \code{epi_df} rather than an rlang \link[rlang:as_data_mask]{pronoun} -like \code{\link{.data}}; this allows you to use additional \code{dplyr}, \code{tidyr}, and +like \code{.data}; this allows you to use additional \code{dplyr}, \code{tidyr}, and \code{epiprocess} operations. If you have multiple expressions in \code{...}, this won't let you refer to the output of the earlier expressions, but \code{.data} will. @@ -131,34 +173,43 @@ determined the time window for the current computation. \examples{ library(dplyr) -# Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases -cases_deaths_subset \%>\% +# Generate some simple time-varying CFR estimates: +with_cfr_estimates <- cases_deaths_subset \%>\% epi_slide( - cases_7sd = sd(cases, na.rm = TRUE), - cases_7dav = mean(cases, na.rm = TRUE), - .window_size = 7 - ) \%>\% - select(geo_value, time_value, cases, cases_7sd, cases_7dav) -# Note that epi_slide_mean could be used to more quickly calculate cases_7dav. - -# In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or -# formula in a way similar to [`dplyr::group_modify`]: -my_summarizer <- function(window_data) { - window_data \%>\% - summarize( - cases_7sd = sd(cases, na.rm = TRUE), - cases_7dav = mean(cases, na.rm = TRUE) - ) + cfr_estimate_v0 = death_rate_7d_av[[22]] / case_rate_7d_av[[1]], + .window_size = 22 + ) +with_cfr_estimates \%>\% + print(n = 30) +# (Here, the value 22 was selected using `epi_cor()` and averaging across +# `geo_value`s. See +# https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1 for some +# warnings & information using CFR estimators along these lines.) + +# In addition to the [`dplyr::mutate`]-like syntax, you can feed in a +# function or formula in a way similar to [`dplyr::group_modify`]; these +# often run much more quickly: +my_computation <- function(window_data) { + tibble( + cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / + window_data$case_rate_7d_av[[1]] + ) } -cases_deaths_subset \%>\% +with_cfr_estimates2 <- cases_deaths_subset \%>\% epi_slide( - ~ my_summarizer(.x), - .window_size = 7 - ) \%>\% - select(geo_value, time_value, cases, cases_7sd, cases_7dav) - - - + ~ my_computation(.x), + .window_size = 22 + ) +with_cfr_estimates3 <- cases_deaths_subset \%>\% + epi_slide( + function(window_data, g, t) { + tibble( + cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / + window_data$case_rate_7d_av[[1]] + ) + }, + .window_size = 22 + ) #### Advanced: #### diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 4b75e9ffb..e1cb20924 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -4,7 +4,7 @@ \alias{epi_slide_opt} \alias{epi_slide_mean} \alias{epi_slide_sum} -\title{Optimized slide functions for common cases} +\title{Calculate rolling or running means, sums, etc., or custom calculations} \usage{ epi_slide_opt( .x, @@ -51,7 +51,7 @@ epi_slide_sum( and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} -\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +\item{.col_names}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An unquoted column name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if @@ -134,10 +134,20 @@ added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups as \code{.x} if \code{.x} was grouped. } \description{ -\code{epi_slide_opt} allows sliding an n-timestep \link[data.table:froll]{data.table::froll} -or \link[slider:summary-slide]{slider::summary-slide} function over variables in an \code{epi_df} object. -These functions tend to be much faster than \code{epi_slide()}. See -\code{vignette("epi_df")} for more examples. +These methods take each subpopulation (i.e., a single +\code{geo_value} and combination of any \code{other_keys} you set up for age groups, +etc.) and perform a \code{.window_size}-width time window rolling/sliding +computation, or alternatively, a running/cumulative computation (with +\code{.window_size = Inf}) on the requested columns. Explicit \code{NA} measurements +are temporarily added to fill in any time gaps, and, for rolling +computations, to pad the time series to ensure that the first & last +computations use exactly \code{.window_size} values. + +\code{epi_slide_opt} allows you to use any \link[data.table:froll]{data.table::froll} or +\link[slider:summary-slide]{slider::summary-slide} function. If none of those specialized functions fit +your usecase, you can use \code{data.table::frollapply} together with a non-rolling +function (e.g., \code{median}). See \code{\link{epi_slide}} if you need to work with +multiple columns at once or output a custom type. \code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollmean}. diff --git a/man/epiprocess-package.Rd b/man/epiprocess-package.Rd index fe79c01e8..12e8a3c42 100644 --- a/man/epiprocess-package.Rd +++ b/man/epiprocess-package.Rd @@ -11,6 +11,7 @@ This package introduces common data structures for working with epidemiological \seealso{ Useful links: \itemize{ + \item \url{https://github.com/cmu-delphi/epiprocess} \item \url{https://cmu-delphi.github.io/epiprocess/} } diff --git a/man/epix_as_of_current.Rd b/man/epix_as_of_current.Rd new file mode 100644 index 000000000..8dcb53afc --- /dev/null +++ b/man/epix_as_of_current.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R +\name{epix_as_of_current} +\alias{epix_as_of_current} +\title{Get the latest snapshot from an \code{epi_archive} object.} +\usage{ +epix_as_of_current(x) +} +\arguments{ +\item{x}{An \code{epi_archive} object} +} +\value{ +The latest snapshot from an \code{epi_archive} object +} +\description{ +The latest snapshot is the snapshot of the last known version. +} diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index cadb19833..ad6be18b5 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -1,10 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R +% Please edit documentation in R/methods-epi_archive.R \name{epix_slide} \alias{epix_slide} -\alias{epix_slide.epi_archive} -\alias{epix_slide.grouped_epi_archive} -\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} +\title{Take each requested (group and) version in an archive, run a computation (e.g., forecast)} \usage{ epix_slide( .x, @@ -15,51 +13,39 @@ epix_slide( .new_col_name = NULL, .all_versions = FALSE ) - -\method{epix_slide}{epi_archive}( - .x, - .f, - ..., - .before = Inf, - .versions = NULL, - .new_col_name = NULL, - .all_versions = FALSE -) - -\method{epix_slide}{grouped_epi_archive}( - .x, - .f, - ..., - .before = Inf, - .versions = NULL, - .new_col_name = NULL, - .all_versions = FALSE -) } \arguments{ \item{.x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, all data in \code{x} will be treated as part of a single data group.} \item{.f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{.before} parameter (see details for more). If a -function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where -\itemize{ -\item "x" is an epi_df with the same column names as the archive's \code{DT}, minus -the \code{version} column -\item "g" is a one-row tibble containing the values of the grouping variables -for the associated group -\item "t" is the ref_time_value for the current window -\item "..." are additional arguments -} +computation. The computation will be run on each requested group-version +combination, with a time window filter applied if \code{.before} is supplied. + +If \code{.f} is a function must have the form \verb{function(x, g, v)} or +\verb{function(x, g, v, )}, where + +\if{html}{\out{
}}\preformatted{- `x` is an `epi_df` with the same column names as the archive's `DT`, + minus the `version` column. (Or, if `.all_versions = TRUE`, an + `epi_archive` with the requested partial version history.) + +- `g` is a one-row tibble containing the values of the grouping variables + for the associated group. + +- `v` (length-1) is the associated `version` (one of the requested + `.versions`) + +- `` are optional; you can add such + arguments to your function and set them by passing them through the + `...` argument to `epix_slide()`. +}\if{html}{\out{
}} If a formula, \code{.f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via \code{.z} -or \code{.ref_time_value}. If \code{.f} is missing, then \code{...} will specify the -computation.} +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z}, \code{.version}, or \code{.ref_time_value}. If \code{.f} is missing, then \code{...} will +specify the computation.} \item{...}{Additional arguments to pass to the function or formula specified via \code{f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted @@ -67,41 +53,46 @@ as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions for tidy evaluation; in addition to referring columns directly by name, the expressions have access to \code{.data} and \code{.env} pronouns as in \code{dplyr} verbs, and can also refer to \code{.x} (not the same as the input epi_archive), -\code{.group_key}, and \code{.ref_time_value}. See details for more.} - -\item{.before}{How many time values before the \code{.ref_time_value} -should each snapshot handed to the function \code{.f} contain? If provided, it -should be a single value that is compatible with the time_type of the -time_value column (more below), but most commonly an integer. This window -endpoint is inclusive. For example, if \code{.before = 7}, \code{time_type} -in the archive is "day", and the \code{.ref_time_value} is January 8, then the -smallest time_value in the snapshot will be January 1. If missing, then the -default is no limit on the time values, so the full snapshot is given.} - -\item{.versions}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{epix_as_of} which we fetch data in this window. If missing, then this will -set to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} +\code{.group_key} and \code{.version}/\code{.ref_time_value}. See details for more.} + +\item{.before}{Optional; applies a \code{time_value} filter before running each +computation. The default is not to apply a \code{time_value} filter. If +provided, it should be a single integer or difftime that is compatible with +the time_type of the time_value column. If an integer, then the minimum +possible \code{time_value} included will be that many time steps (according to +the \code{time_type}) before each requested \code{.version}. This window endpoint is +inclusive. For example, if \code{.before = 14}, the \code{time_type} in the archive +is "day", and the requested \code{.version} is January 15, then the smallest +possible \code{time_value} possible in the snapshot will be January 1. Note that +this does not mean that there will be 14 or 15 distinct \code{time_value}s +actually appearing in the data; for most reporting streams, reporting as of +January 15 won't include \code{time_value}s all the way through January 14, due +to reporting latency. Unlike \code{epi_slide()}, \code{epix_slide()} won't fill in +any missing \code{time_values} in this window.} + +\item{.versions}{Requested versions on which to run the computation. Each +requested \code{.version} also serves as the anchor point from which +the \code{time_value} window specified by \code{.before} is drawn. If \code{.versions} is +missing, it will be set to a regularly-spaced sequence of values set to +cover the range of \code{version}s in the \code{DT} plus the \code{versions_end}; the +spacing of values will be guessed (using the GCD of the skips between +values).} \item{.new_col_name}{Either \code{NULL} or a string indicating the name of the new column that will contain the derived values. The default, \code{NULL}, will use the name "slide_value" unless your slide computations output data frames, -in which case they will be unpacked into the constituent columns and those -names used. If the resulting column name(s) overlap with the column names -used for labeling the computations, which are \code{group_vars(x)} and -\code{"version"}, then the values for these columns must be identical to the -labels we assign.} +in which case they will be unpacked into the constituent columns and the +data frame's column names will be used instead. If the resulting column +name(s) overlap with the column names used for labeling the computations, +which are \code{group_vars(x)} and \code{"version"}, then the values for these +columns must be identical to the labels we assign.} \item{.all_versions}{(Not the same as \code{.all_rows} parameter of \code{epi_slide}.) If \code{.all_versions = TRUE}, then the slide computation will be passed the -version history (all \code{version <= .version} where \code{.version} is one of the -requested \code{.versions}) for rows having a \code{time_value} of at least `.version -\itemize{ -\item before\verb{. Otherwise, the slide computation will be passed only the most recent }version\verb{for every unique}time_value\verb{. Default is }FALSE`. -}} +version history (all versions \verb{<= .version} where \code{.version} is one of the +requested \code{.version}s), in \code{epi_archive} format. Otherwise, the slide +computation will be passed only the most recent \code{version} for every unique +\code{time_value}, in \code{epi_df} format. Default is \code{FALSE}.} } \value{ A tibble whose columns are: the grouping variables (if any), @@ -110,28 +101,32 @@ computation, and a column named according to the \code{.new_col_name} argument, containing the slide values. It will be grouped by the grouping variables. } \description{ -Slides a given function over variables in an \code{epi_archive} object. This -behaves similarly to \code{epi_slide()}, with the key exception that it is -version-aware: the sliding computation at any given reference time t is -performed on \strong{data that would have been available as of t}. This function -is intended for use in accurate backtesting of models; see -\code{vignette("backtesting", package="epipredict")} for a walkthrough. +... and collect the results. This is useful for more accurately simulating +how a forecaster, nowcaster, or other algorithm would have behaved in real +time, factoring in reporting latency and data revisions; see +\href{https://cmu-delphi.github.io/epipredict/articles/backtesting.html}{\code{vignette("backtesting", package="epipredict")}} for a walkthrough. } \details{ +This is similar to looping over versions and calling \code{\link{epix_as_of}}, but has +some conveniences such as working naturally with \code{\link{grouped_epi_archive}}s, +optional time windowing, and syntactic sugar to make things shorter to write. + A few key distinctions between the current function and \code{epi_slide()}: \enumerate{ \item In \code{.f} functions for \code{epix_slide}, one should not assume that the input data to contain any rows with \code{time_value} matching the computation's -\code{.ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for -typical epidemiological surveillance data, observations pertaining to a -particular time period (\code{time_value}) are first reported \code{as_of} some -instant after that time period has ended. +\code{.version}, due to reporting latency; for typical epidemiological +surveillance data, observations pertaining to a particular time period +(\code{time_value}) are first reported \code{as_of} some instant after that time +period has ended. No time window completion is performed as in +\code{epi_slide()}. \item The input class and columns are similar but different: \code{epix_slide} (with the default \code{.all_versions=FALSE}) keeps all columns and the \code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only provides the grouping variables in the second input, and will convert the first input into a regular tibble if the grouping variables include the -essential \code{geo_value} column. (With .all_versions=TRUE\verb{, }epix_slide\verb{will will provide an}epi_archive\verb{rather than an}epi-df` to each +essential \code{geo_value} column. (With \code{.all_versions=TRUE}, \code{epix_slide} +will provide an \code{epi_archive} rather than an \code{epi-df} to each computation.) \item The output class and columns are similar but different: \code{epix_slide()} returns a tibble containing only the grouping variables, \code{time_value}, and @@ -144,82 +139,61 @@ results as they are not supported by tibbles.) \item There are no size stability checks or element/row recycling to maintain size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly -analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed -in the "advanced" vignette. +analogous to \code{\link[dplyr:mutate]{dplyr::mutate}}.) \item \code{.all_rows} is not supported in \code{epix_slide}; since the slide computations are allowed more flexibility in their outputs than in \code{epi_slide}, we can't guess a good representation for missing computations for excluded group-\code{.ref_time_value} pairs. \item The \code{.versions} default for \code{epix_slide} is based on making an evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the -\code{versions_end}, rather than the \code{time_value}s. +\code{versions_end}, rather than all unique \code{time_value}s. +\item \code{epix_slide()} computations can refer to the current element of +\code{.versions} as either \code{.version} or \code{.ref_time_value}, while \code{epi_slide()} +computations refer to the current element of \code{.ref_time_values} with +\code{.ref_time_value}. } Apart from the above distinctions, the interfaces between \code{epix_slide()} and \code{epi_slide()} are the same. - -Furthermore, the current function can be considerably slower than -\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch -properly-versioned snapshots from the data archive (via \code{epix_as_of()}), -and (2) it performs a "manual" sliding of sorts, and does not benefit from -the highly efficient \code{slider} package. For this reason, it should never be -used in place of \code{epi_slide()}, and only used when version-aware sliding is -necessary (as it its purpose). } \examples{ library(dplyr) -# Reference time points for which we want to compute slide values: -versions <- seq(as.Date("2020-06-02"), - as.Date("2020-06-15"), - by = "1 day" -) +# Request only a small set of versions, for example's sake: +requested_versions <- + seq(as.Date("2020-09-02"), as.Date("2020-09-15"), by = "1 day") -# A simple (but not very useful) example (see the archive vignette for a more -# realistic one): +# Investigate reporting lag of `percent_cli` signal (though normally we'd +# probably work off of the dedicated `revision_summary()` function instead): archive_cases_dv_subset \%>\% - group_by(geo_value) \%>\% epix_slide( - .f = ~ mean(.x$case_rate_7d_av), - .before = 2, - .versions = versions, - .new_col_name = "case_rate_7d_av_recent_av" - ) \%>\% - ungroup() -# We requested time windows that started 2 days before the corresponding time -# values. The actual number of `time_value`s in each computation depends on -# the reporting latency of the signal and `time_value` range covered by the -# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -# discarded -# * 1 `time_value`, for ref time 2020-06-02 -# * 2 `time_value`s, for the rest of the results -# * never the 3 `time_value`s we would get from `epi_slide`, since, because -# of data latency, we'll never have an observation -# `time_value == .ref_time_value` as of `.ref_time_value`. -# The example below shows this type of behavior in more detail. - -# Examining characteristics of the data passed to each computation with -# `all_versions=FALSE`. + geowide_percent_cli_max_time = max(time_value[!is.na(percent_cli)]), + geowide_percent_cli_rpt_lag = .version - geowide_percent_cli_max_time, + .versions = requested_versions + ) archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% epix_slide( - function(x, gk, rtv) { - tibble( - time_range = if (nrow(x) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) - }, - n = nrow(x), - class1 = class(x)[[1L]] - ) - }, - .before = 5, .all_versions = FALSE, - .versions = versions - ) \%>\% - ungroup() \%>\% - arrange(geo_value, version) + percent_cli_max_time = max(time_value[!is.na(percent_cli)]), + percent_cli_rpt_lag = .version - percent_cli_max_time, + .versions = requested_versions + ) + +# Backtest a forecaster "pseudoprospectively" (i.e., faithfully with respect +# to the data version history): +case_death_rate_archive \%>\% + epix_slide( + .versions = as.Date(c("2021-10-01", "2021-10-08")), + function(x, g, v) { + epipredict::arx_forecaster( + x, + outcome = "death_rate", + predictors = c("death_rate_7d_av", "case_rate_7d_av") + )$predictions + } + ) +# See `vignette("backtesting", package="epipredict")` for a full walkthrough +# on backtesting forecasters, including plots, etc. # --- Advanced: --- @@ -251,7 +225,7 @@ archive_cases_dv_subset \%>\% ) }, .before = 5, .all_versions = TRUE, - .versions = versions + .versions = requested_versions ) \%>\% ungroup() \%>\% # Focus on one geo_value so we can better see the columns above: diff --git a/man/figures/lifecycle-deprecated.svg b/man/figures/lifecycle-deprecated.svg new file mode 100644 index 000000000..b61c57c3f --- /dev/null +++ b/man/figures/lifecycle-deprecated.svg @@ -0,0 +1,21 @@ + + lifecycle: deprecated + + + + + + + + + + + + + + + lifecycle + + deprecated + + diff --git a/man/figures/lifecycle-experimental.svg b/man/figures/lifecycle-experimental.svg new file mode 100644 index 000000000..5d88fc2c6 --- /dev/null +++ b/man/figures/lifecycle-experimental.svg @@ -0,0 +1,21 @@ + + lifecycle: experimental + + + + + + + + + + + + + + + lifecycle + + experimental + + diff --git a/man/figures/lifecycle-stable.svg b/man/figures/lifecycle-stable.svg new file mode 100644 index 000000000..9bf21e76b --- /dev/null +++ b/man/figures/lifecycle-stable.svg @@ -0,0 +1,29 @@ + + lifecycle: stable + + + + + + + + + + + + + + + + lifecycle + + + + stable + + + diff --git a/man/figures/lifecycle-superseded.svg b/man/figures/lifecycle-superseded.svg new file mode 100644 index 000000000..db8d757f7 --- /dev/null +++ b/man/figures/lifecycle-superseded.svg @@ -0,0 +1,21 @@ + + lifecycle: superseded + + + + + + + + + + + + + + + lifecycle + + superseded + + diff --git a/man/filter.epi_archive.Rd b/man/filter.epi_archive.Rd new file mode 100644 index 000000000..5f9d72db4 --- /dev/null +++ b/man/filter.epi_archive.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R +\name{filter.epi_archive} +\alias{filter.epi_archive} +\title{\code{\link[dplyr:filter]{dplyr::filter}} for \code{epi_archive}s} +\usage{ +\method{filter}{epi_archive}(.data, ..., .by = NULL, .format_aware = FALSE) +} +\arguments{ +\item{.data}{an \code{epi_archive}} + +\item{...}{as in \code{\link[dplyr:filter]{dplyr::filter}}; using the \code{version} column is not allowed +unless you use \code{.format_aware = TRUE}; see details.} + +\item{.by}{as in \code{\link[dplyr:filter]{dplyr::filter}}} + +\item{.format_aware}{optional, \code{TRUE} or \code{FALSE}; default \code{FALSE}. See +details.} +} +\description{ +\code{\link[dplyr:filter]{dplyr::filter}} for \code{epi_archive}s +} +\details{ +By default, using the \code{version} column or measurement columns is disabled as +it's easy to get unexpected results. See if either \code{\link{epix_as_of}} or +\code{\link{epix_slide}} works for any version selection you have in mind: for version +selection, see the \code{version} or \code{.versions} args, respectively; for +measurement column-based filtering, try \code{filter}ing after \code{epix_as_of} or +inside the \code{.f} in \code{epix_slide()}. If they don't cover your use case, then +you can set \code{.format_aware = TRUE} to enable usage of these columns, but be +careful to: +\itemize{ +\item Factor in that \code{.data$DT} may have been converted into a compact format +based on diffing consecutive versions, and the last version of each +observation in \code{.data$DT} will always be carried forward to future +\code{version}s\verb{; see details of [}as_epi_archive`]. +\item Set \code{clobberable_versions_start} and \code{versions_end} of the result +appropriately after the \code{filter} call. They will be initialized with the +same values as in \code{.data}. +} + +\code{dplyr::filter} also has an optional argument \code{.preserve}, which should not +have an impact on (ungrouped) \code{epi_archive}s, and \code{grouped_epi_archive}s do +not currently support \code{dplyr::filter}. +} +\examples{ + +# Filter to one location and a particular time range: +archive_cases_dv_subset \%>\% + filter(geo_value == "fl", time_value >= as.Date("2020-10-01")) + +# Convert to weekly by taking the Saturday data for each week, so that +# `case_rate_7d_av` represents a Sun--Sat average: +archive_cases_dv_subset \%>\% + filter(as.POSIXlt(time_value)$wday == 6L) + +# Filtering involving the `version` column or measurement columns requires +# extra care. See epix_as_of and epix_slide instead for some common +# operations. One semi-common operation that ends up being fairly simple is +# treating observations as finalized after some amount of time, and ignoring +# any revisions that were made after that point: +archive_cases_dv_subset \%>\% + filter( + version <= time_value + as.difftime(60, units = "days"), + .format_aware = TRUE + ) + +} diff --git a/man/format_chr_with_quotes.Rd b/man/format_chr_with_quotes.Rd index 49beffb00..555dd402c 100644 --- a/man/format_chr_with_quotes.Rd +++ b/man/format_chr_with_quotes.Rd @@ -2,19 +2,29 @@ % Please edit documentation in R/utils.R \name{format_chr_with_quotes} \alias{format_chr_with_quotes} -\title{Format a character vector as a string via deparsing/quoting each} +\title{Format each entry in a character vector via quoting; special replacement for length 0} \usage{ format_chr_with_quotes(x, empty = "*none*") } \arguments{ -\item{x}{\code{chr}; e.g., \code{colnames} of some data frame} +\item{x}{chr; e.g., \code{colnames} of some data frame} -\item{empty}{string; what should be output if \code{x} is of length 0?} +\item{empty}{chr, likely string; what should be output if \code{x} is of length 0?} } \value{ -string +chr; same \code{length} as \code{x} if \code{x} had nonzero length; value of \code{empty} otherwise } \description{ -Format a character vector as a string via deparsing/quoting each +Performs no escaping within the strings; if you want something that reader +could copy-paste to debug, look into \code{format_deparse} (note that this +collapses into a single string). +} +\examples{ +cli::cli_inform('{epiprocess:::format_chr_with_quotes("x")}') +cli::cli_inform('{epiprocess:::format_chr_with_quotes(c("x","y"))}') +nms <- c("x", "\"Total Cases\"") +cli::cli_inform("{epiprocess:::format_chr_with_quotes(nms)}") +cli::cli_inform("{epiprocess:::format_chr_with_quotes(character())}") + } \keyword{internal} diff --git a/man/revision_summary.Rd b/man/revision_analysis.Rd similarity index 70% rename from man/revision_summary.Rd rename to man/revision_analysis.Rd index 54abf098c..59f0c9593 100644 --- a/man/revision_summary.Rd +++ b/man/revision_analysis.Rd @@ -1,30 +1,46 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/revision_analysis.R -\name{revision_summary} +\name{revision_analysis} +\alias{revision_analysis} +\alias{print.revision_analysis} \alias{revision_summary} \title{A function to describe revision behavior for an archive.} \usage{ -revision_summary( +revision_analysis( epi_arch, ..., drop_nas = TRUE, - print_inform = TRUE, - min_waiting_period = as.difftime(60, units = "days") \%>\% - difftime_approx_ceiling_time_delta(epi_arch$time_type), + min_waiting_period = as.difftime(60, units = "days"), within_latest = 0.2, - quick_revision = as.difftime(3, units = "days") \%>\% - difftime_approx_ceiling_time_delta(epi_arch$time_type), + compactify = TRUE, + compactify_abs_tol = 0, + return_only_tibble = FALSE +) + +\method{print}{revision_analysis}( + x, + quick_revision = as.difftime(3, units = "days"), few_revisions = 3, abs_spread_threshold = NULL, rel_spread_threshold = 0.1, + ... +) + +revision_summary( + epi_arch, + ..., + drop_nas = TRUE, + min_waiting_period = as.difftime(60, units = "days"), + within_latest = 0.2, compactify = TRUE, - compactify_abs_tol = 0 + compactify_abs_tol = 0, + return_only_tibble = FALSE ) } \arguments{ \item{epi_arch}{an epi_archive to be analyzed} -\item{...}{<\code{\link[=dplyr_tidy_select]{tidyselect}}>, used to choose the column to +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidyselect}}>, used to choose the column to summarize. If empty and there is only one value/measurement column (i.e., not in \code{\link{key_colnames}}) in the archive, it will automatically select it. If supplied, \code{...} must select exactly one column.} @@ -34,47 +50,57 @@ If supplied, \code{...} must select exactly one column.} sure there are no duplicate values from occasions when the signal is revised to \code{NA}, and then back to its immediately-preceding value.} -\item{print_inform}{bool, determines whether to print summary information, or -only return the full summary tibble} - \item{min_waiting_period}{\code{difftime}, integer or \code{NULL}. Sets a cutoff: any time_values that have not had at least \code{min_waiting_period} to stabilize as of the \code{versions_end} are removed. \code{min_waiting_period} should characterize the typical time during which most significant revisions occur. The default of 60 days corresponds to a typical near-final value for case counts as reported in the context of insurance. To avoid this filtering, either set -to \code{NULL} or 0.} +to \code{NULL} or 0. A \code{difftime} will be rounded up to the appropriate \code{time_type} if +necessary (that is 5 days will be rounded to 1 week if the data is weekly).} \item{within_latest}{double between 0 and 1. Determines the threshold used for the \code{lag_to}} -\item{quick_revision}{difftime or integer (integer is treated as days), for -the printed summary, the amount of time between the final revision and the +\item{compactify}{bool. If \code{TRUE}, we will compactify after the signal +requested in \code{...} has been selected on its own and the \code{drop_nas} step. +This helps, for example, to give similar results when called on +\link[=epix_merge]{merged} and single-signal archives, since merged archives +record an update when any of the other signals change, not just the +requested signal. The default is \code{TRUE}.} + +\item{compactify_abs_tol}{length-1 double, used if \code{compactify} is \code{TRUE}, it +determines the threshold for when two doubles are considered identical.} + +\item{return_only_tibble}{boolean to return only the simple \code{tibble} of +computational results rather than the complete S3 object.} + +\item{x}{a \code{revision_analysis} object} + +\item{quick_revision}{Difftime or integer (integer is treated as days). +The amount of time between the final revision and the actual time_value to consider the revision quickly resolved. Default of 3 -days} +days. This will be rounded up to the appropriate \code{time_type} if +necessary (that is 5 days will be rounded to 1 week if the data is weekly).} -\item{few_revisions}{integer, for the printed summary, the upper bound on the +\item{few_revisions}{Integer. The upper bound on the number of revisions to consider "few". Default is 3.} -\item{abs_spread_threshold}{length-1 numeric, for the printed summary, the +\item{abs_spread_threshold}{Scalar numeric. The maximum spread used to characterize revisions which don't actually change very much. Default is 5\% of the maximum value in the dataset, but this is the most unit dependent of values, and likely needs to be chosen appropriate for the scale of the dataset.} -\item{rel_spread_threshold}{length-1 double between 0 and 1, for the printed -summary, the relative spread fraction used to characterize revisions which +\item{rel_spread_threshold}{Scalar between 0 and 1. The relative spread fraction used to characterize revisions which don't actually change very much. Default is .1, or 10\% of the final value} - -\item{compactify}{bool. If \code{TRUE}, we will compactify after the signal -requested in \code{...} has been selected on its own and the \code{drop_nas} step. -This helps, for example, to give similar results when called on -\link[=epix_merge]{merged} and single-signal archives, since merged archives -record an update when any of the other signals change, not just the -requested signal. The default is \code{TRUE}.} - -\item{compactify_abs_tol}{length-1 double, used if \code{compactify} is \code{TRUE}, it -determines the threshold for when two doubles are considered identical.} +} +\value{ +An S3 object with class \code{revision_behavior}. This function is typically +called for the purposes of inspecting the printed output. The +results of the computations are available in +\code{revision_analysis(...)$revision_behavior}. If you only want to access +the internal computations, use \code{return_only_tibble = TRUE}. } \description{ \code{revision_summary} removes all missing values (if requested), and then @@ -113,7 +139,7 @@ produce incorrect results for some calculations, since week numbering contains jumps at year boundaries. } \examples{ -revision_example <- revision_summary(archive_cases_dv_subset, percent_cli) -revision_example \%>\% arrange(desc(spread)) +revision_example <- revision_analysis(archive_cases_dv_subset, percent_cli) +revision_example$revision_behavior \%>\% arrange(desc(spread)) } diff --git a/man/set_versions_end.Rd b/man/set_versions_end.Rd new file mode 100644 index 000000000..3d38f23f2 --- /dev/null +++ b/man/set_versions_end.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R +\name{set_versions_end} +\alias{set_versions_end} +\title{Set the \code{versions_end} attribute of an \code{epi_archive} object} +\usage{ +set_versions_end(x, versions_end) +} +\arguments{ +\item{x}{An \code{epi_archive} object} + +\item{versions_end}{The new \code{versions_end} value} +} +\value{ +An \code{epi_archive} object with the updated \code{versions_end} attribute +} +\description{ +An escape hatch for epix_as_of, which does not allow version > +\verb{$versions_end}. +} diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd index f1ba84745..62eecf29d 100644 --- a/man/sum_groups_epi_df.Rd +++ b/man/sum_groups_epi_df.Rd @@ -4,12 +4,17 @@ \alias{sum_groups_epi_df} \title{Aggregate an \code{epi_df} object} \usage{ -sum_groups_epi_df(.x, sum_cols = "value", group_cols = character()) +sum_groups_epi_df(.x, sum_cols, group_cols = "time_value") } \arguments{ \item{.x}{an \code{epi_df}} -\item{sum_cols}{character vector of the columns to aggregate} +\item{sum_cols}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An unquoted column +name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), +\link[tidyselect:language]{other tidy-select expression}, or a vector of +characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if +they were positions in the data frame, so expressions like \code{x:y} can be +used to select a range of variables.} \item{group_cols}{character vector of column names to group by. "time_value" is included by default.} @@ -22,3 +27,12 @@ Aggregates an \code{epi_df} object by the specified group columns, summing the \code{value} column, and returning an \code{epi_df}. If aggregating over \code{geo_value}, the resulting \code{epi_df} will have \code{geo_value} set to \code{"total"}. } +\examples{ +# This data has other_keys age_group and edu_qual: +grad_employ_subset + +# Aggregate num_graduates within each geo_value (and time_value): +grad_employ_subset \%>\% + sum_groups_epi_df(num_graduates, group_cols = "geo_value") + +} diff --git a/man/tidyselect_arg_roxygen.Rd b/man/tidyselect_arg_roxygen.Rd new file mode 100644 index 000000000..27cb264d9 --- /dev/null +++ b/man/tidyselect_arg_roxygen.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inline-roxygen.R +\docType{data} +\name{tidyselect_arg_roxygen} +\alias{tidyselect_arg_roxygen} +\title{Description of a single arg that tidyselects value variables} +\format{ +An object of class \code{character} of length 1. +} +\usage{ +tidyselect_arg_roxygen +} +\description{ +Not meant for when describing tidyselect \code{...}. +} +\keyword{internal} diff --git a/tests/testthat.R b/tests/testthat.R index b26d274b5..3c4cbff24 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,6 @@ library(testthat) library(epiprocess) +stopifnot(packageVersion("testthat") >= "3.1.5") + test_check("epiprocess") diff --git a/tests/testthat/_snaps/methods-epi_archive.md b/tests/testthat/_snaps/methods-epi_archive.md new file mode 100644 index 000000000..200e4202e --- /dev/null +++ b/tests/testthat/_snaps/methods-epi_archive.md @@ -0,0 +1,35 @@ +# filter.epi_archive works as expected + + Code + ea2 %>% filter(version <= as.Date("2020-06-02")) + Condition + Error in `filter()`: + i In argument: `version <= as.Date("2020-06-02")`. + Caused by error: + ! Using `version` in `filter.epi_archive` may produce unexpected results. + > See if `epix_as_of` or `epix_slide` would work instead. + > If not, see `?filter.epi_archive` details for how to proceed. + +--- + + Code + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L) + Condition + Error in `filter()`: + i In argument: `cases >= 2L`. + Caused by error: + ! Using `cases` in `filter.epi_archive` may produce unexpected results. + > See `?filter.epi_archive` details for how to proceed. + +--- + + Code + ea2p %>% filter(cases >= median(cases), .by = geo_value) + Condition + Error in `filter()`: + i In argument: `cases >= median(cases)`. + i In group 1: `geo_value = "ca"`. + Caused by error: + ! Using `cases` in `filter.epi_archive` may produce unexpected results. + > See `?filter.epi_archive` details for how to proceed. + diff --git a/tests/testthat/_snaps/revision-latency-functions.md b/tests/testthat/_snaps/revision-latency-functions.md index 4f2bfe269..424dff11e 100644 --- a/tests/testthat/_snaps/revision-latency-functions.md +++ b/tests/testthat/_snaps/revision-latency-functions.md @@ -1,29 +1,41 @@ # revision_summary works for dummy datasets Code - dummy_ex %>% revision_summary() %>% print(n = 10, width = 300) + rs1 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-04. -- + + -- Min lag (time to first version): Output min median mean max 0 days 1 days 1.6 days 4 days Message - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 3 out of 7 (42.86%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 1 out of 4 (25%) Spread of more than 5.1 in actual value (when revised): * 3 out of 4 (75%) - Days until within 20% of the latest value: + + -- Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days + +--- + + Code + rs1$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -47,31 +59,43 @@ --- Code - dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + rs2 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-04. -- + + -- Min lag (time to first version): Output min median mean max 0 days 1 days 1.4 days 4 days Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 3 days of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Days until within 20% of the latest value: + + -- Days until within 20% of the latest value: Output min median mean max 0 days 3 days 6.9 days 19 days + +--- + + Code + rs2$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -95,31 +119,43 @@ --- Code - dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300) + rs3 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020-01-01 to 2020-01-22. -- + + -- Min lag (time to first version): Output min median mean max 0 weeks 1 weeks 1.4 weeks 4 weeks Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 1 week of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Weeks until within 20% of the latest value: + + -- Weeks until within 20% of the latest value: Output min median mean max 0 weeks 3 weeks 6.9 weeks 19 weeks + +--- + + Code + rs3$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -143,32 +179,43 @@ --- Code - dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, - width = 300) + rs4 Message - Min lag (time to first version): + + -- An epi_archive spanning 2020 Jan to 2020 Apr. -- + + -- Min lag (time to first version): Output min median mean max 0 1 1.4 4 Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 1 month of the `time_value`): * 2 out of 7 (28.57%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Months until within 20% of the latest value: + + -- Months until within 20% of the latest value: Output min median mean max 0 3 6.9 19 + +--- + + Code + rs4$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread @@ -192,32 +239,43 @@ --- Code - dummy_ex_integerly %>% revision_summary(min_waiting_period = 60, - quick_revision = 3, drop_nas = FALSE) %>% print(n = 10, width = 300) + print(rs5, quick_revision = 3) Message - Min lag (time to first version): + + -- An epi_archive spanning 1 to 4. -- + + -- Min lag (time to first version): Output min median mean max 0 1 1.4 4 Message Fraction of all versions that are `NA`: * 2 out of 19 (10.53%) - Fraction of epi_key+time_values with + + -- Fraction of epi_key + time_values with No revisions: * 2 out of 7 (28.57%) Quick revisions (last revision within 3 time steps of the `time_value`): * 4 out of 7 (57.14%) Few revisions (At most 3 revisions for that `time_value`): * 6 out of 7 (85.71%) - Fraction of revised epi_key+time_values which have: + + -- Fraction of revised epi_key + time_values which have: Less than 0.1 spread in relative value: * 2 out of 5 (40%) Spread of more than 5.1 in actual value (when revised): * 3 out of 5 (60%) - Time Steps until within 20% of the latest value: + + -- Time Steps until within 20% of the latest value: Output min median mean max 0 3 6.9 19 + +--- + + Code + rs5$revision_behavior %>% print(n = 10, width = 300) + Output # A tibble: 7 x 11 time_value geo_value n_revisions min_lag max_lag lag_near_latest spread diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index ee500d308..878cde1c0 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -71,14 +71,21 @@ test_that("`validate_version_bound` validate and class checks together allow and # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same `class`") expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same `class`") - expect_error(validate_version_bound( - `class<-`(list(2), "clazz"), - tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - ), regexp = "must have the same `typeof`", class = "epiprocess__version_bound_mismatched_typeof") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) + # Maybe questionable, but accept to relax things a bit, as this is happening + # with Dates in some R(?) versions. Might need to turn some things into + # vec_cast_common, but idea is just make Date stuff work for now: + validate_version_bound( + `class<-`(list(2), "clazz"), + tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" + ) # Good: + validate_version_bound( + `class<-`(2, "Date"), + tibble::tibble(version = `class<-`(5L, "Date")), TRUE, "vb" + ) validate_version_bound(my_int, x_int, TRUE, "vb") validate_version_bound(my_dbl, x_dbl, TRUE, "vb") validate_version_bound(my_list, x_list, TRUE, "vb") diff --git a/tests/testthat/test-compactify.R b/tests/testthat/test-compactify.R index 2eed5025f..229af8453 100644 --- a/tests/testthat/test-compactify.R +++ b/tests/testthat/test-compactify.R @@ -120,37 +120,54 @@ test_that("compactify does not alter the default clobberable and observed versio expect_identical(ea_true$versions_end, ea_false$versions_end) }) +quantile_pred_once <- function(estimates_vec, levels_vec) { + hardhat::quantile_pred(t(as.matrix(estimates_vec)), levels_vec) +} test_that("compactify works on distributions", { + skip("Until #611 is merged or hardhat/epipredict is patched") forecasts <- tibble( ahead = 2L, geo_value = "ak", target_end_date = as.Date("2020-01-19"), - forecast_date = as.Date("2020-01-17") + 1:8, + forecast_date = as.Date("2020-01-17") + 1:6, actual = 25, .pred_distn = c( - epipredict::dist_quantiles(c(1, 5, 9), c(0.1, 0.5, 0.9)), - epipredict::dist_quantiles(c(1, NA, 9), c(0.1, 0.5, 0.9)), # single NA in quantiles - epipredict::dist_quantiles(c(NA, NA, NA), c(0.1, 0.5, 0.9)), # all NAs in quantiles - distributional::dist_missing(1), # the actual `NA` for distributions - epipredict::dist_quantiles(c(1, 5, 9), c(0.1, 0.5, 0.9)), # and back - epipredict::dist_quantiles(c(3, 5, 9), c(0.1, 0.5, 0.9)), # change quantile - epipredict::dist_quantiles(c(3, 5, 9), c(0.2, 0.5, 0.8)), # change level - epipredict::dist_quantiles(c(3, 5, 9), c(0.2, 0.5, 0.8)) # LOCF + quantile_pred_once(c(1, 5, 9), c(0.1, 0.5, 0.9)), + quantile_pred_once(c(1, NA, 9), c(0.1, 0.5, 0.9)), # single NA in quantiles + quantile_pred_once(c(NA, NA, NA), c(0.1, 0.5, 0.9)), # all NAs in quantiles + quantile_pred_once(c(1, 5, 9), c(0.1, 0.5, 0.9)), # and back + quantile_pred_once(c(3, 5, 9), c(0.1, 0.5, 0.9)), # change quantile + quantile_pred_once(c(3, 5, 9), c(0.1, 0.5, 0.9)) # LOCF ) ) expect_equal( forecasts %>% - as_epi_archive( - other_keys = "ahead", time_value = target_end_date, version = forecast_date, - compactify = TRUE - ) %>% + as_epi_archive(other_keys = "ahead", time_value = target_end_date, version = forecast_date) %>% .$DT %>% as.data.frame() %>% as_tibble(), - forecasts[-8, ] %>% + forecasts[-6, ] %>% rename(time_value = target_end_date, version = forecast_date) ) }) +test_that("epix_merge works with distributions", { + skip("Until hardhat/epipredict is patched") + forecasts1ea <- tibble( + ahead = 2L, + geo_value = "ak", + target_end_date = as.Date("2020-01-19"), + forecast_date = as.Date("2020-01-17") + 1, + .pred_distn1 = quantile_pred_once(c(1, 5, 9), c(0.1, 0.5, 0.9)) + ) %>% as_epi_archive(other_keys = "ahead", time_value = target_end_date, version = forecast_date) + forecasts2ea <- tibble( + ahead = 2L, + geo_value = "ak", + target_end_date = as.Date("2020-01-19"), + forecast_date = as.Date("2020-01-17") + 2, + .pred_distn2 = quantile_pred_once(c(2, 4, 8), c(0.1, 0.5, 0.9)) + ) %>% as_epi_archive(other_keys = "ahead", time_value = target_end_date, version = forecast_date) + forecasts12ea <- epix_merge(forecasts1ea, forecasts2ea, sync = "locf") +}) test_that("Large compactify_abs_tol does not drop edf keys", { # several epikeytimes, each with a single version diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 0aa4aca7f..b9be125df 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -610,7 +610,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 2L after <- 1L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% @@ -627,7 +627,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { pad_late_dates = as.Date(c("2022-01-08")) ) ) - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = weeks) %>% @@ -677,7 +677,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 5L after <- 0L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% @@ -701,7 +701,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 0L after <- 3L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% diff --git a/tests/testthat/test-epix_merge.R b/tests/testthat/test-epix_merge.R index 5b3de284f..ded143531 100644 --- a/tests/testthat/test-epix_merge.R +++ b/tests/testthat/test-epix_merge.R @@ -175,7 +175,7 @@ test_that("epix_merge forbids and warns on metadata and naming issues", { as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, value = 1L)), as_epi_archive(tibble::tibble(geo_value = "ak", time_value = test_date, version = test_date + 1L, value = 2L)) ), - regexp = "overlapping.*names" + regexp = 'both have measurement columns named "value"' ) }) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 45ba6ea12..c544368a5 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -128,3 +128,179 @@ test_that("group_vars works as expected", { "geo_value" ) }) + +test_that("epix_as_of_now works as expected", { + expect_equal( + attr(ea2_data %>% as_epi_archive() %>% epix_as_of_current(), "metadata")$as_of, + as.Date("2020-06-04") + ) + time_value <- as.Date("2020-06-01") + df <- dplyr::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", time_value, time_value, 1, + "ca", time_value + 7, time_value + 7, 2, + ) + expect_equal( + attr(df %>% as_epi_archive() %>% epix_as_of_current(), "metadata")$as_of, + as.Date("2020-06-08") + ) + time_value <- tsibble::yearmonth(as.Date("2020-06-01") - lubridate::month(1)) + df <- dplyr::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", time_value, time_value, 1, + "ca", time_value + lubridate::month(1), time_value + lubridate::month(1), 2, + ) + expect_equal( + attr(df %>% as_epi_archive() %>% epix_as_of_current(), "metadata")$as_of, + tsibble::yearmonth("2020-06") + ) + time_value <- 2020 + df <- dplyr::tribble( + ~geo_value, ~time_value, ~version, ~cases, + "ca", time_value, time_value, 1, + "ca", time_value + 7, time_value + 7, 2, + ) + expect_equal( + attr(df %>% as_epi_archive() %>% epix_as_of_current(), "metadata")$as_of, + 2027 + ) +}) + +test_that("filter.epi_archive works as expected", { + ea2 <- ea2_data %>% + as_epi_archive() + + # Some basic output value checks: + + expect_equal( + ea2 %>% filter(geo_value == "tn"), + new_epi_archive( + ea2$DT[FALSE], + ea2$geo_type, ea2$time_type, ea2$other_keys, + ea2$clobberable_versions_start, ea2$versions_end + ) + ) + + expect_equal( + ea2 %>% filter(geo_value == "ca", time_value == as.Date("2020-06-02")), + new_epi_archive( + data.table::data.table( + geo_value = "ca", time_value = as.Date("2020-06-02"), + version = as.Date("2020-06-02") + 0:2, cases = 0:2 + ), + ea2$geo_type, ea2$time_type, ea2$other_keys, + ea2$clobberable_versions_start, ea2$versions_end + ) + ) + + # Output geo_type and time_type behavior: + + hrr_day_ea <- tibble( + geo_value = c(rep(1, 14), 100), + time_value = as.Date("2020-01-01") - 1 + c(1:14, 14), + version = time_value + 3, + value = 1:15 + ) %>% + as_epi_archive() + + expect_equal(hrr_day_ea$geo_type, "hrr") + expect_equal(hrr_day_ea$time_type, "day") + + hrr_week_ea <- hrr_day_ea %>% + filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L) + + expect_equal(hrr_week_ea$geo_type, "hrr") + expect_equal(hrr_week_ea$time_type, "week") + + hrr_one_week_ea <- hrr_week_ea %>% + filter(time_value == time_value[[1]]) + + expect_equal(hrr_one_week_ea$time_type, "week") + + intcustom_day_ea <- hrr_day_ea + intcustom_day_ea$geo_type <- "custom" + + intcustom_week_ea <- intcustom_day_ea %>% + filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L) + + expect_equal(intcustom_week_ea$geo_type, "custom") + expect_equal(intcustom_week_ea$time_type, "week") + + # Environment variables should be fine: + version <- as.Date("2020-06-02") + 1 + e <- version + expected <- ea2 %>% filter(geo_value == "ca", as.Date("2020-06-02") + 1 <= time_value) + expect_equal(ea2 %>% filter(geo_value == "ca", .env$version <= time_value), expected) + expect_equal(ea2 %>% filter(geo_value == "ca", e <= time_value), expected) + expect_equal(ea2 %>% filter(geo_value == "ca", .env$e <= time_value), expected) + + # Error-raising: + expect_error( + ea2 %>% filter(version == as.Date("2020-06-02")), + class = "epiprocess__filter_archive__used_version" + ) + expect_error( + ea2 %>% filter(version <= as.Date("2020-06-02")), + class = "epiprocess__filter_archive__used_version" + ) + expect_snapshot( + ea2 %>% filter(version <= as.Date("2020-06-02")), + error = TRUE, cnd_class = TRUE + ) + expect_error( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + class = "epiprocess__filter_archive__used_measurement" + ) + expect_snapshot( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + error = TRUE, cnd_class = TRUE + ) + expect_error( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + class = "epiprocess__filter_archive__used_measurement" + ) + # Check for `for` + `delayedAssign` mishap in `expect_snapshot` (we should say + # something about `cases` (the relevant colname), not `deaths` (the last + # measurement colname)): + ea2p <- ea2_data %>% + mutate(deaths = 0) %>% + as_epi_archive() + expect_error( + ea2p %>% filter(cases >= median(cases), .by = geo_value), + class = "epiprocess__filter_archive__used_measurement" + ) + expect_snapshot( + ea2p %>% filter(cases >= median(cases), .by = geo_value), + error = TRUE, cnd_class = TRUE + ) + # Check that we are insulated from other lazy eval traps: + expected <- rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value)) + expect_class(expected$parent, "epiprocess__filter_archive__used_measurement") + with(list(cli_abort = function(...) stop("now, pretend user didn't have cli attached")), { + expect_equal( + rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value))$parent$message, + expected$parent$message + ) + }) + expect_equal( + rlang::catch_cnd(ea2p %>% filter( + { + c <- function(...) stop("and that they overwrote `c` to try to debug their own code") + cases >= median(cases) + }, + .by = geo_value + ))$parent$message, + expected$parent$message + ) + + + # Escape hatch: + expect_equal( + ea2 %>% + filter(version <= time_value + as.difftime(1, units = "days"), + .format_aware = TRUE + ) %>% + .$DT, + ea2$DT[version <= time_value + as.difftime(1, units = "days"), ] + ) +}) diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R index 3e5c180b0..bc9f1e35d 100644 --- a/tests/testthat/test-methods-epi_df.R +++ b/tests/testthat/test-methods-epi_df.R @@ -311,20 +311,25 @@ test_that("complete.epi_df works", { }) test_that("sum_groups_epi_df works", { - out <- toy_epi_df %>% sum_groups_epi_df(sum_cols = "x") + out <- toy_epi_df %>% sum_groups_epi_df("x") expected_out <- toy_epi_df %>% group_by(time_value) %>% summarize(x = sum(x)) %>% mutate(geo_value = "total") %>% as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of) expect_equal(out, expected_out) + out <- toy_epi_df %>% sum_groups_epi_df(x) + expect_equal(out, expected_out) out <- toy_epi_df %>% - sum_groups_epi_df(sum_cols = c("x", "y"), group_cols = c("time_value", "geo_value", "indic_var1")) + sum_groups_epi_df(c(x, y), group_cols = c("time_value", "geo_value", "indic_var1")) expected_out <- toy_epi_df %>% group_by(time_value, geo_value, indic_var1) %>% summarize(x = sum(x), y = sum(y), .groups = "drop") %>% as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of, other_keys = "indic_var1") %>% arrange_canonical() expect_equal(out, expected_out) + out <- toy_epi_df %>% + sum_groups_epi_df(x:y, group_cols = c("time_value", "geo_value", "indic_var1")) + expect_equal(out, expected_out) }) diff --git a/tests/testthat/test-revision-latency-functions.R b/tests/testthat/test-revision-latency-functions.R index b636bf32e..1a61e8d88 100644 --- a/tests/testthat/test-revision-latency-functions.R +++ b/tests/testthat/test-revision-latency-functions.R @@ -64,27 +64,36 @@ dummy_ex_integerly <- dummy_ex$DT %>% stopifnot(dummy_ex_integerly$time_type == "integer") test_that("revision_summary works for dummy datasets", { - expect_snapshot(dummy_ex %>% revision_summary() %>% print(n = 10, width = 300)) - expect_snapshot(dummy_ex %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs1 <- dummy_ex %>% revision_summary() + rs2 <- dummy_ex %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs1) + expect_snapshot(rs1$revision_behavior %>% print(n = 10, width = 300)) + expect_snapshot(rs2) + expect_snapshot(rs2$revision_behavior %>% print(n = 10, width = 300)) # Weekly dummy is mostly just "day" -> "week", but quick-revision summary changes: - expect_snapshot(dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs3 <- dummy_ex_weekly %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs3) + expect_snapshot(rs3$revision_behavior %>% print(n = 10, width = 300)) # Yearmonthly has the same story. It would have been close to encountering # min_waiting_period-based filtering but we actually set its versions_end to # sometime in 2080 rather than 2022: - expect_snapshot(dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) %>% print(n = 10, width = 300)) + rs4 <- dummy_ex_yearmonthly %>% revision_summary(drop_nas = FALSE) + expect_snapshot(rs4) + expect_snapshot(rs4$revision_behavior %>% print(n = 10, width = 300)) # Integer is very much like daily. We have to provide some of the # configuration arguments since we have no idea about what the integers # represent. If the possible integers being used have large jumps like # YYYYww-as-integer epiweek labels (e.g., 200053 jumps to 200101) or are # regularly spaced apart but by more than 1, we'll still be producing # something nonsensical, but we tried. - expect_snapshot(dummy_ex_integerly %>% + rs5 <- dummy_ex_integerly %>% revision_summary( - min_waiting_period = 60, quick_revision = 3, + min_waiting_period = 60, drop_nas = FALSE - ) %>% - print(n = 10, width = 300)) + ) + expect_snapshot(print(rs5, quick_revision = 3)) + expect_snapshot(rs5$revision_behavior %>% print(n = 10, width = 300)) }) test_that("tidyselect is functional", { @@ -133,7 +142,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = as.Date("2020-01-01") + 1 + 59) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), as.Date("2020-01-01") ) @@ -146,7 +155,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = as.Date("2020-01-01") + 7 + 56) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), as.Date("2020-01-01") ) @@ -159,7 +168,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = tsibble::make_yearmonth(2000, 3)) %>% - revision_summary(print_inform = FALSE) %>% + revision_summary(return_only_tibble = TRUE) %>% pull(time_value), tsibble::make_yearmonth(2000, 1) ) @@ -172,7 +181,7 @@ test_that("revision_summary default min_waiting_period works as expected", { value = 1:2 ) %>% as_epi_archive(versions_end = 1 + 1 + 59) %>% - revision_summary(print_inform = FALSE), + revision_summary(return_only_tibble = TRUE), regexp = "Unsupported time_type" ) }) diff --git a/tests/testthat/test-time-utils.R b/tests/testthat/test-time-utils.R index 6fe8d78ad..7ddd70c04 100644 --- a/tests/testthat/test-time-utils.R +++ b/tests/testthat/test-time-utils.R @@ -17,11 +17,11 @@ test_that("guess_period works", { # On Dates: daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") - expect_identical( + expect_equal( daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), daily_dates ) - expect_identical( + expect_equal( weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), weekly_dates ) diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd index f87ea2915..7e6eaea58 100644 --- a/vignettes/epi_archive.Rmd +++ b/vignettes/epi_archive.Rmd @@ -162,7 +162,8 @@ addition to the per key summary, it also returns an overall summary. Here is an a sample of the output: ```{r} -revision_details <- revision_summary(dv_archive, print_inform = TRUE) +revision_details <- revision_summary(dv_archive) +revision_details ``` We can see from the output that, as mentioned above, this data set has a lot of @@ -174,9 +175,9 @@ inspect the returned `revision_details` tibble. Here we collect a number of statistics for each state: ```{r} -revision_details %>% +revision_details$revision_behavior %>% group_by(geo_value) %>% - summarise( + summarize( n_rev = mean(n_revisions), min_lag = min(min_lag), max_lag = max(max_lag), @@ -234,7 +235,8 @@ observation carried forward (LOCF). For more information, see `epix_merge()`. ## Backtesting forecasting models One of the most common use cases of `epiprocess::epi_archive()` object is for -accurate model backtesting. See `vignette("backtesting", package="epipredict")` +accurate model backtesting. See [`vignette("backtesting", +package="epipredict")`](https://cmu-delphi.github.io/epipredict/articles/backtesting.html) for an in-depth demo, using a pre-built forecaster in that package. ## Attribution