Skip to content

A variable called n on the frame causes issues with select_sample and ExpectedHits #7

@szimmer

Description

@szimmer

Add more text around which methods output selection probability and which output expected hits. Add more information about how expected hits is calculated.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(SampleSelectR)
set.seed(8675309)

county_2023_slim_n <- county_2023 |>
  select(GEOID, Region, Pop_Tot) |>
  mutate(
    n=50,
    ExpHits_man=10*Pop_Tot/sum(Pop_Tot),
    .by="Region"
  )

sampsizes <- county_2023_slim_n |>
  distinct(Region) |>
  mutate(sample_size=10)

samp1 <- county_2023_slim_n |>
  select_sample("sys_pps", n=sampsizes, strata="Region", mos="Pop_Tot", outall = TRUE)
#> No sorting variables are provided so frame is assumed to be already sorted for systematic sampling.
#> Stratum: Region = South 
#> --Frame size: 1422
#> --Sample size: 10
#> --Sampling interval (k): 12763101
#> --Random start (r): 2035506
#> Stratum: Region = West 
#> --Frame size: 449
#> --Sample size: 10
#> --Sampling interval (k): 7864612
#> --Random start (r): 3760766
#> Stratum: Region = Northeast 
#> --Frame size: 218
#> --Sample size: 10
#> --Sampling interval (k): 5722245
#> --Random start (r): 4376366
#> Stratum: Region = Midwest 
#> --Frame size: 1055
#> --Sample size: 10
#> --Sampling interval (k): 6888795
#> --Random start (r): 5302221
samp2 <- county_2023_slim_n |>
  select(-n) |>
  select_sample("sys_pps", n=sampsizes, strata="Region", mos="Pop_Tot", outall = TRUE)
#> No sorting variables are provided so frame is assumed to be already sorted for systematic sampling.
#> Stratum: Region = South 
#> --Frame size: 1422
#> --Sample size: 10
#> --Sampling interval (k): 12763101
#> --Random start (r): 3427512
#> Stratum: Region = West 
#> --Frame size: 449
#> --Sample size: 10
#> --Sampling interval (k): 7864612
#> --Random start (r): 5293245
#> Stratum: Region = Northeast 
#> --Frame size: 218
#> --Sample size: 10
#> --Sampling interval (k): 5722245
#> --Random start (r): 5600881
#> Stratum: Region = Midwest 
#> --Frame size: 1055
#> --Sample size: 10
#> --Sampling interval (k): 6888795
#> --Random start (r): 5830173

samp1
#> # A tidytable: 3,144 × 9
#>    Region GEOID Pop_Tot     n ExpHits_man SelectionIndicator SamplingWeight
#>    <fct>  <chr>   <dbl> <dbl>       <dbl> <lgl>                       <dbl>
#>  1 South  01001   59285    50    0.00465  FALSE                          NA
#>  2 South  01003  239945    50    0.0188   FALSE                          NA
#>  3 South  01005   24757    50    0.00194  FALSE                          NA
#>  4 South  01007   22152    50    0.00174  FALSE                          NA
#>  5 South  01009   59292    50    0.00465  FALSE                          NA
#>  6 South  01011   10157    50    0.000796 FALSE                          NA
#>  7 South  01013   18807    50    0.00147  FALSE                          NA
#>  8 South  01015  116141    50    0.00910  FALSE                          NA
#>  9 South  01017   34450    50    0.00270  FALSE                          NA
#> 10 South  01019   25224    50    0.00198  FALSE                          NA
#> # ℹ 3,134 more rows
#> # ℹ 2 more variables: NumberHits <int>, ExpectedHits <dbl>
samp2
#> # A tidytable: 3,144 × 8
#>    Region GEOID Pop_Tot ExpHits_man SelectionIndicator SamplingWeight NumberHits
#>    <fct>  <chr>   <dbl>       <dbl> <lgl>                       <dbl>      <int>
#>  1 South  01001   59285    0.00465  FALSE                          NA          0
#>  2 South  01003  239945    0.0188   FALSE                          NA          0
#>  3 South  01005   24757    0.00194  FALSE                          NA          0
#>  4 South  01007   22152    0.00174  FALSE                          NA          0
#>  5 South  01009   59292    0.00465  FALSE                          NA          0
#>  6 South  01011   10157    0.000796 FALSE                          NA          0
#>  7 South  01013   18807    0.00147  FALSE                          NA          0
#>  8 South  01015  116141    0.00910  FALSE                          NA          0
#>  9 South  01017   34450    0.00270  FALSE                          NA          0
#> 10 South  01019   25224    0.00198  FALSE                          NA          0
#> # ℹ 3,134 more rows
#> # ℹ 1 more variable: ExpectedHits <dbl>
waldo::compare(
  samp1 |> select(-c(SelectionIndicator, SamplingWeight, NumberHits, n)),
  samp2 |> select(-c(SelectionIndicator, SamplingWeight, NumberHits))
)
#> old vs new
#>               ExpectedHits
#> - old[1, ]    2.322515e-02
#> + new[1, ]    4.645031e-03
#> - old[2, ]    9.399949e-02
#> + new[2, ]    1.879990e-02
#> - old[3, ]    9.698662e-03
#> + new[3, ]    1.939732e-03
#> - old[4, ]    8.678142e-03
#> + new[4, ]    1.735628e-03
#> - old[5, ]    2.322790e-02
#> + new[5, ]    4.645579e-03
#> - old[6, ]    3.979049e-03
#> + new[6, ]    7.958097e-04
#> - old[7, ]    7.367723e-03
#> + new[7, ]    1.473545e-03
#> - old[8, ]    4.549874e-02
#> + new[8, ]    9.099748e-03
#> - old[9, ]    1.349594e-02
#> + new[9, ]    2.699187e-03
#> - old[10, ]   9.881611e-03
#> + new[10, ]   1.976322e-03
#> and 3134 more ...
#> 
#>      old$ExpectedHits | new$ExpectedHits                  
#>  [1] 0.023225         - 0.004645         [1]              
#>  [2] 0.093999         - 0.018800         [2]              
#>  [3] 0.009699         - 0.001940         [3]              
#>  [4] 0.008678         - 0.001736         [4]              
#>  [5] 0.023228         - 0.004646         [5]              
#>  [6] 0.003979         - 0.000796         [6]              
#>  [7] 0.007368         - 0.001474         [7]              
#>  [8] 0.045499         - 0.009100         [8]              
#>  [9] 0.013496         - 0.002699         [9]              
#> [10] 0.009882         - 0.001976         [10]             
#>  ... ...                ...              and 3134 more ...

Created on 2025-11-19 with reprex v2.1.1

Session info

sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.5.1 (2025-06-13 ucrt)
#>  os       Windows 11 x64 (build 22631)
#>  system   x86_64, mingw32
#>  ui       RTerm
#>  language (EN)
#>  collate  English_United States.utf8
#>  ctype    English_United States.utf8
#>  tz       America/New_York
#>  date     2025-11-19
#>  pandoc   3.6.3 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
#>  quarto   1.8.24 @ C:\\Users\\sazimmer\\AppData\\Local\\Programs\\Quarto\\bin\\quarto.exe
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package       * version    date (UTC) lib source
#>  cli             3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
#>  crayon          1.5.3      2024-06-20 [1] CRAN (R 4.5.0)
#>  data.table      1.17.8     2025-07-10 [1] RSPM (R 4.5.0)
#>  diffobj         0.3.6      2025-04-21 [1] CRAN (R 4.5.0)
#>  digest          0.6.37     2024-08-19 [1] CRAN (R 4.5.0)
#>  dplyr         * 1.1.4      2023-11-17 [1] CRAN (R 4.5.0)
#>  evaluate        1.0.5      2025-08-27 [1] RSPM (R 4.5.0)
#>  fastmap         1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
#>  fs              1.6.6      2025-04-12 [1] CRAN (R 4.5.0)
#>  generics        0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
#>  glue            1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
#>  htmltools       0.5.8.1    2024-04-04 [1] CRAN (R 4.5.0)
#>  knitr           1.50       2025-03-16 [1] CRAN (R 4.5.0)
#>  lifecycle       1.0.4      2023-11-07 [1] CRAN (R 4.5.0)
#>  magrittr        2.0.3      2022-03-30 [1] CRAN (R 4.5.0)
#>  pillar          1.11.0     2025-07-04 [1] RSPM (R 4.5.0)
#>  pkgconfig       2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
#>  R6              2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
#>  reprex          2.1.1      2024-07-06 [1] CRAN (R 4.5.0)
#>  rlang           1.1.6      2025-04-11 [1] CRAN (R 4.5.0)
#>  rmarkdown       2.29       2024-11-04 [1] CRAN (R 4.5.0)
#>  rstudioapi      0.17.1     2024-10-22 [1] CRAN (R 4.5.0)
#>  SampleSelectR * 1.0.0      2025-09-22 [1] Github (rti-international/SampleSelectR@2f7d23c)
#>  sessioninfo     1.2.3.9000 2025-09-18 [1] Github (r-lib/sessioninfo@ec4dd0c)
#>  tibble          3.3.0      2025-06-08 [1] RSPM (R 4.5.0)
#>  tidyselect      1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
#>  tidytable       0.11.2     2024-12-11 [1] CRAN (R 4.5.0)
#>  utf8            1.2.6      2025-06-08 [1] RSPM (R 4.5.0)
#>  vctrs           0.6.5      2023-12-01 [1] CRAN (R 4.5.0)
#>  waldo           0.6.2      2025-07-11 [1] RSPM
#>  withr           3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
#>  xfun            0.53       2025-08-19 [1] RSPM (R 4.5.0)
#>  yaml            2.3.10     2024-07-26 [1] CRAN (R 4.5.0)
#> 
#>  [1] C:/Users/sazimmer/AppData/Local/R/win-library/4.5
#>  [2] C:/Program Files/R/R-4.5.1/library
#>  * ── Packages attached to the search path.
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions