Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slow with dm::dm_filter() #441

Open
krlmlr opened this issue Jan 4, 2025 · 4 comments
Open

Slow with dm::dm_filter() #441

krlmlr opened this issue Jan 4, 2025 · 4 comments
Milestone

Comments

@krlmlr
Copy link
Member

krlmlr commented Jan 4, 2025

Even with empty duck tibbles, see dm_duckplyr_ptype . Want to investigate what's going on here. It's not the "meta" functionality, checked that.

library(dm)

if (!file.exists("dm_local.qs")) {
  dm <- dm_financial()

  dm_local <- dm |>
    collect()

  qs::qsave(dm_local, file = "dm_local.qs")
}

dm_local <- qs::qread("dm_local.qs")

dm_local |>
  dm_filter(districts = (id == 1L)) |>
  system.time()
#>    user  system elapsed 
#>   0.184   0.008   0.196

dm_local_ptype <-
  dm_local |>
  dm_ptype()

dm_local_ptype |>
  dm_filter(districts = (id == 1L)) |>
  system.time()
#>    user  system elapsed 
#>   0.077   0.002   0.078

dm_local |>
  dm_zoom_to(districts) |>
  filter(id == 1L) |>
  dm_update_zoomed() |>
  dm_zoom_to(accounts) |>
  semi_join(districts) |>
  dm_update_zoomed() |>
  dm_zoom_to(loans) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(orders) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(trans) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(disps) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(cards) |>
  semi_join(disps) |>
  dm_update_zoomed() |>
  dm_zoom_to(clients) |>
  semi_join(disps) |>
  dm_update_zoomed() |>
  system.time()
#>    user  system elapsed 
#>   0.031   0.001   0.032

library(duckplyr)
#> ✔ Overwriting dplyr methods with duckplyr methods.
#> ℹ Turn off with `duckplyr::methods_restore()`.

trans <- pull_tbl(dm_local, "trans", keyed = TRUE) |> as_duck_tbl()
districts <- pull_tbl(dm_local, "districts", keyed = TRUE) |> as_duck_tbl()
clients <- pull_tbl(dm_local, "clients", keyed = TRUE) |> as_duck_tbl()
orders <- pull_tbl(dm_local, "orders", keyed = TRUE) |> as_duck_tbl()
cards <- pull_tbl(dm_local, "cards", keyed = TRUE) |> as_duck_tbl()
disps <- pull_tbl(dm_local, "disps", keyed = TRUE) |> as_duck_tbl()
tkeys <- pull_tbl(dm_local, "tkeys", keyed = TRUE) |> as_duck_tbl()
accounts <- pull_tbl(dm_local, "accounts", keyed = TRUE) |> as_duck_tbl()
loans <- pull_tbl(dm_local, "loans", keyed = TRUE) |> as_duck_tbl()

dm_duckplyr <-
  dm::dm(
    trans,
    districts,
    clients,
    orders,
    cards,
    disps,
    tkeys,
    accounts,
    loans,
  ) %>%
  dm::dm_add_pk(trans, id) %>%
  dm::dm_add_pk(districts, id) %>%
  dm::dm_add_pk(clients, id) %>%
  dm::dm_add_pk(orders, id) %>%
  dm::dm_add_pk(cards, id) %>%
  dm::dm_add_pk(disps, id) %>%
  dm::dm_add_pk(accounts, id) %>%
  dm::dm_add_pk(loans, id) %>%
  dm::dm_add_fk(accounts, district_id, districts) %>%
  dm::dm_add_fk(disps, client_id, clients) %>%
  dm::dm_add_fk(cards, disp_id, disps) %>%
  dm::dm_add_fk(loans, account_id, accounts) %>%
  dm::dm_add_fk(orders, account_id, accounts) %>%
  dm::dm_add_fk(disps, account_id, accounts) %>%
  dm::dm_add_fk(trans, account_id, accounts) %>%
  dm::dm_set_colors(`#006400FF` = loans)

dm_duckplyr |>
  dm_filter(districts = (id == 1L)) |>
  system.time()
#>    user  system elapsed 
#>   2.103   0.055   2.193

dm_duckplyr |>
  dm_zoom_to(districts) |>
  filter(id == 1L) |>
  dm_update_zoomed() |>
  dm_zoom_to(accounts) |>
  semi_join(districts) |>
  dm_update_zoomed() |>
  dm_zoom_to(loans) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(orders) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(trans) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(disps) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(cards) |>
  semi_join(disps) |>
  dm_update_zoomed() |>
  dm_zoom_to(clients) |>
  semi_join(disps) |>
  dm_update_zoomed() |>
  system.time()
#>    user  system elapsed 
#>   0.262   0.000   0.263

dm_duckplyr |>
  dm_zoom_to(districts) |>
  filter(id == 1L) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(accounts) |>
  semi_join(districts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(loans) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(orders) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(trans) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(disps) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(cards) |>
  semi_join(disps) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(clients) |>
  semi_join(disps) |>
  compute() |>
  dm_update_zoomed() |>
  system.time()
#>    user  system elapsed 
#>   0.276   0.008   0.284

trans <- pull_tbl(dm_local_ptype, "trans", keyed = TRUE) |> as_duck_tbl()
districts <- pull_tbl(dm_local_ptype, "districts", keyed = TRUE) |> as_duck_tbl()
clients <- pull_tbl(dm_local_ptype, "clients", keyed = TRUE) |> as_duck_tbl()
orders <- pull_tbl(dm_local_ptype, "orders", keyed = TRUE) |> as_duck_tbl()
cards <- pull_tbl(dm_local_ptype, "cards", keyed = TRUE) |> as_duck_tbl()
disps <- pull_tbl(dm_local_ptype, "disps", keyed = TRUE) |> as_duck_tbl()
tkeys <- pull_tbl(dm_local_ptype, "tkeys", keyed = TRUE) |> as_duck_tbl()
accounts <- pull_tbl(dm_local_ptype, "accounts", keyed = TRUE) |> as_duck_tbl()
loans <- pull_tbl(dm_local_ptype, "loans", keyed = TRUE) |> as_duck_tbl()

dm_duckplyr_ptype <-
  dm::dm(
    trans,
    districts,
    clients,
    orders,
    cards,
    disps,
    tkeys,
    accounts,
    loans,
  ) %>%
  dm::dm_add_pk(trans, id) %>%
  dm::dm_add_pk(districts, id) %>%
  dm::dm_add_pk(clients, id) %>%
  dm::dm_add_pk(orders, id) %>%
  dm::dm_add_pk(cards, id) %>%
  dm::dm_add_pk(disps, id) %>%
  dm::dm_add_pk(accounts, id) %>%
  dm::dm_add_pk(loans, id) %>%
  dm::dm_add_fk(accounts, district_id, districts) %>%
  dm::dm_add_fk(disps, client_id, clients) %>%
  dm::dm_add_fk(cards, disp_id, disps) %>%
  dm::dm_add_fk(loans, account_id, accounts) %>%
  dm::dm_add_fk(orders, account_id, accounts) %>%
  dm::dm_add_fk(disps, account_id, accounts) %>%
  dm::dm_add_fk(trans, account_id, accounts) %>%
  dm::dm_set_colors(`#006400FF` = loans)

dm_duckplyr_ptype |>
  dm_filter(districts = (id == 1L)) |>
  system.time()
#>    user  system elapsed 
#>   1.806   0.042   1.867

Created on 2025-01-04 with reprex v2.1.1

@krlmlr
Copy link
Member Author

krlmlr commented Jan 4, 2025

Much simpler: It's not the duck tibbles inside the dm, it's dm using duckplyr.

library(dm)

Sys.setenv(DUCKPLYR_META_SKIP = TRUE)

if (!file.exists("dm_local.qs")) {
  dm <- dm_financial()

  dm_local <- dm |>
    dm_ptype()

  qs::qsave(dm_local, file = "dm_local.qs")
}

dm_local <-
  qs::qread("dm_local.qs") %>%
  dm_select_tbl(-cards, -clients, -disps, -loans, -orders)

duckplyr::methods_restore()
#> ℹ Restoring dplyr methods.

dm_local |>
  dm_filter(districts = (id == 1L)) |>
  system.time()
#>    user  system elapsed 
#>   0.070   0.001   0.094

duckplyr::methods_overwrite()
#> ✔ Overwriting dplyr methods with duckplyr methods.
#> ℹ Turn off with `duckplyr::methods_restore()`.

dm_local |>
  dm_filter(districts = (id == 1L)) |>
  system.time()
#>    user  system elapsed 
#>   1.172   0.034   1.267

Created on 2025-01-04 with reprex v2.1.1

@krlmlr
Copy link
Member Author

krlmlr commented Jan 4, 2025

Half of that is fallback collection, another quarter perhaps is cli::cli_abort() in check_df_for_rel() .

@krlmlr
Copy link
Member Author

krlmlr commented Jan 5, 2025

But also the version where dm uses dplyr and the tables are duck tibbles is slow in itself:

library(dm)
#> 
#> Attaching package: 'dm'
#> The following object is masked from 'package:stats':
#> 
#>     filter

if (!file.exists("dm_local.qs")) {
  dm <- dm_financial()

  dm_local <- dm |>
    dm_ptype()

  qs::qsave(dm_local, file = "dm_local.qs")
}

dm_local <- qs::qread("dm_local.qs")

trans <- pull_tbl(dm_local, "trans", keyed = TRUE) |> duckplyr::as_duck_tbl()
#> The duckplyr package is configured to fall back to dplyr when it encounters an
#> incompatibility. Fallback events can be collected and uploaded for analysis to
#> guide future development. By default, data will be collected but no data will
#> be uploaded.
#> ℹ Automatic fallback uploading is not controlled and therefore disabled, see
#>   `?duckplyr::fallback()`.
#> ✔ Number of reports ready for upload: 193.
#> → Review with `duckplyr::fallback_review()`, upload with
#>   `duckplyr::fallback_upload()`.
#> ℹ Configure automatic uploading with `duckplyr::fallback_config()`.
districts <- pull_tbl(dm_local, "districts", keyed = TRUE) |> duckplyr::as_duck_tbl()
clients <- pull_tbl(dm_local, "clients", keyed = TRUE) |> duckplyr::as_duck_tbl()
orders <- pull_tbl(dm_local, "orders", keyed = TRUE) |> duckplyr::as_duck_tbl()
cards <- pull_tbl(dm_local, "cards", keyed = TRUE) |> duckplyr::as_duck_tbl()
disps <- pull_tbl(dm_local, "disps", keyed = TRUE) |> duckplyr::as_duck_tbl()
tkeys <- pull_tbl(dm_local, "tkeys", keyed = TRUE) |> duckplyr::as_duck_tbl()
accounts <- pull_tbl(dm_local, "accounts", keyed = TRUE) |> duckplyr::as_duck_tbl()
loans <- pull_tbl(dm_local, "loans", keyed = TRUE) |> duckplyr::as_duck_tbl()

dm_duckplyr <-
  dm::dm(
    trans,
    districts,
    clients,
    orders,
    cards,
    disps,
    tkeys,
    accounts,
    loans,
  ) %>%
  dm::dm_add_pk(trans, id) %>%
  dm::dm_add_pk(districts, id) %>%
  dm::dm_add_pk(clients, id) %>%
  dm::dm_add_pk(orders, id) %>%
  dm::dm_add_pk(cards, id) %>%
  dm::dm_add_pk(disps, id) %>%
  dm::dm_add_pk(accounts, id) %>%
  dm::dm_add_pk(loans, id) %>%
  dm::dm_add_fk(accounts, district_id, districts) %>%
  dm::dm_add_fk(disps, client_id, clients) %>%
  dm::dm_add_fk(cards, disp_id, disps) %>%
  dm::dm_add_fk(loans, account_id, accounts) %>%
  dm::dm_add_fk(orders, account_id, accounts) %>%
  dm::dm_add_fk(disps, account_id, accounts) %>%
  dm::dm_add_fk(trans, account_id, accounts) %>%
  dm::dm_set_colors(`#006400FF` = loans)

dm_duckplyr |>
  dm_filter(districts = (id == 1L)) |>
  system.time()
#>    user  system elapsed 
#>   0.476   0.034   0.603

dm_duckplyr |>
  dm_zoom_to(districts) |>
  filter(id == 1L) |>
  dm_update_zoomed() |>
  dm_zoom_to(accounts) |>
  semi_join(districts) |>
  dm_update_zoomed() |>
  dm_zoom_to(loans) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(orders) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(trans) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(disps) |>
  semi_join(accounts) |>
  dm_update_zoomed() |>
  dm_zoom_to(cards) |>
  semi_join(disps) |>
  dm_update_zoomed() |>
  dm_zoom_to(clients) |>
  semi_join(disps) |>
  dm_update_zoomed() |>
  system.time()
#>    user  system elapsed 
#>   0.053   0.000   0.053

dm_duckplyr |>
  dm_zoom_to(districts) |>
  filter(id == 1L) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(accounts) |>
  semi_join(districts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(loans) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(orders) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(trans) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(disps) |>
  semi_join(accounts) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(cards) |>
  semi_join(disps) |>
  compute() |>
  dm_update_zoomed() |>
  dm_zoom_to(clients) |>
  semi_join(disps) |>
  compute() |>
  dm_update_zoomed() |>
  system.time()
#>    user  system elapsed 
#>   0.064   0.002   0.057

Created on 2025-01-05 with reprex v2.1.1

@krlmlr
Copy link
Member Author

krlmlr commented Jan 5, 2025

I'm getting wildly different timings for the second example, though. Probably okay?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant