From 04d2822559936609233d5d1a07b4a86c0875f8d2 Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Tue, 30 Sep 2025 20:08:03 +0200 Subject: [PATCH 01/10] optionally use `DISTINCT ON` SQL clause when `keep_all = TRUE` in `distinct()` --- NAMESPACE | 4 +++ R/backend-postgres.R | 10 ++++++ R/db-sql.R | 10 ++++++ R/lazy-select-query.R | 32 +++++++++++++++-- R/query-select.R | 12 +++++-- R/sql-clause.R | 11 +++++- R/verb-distinct.R | 82 ++++++++++++++++++++++++++++--------------- R/verb-select.R | 2 +- man/db-sql.Rd | 3 ++ 9 files changed, 130 insertions(+), 36 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b2e724c54..215993945 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -407,6 +407,9 @@ S3method(sql_values_subquery,Redshift) S3method(sql_values_subquery,RedshiftConnection) S3method(src_tbls,src_sql) S3method(summarise,tbl_lazy) +S3method(supports_distinct_on,DBIConnection) +S3method(supports_distinct_on,PostgreSQL) +S3method(supports_distinct_on,PqConnection) S3method(supports_window_clause,"Spark SQL") S3method(supports_window_clause,ACCESS) S3method(supports_window_clause,DBIConnection) @@ -586,6 +589,7 @@ export(src_dbi) export(src_memdb) export(src_sql) export(src_test) +export(supports_distinct_on) export(supports_window_clause) export(table_path_components) export(table_path_name) diff --git a/R/backend-postgres.R b/R/backend-postgres.R index 824f886bb..cb2a26da1 100644 --- a/R/backend-postgres.R +++ b/R/backend-postgres.R @@ -444,6 +444,16 @@ supports_window_clause.PostgreSQL <- function(con) { TRUE } +#' @export +supports_distinct_on.PqConnection <- function(con) { + TRUE +} + +#' @export +supports_distinct_on.PostgreSQL <- function(con) { + TRUE +} + #' @export db_supports_table_alias_with_as.PqConnection <- function(con) { TRUE diff --git a/R/db-sql.R b/R/db-sql.R index f29aa1021..39214d65d 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -327,6 +327,16 @@ db_supports_table_alias_with_as.TestConnection <- function(con) { TRUE } +#' @rdname db-sql +#' @export +supports_distinct_on <- function(con) { + UseMethod("supports_distinct_on") +} + +#' @export +supports_distinct_on.DBIConnection <- function(con) { + FALSE +} # Query generation -------------------------------------------------------- diff --git a/R/lazy-select-query.R b/R/lazy-select-query.R index 439697349..d3eacda9b 100644 --- a/R/lazy-select-query.R +++ b/R/lazy-select-query.R @@ -19,7 +19,10 @@ lazy_select_query <- function(x, # stopifnot(is.character(group_by)) stopifnot(is_lazy_sql_part(order_by)) check_number_whole(limit, allow_infinite = TRUE, allow_null = TRUE) - check_bool(distinct) + # distinct = FALSE -> no distinct + # distinct = TRUE -> distinct over all columns + # distinct = columns -> DISTINCT ON (...) + stopifnot(is.logical(distinct) || is_lazy_sql_part(distinct)) select <- select %||% syms(set_names(op_vars(x))) select_operation <- arg_match0(select_operation, c("select", "mutate", "summarise")) @@ -39,7 +42,11 @@ lazy_select_query <- function(x, frame = frame ) } else { + select <- new_lazy_select(select) + if (!is.logical(distinct)) { + distinct <- new_lazy_select(distinct) + } } lazy_query( @@ -166,8 +173,26 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { alias <- remote_name(op$x, null_if_local = FALSE) %||% unique_subquery_name() from <- sql_build(op$x, con, sql_options = sql_options) + + selects <- op$select + + if (!isTRUE(op$distinct) && !isFALSE(op$distinct)) { + distinct <- get_select_sql( + select = op$distinct, + select_operation = op$select_operation, + in_vars = op_vars(op$x), + table_alias = alias, + con = con, + use_star = sql_options$use_star + )$select_sql + + selects <- anti_join(selects, op$distinct, by = "expr") + } else { + distinct <- op$distinct + } + select_sql_list <- get_select_sql( - select = op$select, + select = selects, select_operation = op$select_operation, in_vars = op_vars(op$x), table_alias = alias, @@ -176,6 +201,7 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { ) where_sql <- translate_sql_(op$where, con = con, context = list(clause = "WHERE")) + select_query( from = from, select = select_sql_list$select_sql, @@ -184,7 +210,7 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { having = translate_sql_(op$having, con = con, window = FALSE), window = select_sql_list$window_sql, order_by = translate_sql_(op$order_by, con = con), - distinct = op$distinct, + distinct = distinct, limit = op$limit, from_alias = alias ) diff --git a/R/query-select.R b/R/query-select.R index 39c54ac70..aad573308 100644 --- a/R/query-select.R +++ b/R/query-select.R @@ -17,7 +17,7 @@ select_query <- function(from, check_character(window) check_character(order_by) check_number_whole(limit, allow_infinite = TRUE, allow_null = TRUE) - check_bool(distinct) + stopifnot(is.logical(distinct) || is.character(distinct)) check_string(from_alias, allow_null = TRUE) structure( @@ -39,10 +39,12 @@ select_query <- function(from, #' @export print.select_query <- function(x, ...) { - cat_line("") + cat_line("") cat_line("From:") cat_line(indent_print(x$from)) + if (!is.logical(x$distinct)) cat_line("Dist On: ", named_commas(x$distinct)) if (length(x$select)) cat_line("Select: ", named_commas(x$select)) if (length(x$where)) cat_line("Where: ", named_commas(x$where)) if (length(x$group_by)) cat_line("Group by: ", named_commas(x$group_by)) @@ -93,7 +95,11 @@ select_query_clauses <- function(x, subquery = FALSE) { group_by = length(x$group_by) > 0, having = length(x$having) > 0, select = !identical(unname(x$select), sql("*")), - distinct = x$distinct, + distinct = if (is.logical(x$distinct)) { + x$distinct + } else { + !identical(unname(x$distinct), sql("*")) + }, window = length(x$window) > 0, order_by = (!subquery || !is.null(x$limit)) && length(x$order_by) > 0, limit = !is.null(x$limit) diff --git a/R/sql-clause.R b/R/sql-clause.R index 8a10e92c4..edbc07b5b 100644 --- a/R/sql-clause.R +++ b/R/sql-clause.R @@ -49,10 +49,19 @@ sql_clause_select <- function(con, top <- as.integer(top) } + sql_distinct <- + if(isTRUE(distinct)) { + " DISTINCT" + } else if (isFALSE(distinct)) { + "" + } else { + glue_sql2(con, " DISTINCT ON ({.col distinct})") + } + clause <- glue_sql2( con, "SELECT", - if (distinct) " DISTINCT", + sql_distinct, if (!is.null(top)) " TOP {.val top}" ) diff --git a/R/verb-distinct.R b/R/verb-distinct.R index e1d2d9dc2..96857907c 100644 --- a/R/verb-distinct.R +++ b/R/verb-distinct.R @@ -17,37 +17,50 @@ distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) { grps <- syms(op_grps(.data)) empty_dots <- dots_n(...) == 0 - can_use_distinct <- !.keep_all || (empty_dots && is_empty(grps)) - if (!can_use_distinct) { - needs_dummy_order <- is.null(op_sort(.data)) - - if (needs_dummy_order) { - dummy_order_vars <- colnames(.data)[[1]] - .data <- .data %>% window_order(!!sym(dummy_order_vars)) + use_distinct_on <- .keep_all && !(empty_dots && is_empty(grps)) + + if (!use_distinct_on) { + if (empty_dots) { + dots <- quos(!!!syms(colnames(.data))) + } else { + dots <- partial_eval_dots(.data, ..., .named = FALSE) + dots <- quos(!!!dots) } + prep <- distinct_prepare_compat(.data, dots, group_vars = group_vars(.data)) + out <- dplyr::select(prep$data, prep$keep) - .data <- .data %>% - group_by(..., .add = TRUE) %>% - filter(row_number() == 1L) %>% - group_by(!!!grps) + out$lazy_query <- add_distinct(out, distinct = TRUE) + } else { + if (supports_distinct_on(.data$src$con)) { + loc <- tidyselect::eval_select(expr(c(...)), .data) + loc <- ensure_group_vars(loc, .data, notify = TRUE) + new_vars <- set_names(colnames(.data)[loc], names(loc)) + + prep <- distinct_prepare_compat(.data, vars = quos(), + group_vars = group_vars(.data)) + out <- dplyr::select(prep$data, prep$keep) + + out$lazy_query <- add_distinct(out, distinct = new_vars) + } else { + needs_dummy_order <- is.null(op_sort(.data)) + + if (needs_dummy_order) { + dummy_order_vars <- colnames(.data)[[1]] + .data <- .data %>% window_order(!!sym(dummy_order_vars)) + } - if (needs_dummy_order) { - .data <- .data %>% window_order() - } + .data <- .data %>% + group_by(..., .add = TRUE) %>% + filter(row_number() == 1L) %>% + group_by(!!!grps) - return(.data) - } + if (needs_dummy_order) { + .data <- .data %>% window_order() + } - if (empty_dots) { - dots <- quos(!!!syms(colnames(.data))) - } else { - dots <- partial_eval_dots(.data, ..., .named = FALSE) - dots <- quos(!!!dots) + return(.data) + } } - prep <- distinct_prepare_compat(.data, dots, group_vars = group_vars(.data)) - out <- dplyr::select(prep$data, prep$keep) - - out$lazy_query <- add_distinct(out) out } @@ -146,12 +159,25 @@ quo_is_variable_reference <- function(quo) { } -add_distinct <- function(.data) { +add_distinct <- function(.data, distinct) { lazy_query <- .data$lazy_query + distinct <- + if (is.logical(distinct)) { + distinct + } else { + # if the DISTINCT ON clause contains all columns + # we can fall back to a normal DISTINCT + if (is_identity(syms(distinct), names(distinct), colnames(.data))) { + TRUE + } else { + syms(distinct) + } + } + out <- lazy_select_query( x = lazy_query, - distinct = TRUE + distinct = distinct ) # TODO this could also work for joins if (!is_lazy_select_query(lazy_query)) { @@ -171,6 +197,6 @@ add_distinct <- function(.data) { return(out) } - lazy_query$distinct <- TRUE + lazy_query$distinct <- distinct lazy_query } diff --git a/R/verb-select.R b/R/verb-select.R index ef93594f3..08a9a3a02 100644 --- a/R/verb-select.R +++ b/R/verb-select.R @@ -140,7 +140,7 @@ select_can_be_inlined <- function(lazy_query, vars) { ordered_present <- all(intersect(computed_columns, order_vars) %in% vars) # if the projection is distinct, it must be bijective - is_distinct <- is_true(lazy_query$distinct) + is_distinct <- !is.logical(lazy_query$distinct) || is_true(lazy_query$distinct) is_bijective_projection <- identical(sort(unname(vars)), op_vars(lazy_query)) distinct_is_bijective <- !is_distinct || is_bijective_projection diff --git a/man/db-sql.Rd b/man/db-sql.Rd index c1aec3dea..4b1588daa 100644 --- a/man/db-sql.Rd +++ b/man/db-sql.Rd @@ -15,6 +15,7 @@ \alias{sql_query_rows} \alias{supports_window_clause} \alias{db_supports_table_alias_with_as} +\alias{supports_distinct_on} \alias{sql_query_select} \alias{sql_query_join} \alias{sql_query_multi_join} @@ -58,6 +59,8 @@ supports_window_clause(con) db_supports_table_alias_with_as(con) +supports_distinct_on(con) + sql_query_select( con, select, From 09f6aa12443fe2a1b1059d3eac323c0743a0a43f Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Tue, 30 Sep 2025 20:20:56 +0200 Subject: [PATCH 02/10] simplify and fix whitespace --- R/lazy-select-query.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/lazy-select-query.R b/R/lazy-select-query.R index d3eacda9b..0e557526b 100644 --- a/R/lazy-select-query.R +++ b/R/lazy-select-query.R @@ -42,7 +42,6 @@ lazy_select_query <- function(x, frame = frame ) } else { - select <- new_lazy_select(select) if (!is.logical(distinct)) { distinct <- new_lazy_select(distinct) @@ -176,7 +175,7 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { selects <- op$select - if (!isTRUE(op$distinct) && !isFALSE(op$distinct)) { + if (!is.logical(op$distinct)) { distinct <- get_select_sql( select = op$distinct, select_operation = op$select_operation, @@ -201,7 +200,6 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { ) where_sql <- translate_sql_(op$where, con = con, context = list(clause = "WHERE")) - select_query( from = from, select = select_sql_list$select_sql, From 4fdaa082658db89233a7bacfd8b49ad1d3a9c1ca Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Wed, 1 Oct 2025 09:44:12 +0200 Subject: [PATCH 03/10] some temporary fixes --- R/lazy-select-query.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/lazy-select-query.R b/R/lazy-select-query.R index 0e557526b..510c049a7 100644 --- a/R/lazy-select-query.R +++ b/R/lazy-select-query.R @@ -143,7 +143,8 @@ is_select_identity <- function(select, vars_prev) { #' @export print.lazy_select_query <- function(x, ...) { - cat_line("") + cat_line("") cat_line("From:") cat_line(indent_print(sql_build(x$x, simulate_dbi()))) @@ -185,7 +186,8 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { use_star = sql_options$use_star )$select_sql - selects <- anti_join(selects, op$distinct, by = "expr") + # if some renaming is going on we want to keep that deliberately + selects <- anti_join(selects, op$distinct, by = "name") } else { distinct <- op$distinct } From e7db82bcdda74ad6355e84d026018779182ac891 Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Wed, 1 Oct 2025 12:07:48 +0200 Subject: [PATCH 04/10] we explicitely need to select columns after using ON() --- R/lazy-select-query.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/R/lazy-select-query.R b/R/lazy-select-query.R index 510c049a7..be2b2efbf 100644 --- a/R/lazy-select-query.R +++ b/R/lazy-select-query.R @@ -185,9 +185,6 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { con = con, use_star = sql_options$use_star )$select_sql - - # if some renaming is going on we want to keep that deliberately - selects <- anti_join(selects, op$distinct, by = "name") } else { distinct <- op$distinct } From 7713bca5da0e5b60c56a5959bcfba75626e446bf Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Wed, 1 Oct 2025 12:08:13 +0200 Subject: [PATCH 05/10] make distinct computations and renaming work as usual --- R/verb-distinct.R | 57 +++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/R/verb-distinct.R b/R/verb-distinct.R index 96857907c..12e578637 100644 --- a/R/verb-distinct.R +++ b/R/verb-distinct.R @@ -17,9 +17,10 @@ distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) { grps <- syms(op_grps(.data)) empty_dots <- dots_n(...) == 0 - use_distinct_on <- .keep_all && !(empty_dots && is_empty(grps)) + can_use_distinct <- + !.keep_all || (empty_dots && is_empty(grps)) || supports_distinct_on(.data$src$con) - if (!use_distinct_on) { + if (can_use_distinct) { if (empty_dots) { dots <- quos(!!!syms(colnames(.data))) } else { @@ -27,39 +28,32 @@ distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) { dots <- quos(!!!dots) } prep <- distinct_prepare_compat(.data, dots, group_vars = group_vars(.data)) - out <- dplyr::select(prep$data, prep$keep) - out$lazy_query <- add_distinct(out, distinct = TRUE) - } else { - if (supports_distinct_on(.data$src$con)) { - loc <- tidyselect::eval_select(expr(c(...)), .data) - loc <- ensure_group_vars(loc, .data, notify = TRUE) - new_vars <- set_names(colnames(.data)[loc], names(loc)) - - prep <- distinct_prepare_compat(.data, vars = quos(), - group_vars = group_vars(.data)) + if (!.keep_all) { out <- dplyr::select(prep$data, prep$keep) - - out$lazy_query <- add_distinct(out, distinct = new_vars) + out$lazy_query <- add_distinct(out, distinct = TRUE) } else { - needs_dummy_order <- is.null(op_sort(.data)) - - if (needs_dummy_order) { - dummy_order_vars <- colnames(.data)[[1]] - .data <- .data %>% window_order(!!sym(dummy_order_vars)) - } + out <- prep$data + out$lazy_query <- add_distinct(out, distinct = set_names(names(quos_auto_name(dots)))) + } + } else { + needs_dummy_order <- is.null(op_sort(.data)) - .data <- .data %>% - group_by(..., .add = TRUE) %>% - filter(row_number() == 1L) %>% - group_by(!!!grps) + if (needs_dummy_order) { + dummy_order_vars <- colnames(.data)[[1]] + .data <- .data %>% window_order(!!sym(dummy_order_vars)) + } - if (needs_dummy_order) { - .data <- .data %>% window_order() - } + .data <- .data %>% + group_by(..., .add = TRUE) %>% + filter(row_number() == 1L) %>% + group_by(!!!grps) - return(.data) + if (needs_dummy_order) { + .data <- .data %>% window_order() } + + return(.data) } out } @@ -197,6 +191,11 @@ add_distinct <- function(.data, distinct) { return(out) } - lazy_query$distinct <- distinct + if (!is.logical(distinct)) { + lazy_query$distinct <- new_lazy_select(distinct) + } else { + lazy_query$distinct <- distinct + } + lazy_query } From 9b75eb038ecee4da1cdd380835168fda62952e5a Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Wed, 1 Oct 2025 12:08:22 +0200 Subject: [PATCH 06/10] add some postgres tests --- tests/testthat/test-backend-postgres.R | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/testthat/test-backend-postgres.R b/tests/testthat/test-backend-postgres.R index c60512f46..51e70e053 100644 --- a/tests/testthat/test-backend-postgres.R +++ b/tests/testthat/test-backend-postgres.R @@ -436,3 +436,32 @@ test_that("correctly escapes dates", { dd <- as.Date("2022-03-04") expect_equal(escape(dd, con = con), sql("'2022-03-04'::date")) }) + +test_that("distinct can compute variables when .keep_all is TRUE", { + con <- src_test("postgres") + + out <- memdb_frame(x = c(2, 1), y = c(1, 2), con = con) %>% + distinct(z = x + y, .keep_all = TRUE) %>% + collect() + + expect_named(out, c("x", "y", "z")) + expect_equal(out$z, 3) +}) + +test_that("distinct respects window_order when .keep_all is TRUE", { + con <- src_test("postgres") + + mf <- memdb_frame(x = c(1, 1, 2, 2), y = 1:4, con = con) + out <- mf %>% + window_order(desc(y)) %>% + distinct(x, .keep_all = TRUE) + + expect_equal(out %>% collect(), tibble(x = 1:2, y = c(2, 4))) + + lf <- lazy_frame(x = c(1, 1, 2, 2), y = 1:4, con = con) + expect_snapshot( + lf %>% + window_order(desc(y)) %>% + distinct(x, .keep_all = TRUE) + ) +}) From b2b7146ba6acc6e4b3f334d5880b6c34ed0a2638 Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Wed, 1 Oct 2025 20:19:57 +0200 Subject: [PATCH 07/10] fix new tests --- tests/testthat/test-backend-postgres.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-backend-postgres.R b/tests/testthat/test-backend-postgres.R index 51e70e053..b4ed958ae 100644 --- a/tests/testthat/test-backend-postgres.R +++ b/tests/testthat/test-backend-postgres.R @@ -437,10 +437,12 @@ test_that("correctly escapes dates", { expect_equal(escape(dd, con = con), sql("'2022-03-04'::date")) }) +# we test that again because postgres uses the DISTINCT ON feature test_that("distinct can compute variables when .keep_all is TRUE", { con <- src_test("postgres") - out <- memdb_frame(x = c(2, 1), y = c(1, 2), con = con) %>% + out <- + local_db_table(con, data.frame(x = c(2, 1), y = c(1, 2)), "df_x") %>% distinct(z = x + y, .keep_all = TRUE) %>% collect() @@ -451,16 +453,15 @@ test_that("distinct can compute variables when .keep_all is TRUE", { test_that("distinct respects window_order when .keep_all is TRUE", { con <- src_test("postgres") - mf <- memdb_frame(x = c(1, 1, 2, 2), y = 1:4, con = con) + mf <- local_db_table(con, data.frame(x = c(1, 1, 2, 2), y = 1:4), "mf") out <- mf %>% window_order(desc(y)) %>% distinct(x, .keep_all = TRUE) expect_equal(out %>% collect(), tibble(x = 1:2, y = c(2, 4))) - lf <- lazy_frame(x = c(1, 1, 2, 2), y = 1:4, con = con) expect_snapshot( - lf %>% + mf %>% window_order(desc(y)) %>% distinct(x, .keep_all = TRUE) ) From d6c04a1135451978a13d01db9efc8c6138d40cf0 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Thu, 27 Nov 2025 08:45:31 +0200 Subject: [PATCH 08/10] Reformat with air --- R/lazy-select-query.R | 8 ++++++-- R/sql-clause.R | 2 +- R/verb-distinct.R | 9 +++++++-- R/verb-select.R | 3 ++- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/R/lazy-select-query.R b/R/lazy-select-query.R index a4c120105..1a8ac3f99 100644 --- a/R/lazy-select-query.R +++ b/R/lazy-select-query.R @@ -158,8 +158,12 @@ is_select_identity <- function(select, vars_prev) { #' @export print.lazy_select_query <- function(x, ...) { - cat_line("") + cat_line( + "" + ) cat_line("From:") cat_line(indent_print(sql_build(x$x, simulate_dbi()))) diff --git a/R/sql-clause.R b/R/sql-clause.R index a4b96473d..4f2d1539e 100644 --- a/R/sql-clause.R +++ b/R/sql-clause.R @@ -54,7 +54,7 @@ sql_clause_select <- function( } sql_distinct <- - if(isTRUE(distinct)) { + if (isTRUE(distinct)) { " DISTINCT" } else if (isFALSE(distinct)) { "" diff --git a/R/verb-distinct.R b/R/verb-distinct.R index 5c10fcfd8..34e715c1e 100644 --- a/R/verb-distinct.R +++ b/R/verb-distinct.R @@ -18,7 +18,9 @@ distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) { grps <- syms(op_grps(.data)) empty_dots <- dots_n(...) == 0 can_use_distinct <- - !.keep_all || (empty_dots && is_empty(grps)) || supports_distinct_on(.data$src$con) + !.keep_all || + (empty_dots && is_empty(grps)) || + supports_distinct_on(.data$src$con) if (can_use_distinct) { if (empty_dots) { @@ -34,7 +36,10 @@ distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) { out$lazy_query <- add_distinct(out, distinct = TRUE) } else { out <- prep$data - out$lazy_query <- add_distinct(out, distinct = set_names(names(quos_auto_name(dots)))) + out$lazy_query <- add_distinct( + out, + distinct = set_names(names(quos_auto_name(dots))) + ) } } else { needs_dummy_order <- is.null(op_sort(.data)) diff --git a/R/verb-select.R b/R/verb-select.R index 08a9a3a02..39cc208b1 100644 --- a/R/verb-select.R +++ b/R/verb-select.R @@ -140,7 +140,8 @@ select_can_be_inlined <- function(lazy_query, vars) { ordered_present <- all(intersect(computed_columns, order_vars) %in% vars) # if the projection is distinct, it must be bijective - is_distinct <- !is.logical(lazy_query$distinct) || is_true(lazy_query$distinct) + is_distinct <- !is.logical(lazy_query$distinct) || + is_true(lazy_query$distinct) is_bijective_projection <- identical(sort(unname(vars)), op_vars(lazy_query)) distinct_is_bijective <- !is_distinct || is_bijective_projection From 2f0d8da2ee83b9c65addaab96f7f60e283b0cb1f Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Mon, 1 Dec 2025 21:43:49 +0100 Subject: [PATCH 09/10] simplifications (review response) --- R/lazy-select-query.R | 15 +++++---------- R/query-select.R | 2 +- R/sql-clause.R | 21 +++++++++++---------- R/verb-distinct.R | 27 ++++++--------------------- 4 files changed, 23 insertions(+), 42 deletions(-) diff --git a/R/lazy-select-query.R b/R/lazy-select-query.R index 1a8ac3f99..fa9712e6c 100644 --- a/R/lazy-select-query.R +++ b/R/lazy-select-query.R @@ -24,7 +24,7 @@ lazy_select_query <- function( # distinct = FALSE -> no distinct # distinct = TRUE -> distinct over all columns # distinct = columns -> DISTINCT ON (...) - stopifnot(is.logical(distinct) || is_lazy_sql_part(distinct)) + stopifnot(is_bool(distinct) || is_lazy_sql_part(distinct)) select <- select %||% syms(set_names(op_vars(x))) select_operation <- arg_match0( @@ -48,9 +48,6 @@ lazy_select_query <- function( ) } else { select <- new_lazy_select(select) - if (!is.logical(distinct)) { - distinct <- new_lazy_select(distinct) - } } lazy_query( @@ -161,7 +158,7 @@ print.lazy_select_query <- function(x, ...) { cat_line( "" ) cat_line("From:") @@ -207,11 +204,9 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { alias <- remote_name(op$x, null_if_local = FALSE) %||% unique_subquery_name() from <- sql_build(op$x, con, sql_options = sql_options) - selects <- op$select - - if (!is.logical(op$distinct)) { + if (is_lazy_sql_part(op$distinct)) { distinct <- get_select_sql( - select = op$distinct, + select = new_lazy_select(op$distinct), select_operation = op$select_operation, in_vars = op_vars(op$x), table_alias = alias, @@ -223,7 +218,7 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) { } select_sql_list <- get_select_sql( - select = selects, + select = op$select, select_operation = op$select_operation, in_vars = op_vars(op$x), table_alias = alias, diff --git a/R/query-select.R b/R/query-select.R index 570b005df..70f889eef 100644 --- a/R/query-select.R +++ b/R/query-select.R @@ -19,7 +19,7 @@ select_query <- function( check_character(window) check_character(order_by) check_number_whole(limit, allow_infinite = TRUE, allow_null = TRUE) - stopifnot(is.logical(distinct) || is.character(distinct)) + stopifnot(is_bool(distinct) || is.character(distinct)) check_string(from_alias, allow_null = TRUE) structure( diff --git a/R/sql-clause.R b/R/sql-clause.R index 4f2d1539e..f30a73a3d 100644 --- a/R/sql-clause.R +++ b/R/sql-clause.R @@ -53,25 +53,26 @@ sql_clause_select <- function( top <- as.integer(top) } - sql_distinct <- - if (isTRUE(distinct)) { - " DISTINCT" - } else if (isFALSE(distinct)) { - "" - } else { - glue_sql2(con, " DISTINCT ON ({.col distinct})") - } - clause <- glue_sql2( con, "SELECT", - sql_distinct, + sql_distinct(con, distinct), if (!is.null(top)) " TOP {.val top}" ) sql_clause(clause, select) } +sql_distinct <- function(con, distinct) { + if (isTRUE(distinct)) { + " DISTINCT" + } else if (isFALSE(distinct)) { + "" + } else { + glue_sql2(con, " DISTINCT ON ({.col distinct})") + } +} + sql_clause_from <- function(from, lvl = 0) { sql_clause("FROM", from, lvl = lvl) } diff --git a/R/verb-distinct.R b/R/verb-distinct.R index 34e715c1e..9cbabac39 100644 --- a/R/verb-distinct.R +++ b/R/verb-distinct.R @@ -49,16 +49,14 @@ distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) { .data <- .data %>% window_order(!!sym(dummy_order_vars)) } - .data <- .data %>% + out <- .data %>% group_by(..., .add = TRUE) %>% filter(row_number() == 1L) %>% group_by(!!!grps) if (needs_dummy_order) { - .data <- .data %>% window_order() + out <- out %>% window_order() } - - return(.data) } out } @@ -163,18 +161,9 @@ quo_is_variable_reference <- function(quo) { add_distinct <- function(.data, distinct) { lazy_query <- .data$lazy_query - distinct <- - if (is.logical(distinct)) { - distinct - } else { - # if the DISTINCT ON clause contains all columns - # we can fall back to a normal DISTINCT - if (is_identity(syms(distinct), names(distinct), colnames(.data))) { - TRUE - } else { - syms(distinct) - } - } + if (!is_bool(distinct)) { + distinct <- syms(distinct) + } out <- lazy_select_query( x = lazy_query, @@ -198,11 +187,7 @@ add_distinct <- function(.data, distinct) { return(out) } - if (!is.logical(distinct)) { - lazy_query$distinct <- new_lazy_select(distinct) - } else { - lazy_query$distinct <- distinct - } + lazy_query$distinct <- distinct lazy_query } From 8b316962e95892358c2e895bb52ca0feb918b99a Mon Sep 17 00:00:00 2001 From: Lukas Schneiderbauer Date: Mon, 1 Dec 2025 21:50:48 +0100 Subject: [PATCH 10/10] print DISTINCT ON variables --- R/lazy-select-query.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/lazy-select-query.R b/R/lazy-select-query.R index fa9712e6c..ab6753124 100644 --- a/R/lazy-select-query.R +++ b/R/lazy-select-query.R @@ -168,6 +168,9 @@ print.lazy_select_query <- function(x, ...) { if (length(select)) { cat_line("Select: ", named_commas(select)) } + if (is_lazy_sql_part(x$distinct)) { + cat_line("Dist on: ", named_commas(x$distinct)) + } if (length(x$where)) { cat_line("Where: ", named_commas(x$where)) }