Skip to content
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,9 @@ S3method(sql_values_subquery,Redshift)
S3method(sql_values_subquery,RedshiftConnection)
S3method(src_tbls,src_sql)
S3method(summarise,tbl_lazy)
S3method(supports_distinct_on,DBIConnection)
S3method(supports_distinct_on,PostgreSQL)
S3method(supports_distinct_on,PqConnection)
S3method(supports_window_clause,"Spark SQL")
S3method(supports_window_clause,ACCESS)
S3method(supports_window_clause,DBIConnection)
Expand Down Expand Up @@ -586,6 +589,7 @@ export(src_dbi)
export(src_memdb)
export(src_sql)
export(src_test)
export(supports_distinct_on)
export(supports_window_clause)
export(table_path_components)
export(table_path_name)
Expand Down
10 changes: 10 additions & 0 deletions R/backend-postgres.R
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,16 @@ supports_window_clause.PostgreSQL <- function(con) {
TRUE
}

#' @export
supports_distinct_on.PqConnection <- function(con) {
TRUE
}

#' @export
supports_distinct_on.PostgreSQL <- function(con) {
TRUE
}

#' @export
db_supports_table_alias_with_as.PqConnection <- function(con) {
TRUE
Expand Down
10 changes: 10 additions & 0 deletions R/db-sql.R
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,16 @@ db_supports_table_alias_with_as.TestConnection <- function(con) {
TRUE
}

#' @rdname db-sql
#' @export
supports_distinct_on <- function(con) {
UseMethod("supports_distinct_on")
}

#' @export
supports_distinct_on.DBIConnection <- function(con) {
FALSE
}

# Query generation --------------------------------------------------------

Expand Down
35 changes: 31 additions & 4 deletions R/lazy-select-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ lazy_select_query <- function(
# stopifnot(is.character(group_by))
stopifnot(is_lazy_sql_part(order_by))
check_number_whole(limit, allow_infinite = TRUE, allow_null = TRUE)
check_bool(distinct)
# distinct = FALSE -> no distinct
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a great comment that helps me understand the intent of the rest of the code 😄

But it does illustrates a problem with lazy_select_query() — it isn't properly documented and thus it's hard to tell what the types of the inputs are. You don't need to fix it here, but it might be a useful follow up PR.

# distinct = TRUE -> distinct over all columns
# distinct = columns -> DISTINCT ON (...)
stopifnot(is.logical(distinct) || is_lazy_sql_part(distinct))

select <- select %||% syms(set_names(op_vars(x)))
select_operation <- arg_match0(
Expand All @@ -45,6 +48,9 @@ lazy_select_query <- function(
)
} else {
select <- new_lazy_select(select)
if (!is.logical(distinct)) {
distinct <- new_lazy_select(distinct)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do this here and not in sql_build.lazy_select_query?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved it to sql_build.lazy_select_query() now. Originally I did this here because it seemed more consistent with the select variable.

}
}

lazy_query(
Expand Down Expand Up @@ -152,7 +158,12 @@ is_select_identity <- function(select, vars_prev) {

#' @export
print.lazy_select_query <- function(x, ...) {
cat_line("<SQL SELECT", if (x$distinct) " DISTINCT", ">")
cat_line(
"<SQL SELECT",
if (!isFALSE(x$distinct)) " DISTINCT",
if (!is.logical(x$distinct)) " ON",
">"
)
cat_line("From:")
cat_line(indent_print(sql_build(x$x, simulate_dbi())))

Expand Down Expand Up @@ -195,8 +206,24 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) {

alias <- remote_name(op$x, null_if_local = FALSE) %||% unique_subquery_name()
from <- sql_build(op$x, con, sql_options = sql_options)

selects <- op$select

if (!is.logical(op$distinct)) {
distinct <- get_select_sql(
select = op$distinct,
select_operation = op$select_operation,
in_vars = op_vars(op$x),
table_alias = alias,
con = con,
use_star = sql_options$use_star
)$select_sql
} else {
distinct <- op$distinct
}

select_sql_list <- get_select_sql(
select = op$select,
select = selects,
select_operation = op$select_operation,
in_vars = op_vars(op$x),
table_alias = alias,
Expand All @@ -217,7 +244,7 @@ sql_build.lazy_select_query <- function(op, con, ..., sql_options = NULL) {
having = translate_sql_(op$having, con = con, window = FALSE),
window = select_sql_list$window_sql,
order_by = translate_sql_(op$order_by, con = con),
distinct = op$distinct,
distinct = distinct,
limit = op$limit,
from_alias = alias
)
Expand Down
12 changes: 10 additions & 2 deletions R/query-select.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ select_query <- function(
check_character(window)
check_character(order_by)
check_number_whole(limit, allow_infinite = TRUE, allow_null = TRUE)
check_bool(distinct)
stopifnot(is.logical(distinct) || is.character(distinct))
check_string(from_alias, allow_null = TRUE)

structure(
Expand All @@ -41,10 +41,18 @@ select_query <- function(

#' @export
print.select_query <- function(x, ...) {
cat_line("<SQL SELECT", if (x$distinct) " DISTINCT", ">")
cat_line(
"<SQL SELECT",
if (!isFALSE(x$distinct)) " DISTINCT",
if (!is.logical(x$distinct)) " ON",
">"
)
cat_line("From:")
cat_line(indent_print(x$from))

if (!is.logical(x$distinct)) {
cat_line("Dist On: ", named_commas(x$distinct))
}
if (length(x$select)) {
cat_line("Select: ", named_commas(x$select))
}
Expand Down
11 changes: 10 additions & 1 deletion R/sql-clause.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,19 @@ sql_clause_select <- function(
top <- as.integer(top)
}

sql_distinct <-
if (isTRUE(distinct)) {
" DISTINCT"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our usual style would be to put sql_distinct <- in each branch or extract out a helper function. I'd probably lean towards a helper function since you could inline that in clause below.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like I have done now? Or should the helper function be inside sql_clause_select()?

} else if (isFALSE(distinct)) {
""
} else {
glue_sql2(con, " DISTINCT ON ({.col distinct})")
}

clause <- glue_sql2(
con,
"SELECT",
if (distinct) " DISTINCT",
sql_distinct,
if (!is.null(top)) " TOP {.val top}"
)

Expand Down
62 changes: 46 additions & 16 deletions R/verb-distinct.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,31 @@
distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) {
grps <- syms(op_grps(.data))
empty_dots <- dots_n(...) == 0
can_use_distinct <- !.keep_all || (empty_dots && is_empty(grps))
if (!can_use_distinct) {
can_use_distinct <-
!.keep_all ||
(empty_dots && is_empty(grps)) ||
supports_distinct_on(.data$src$con)

if (can_use_distinct) {
if (empty_dots) {
dots <- quos(!!!syms(colnames(.data)))
} else {
dots <- partial_eval_dots(.data, ..., .named = FALSE)
dots <- quos(!!!dots)
}
prep <- distinct_prepare_compat(.data, dots, group_vars = group_vars(.data))

if (!.keep_all) {
out <- dplyr::select(prep$data, prep$keep)
out$lazy_query <- add_distinct(out, distinct = TRUE)
} else {
out <- prep$data
out$lazy_query <- add_distinct(
out,
distinct = set_names(names(quos_auto_name(dots)))
)
}
} else {
needs_dummy_order <- is.null(op_sort(.data))

if (needs_dummy_order) {
Expand All @@ -37,17 +60,6 @@ distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) {

return(.data)
}

if (empty_dots) {
dots <- quos(!!!syms(colnames(.data)))
} else {
dots <- partial_eval_dots(.data, ..., .named = FALSE)
dots <- quos(!!!dots)
}
prep <- distinct_prepare_compat(.data, dots, group_vars = group_vars(.data))
out <- dplyr::select(prep$data, prep$keep)

out$lazy_query <- add_distinct(out)
out
}

Expand Down Expand Up @@ -148,12 +160,25 @@ quo_is_variable_reference <- function(quo) {
}


add_distinct <- function(.data) {
add_distinct <- function(.data, distinct) {
lazy_query <- .data$lazy_query

distinct <-
if (is.logical(distinct)) {
distinct
} else {
# if the DISTINCT ON clause contains all columns
# we can fall back to a normal DISTINCT
if (is_identity(syms(distinct), names(distinct), colnames(.data))) {
TRUE
} else {
syms(distinct)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think that's worth checking? Seems like it would be pretty rare.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you are right. I removed the extra check.

}
}

out <- lazy_select_query(
x = lazy_query,
distinct = TRUE
distinct = distinct
)
# TODO this could also work for joins
if (!is_lazy_select_query(lazy_query)) {
Expand All @@ -173,6 +198,11 @@ add_distinct <- function(.data) {
return(out)
}

lazy_query$distinct <- TRUE
if (!is.logical(distinct)) {
lazy_query$distinct <- new_lazy_select(distinct)
} else {
lazy_query$distinct <- distinct
}

lazy_query
}
3 changes: 2 additions & 1 deletion R/verb-select.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ select_can_be_inlined <- function(lazy_query, vars) {
ordered_present <- all(intersect(computed_columns, order_vars) %in% vars)

# if the projection is distinct, it must be bijective
is_distinct <- is_true(lazy_query$distinct)
is_distinct <- !is.logical(lazy_query$distinct) ||
is_true(lazy_query$distinct)
is_bijective_projection <- identical(sort(unname(vars)), op_vars(lazy_query))
distinct_is_bijective <- !is_distinct || is_bijective_projection

Expand Down
3 changes: 3 additions & 0 deletions man/db-sql.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions tests/testthat/test-backend-postgres.R
Original file line number Diff line number Diff line change
Expand Up @@ -573,3 +573,33 @@ test_that("correctly escapes dates", {
dd <- as.Date("2022-03-04")
expect_equal(escape(dd, con = con), sql("'2022-03-04'::date"))
})

# we test that again because postgres uses the DISTINCT ON feature
test_that("distinct can compute variables when .keep_all is TRUE", {
con <- src_test("postgres")

out <-
local_db_table(con, data.frame(x = c(2, 1), y = c(1, 2)), "df_x") %>%
distinct(z = x + y, .keep_all = TRUE) %>%
collect()

expect_named(out, c("x", "y", "z"))
expect_equal(out$z, 3)
})

test_that("distinct respects window_order when .keep_all is TRUE", {
con <- src_test("postgres")

mf <- local_db_table(con, data.frame(x = c(1, 1, 2, 2), y = 1:4), "mf")
out <- mf %>%
window_order(desc(y)) %>%
distinct(x, .keep_all = TRUE)

expect_equal(out %>% collect(), tibble(x = 1:2, y = c(2, 4)))

expect_snapshot(
mf %>%
window_order(desc(y)) %>%
distinct(x, .keep_all = TRUE)
)
})
Loading