forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-43098][SQL] Fix correctness COUNT bug when scalar subquery has…
… group by clause ### What changes were proposed in this pull request? Fix a correctness bug for scalar subqueries with COUNT and a GROUP BY clause, for example: ``` create view t1(c1, c2) as values (0, 1), (1, 2); create view t2(c1, c2) as values (0, 2), (0, 3); select c1, c2, (select count(*) from t2 where t1.c1 = t2.c1 group by c1) from t1; -- Correct answer: [(0, 1, 2), (1, 2, null)] +---+---+------------------+ |c1 |c2 |scalarsubquery(c1)| +---+---+------------------+ |0 |1 |2 | |1 |2 |0 | +---+---+------------------+ ``` This is due to a bug in our "COUNT bug" handling for scalar subqueries. For a subquery with COUNT aggregate but no GROUP BY clause, 0 is the correct output on empty inputs, and we use the COUNT bug handling to construct the plan that yields 0 when there were no matched rows. But when there is a GROUP BY clause then NULL is the correct output (i.e. there is no COUNT bug), but we still incorrectly use the COUNT bug handling and therefore incorrectly output 0. Instead, we need to only apply the COUNT bug handling when the scalar subquery had no GROUP BY clause. To fix this, we need to track whether the scalar subquery has a GROUP BY, i.e. a non-empty groupingExpressions for the Aggregate node. This need to be checked before subquery decorrelation, because that adds the correlated outer refs to the group-by list so after that the group-by is always non-empty. We save it in a boolean in the ScalarSubquery node until later when we rewrite the subquery into a join in constructLeftJoins. This is a long-standing bug. This bug affected both the current DecorrelateInnerQuery framework and the old code (with spark.sql.optimizer.decorrelateInnerQuery.enabled = false), and this PR fixes both. ### Why are the changes needed? Fix a correctness bug. ### Does this PR introduce _any_ user-facing change? Yes, fix incorrect query results. ### How was this patch tested? Add SQL tests and unit tests. (Note that there were 2 existing unit tests for queries of this shape, which had the incorrect results as golden results.) Closes apache#40811 from jchen5/count-bug. Authored-by: Jack Chen <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
- Loading branch information
Showing
11 changed files
with
559 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
228 changes: 228 additions & 0 deletions
228
...ces/sql-tests/analyzer-results/subquery/scalar-subquery/scalar-subquery-count-bug.sql.out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
-- Automatically generated by SQLQueryTestSuite | ||
-- !query | ||
create temp view l (a, b) | ||
as values | ||
(1, 2.0), | ||
(1, 2.0), | ||
(2, 1.0), | ||
(2, 1.0), | ||
(3, 3.0), | ||
(null, null), | ||
(null, 5.0), | ||
(6, null) | ||
-- !query analysis | ||
CreateViewCommand `l`, [(a,None), (b,None)], values | ||
(1, 2.0), | ||
(1, 2.0), | ||
(2, 1.0), | ||
(2, 1.0), | ||
(3, 3.0), | ||
(null, null), | ||
(null, 5.0), | ||
(6, null), false, false, LocalTempView, true | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
create temp view r (c, d) | ||
as values | ||
(2, 3.0), | ||
(2, 3.0), | ||
(3, 2.0), | ||
(4, 1.0), | ||
(null, null), | ||
(null, 5.0), | ||
(6, null) | ||
-- !query analysis | ||
CreateViewCommand `r`, [(c,None), (d,None)], values | ||
(2, 3.0), | ||
(2, 3.0), | ||
(3, 2.0), | ||
(4, 1.0), | ||
(null, null), | ||
(null, 5.0), | ||
(6, null), false, false, LocalTempView, true | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c) from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Aggregate [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c group by c) from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Aggregate [c#x], [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c group by 'constant') from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Aggregate [constant], [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, ( | ||
select (count(*)) is null | ||
from r | ||
where l.a = r.c) | ||
from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#x] | ||
: +- Aggregate [isnull(count(1)) AS (count(1) IS NULL)#x] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, ( | ||
select (count(*)) is null | ||
from r | ||
where l.a = r.c | ||
group by r.c) | ||
from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#x] | ||
: +- Aggregate [c#x], [isnull(count(1)) AS (count(1) IS NULL)#x] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c having count(*) <= 1) from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Filter (count(1)#xL <= cast(1 as bigint)) | ||
: +- Aggregate [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c having count(*) >= 2) from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Filter (count(1)#xL >= cast(2 as bigint)) | ||
: +- Aggregate [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
set spark.sql.optimizer.decorrelateSubqueryLegacyIncorrectCountHandling.enabled = true | ||
-- !query analysis | ||
SetCommand (spark.sql.optimizer.decorrelateSubqueryLegacyIncorrectCountHandling.enabled,Some(true)) | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c) from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Aggregate [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c group by c) from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Aggregate [c#x], [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
select *, (select count(*) from r where l.a = r.c group by 'constant') from l | ||
-- !query analysis | ||
Project [a#x, b#x, scalar-subquery#x [a#x] AS scalarsubquery(a)#xL] | ||
: +- Aggregate [constant], [count(1) AS count(1)#xL] | ||
: +- Filter (outer(a#x) = c#x) | ||
: +- SubqueryAlias r | ||
: +- View (`r`, [c#x,d#x]) | ||
: +- Project [cast(col1#x as int) AS c#x, cast(col2#x as decimal(2,1)) AS d#x] | ||
: +- LocalRelation [col1#x, col2#x] | ||
+- SubqueryAlias l | ||
+- View (`l`, [a#x,b#x]) | ||
+- Project [cast(col1#x as int) AS a#x, cast(col2#x as decimal(2,1)) AS b#x] | ||
+- LocalRelation [col1#x, col2#x] | ||
|
||
|
||
-- !query | ||
reset spark.sql.optimizer.decorrelateSubqueryLegacyIncorrectCountHandling.enabled | ||
-- !query analysis | ||
ResetCommand spark.sql.optimizer.decorrelateSubqueryLegacyIncorrectCountHandling.enabled |
Oops, something went wrong.