Skip to content

Commit c054a47

Browse files
authored
HIVE-29290: Wrong results when n-way join contains both anti and outer join (#6173)
1 parent 18df88f commit c054a47

File tree

4 files changed

+281
-2
lines changed

4 files changed

+281
-2
lines changed

ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -990,7 +990,13 @@ protected void checkAndGenObject() throws HiveException {
990990
} else {
991991
if (!alw.hasRows()) {
992992
hasEmpty = true;
993-
alw.addRow(dummyObj[i]);
993+
if (!isRightOfAntiJoin) {
994+
alw.addRow(dummyObj[i]);
995+
}
996+
} else if (isRightOfAntiJoin && !needsPostEvaluation) {
997+
// For anti join the right side should be empty. For needsPostEvaluation case we will
998+
// wait till evaluation is done. For other cases we can directly return from here.
999+
return;
9941000
} else if (!hasEmpty && alw.isSingleRow()) {
9951001
if (hasAnyFiltered(alias, alw.rowIter().first())) {
9961002
hasEmpty = true;

ql/src/java/org/apache/hadoop/hive/ql/exec/CommonMergeJoinOperator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,8 @@ private List<Byte> joinObject(int[] smallestPos, boolean clear) throws HiveExcep
428428

429429
private void putDummyOrEmpty(Byte i) {
430430
// put a empty list or null
431-
if (noOuterJoin) {
431+
boolean isRightOfAntiJoin = (i != 0 && condn[i-1].getType() == JoinDesc.ANTI_JOIN);
432+
if (noOuterJoin || isRightOfAntiJoin) {
432433
storage[i] = emptyList;
433434
} else {
434435
storage[i] = dummyObjVectors[i];
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
set hive.merge.nway.joins=true;
2+
3+
create table taba(id string);
4+
create table tabb(id string);
5+
create table tabc(id string);
6+
7+
INSERT INTO TABLE taba VALUES ('1'),('2');
8+
INSERT INTO TABLE tabc VALUES ('1'),('2'),('2');
9+
10+
explain
11+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null;
12+
explain cbo
13+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null;
14+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null;
15+
16+
INSERT INTO TABLE tabb VALUES ('1');
17+
18+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null;
19+
20+
INSERT INTO TABLE tabb VALUES ('2');
21+
22+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null;
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
PREHOOK: query: create table taba(id string)
2+
PREHOOK: type: CREATETABLE
3+
PREHOOK: Output: database:default
4+
PREHOOK: Output: default@taba
5+
POSTHOOK: query: create table taba(id string)
6+
POSTHOOK: type: CREATETABLE
7+
POSTHOOK: Output: database:default
8+
POSTHOOK: Output: default@taba
9+
PREHOOK: query: create table tabb(id string)
10+
PREHOOK: type: CREATETABLE
11+
PREHOOK: Output: database:default
12+
PREHOOK: Output: default@tabb
13+
POSTHOOK: query: create table tabb(id string)
14+
POSTHOOK: type: CREATETABLE
15+
POSTHOOK: Output: database:default
16+
POSTHOOK: Output: default@tabb
17+
PREHOOK: query: create table tabc(id string)
18+
PREHOOK: type: CREATETABLE
19+
PREHOOK: Output: database:default
20+
PREHOOK: Output: default@tabc
21+
POSTHOOK: query: create table tabc(id string)
22+
POSTHOOK: type: CREATETABLE
23+
POSTHOOK: Output: database:default
24+
POSTHOOK: Output: default@tabc
25+
PREHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2')
26+
PREHOOK: type: QUERY
27+
PREHOOK: Input: _dummy_database@_dummy_table
28+
PREHOOK: Output: default@taba
29+
POSTHOOK: query: INSERT INTO TABLE taba VALUES ('1'),('2')
30+
POSTHOOK: type: QUERY
31+
POSTHOOK: Input: _dummy_database@_dummy_table
32+
POSTHOOK: Output: default@taba
33+
POSTHOOK: Lineage: taba.id SCRIPT []
34+
PREHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2')
35+
PREHOOK: type: QUERY
36+
PREHOOK: Input: _dummy_database@_dummy_table
37+
PREHOOK: Output: default@tabc
38+
POSTHOOK: query: INSERT INTO TABLE tabc VALUES ('1'),('2'),('2')
39+
POSTHOOK: type: QUERY
40+
POSTHOOK: Input: _dummy_database@_dummy_table
41+
POSTHOOK: Output: default@tabc
42+
POSTHOOK: Lineage: tabc.id SCRIPT []
43+
PREHOOK: query: explain
44+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
45+
PREHOOK: type: QUERY
46+
PREHOOK: Input: default@taba
47+
PREHOOK: Input: default@tabb
48+
PREHOOK: Input: default@tabc
49+
#### A masked pattern was here ####
50+
POSTHOOK: query: explain
51+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
52+
POSTHOOK: type: QUERY
53+
POSTHOOK: Input: default@taba
54+
POSTHOOK: Input: default@tabb
55+
POSTHOOK: Input: default@tabc
56+
#### A masked pattern was here ####
57+
STAGE DEPENDENCIES:
58+
Stage-1 is a root stage
59+
Stage-0 depends on stages: Stage-1
60+
61+
STAGE PLANS:
62+
Stage: Stage-1
63+
Tez
64+
#### A masked pattern was here ####
65+
Edges:
66+
Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE)
67+
#### A masked pattern was here ####
68+
Vertices:
69+
Map 1
70+
Map Operator Tree:
71+
TableScan
72+
alias: a
73+
Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE
74+
Select Operator
75+
expressions: id (type: string)
76+
outputColumnNames: _col0
77+
Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE
78+
Reduce Output Operator
79+
key expressions: _col0 (type: string)
80+
null sort order: z
81+
sort order: +
82+
Map-reduce partition columns: _col0 (type: string)
83+
Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE
84+
Execution mode: vectorized, llap
85+
LLAP IO: all inputs
86+
Map 3
87+
Map Operator Tree:
88+
TableScan
89+
alias: b
90+
filterExpr: id is not null (type: boolean)
91+
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
92+
Filter Operator
93+
predicate: id is not null (type: boolean)
94+
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
95+
Select Operator
96+
expressions: id (type: string)
97+
outputColumnNames: _col0
98+
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
99+
Group By Operator
100+
keys: _col0 (type: string)
101+
minReductionHashAggr: 0.99
102+
mode: hash
103+
outputColumnNames: _col0
104+
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
105+
Reduce Output Operator
106+
key expressions: _col0 (type: string)
107+
null sort order: z
108+
sort order: +
109+
Map-reduce partition columns: _col0 (type: string)
110+
Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
111+
Execution mode: vectorized, llap
112+
LLAP IO: all inputs
113+
Map 4
114+
Map Operator Tree:
115+
TableScan
116+
alias: c
117+
filterExpr: id is not null (type: boolean)
118+
Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
119+
Filter Operator
120+
predicate: id is not null (type: boolean)
121+
Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
122+
Select Operator
123+
expressions: id (type: string)
124+
outputColumnNames: _col0
125+
Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
126+
Reduce Output Operator
127+
key expressions: _col0 (type: string)
128+
null sort order: z
129+
sort order: +
130+
Map-reduce partition columns: _col0 (type: string)
131+
Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
132+
Execution mode: vectorized, llap
133+
LLAP IO: all inputs
134+
Reducer 2
135+
Execution mode: llap
136+
Reduce Operator Tree:
137+
Merge Join Operator
138+
condition map:
139+
Anti Join 0 to 1
140+
Left Outer Join 0 to 2
141+
keys:
142+
0 _col0 (type: string)
143+
1 _col0 (type: string)
144+
2 _col0 (type: string)
145+
outputColumnNames: _col0, _col1
146+
Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE Column stats: NONE
147+
Select Operator
148+
expressions: _col0 (type: string), null (type: string), _col1 (type: string)
149+
outputColumnNames: _col0, _col1, _col2
150+
Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE Column stats: NONE
151+
File Output Operator
152+
compressed: false
153+
Statistics: Num rows: 6 Data size: 561 Basic stats: COMPLETE Column stats: NONE
154+
table:
155+
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
156+
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
157+
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
158+
159+
Stage: Stage-0
160+
Fetch Operator
161+
limit: -1
162+
Processor Tree:
163+
ListSink
164+
165+
PREHOOK: query: explain cbo
166+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
167+
PREHOOK: type: QUERY
168+
PREHOOK: Input: default@taba
169+
PREHOOK: Input: default@tabb
170+
PREHOOK: Input: default@tabc
171+
#### A masked pattern was here ####
172+
POSTHOOK: query: explain cbo
173+
select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
174+
POSTHOOK: type: QUERY
175+
POSTHOOK: Input: default@taba
176+
POSTHOOK: Input: default@tabb
177+
POSTHOOK: Input: default@tabc
178+
#### A masked pattern was here ####
179+
CBO PLAN:
180+
HiveProject(id=[$1], id1=[null:VARCHAR(2147483647) CHARACTER SET "UTF-16LE"], id2=[$0])
181+
HiveJoin(condition=[=($0, $1)], joinType=[right], algorithm=[none], cost=[not available])
182+
HiveProject(id=[$0])
183+
HiveFilter(condition=[IS NOT NULL($0)])
184+
HiveTableScan(table=[[default, tabc]], table:alias=[c])
185+
HiveAntiJoin(condition=[=($0, $1)], joinType=[anti])
186+
HiveProject(id=[$0])
187+
HiveTableScan(table=[[default, taba]], table:alias=[a])
188+
HiveProject(id=[$0])
189+
HiveFilter(condition=[IS NOT NULL($0)])
190+
HiveTableScan(table=[[default, tabb]], table:alias=[b])
191+
192+
PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
193+
PREHOOK: type: QUERY
194+
PREHOOK: Input: default@taba
195+
PREHOOK: Input: default@tabb
196+
PREHOOK: Input: default@tabc
197+
#### A masked pattern was here ####
198+
POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
199+
POSTHOOK: type: QUERY
200+
POSTHOOK: Input: default@taba
201+
POSTHOOK: Input: default@tabb
202+
POSTHOOK: Input: default@tabc
203+
#### A masked pattern was here ####
204+
1 NULL 1
205+
2 NULL 2
206+
2 NULL 2
207+
PREHOOK: query: INSERT INTO TABLE tabb VALUES ('1')
208+
PREHOOK: type: QUERY
209+
PREHOOK: Input: _dummy_database@_dummy_table
210+
PREHOOK: Output: default@tabb
211+
POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('1')
212+
POSTHOOK: type: QUERY
213+
POSTHOOK: Input: _dummy_database@_dummy_table
214+
POSTHOOK: Output: default@tabb
215+
POSTHOOK: Lineage: tabb.id SCRIPT []
216+
PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
217+
PREHOOK: type: QUERY
218+
PREHOOK: Input: default@taba
219+
PREHOOK: Input: default@tabb
220+
PREHOOK: Input: default@tabc
221+
#### A masked pattern was here ####
222+
POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
223+
POSTHOOK: type: QUERY
224+
POSTHOOK: Input: default@taba
225+
POSTHOOK: Input: default@tabb
226+
POSTHOOK: Input: default@tabc
227+
#### A masked pattern was here ####
228+
2 NULL 2
229+
2 NULL 2
230+
PREHOOK: query: INSERT INTO TABLE tabb VALUES ('2')
231+
PREHOOK: type: QUERY
232+
PREHOOK: Input: _dummy_database@_dummy_table
233+
PREHOOK: Output: default@tabb
234+
POSTHOOK: query: INSERT INTO TABLE tabb VALUES ('2')
235+
POSTHOOK: type: QUERY
236+
POSTHOOK: Input: _dummy_database@_dummy_table
237+
POSTHOOK: Output: default@tabb
238+
POSTHOOK: Lineage: tabb.id SCRIPT []
239+
PREHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
240+
PREHOOK: type: QUERY
241+
PREHOOK: Input: default@taba
242+
PREHOOK: Input: default@tabb
243+
PREHOOK: Input: default@tabc
244+
#### A masked pattern was here ####
245+
POSTHOOK: query: select * from taba A left outer join tabb B on (A.id = B.id) left outer join tabc C on (C.id = A.id) where B.id is null
246+
POSTHOOK: type: QUERY
247+
POSTHOOK: Input: default@taba
248+
POSTHOOK: Input: default@tabb
249+
POSTHOOK: Input: default@tabc
250+
#### A masked pattern was here ####

0 commit comments

Comments
 (0)