[SPARK-54319][SQL] BHJ LeftAnti update numOutputRows wrong when codegen is disabled

AngersZhuuuu · cloud-fan · commit 3757091e1c51 · 2025-11-17T11:39:11.000+08:00
### What changes were proposed in this pull request? BHJ LeftAnti update numOutputRows missing case for hashed = EmptyHashedRelation <img width="1754" height="1148" alt="image" src="https://github.com/user-attachments/assets/a71e4546-578e-4e4d-9434-9287074ebe75" /> ### Why are the changes needed? Fix missing sql metrics for BHJ ### Does this PR introduce _any_ user-facing change? Yes, BHJ LeftAnti will update numOutputRows when hashed = EmptyHashedRelation ### How was this patch tested? Existed UT ### Was this patch authored or co-authored using generative AI tooling? No Closes #53014 from AngersZhuuuu/SPARK-54319. Authored-by: Angerszhuuuu <angers.zhu@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -129,7 +129,10 @@ case class BroadcastHashJoinExec(
         val hashed = broadcastRelation.value.asReadOnlyCopy()
         TaskContext.get().taskMetrics().incPeakExecutionMemory(hashed.estimatedSize)
         if (hashed == EmptyHashedRelation) {
-          streamedIter
+          streamedIter.map { row =>
+            numOutputRows += 1
+            row
+          }
         } else if (hashed == HashedRelationWithAllNullKeys) {
           Iterator.empty
         } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -915,16 +915,27 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
           withTable("t1", "t2") {
             spark.range(4).write.saveAsTable("t1")
             spark.range(2).write.saveAsTable("t2")
-            val df = sql("SELECT * FROM t1 WHERE id NOT IN (SELECT id FROM t2)")
-            df.collect()
-            val plan = df.queryExecution.executedPlan
+            val df1 = sql("SELECT * FROM t1 WHERE id NOT IN (SELECT id FROM t2)")
+            df1.collect()
+            val plan1 = df1.queryExecution.executedPlan
 
-            val joins = plan.collect {
+            val joins1 = plan1.collect {
               case s: BroadcastHashJoinExec => s
             }
 
-            assert(joins.size === 1)
-            testMetricsInSparkPlanOperator(joins.head, Map("numOutputRows" -> 2))
+            assert(joins1.size === 1)
+            testMetricsInSparkPlanOperator(joins1.head, Map("numOutputRows" -> 2))
+
+            val df2 = sql("SELECT * FROM t1 WHERE id NOT IN (SELECT id FROM t2 WHERE 1 = 2)")
+            df2.collect()
+            val plan2 = df2.queryExecution.executedPlan
+
+            val joins2 = plan2.collect {
+              case s: BroadcastHashJoinExec => s
+            }
+
+            assert(joins2.size === 1)
+            testMetricsInSparkPlanOperator(joins2.head, Map("numOutputRows" -> 4))
           }
         }
       }