Skip to content

Commit adef6f3

Browse files
committed
[SPARK-53479][PS] Align == behavior with pandas when comparing against scalar under ANSI
### What changes were proposed in this pull request? - Ensure `==` returns a nameless Series when comparing with another Series/Index, but preserves the name for scalar comparisons. - Add test cases to compare with `np.nan` ### Why are the changes needed? Part of https://issues.apache.org/jira/browse/SPARK-53389 ### Does this PR introduce _any_ user-facing change? No, the feature is not released yet. For example, Before ```py >>> psdf['int'] == 'x' 0 False 1 False dtype: bool ``` After ```py >>> psdf['int'] == 'x' 0 False 1 False Name: int, dtype: bool ``` which follows native pandas ```py >>> pdf['int'] == 'x' 0 False 1 False Name: int, dtype: bool ``` ### How was this patch tested? Unit tests Commands below passed ```py 1037 SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_comparison_dtype_compatibility" 1038 SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_comparison_dtype_compatibility" 1039 SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_eq" 1040 SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_eq" 1041 SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_ne" 1042 SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_num_ops NumOpsTests.test_ne" ``` ### Was this patch authored or co-authored using generative AI tooling? No Closes #52224 from xinrong-meng/cmp_op_test. Authored-by: Xinrong Meng <[email protected]> Signed-off-by: Xinrong Meng <[email protected]>
1 parent d16e92d commit adef6f3

File tree

2 files changed

+11
-2
lines changed

2 files changed

+11
-2
lines changed

python/pyspark/pandas/data_type_ops/num_ops.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import decimal
1919
import numbers
20-
from typing import Any, Union, Callable
20+
from typing import Any, Union, Callable, cast
2121

2222
import numpy as np
2323
import pandas as pd
@@ -275,7 +275,13 @@ def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
275275
if is_ansi_mode_enabled(left._internal.spark_frame.sparkSession):
276276
if _should_return_all_false(left, right):
277277
left_scol = left._with_new_scol(F.lit(False))
278-
return left_scol.rename(None) # type: ignore[attr-defined]
278+
if isinstance(right, IndexOpsMixin):
279+
# When comparing with another Series/Index, drop the name
280+
# to align with pandas behavior
281+
return left_scol.rename(None) # type: ignore[attr-defined]
282+
else:
283+
# When comparing with scalar-like, keep the name of left operand
284+
return cast(SeriesOrIndex, left_scol)
279285
if _is_boolean_type(right): # numeric vs. bool
280286
right = transform_boolean_operand_to_numeric(
281287
right, spark_type=left.spark.data_type

python/pyspark/pandas/tests/data_type_ops/test_num_ops.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,16 +140,19 @@ def test_comparison_dtype_compatibility(self):
140140
if is_ansi_mode_test: # TODO: match non-ansi behavior with pandas
141141
self.assert_eq(pdf["int"] == pdf["str"], psdf["int"] == psdf["str"])
142142
self.assert_eq(pdf["float"] == pdf["bool"], psdf["float"] == psdf["bool"])
143+
self.assert_eq(pdf["str"] == "x", psdf["str"] == "x")
143144

144145
def test_eq(self):
145146
pdf, psdf = self.pdf, self.psdf
146147
for col in self.numeric_df_cols:
147148
self.assert_eq(pdf[col] == pdf[col], psdf[col] == psdf[col])
149+
self.assert_eq(pdf[col] == np.nan, psdf[col] == np.nan)
148150

149151
def test_ne(self):
150152
pdf, psdf = self.pdf, self.psdf
151153
for col in self.numeric_df_cols:
152154
self.assert_eq(pdf[col] != pdf[col], psdf[col] != psdf[col])
155+
self.assert_eq(pdf[col] != np.nan, psdf[col] != np.nan)
153156

154157
def test_lt(self):
155158
pdf, psdf = self.pdf, self.psdf

0 commit comments

Comments
 (0)