[SPARK-53029][PYTHON] Support return type coercion for Arrow Python UDTFs

shujingyang-db · ueshin · commit d16e92de0a3a · 2025-09-08T15:48:02.000-07:00
### What changes were proposed in this pull request? Support return type coercion for Arrow Python UDTFs by doing `arrow_cast` by default ### Why are the changes needed? Consistent behavior across Arrow UDFs and Arrow UDTFs ### Does this PR introduce _any_ user-facing change? No, Arrow UDTF is not a public API yet ### How was this patch tested? New and existing UTs ### Was this patch authored or co-authored using generative AI tooling? No Closes #52140 from shujingyang-db/arrow-udtf-type-corerion. Lead-authored-by: Shujing Yang <shujing.yang@databricks.com> Co-authored-by: Shujing Yang <135740748+shujingyang-db@users.noreply.github.com> Signed-off-by: Takuya Ueshin <ueshin@databricks.com>
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -967,6 +967,11 @@
       "Column names of the returned pyarrow.Table do not match specified schema.<missing><extra>"
     ]
   },
+  "RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDTF": {
+    "message": [
+      "Column names of the returned pyarrow.Table or pyarrow.RecordBatch do not match specified schema. Expected: <expected> Actual: <actual>"
+    ]
+  },
   "RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF": {
     "message": [
       "Column names of the returned pandas.DataFrame do not match specified schema.<missing><extra>"
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -227,6 +227,68 @@ def load_stream(self, stream):
                     result_batches.append(batch.column(i))
             yield result_batches
 
+    def _create_array(self, arr, arrow_type):
+        import pyarrow as pa
+
+        assert isinstance(arr, pa.Array)
+        assert isinstance(arrow_type, pa.DataType)
+        if arr.type == arrow_type:
+            return arr
+        else:
+            try:
+                # when safe is True, the cast will fail if there's a overflow or other
+                # unsafe conversion.
+                # RecordBatch.cast(...) isn't used as minimum PyArrow version
+                # required for RecordBatch.cast(...) is v16.0
+                return arr.cast(target_type=arrow_type, safe=True)
+            except (pa.ArrowInvalid, pa.ArrowTypeError):
+                raise PySparkRuntimeError(
+                    errorClass="RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDTF",
+                    messageParameters={
+                        "expected": str(arrow_type),
+                        "actual": str(arr.type),
+                    },
+                )
+
+    def dump_stream(self, iterator, stream):
+        """
+        Override to handle type coercion for ArrowUDTF outputs.
+        ArrowUDTF returns iterator of (pa.RecordBatch, arrow_return_type) tuples.
+        """
+        import pyarrow as pa
+
+        def apply_type_coercion():
+            for batch, arrow_return_type in iterator:
+                assert isinstance(
+                    arrow_return_type, pa.StructType
+                ), f"Expected pa.StructType, got {type(arrow_return_type)}"
+
+                # Handle empty struct case specially
+                if batch.num_columns == 0:
+                    coerced_batch = batch  # skip type coercion
+                else:
+                    expected_field_names = arrow_return_type.names
+                    actual_field_names = batch.schema.names
+
+                    if expected_field_names != actual_field_names:
+                        raise PySparkTypeError(
+                            "Target schema's field names are not matching the record batch's "
+                            "field names. "
+                            f"Expected: {expected_field_names}, but got: {actual_field_names}."
+                        )
+
+                    coerced_arrays = []
+                    for i, field in enumerate(arrow_return_type):
+                        original_array = batch.column(i)
+                        coerced_array = self._create_array(original_array, field.type)
+                        coerced_arrays.append(coerced_array)
+                    coerced_batch = pa.RecordBatch.from_arrays(
+                        coerced_arrays, names=arrow_return_type.names
+                    )
+                yield coerced_batch, arrow_return_type
+
+        return super().dump_stream(apply_type_coercion(), stream)
+
 
 class ArrowStreamGroupUDFSerializer(ArrowStreamUDFSerializer):
     """
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udtf.py b/python/pyspark/sql/tests/arrow/test_arrow_udtf.py
@@ -189,7 +189,10 @@ def eval(self) -> Iterator["pa.Table"]:
                 )
                 yield result_table
 
-        with self.assertRaisesRegex(PythonException, "Schema at index 0 was different"):
+        with self.assertRaisesRegex(
+            PythonException,
+            "Target schema's field names are not matching the record batch's field names",
+        ):
             result_df = MismatchedSchemaUDTF()
             result_df.collect()
 
@@ -330,9 +333,10 @@ def eval(self) -> Iterator["pa.Table"]:
                 )
                 yield result_table
 
-        with self.assertRaisesRegex(PythonException, "Schema at index 0 was different"):
-            result_df = LongToIntUDTF()
-            result_df.collect()
+        # Should succeed with automatic coercion
+        result_df = LongToIntUDTF()
+        expected_df = self.spark.createDataFrame([(1,), (2,), (3,)], "id int")
+        assertDataFrameEqual(result_df, expected_df)
 
     def test_arrow_udtf_type_coercion_string_to_int(self):
         @arrow_udtf(returnType="id int")
@@ -341,15 +345,103 @@ def eval(self) -> Iterator["pa.Table"]:
                 # Return string values that cannot be coerced to int
                 result_table = pa.table(
                     {
-                        "id": pa.array(["abc", "def", "xyz"], type=pa.string()),
+                        "id": pa.array(["1", "2", "xyz"], type=pa.string()),
                     }
                 )
                 yield result_table
 
-        with self.assertRaisesRegex(PythonException, "Schema at index 0 was different"):
+        # Should fail with Arrow cast exception since string cannot be cast to int
+        with self.assertRaisesRegex(
+            PythonException,
+            "PySparkRuntimeError: \\[RESULT_COLUMNS_MISMATCH_FOR_ARROW_UDTF\\] "
+            "Column names of the returned pyarrow.Table or pyarrow.RecordBatch do not match "
+            "specified schema. Expected: int32 Actual: string",
+        ):
             result_df = StringToIntUDTF()
             result_df.collect()
 
+    def test_arrow_udtf_type_coercion_string_to_int_safe(self):
+        @arrow_udtf(returnType="id int")
+        class StringToIntUDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                result_table = pa.table(
+                    {
+                        "id": pa.array(["1", "2", "3"], type=pa.string()),
+                    }
+                )
+                yield result_table
+
+        result_df = StringToIntUDTF()
+        expected_df = self.spark.createDataFrame([(1,), (2,), (3,)], "id int")
+        assertDataFrameEqual(result_df, expected_df)
+
+    def test_arrow_udtf_type_corecion_int64_to_int32_safe(self):
+        @arrow_udtf(returnType="id int")
+        class Int64ToInt32UDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                result_table = pa.table(
+                    {
+                        "id": pa.array([1, 2, 3], type=pa.int64()),  # long values
+                    }
+                )
+                yield result_table
+
+        result_df = Int64ToInt32UDTF()
+        expected_df = self.spark.createDataFrame([(1,), (2,), (3,)], "id int")
+        assertDataFrameEqual(result_df, expected_df)
+
+    def test_return_type_coercion_success(self):
+        @arrow_udtf(returnType="value int")
+        class CoercionSuccessUDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                result_table = pa.table(
+                    {
+                        "value": pa.array([10, 20, 30], type=pa.int64()),  # long -> int coercion
+                    }
+                )
+                yield result_table
+
+        result_df = CoercionSuccessUDTF()
+        expected_df = self.spark.createDataFrame([(10,), (20,), (30,)], "value int")
+        assertDataFrameEqual(result_df, expected_df)
+
+    def test_return_type_coercion_overflow(self):
+        @arrow_udtf(returnType="value int")
+        class CoercionOverflowUDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                # Return values that will cause overflow when casting long to int
+                result_table = pa.table(
+                    {
+                        "value": pa.array([2147483647 + 1], type=pa.int64()),  # int32 max + 1
+                    }
+                )
+                yield result_table
+
+        # Should fail with PyArrow overflow exception
+        with self.assertRaises(Exception):
+            result_df = CoercionOverflowUDTF()
+            result_df.collect()
+
+    def test_return_type_coercion_multiple_columns(self):
+        @arrow_udtf(returnType="id int, price float")
+        class MultipleColumnCoercionUDTF:
+            def eval(self) -> Iterator["pa.Table"]:
+                result_table = pa.table(
+                    {
+                        "id": pa.array([1, 2, 3], type=pa.int64()),  # long -> int coercion
+                        "price": pa.array(
+                            [10.5, 20.7, 30.9], type=pa.float64()
+                        ),  # double -> float coercion
+                    }
+                )
+                yield result_table
+
+        result_df = MultipleColumnCoercionUDTF()
+        expected_df = self.spark.createDataFrame(
+            [(1, 10.5), (2, 20.7), (3, 30.9)], "id int, price float"
+        )
+        assertDataFrameEqual(result_df, expected_df)
+
     def test_arrow_udtf_with_empty_column_result(self):
         @arrow_udtf(returnType=StructType())
         class EmptyResultUDTF:
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -1970,14 +1970,8 @@ def verify_result(result):
                         },
                     )
 
-                # Verify the type and the schema of the result.
-                verify_arrow_result(
-                    pa.Table.from_batches([result], schema=pa.schema(list(arrow_return_type))),
-                    assign_cols_by_name=False,
-                    expected_cols_and_types=[
-                        (col.name, to_arrow_type(col.dataType)) for col in return_type.fields
-                    ],
-                )
+                # We verify the type of the result and do type corerion
+                # in the serializer
                 return result
 
             # Wrap the exception thrown from the UDTF in a PySparkRuntimeError.