apache · jiayuasu · Apr 2, 2025 · Feb 23, 2025 · Mar 22, 2025 · Mar 22, 2025
@@ -97,7 +97,7 @@ jobs:
           java-version: ${{ matrix.jdk }}
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.7'
+          python-version: '3.10'
       - name: Cache Maven packages
         uses: actions/cache@v3
         with:
@@ -110,6 +110,12 @@ jobs:
           SKIP_TESTS: ${{ matrix.skipTests }}
         run: |
           SPARK_COMPAT_VERSION=${SPARK_VERSION:0:3}
+
+          if [ "${SPARK_VERSION}" == "3.5.0" ]; then
+              pip install pyspark==3.5.0 pandas shapely apache-sedona pyarrow
+              export SPARK_HOME=$(python -c "import pyspark; print(pyspark.__path__[0])")
+          fi
+
           mvn -q clean install -Dspark=${SPARK_COMPAT_VERSION} -Dscala=${SCALA_VERSION:0:4} -Dspark.version=${SPARK_VERSION} ${SKIP_TESTS}
       - run: mkdir staging
       - run: cp spark-shaded/target/sedona-*.jar staging

@@ -1195,6 +1195,73 @@ Output:
 +------------------------------+--------+--------------------------------------------------+-----------------+
 ```
 
+## Spatial vectorized udfs (Python only)
+
+By default when you create the user defined functions in Python, the UDFs are not vectorized.
+This means that the UDFs are called row by row which can be slow.
+To speed up the UDFs, you can use the `vectorized` UDF which will be called in a batch mode
+using Apache Arrow.
+
+To create a vectorized UDF please use the decorator sedona_vectorized_udf.
+Currently supports only the scalar UDFs. Vectorized UDFs are way faster than
+the normal UDFs. It might be even 2x faster than the normal UDFs.
+
+!!!note
+	When you use geometry as an input type, please include the BaseGeometry type,
+	like Point from shapely or geopandas GeoSeries, when you use GEO_SERIES vectorized udf.
+	That's how Sedona infers the type and knows if the data should be cast.
+
+Decorator signature looks as follows:
+
+```python
+def sedona_vectorized_udf(udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR, return_type: DataType)
+```
+
+where udf_type is the type of the UDF function, currently supported are:
+
+- SHAPELY_SCALAR
+- GEO_SERIES
+
+The main difference is what input data you get in the function
+Let's analyze the two examples below, that creates buffers from
+a given geometry.
+
+### Shapely scalar UDF
+
+```python
+import shapely.geometry.base as b
+from sedona.sql.functions import sedona_vectorized_udf
+
+@sedona_vectorized_udf(return_type=GeometryType())
+def vectorized_buffer(geom: b.BaseGeometry) -> b.BaseGeometry:
+    return geom.buffer(0.1)
+```
+
+### GeoSeries UDF
+
+```python
+import geopandas as gpd
+from sedona.sql.functions import sedona_vectorized_udf, SedonaUDFType
+from sedona.sql.types import GeometryType
+
+
+@sedona_vectorized_udf(udf_type=SedonaUDFType.GEO_SERIES, return_type=GeometryType())
+def vectorized_geo_series_buffer(series: gpd.GeoSeries) -> gpd.GeoSeries:
+    buffered = series.buffer(0.1)
+
+    return buffered
+```
+
+To call the UDFs you can use the following code:
+
+```python
+# Shapely scalar UDF
+df.withColumn("buffered", vectorized_buffer(df.geom)).show()
+
+# GeoSeries UDF
+df.withColumn("buffered", vectorized_geo_series_buffer(df.geom)).show()
+```
+
 ## Save to permanent storage
 
 To save a Spatial DataFrame to some permanent storage such as Hive tables and HDFS, you can simply convert each geometry in the Geometry type column back to a plain String and save the plain DataFrame to wherever you want.

@@ -0,0 +1,144 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+
+import inspect
+from enum import Enum
+
+import pandas as pd
+
+from sedona.sql.types import GeometryType
+from sedona.utils import geometry_serde
+from pyspark.sql.udf import UserDefinedFunction
+from pyspark.sql.types import DataType
+from shapely.geometry.base import BaseGeometry
+
+
+SEDONA_SCALAR_EVAL_TYPE = 5200
+SEDONA_PANDAS_ARROW_NAME = "SedonaPandasArrowUDF"
+
+
+class SedonaUDFType(Enum):
+    SHAPELY_SCALAR = "ShapelyScalar"
+    GEO_SERIES = "GeoSeries"
+
+
+class InvalidSedonaUDFType(Exception):
+    pass
+
+
+sedona_udf_to_eval_type = {
+    SedonaUDFType.SHAPELY_SCALAR: SEDONA_SCALAR_EVAL_TYPE,
+    SedonaUDFType.GEO_SERIES: SEDONA_SCALAR_EVAL_TYPE,
+}
+
+
+def sedona_vectorized_udf(
+    return_type: DataType, udf_type: SedonaUDFType = SedonaUDFType.SHAPELY_SCALAR
+):
+    import geopandas as gpd
+
+    def apply_fn(fn):
+        function_signature = inspect.signature(fn)
+        serialize_geom = False
+        deserialize_geom = False
+
+        if isinstance(return_type, GeometryType):
+            serialize_geom = True
+
+        if issubclass(function_signature.return_annotation, BaseGeometry):
+            serialize_geom = True
+
+        if issubclass(function_signature.return_annotation, gpd.GeoSeries):
+            serialize_geom = True
+
+        for param in function_signature.parameters.values():
+            if issubclass(param.annotation, BaseGeometry):
+                deserialize_geom = True
+
+            if issubclass(param.annotation, gpd.GeoSeries):
+                deserialize_geom = True
+
+        if udf_type == SedonaUDFType.SHAPELY_SCALAR:
+            return _apply_shapely_series_udf(
+                fn, return_type, serialize_geom, deserialize_geom
+            )
+
+        if udf_type == SedonaUDFType.GEO_SERIES:
+            return _apply_geo_series_udf(
+                fn, return_type, serialize_geom, deserialize_geom
+            )
+
+        raise InvalidSedonaUDFType(f"Invalid UDF type: {udf_type}")
+
+    return apply_fn
+
+
+def _apply_shapely_series_udf(
+    fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
+):
+    def apply(series: pd.Series) -> pd.Series:
+        applied = series.apply(
+            lambda x: (
+                fn(geometry_serde.deserialize(x)[0]) if deserialize_geom else fn(x)
+            )
+        )
+
+        return applied.apply(
+            lambda x: geometry_serde.serialize(x) if serialize_geom else x
+        )
+
+    udf = UserDefinedFunction(
+        apply, return_type, "SedonaPandasArrowUDF", evalType=SEDONA_SCALAR_EVAL_TYPE
+    )
+
+    return udf
+
+
+def _apply_geo_series_udf(
+    fn, return_type: DataType, serialize_geom: bool, deserialize_geom: bool
+):
+    import geopandas as gpd
+
+    def apply(series: pd.Series) -> pd.Series:
+        series_data = series
+        if deserialize_geom:
+            series_data = gpd.GeoSeries(
+                series.apply(lambda x: geometry_serde.deserialize(x)[0])
+            )
+
+        return fn(series_data).apply(
+            lambda x: geometry_serde.serialize(x) if serialize_geom else x
+        )
+
+    return UserDefinedFunction(
+        apply, return_type, "SedonaPandasArrowUDF", evalType=SEDONA_SCALAR_EVAL_TYPE
+    )
+
+
+def deserialize_geometry_if_geom(data):
+    if isinstance(data, BaseGeometry):
+        return geometry_serde.deserialize(data)[0]
+
+    return data
+
+
+def serialize_to_geometry_if_geom(data, return_type: DataType):
+    if isinstance(return_type, GeometryType):
+        return geometry_serde.serialize(data)
+
+    return data