From e837689e33ee532ed0aa402a0280c43618c63c29 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 10:51:56 -0500 Subject: [PATCH 01/18] ENH: Implement cum* methods for PyArrow strings --- pandas/conftest.py | 20 ++++++++ pandas/core/arrays/arrow/array.py | 56 +++++++++++++++++++++++ pandas/tests/apply/test_str.py | 9 +--- pandas/tests/series/test_cumulative.py | 63 ++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 106518678df6a..e62e77eee492a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1317,6 +1317,26 @@ def nullable_string_dtype(request): return request.param +@pytest.fixture( + params=[ + pytest.param( + pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow") + ), + pytest.param( + pd.StringDtype("pyarrow", na_value=pd.NA), marks=td.skip_if_no("pyarrow") + ), + ] +) +def pyarrow_string_dtype(request): + """ + Parametrized fixture for string dtypes backed by Pyarrow. + + * 'pd.StringDtype("pyarrow", na_value=np.nan)' + * 'pd.StringDtype("pyarrow", na_value=pd.NA)' + """ + return request.param + + @pytest.fixture( params=[ "python", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4d9c8eb3a41b6..211905af27b38 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1619,6 +1620,9 @@ def _accumulate( ------ NotImplementedError : subclass does not define accumulations """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + pyarrow_name = { "cummax": "cumulative_max", "cummin": "cumulative_min", @@ -1654,6 +1658,58 @@ def _accumulate( return type(self)(result) + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> ArrowExtensionArray | ExtensionArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # When present and skipna is False, we stop of at the first NA value. + # as the tail becomes all NA values. + head: pa.array | None = None + tail: pa.array | None = None + pa_array = self._pa_array + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + if skipna: + if name == "cumsum": + pa_array = pc.fill_null(pa_array, "") + else: + pa_array = pc.fill_null_forward(pa_array) + nulls = pc.is_null(pa_array) + idx = pc.index(nulls, False).as_py() + if idx == -1: + idx = len(pa_array) + if idx > 0: + head = pa.array([""] * idx, type=pa_array.type) + pa_array = pa_array[idx:].combine_chunks() + else: + nulls = pc.is_null(pa_array) + idx = pc.index(nulls, True).as_py() + tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) + pa_array = pa_array[:idx].combine_chunks() + + pa_result = pa.array(np_func(pa_array), type=pa_array.type) + + if head is not None or tail is not None: + head = pa.array([], type=pa_array.type) if head is None else head + tail = pa.array([], type=pa_array.type) if tail is None else tail + pa_result = pa.concat_arrays([head, pa_result, tail]) + + result = type(self)(pa_result) + return result + def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: """ Return a pyarrow scalar result of performing the reduction operation. diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index c52168ae48ca8..e224b07a1097b 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -159,17 +159,10 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(request, series, func, expected): +def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func == "cumsum": - request.applymarker( - pytest.mark.xfail( - raises=(TypeError, NotImplementedError), - reason="TODO(infer_string) cumsum not yet implemented for string", - ) - ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index a9d5486139b46..c2c2588ca01ce 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -227,3 +227,66 @@ def test_cumprod_timedelta(self): ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)]) with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() + + @pytest.mark.parametrize( + "data, skipna, expected_data", + [ + ([], True, []), + ([], False, []), + (["x", "z", "y"], True, ["x", "xz", "xzy"]), + (["x", "z", "y"], False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], True, ["x", "x", "xy"]), + (["x", pd.NA, "y"], False, ["x", pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cumsum_pyarrow_strings( + self, pyarrow_string_dtype, data, skipna, expected_data + ): + ser = pd.Series(data, dtype=pyarrow_string_dtype) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = ser.cumsum(skipna=skipna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, op, skipna, expected_data", + [ + ([], "cummin", True, []), + ([], "cummin", False, []), + (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), + (["y", "z", "x"], "cummin", False, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]), + (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]), + ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]), + ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]), + ([], "cummax", True, []), + ([], "cummax", False, []), + (["x", "z", "y"], "cummax", True, ["x", "z", "z"]), + (["x", "z", "y"], "cummax", False, ["x", "z", "z"]), + (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]), + (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]), + ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), + ], + ) + def test_cummin_cummax_pyarrow_strings( + self, pyarrow_string_dtype, data, op, skipna, expected_data + ): + ser = pd.Series(data, dtype=pyarrow_string_dtype) + if expected_data is None: + expected_data = ser.dtype.na_value + method = getattr(ser, op) + expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + result = method(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) + msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'" + with pytest.raises(TypeError, match=msg): + ser.cumprod(skipna=skipna) From 009d11b0776447f143edfefa756a36aa87f70155 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 11:11:41 -0500 Subject: [PATCH 02/18] cleanup --- pandas/core/arrays/arrow/array.py | 3 +-- pandas/tests/series/test_cumulative.py | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 211905af27b38..aed8cb7c585e0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1670,8 +1670,7 @@ def _str_accumulate( msg = f"operation '{name}' not supported for dtype '{self.dtype}'" raise TypeError(msg) - # When present and skipna is False, we stop of at the first NA value. - # as the tail becomes all NA values. + # We may need to strip out leading / trailing NA values head: pa.array | None = None tail: pa.array | None = None pa_array = self._pa_array diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index c2c2588ca01ce..bab2ff776c3b1 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -244,6 +244,7 @@ def test_cumprod_timedelta(self): def test_cumsum_pyarrow_strings( self, pyarrow_string_dtype, data, skipna, expected_data ): + # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(data, dtype=pyarrow_string_dtype) expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) result = ser.cumsum(skipna=skipna) @@ -277,6 +278,7 @@ def test_cumsum_pyarrow_strings( def test_cummin_cummax_pyarrow_strings( self, pyarrow_string_dtype, data, op, skipna, expected_data ): + # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(data, dtype=pyarrow_string_dtype) if expected_data is None: expected_data = ser.dtype.na_value @@ -286,6 +288,7 @@ def test_cummin_cummax_pyarrow_strings( tm.assert_series_equal(result, expected) def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): + # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'" with pytest.raises(TypeError, match=msg): From 3a9200d12cb52c36b5783708b8023163fb457d5a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 11:16:53 -0500 Subject: [PATCH 03/18] Cleanup --- pandas/core/arrays/arrow/array.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index aed8cb7c585e0..3b8e4d7783b31 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1685,7 +1685,10 @@ def _str_accumulate( if name == "cumsum": pa_array = pc.fill_null(pa_array, "") else: + # After the first non-NA value we can retain the running min/max + # by forward filling. pa_array = pc.fill_null_forward(pa_array) + # But any leading NA values should result in "". nulls = pc.is_null(pa_array) idx = pc.index(nulls, False).as_py() if idx == -1: @@ -1694,6 +1697,8 @@ def _str_accumulate( head = pa.array([""] * idx, type=pa_array.type) pa_array = pa_array[idx:].combine_chunks() else: + # When not skipping NA values, the result should be null from + # the first NA value onward. nulls = pc.is_null(pa_array) idx = pc.index(nulls, True).as_py() tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) @@ -1701,10 +1706,11 @@ def _str_accumulate( pa_result = pa.array(np_func(pa_array), type=pa_array.type) - if head is not None or tail is not None: - head = pa.array([], type=pa_array.type) if head is None else head - tail = pa.array([], type=pa_array.type) if tail is None else tail - pa_result = pa.concat_arrays([head, pa_result, tail]) + assert head is None or tail is None + if head is not None: + pa_result = pa.concat_arrays([head, pa_result]) + elif tail is not None: + pa_result = pa.concat_arrays([pa_result, tail]) result = type(self)(pa_result) return result From d62552266b5915476512818203bff14a856ef163 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 11:28:07 -0500 Subject: [PATCH 04/18] fixup --- pandas/conftest.py | 12 ++++-------- pandas/tests/series/test_cumulative.py | 4 +++- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e62e77eee492a..0b3654bbcc16e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1319,20 +1319,16 @@ def nullable_string_dtype(request): @pytest.fixture( params=[ - pytest.param( - pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow") - ), - pytest.param( - pd.StringDtype("pyarrow", na_value=pd.NA), marks=td.skip_if_no("pyarrow") - ), + pytest.param("str[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), ] ) def pyarrow_string_dtype(request): """ Parametrized fixture for string dtypes backed by Pyarrow. - * 'pd.StringDtype("pyarrow", na_value=np.nan)' - * 'pd.StringDtype("pyarrow", na_value=pd.NA)' + * 'str[pyarrow]' + * 'string[pyarrow]' """ return request.param diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index bab2ff776c3b1..ba850e8e2ed21 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -6,6 +6,8 @@ tests.frame.test_cumulative """ +import re + import numpy as np import pytest @@ -290,6 +292,6 @@ def test_cummin_cummax_pyarrow_strings( def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna): # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype) - msg = f"operation 'cumprod' not supported for dtype '{ser.dtype}'" + msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'") with pytest.raises(TypeError, match=msg): ser.cumprod(skipna=skipna) From de728add8388ea8981e67d392a12b82bf251c00d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 31 Dec 2024 12:16:18 -0500 Subject: [PATCH 05/18] Fix extension tests --- pandas/tests/extension/base/accumulate.py | 5 ++-- pandas/tests/extension/test_arrow.py | 15 ++++++---- pandas/tests/extension/test_string.py | 7 +++++ pandas/tests/series/test_cumulative.py | 36 +++++++---------------- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 9a41a3a582c4a..9a2f186c2a00b 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): try: alt = ser.astype("float64") - except TypeError: - # e.g. Period can't be cast to float64 + except (TypeError, ValueError): + # e.g. Period can't be cast to float64 (TypeError) + # String can't be cast to float64 (ValueError) alt = ser.astype(object) result = getattr(ser, op_name)(skipna=skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c5f5a65b77eea..4fccf02e08bd6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: # attribute "pyarrow_dtype" pa_type = ser.dtype.pyarrow_dtype # type: ignore[union-attr] - if ( - pa.types.is_string(pa_type) - or pa.types.is_binary(pa_type) - or pa.types.is_decimal(pa_type) - ): + if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type): if op_name in ["cumsum", "cumprod", "cummax", "cummin"]: return False + elif pa.types.is_string(pa_type): + if op_name == "cumprod": + return False elif pa.types.is_boolean(pa_type): if op_name in ["cumprod", "cummax", "cummin"]: return False @@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request): pa_type = data.dtype.pyarrow_dtype op_name = all_numeric_accumulations + + if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]: + # https://github.com/pandas-dev/pandas/pull/60633 + # Doesn't fit test structure, tested in series/test_cumulative.py instead. + return + ser = pd.Series(data) if not self._supports_accumulation(ser, op_name): diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e19351b2ad058..6434487d67a4d 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -192,6 +192,13 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: and op_name in ("any", "all") ) + def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + return ser.dtype.storage == "pyarrow" and op_name in [ + "cummin", + "cummax", + "cumsum", + ] + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index ba850e8e2ed21..610903d77512d 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -230,31 +230,19 @@ def test_cumprod_timedelta(self): with pytest.raises(TypeError, match="cumprod not supported for Timedelta"): ser.cumprod() - @pytest.mark.parametrize( - "data, skipna, expected_data", - [ - ([], True, []), - ([], False, []), - (["x", "z", "y"], True, ["x", "xz", "xzy"]), - (["x", "z", "y"], False, ["x", "xz", "xzy"]), - (["x", pd.NA, "y"], True, ["x", "x", "xy"]), - (["x", pd.NA, "y"], False, ["x", pd.NA, pd.NA]), - ([pd.NA, pd.NA, pd.NA], True, ["", "", ""]), - ([pd.NA, pd.NA, pd.NA], False, [pd.NA, pd.NA, pd.NA]), - ], - ) - def test_cumsum_pyarrow_strings( - self, pyarrow_string_dtype, data, skipna, expected_data - ): - # https://github.com/pandas-dev/pandas/pull/60633 - ser = pd.Series(data, dtype=pyarrow_string_dtype) - expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) - result = ser.cumsum(skipna=skipna) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "data, op, skipna, expected_data", [ + ([], "cumsum", True, []), + ([], "cumsum", False, []), + (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]), + (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]), + (["x", pd.NA, "y"], "cumsum", True, ["x", "x", "xy"]), + (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]), + ([pd.NA, "x", "y"], "cumsum", True, ["", "x", "xy"]), + ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]), + ([pd.NA, pd.NA, pd.NA], "cumsum", True, ["", "", ""]), + ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]), ([], "cummin", True, []), ([], "cummin", False, []), (["y", "z", "x"], "cummin", True, ["y", "y", "x"]), @@ -277,13 +265,11 @@ def test_cumsum_pyarrow_strings( ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), ], ) - def test_cummin_cummax_pyarrow_strings( + def test_cum_methods_pyarrow_strings( self, pyarrow_string_dtype, data, op, skipna, expected_data ): # https://github.com/pandas-dev/pandas/pull/60633 ser = pd.Series(data, dtype=pyarrow_string_dtype) - if expected_data is None: - expected_data = ser.dtype.na_value method = getattr(ser, op) expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) result = method(skipna=skipna) From 7c12f15520a1f3bf39f0e415bc87ab4ddac62a4a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 1 Jan 2025 16:38:56 -0500 Subject: [PATCH 06/18] xfail test when there is no pyarrow --- pandas/tests/apply/test_str.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index e224b07a1097b..ce71cfec535e4 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,7 +4,10 @@ import numpy as np import pytest -from pandas.compat import WASM +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.core.dtypes.common import is_number @@ -159,10 +162,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: + request.applymarker( + pytest.mark.xfail( + raises=NotImplementedError, + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) From a3650a98cb94663511aaca682882c60c1ccaa3e3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 1 Jan 2025 16:49:24 -0500 Subject: [PATCH 07/18] mypy fixups --- pandas/core/arrays/arrow/array.py | 3 ++- pandas/tests/extension/test_string.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3b8e4d7783b31..bcf51579bc968 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1704,7 +1704,8 @@ def _str_accumulate( tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) pa_array = pa_array[:idx].combine_chunks() - pa_result = pa.array(np_func(pa_array), type=pa_array.type) + # error: Cannot call function of unknown type + pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] assert head is None or tail is None if head is not None: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6434487d67a4d..6ce48e434d329 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -24,6 +24,8 @@ from pandas.compat import HAS_PYARROW +from pandas.core.dtypes.base import StorageExtensionDtype + import pandas as pd import pandas._testing as tm from pandas.api.types import is_string_dtype @@ -193,6 +195,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: ) def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: + assert isinstance(ser.dtype, StorageExtensionDtype) return ser.dtype.storage == "pyarrow" and op_name in [ "cummin", "cummax", From 83104a4090d20002cdf5c677a5ca06775036ce88 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 4 Jan 2025 13:12:04 -0500 Subject: [PATCH 08/18] Implement string accumulations with nanoarrow --- .gitignore | 6 + .pre-commit-config.yaml | 2 +- environment.yml | 4 +- meson.build | 4 +- pandas/_libs/arrow_string_accumulations.cc | 214 +++++++++++++++++++++ pandas/_libs/meson.build | 10 + pandas/core/arrays/arrow/array.py | 46 +---- pyproject.toml | 1 + requirements-dev.txt | 4 +- subprojects/nanoarrow.wrap | 10 + subprojects/nanobind.wrap | 13 ++ subprojects/robin-map.wrap | 13 ++ 12 files changed, 278 insertions(+), 49 deletions(-) create mode 100644 pandas/_libs/arrow_string_accumulations.cc create mode 100644 subprojects/nanoarrow.wrap create mode 100644 subprojects/nanobind.wrap create mode 100644 subprojects/robin-map.wrap diff --git a/.gitignore b/.gitignore index a188e216d9f70..d33c95043b69d 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,9 @@ doc/source/savefig/ # Interactive terminal generated files # ######################################## .jupyterlite.doit.db + +# meson subproject files # +########################## +subprojects/* +!subprojects/packagefiles +!subprojects/*.wrap diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 983c45fc493d1..1b490d4edc060 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -98,7 +98,7 @@ repos: rev: v19.1.6 hooks: - id: clang-format - files: ^pandas/_libs/src|^pandas/_libs/include + files: ^pandas/_libs|pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: local diff --git a/environment.yml b/environment.yml index 69647a436e3ad..1b7de5a808811 100644 --- a/environment.yml +++ b/environment.yml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer - cython~=3.0.5 - - meson=1.2.1 - - meson-python=0.13.1 + - meson>=1.3.0 + - meson-python>=0.13.1 # test dependencies - pytest>=7.3.2 diff --git a/meson.build b/meson.build index efe543b7a267c..4be198f21bcfe 100644 --- a/meson.build +++ b/meson.build @@ -4,11 +4,13 @@ project( 'c', 'cpp', 'cython', version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', - meson_version: '>=1.2.1', + meson_version: '>=1.3.0', default_options: [ 'buildtype=release', 'c_std=c11', + 'cpp_std=c++20', 'warning_level=2', + 'default_library=static', ] ) diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc new file mode 100644 index 0000000000000..035c194ea993e --- /dev/null +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace nanoarrow::literals; +namespace nb = nanobind; + +static auto ReleaseArrowArray(void *ptr) noexcept -> void { + auto array = static_cast(ptr); + if (array->release != nullptr) { + ArrowArrayRelease(array); + } + + delete array; +} + +static auto ReleaseArrowSchema(void *ptr) noexcept -> void { + auto schema = static_cast(ptr); + if (schema->release != nullptr) { + ArrowSchemaRelease(schema); + } + + delete schema; +} + +static auto CumSum(const struct ArrowArrayView *array_view, + struct ArrowArray *out, bool skipna) { + bool seen_na = false; + std::stringstream ss{}; + + for (int64_t i = 0; i < array_view->length; i++) { + const bool isna = ArrowArrayViewIsNull(array_view, i); + if (!skipna && (seen_na || isna)) { + seen_na = true; + ArrowArrayAppendNull(out, 1); + } else { + if (!isna) { + const auto std_sv = ArrowArrayViewGetStringUnsafe(array_view, i); + ss << std::string_view{std_sv.data, + static_cast(std_sv.size_bytes)}; + } + const auto str = ss.str(); + const ArrowStringView asv{str.c_str(), static_cast(str.size())}; + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, asv)); + } + } +} + +template +concept MinOrMaxOp = + std::same_as> || std::same_as>; + +template + requires MinOrMaxOp +static auto CumMinOrMax(const struct ArrowArrayView *array_view, + struct ArrowArray *out, bool skipna) { + bool seen_na = false; + std::optional current_str{}; + + for (int64_t i = 0; i < array_view->length; i++) { + const bool isna = ArrowArrayViewIsNull(array_view, i); + if (!skipna && (seen_na || isna)) { + seen_na = true; + ArrowArrayAppendNull(out, 1); + } else { + if (!isna || current_str) { + if (!isna) { + const auto asv = ArrowArrayViewGetStringUnsafe(array_view, i); + const nb::str pyval{asv.data, static_cast(asv.size_bytes)}; + + if (current_str) { + const nb::str pycurrent{current_str->data(), current_str->size()}; + if (Op(pyval, pycurrent)) { + current_str = + std::string{asv.data, static_cast(asv.size_bytes)}; + } + } else { + current_str = + std::string{asv.data, static_cast(asv.size_bytes)}; + } + } + + struct ArrowStringView out_sv{ + current_str->data(), static_cast(current_str->size())}; + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, out_sv)); + } else { + ArrowArrayAppendEmpty(out, 1); + } + } + } +} + +class ArrowStringAccumulation { +public: + ArrowStringAccumulation(nb::object array_object, std::string accumulation, + bool skipna) + : skipna_(skipna) { + if ((accumulation == "cumsum") || (accumulation == "cummin") || + (accumulation == "cummax")) { + accumulation_ = std::move(accumulation); + } else { + const auto error_message = + std::string("Unsupported accumulation: ") + accumulation; + throw nb::value_error(error_message.c_str()); + } + + const auto obj = nb::getattr(array_object, "__arrow_c_stream__")(); + const auto pycapsule_obj = nb::cast(obj); + + const auto stream = static_cast( + PyCapsule_GetPointer(pycapsule_obj.ptr(), "arrow_array_stream")); + if (stream == nullptr) { + throw std::invalid_argument("Invalid Arrow Stream capsule provided!"); + } + + if (stream->get_schema(stream, schema_.get()) != 0) { + std::string error_msg{stream->get_last_error(stream)}; + throw std::runtime_error("Could not read from arrow schema:" + error_msg); + } + struct ArrowSchemaView schema_view{}; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&schema_view, schema_.get(), nullptr)); + + switch (schema_view.type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_STRING_VIEW: + break; + default: + const auto error_message = + std::string("Expected a string-like array type, got: ") + + ArrowTypeString(schema_view.type); + throw std::invalid_argument(error_message); + } + + ArrowArrayStreamMove(stream, stream_.get()); + } + + std::pair Accumulate(nb::object requested_schema) { + struct ArrowSchemaView schema_view{}; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&schema_view, schema_.get(), nullptr)); + auto uschema = nanoarrow::UniqueSchema{}; + ArrowSchemaInit(uschema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(uschema.get(), schema_view.type)); + + // TODO: even though we are reading a stream we are returning an array + // We should return a like sized stream of data in the future + auto uarray_out = nanoarrow::UniqueArray{}; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(uarray_out.get(), uschema.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(uarray_out.get())); + + nanoarrow::UniqueArray chunk{}; + int errcode{}; + + while ((errcode = ArrowArrayStreamGetNext(stream_.get(), chunk.get(), + nullptr) == 0) && + chunk->release != nullptr) { + struct ArrowArrayView array_view{}; + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&array_view, schema_.get(), nullptr)); + + NANOARROW_THROW_NOT_OK( + ArrowArrayViewSetArray(&array_view, chunk.get(), nullptr)); + + if (accumulation_ == "cumsum") { + CumSum(&array_view, uarray_out.get(), skipna_); + } else if (accumulation_ == "cummin") { + CumMinOrMax(&array_view, uarray_out.get(), skipna_); + } else if (accumulation_ == "cummax") { + CumMinOrMax(&array_view, uarray_out.get(), skipna_); + } else { + throw std::runtime_error("Unexpected branch"); + } + + chunk.reset(); + } + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuildingDefault(uarray_out.get(), nullptr)); + + auto out_schema = new struct ArrowSchema; + ArrowSchemaMove(uschema.get(), out_schema); + nb::capsule schema_capsule{out_schema, "arrow_schema", &ReleaseArrowSchema}; + + auto out_array = new struct ArrowArray; + ArrowArrayMove(uarray_out.get(), out_array); + nb::capsule array_capsule{out_array, "arrow_array", &ReleaseArrowArray}; + + return std::pair{schema_capsule, array_capsule}; + } + +private: + nanoarrow::UniqueArrayStream stream_; + nanoarrow::UniqueSchema schema_; + std::string accumulation_; + bool skipna_; +}; + +NB_MODULE(arrow_string_accumulations, m) { + nb::class_(m, "ArrowStringAccumulation") + .def(nb::init()) + .def("__arrow_c_array__", &ArrowStringAccumulation::Accumulate, + nb::arg("requested_schema") = nb::none()); +} diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..b6ecbe2fba97f 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -122,6 +122,16 @@ foreach ext_name, ext_dict : libs_sources ) endforeach +nanobind_dep = dependency('nanobind') +nanoarrow_dep = dependency('nanoarrow') +py.extension_module( + 'arrow_string_accumulations', + sources: ['arrow_string_accumulations.cc'], + dependencies: [nanobind_dep, nanoarrow_dep], + subdir: 'pandas/_libs', + install: true, +) + # Basically just __init__.py and the .pyi files sources_to_install = [ '__init__.py', diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bcf51579bc968..b65275a3833ff 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,6 +16,7 @@ import numpy as np from pandas._libs import lib +import pandas._libs.arrow_string_accumulations as sa from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -1670,49 +1671,8 @@ def _str_accumulate( msg = f"operation '{name}' not supported for dtype '{self.dtype}'" raise TypeError(msg) - # We may need to strip out leading / trailing NA values - head: pa.array | None = None - tail: pa.array | None = None - pa_array = self._pa_array - np_func = { - "cumsum": np.cumsum, - "cummin": np.minimum.accumulate, - "cummax": np.maximum.accumulate, - }[name] - - if self._hasna: - if skipna: - if name == "cumsum": - pa_array = pc.fill_null(pa_array, "") - else: - # After the first non-NA value we can retain the running min/max - # by forward filling. - pa_array = pc.fill_null_forward(pa_array) - # But any leading NA values should result in "". - nulls = pc.is_null(pa_array) - idx = pc.index(nulls, False).as_py() - if idx == -1: - idx = len(pa_array) - if idx > 0: - head = pa.array([""] * idx, type=pa_array.type) - pa_array = pa_array[idx:].combine_chunks() - else: - # When not skipping NA values, the result should be null from - # the first NA value onward. - nulls = pc.is_null(pa_array) - idx = pc.index(nulls, True).as_py() - tail = pa.nulls(len(pa_array) - idx, type=pa_array.type) - pa_array = pa_array[:idx].combine_chunks() - - # error: Cannot call function of unknown type - pa_result = pa.array(np_func(pa_array), type=pa_array.type) # type: ignore[operator] - - assert head is None or tail is None - if head is not None: - pa_result = pa.concat_arrays([head, pa_result]) - elif tail is not None: - pa_result = pa.concat_arrays([pa_result, tail]) - + # TODO: we can use arrow_c_stream instead of arrow_c_array + pa_result = pa.array(sa.ArrowStringAccumulation(self._pa_array, name, skipna)) result = type(self)(pa_result) return result diff --git a/pyproject.toml b/pyproject.toml index 7ab9cd2c17669..2321e899ceb81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -145,6 +145,7 @@ parentdir_prefix = "pandas-" [tool.meson-python.args] setup = ['--vsenv'] # For Windows +install = ['--skip-subprojects'] [tool.cibuildwheel] skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" diff --git a/requirements-dev.txt b/requirements-dev.txt index fb4d9cdb589ca..a3633a2d6d9d8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,8 +4,8 @@ pip versioneer[toml] cython~=3.0.5 -meson[ninja]==1.2.1 -meson-python==0.13.1 +meson[ninja]>=1.3.0 +meson-python>=0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=3.4.0 diff --git a/subprojects/nanoarrow.wrap b/subprojects/nanoarrow.wrap new file mode 100644 index 0000000000000..7b4ce8abdb762 --- /dev/null +++ b/subprojects/nanoarrow.wrap @@ -0,0 +1,10 @@ +[wrap-file] +directory = apache-arrow-nanoarrow-0.6.0 +source_url = https://www.apache.org/dyn/closer.lua?action=download&filename=arrow/apache-arrow-nanoarrow-0.6.0/apache-arrow-nanoarrow-0.6.0.tar.gz +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanoarrow_0.6.0-1/apache-arrow-nanoarrow-0.6.0.tar.gz +source_filename = apache-arrow-nanoarrow-0.6.0.tar.gz +source_hash = e4a02ac51002ad1875bf09317e70adb959005fad52b240ff59f73b970fa485d1 +wrapdb_version = 0.6.0-1 + +[provide] +nanoarrow = nanoarrow_dep diff --git a/subprojects/nanobind.wrap b/subprojects/nanobind.wrap new file mode 100644 index 0000000000000..78e2e7c5d011b --- /dev/null +++ b/subprojects/nanobind.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = nanobind-2.4.0 +source_url = https://github.com/wjakob/nanobind/archive/refs/tags/v2.4.0.tar.gz +source_filename = nanobind-2.4.0.tar.gz +source_hash = bb35deaed7efac5029ed1e33880a415638352f757d49207a8e6013fefb6c49a7 +patch_filename = nanobind_2.4.0-2_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/nanobind_2.4.0-2/get_patch +patch_hash = cf493bda0b11ea4e8d9dd42229c3bbdd52af88cc4aedac75a1eccb102b86dd4a +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanobind_2.4.0-2/nanobind-2.4.0.tar.gz +wrapdb_version = 2.4.0-2 + +[provide] +nanobind = nanobind_dep diff --git a/subprojects/robin-map.wrap b/subprojects/robin-map.wrap new file mode 100644 index 0000000000000..3da2993bb709e --- /dev/null +++ b/subprojects/robin-map.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = robin-map-1.3.0 +source_url = https://github.com/Tessil/robin-map/archive/refs/tags/v1.3.0.tar.gz +source_filename = robin-map-1.3.0.tar.gz +source_hash = a8424ad3b0affd4c57ed26f0f3d8a29604f0e1f2ef2089f497f614b1c94c7236 +patch_filename = robin-map_1.3.0-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/robin-map_1.3.0-1/get_patch +patch_hash = 6d090f988541ffb053512607e0942cbd0dbc2a4fa0563e44ff6a37e810b8c739 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/robin-map_1.3.0-1/robin-map-1.3.0.tar.gz +wrapdb_version = 1.3.0-1 + +[provide] +robin-map = robin_map_dep From 7b8e78202b8504ed354058e79d74db7b68ac7c67 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 6 Jan 2025 17:18:52 -0500 Subject: [PATCH 09/18] bump CI meson installs --- .github/workflows/unit-tests.yml | 8 ++++---- ci/deps/actions-310-minimum_versions.yaml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-numpydev.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-311-arm64.yaml | 2 +- pandas/core/arrays/arrow/array.py | 1 - pyproject.toml | 2 +- 12 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 899b49cc4eff5..600a012189032 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -233,7 +233,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.3.2 meson-python==0.13.1 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" @@ -272,7 +272,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.3.2 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir @@ -344,7 +344,7 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov @@ -389,7 +389,7 @@ jobs: # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index c7c72828db481..ba5d7ac712efa 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 74cab4e0970dc..0ae0fcc6cb7d8 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 092ca18d61259..984fb68182da3 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 325a6d45d74fd..b916a4f5126f9 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 - cython>=0.29.33 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 22e4907e5a6e5..ece7e3401cfa9 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer - - meson=1.2.1 + - meson=1.3.2 - cython>=0.29.33 - meson-python=0.13.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b6f515dceaea9..b16c774007dcb 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index bc66f8a5382c9..8d068d4410d2e 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 90933b24b88db..6e38ecd15aebb 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -10,7 +10,7 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 3f09e27d0fe4b..1d36a4a0bf10a 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.2.1 + - meson=1.3.2 - meson-python=0.13.1 # test dependencies diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b65275a3833ff..1c930ee642f13 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1671,7 +1671,6 @@ def _str_accumulate( msg = f"operation '{name}' not supported for dtype '{self.dtype}'" raise TypeError(msg) - # TODO: we can use arrow_c_stream instead of arrow_c_array pa_result = pa.array(sa.ArrowStringAccumulation(self._pa_array, name, skipna)) result = type(self)(pa_result) return result diff --git a/pyproject.toml b/pyproject.toml index 2321e899ceb81..6cafcef296cc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ "meson-python>=0.13.1", - "meson>=1.2.1,<2", + "meson>=1.3.0,<2", "wheel", "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json # Force numpy higher than 2.0rc1, so that built wheels are compatible From f24c79f3af97252f900f4f53c66d0cbd1e1c4eaa Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 6 Jan 2025 17:29:08 -0500 Subject: [PATCH 10/18] Suppress warnings --- meson.build | 2 ++ pandas/_libs/arrow_string_accumulations.cc | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 4be198f21bcfe..09a33d9c32109 100644 --- a/meson.build +++ b/meson.build @@ -11,6 +11,8 @@ project( 'cpp_std=c++20', 'warning_level=2', 'default_library=static', + # TODO: how can we only set this for nanobind? + 'cpp_args=-Wno-sign-compare' ] ) diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc index 035c194ea993e..e62118c9982ce 100644 --- a/pandas/_libs/arrow_string_accumulations.cc +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -143,7 +143,7 @@ class ArrowStringAccumulation { ArrowArrayStreamMove(stream, stream_.get()); } - std::pair Accumulate(nb::object requested_schema) { + std::pair Accumulate(nb::object) { struct ArrowSchemaView schema_view{}; NANOARROW_THROW_NOT_OK( ArrowSchemaViewInit(&schema_view, schema_.get(), nullptr)); From 78872f9127b6c2c0da7968873dff9437b8c5f315 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 6 Jan 2025 18:03:02 -0500 Subject: [PATCH 11/18] Remove C++20 concept for now --- pandas/_libs/arrow_string_accumulations.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc index e62118c9982ce..5037e33f08f0f 100644 --- a/pandas/_libs/arrow_string_accumulations.cc +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -53,12 +53,12 @@ static auto CumSum(const struct ArrowArrayView *array_view, } } -template -concept MinOrMaxOp = - std::same_as> || std::same_as>; +// template +// concept MinOrMaxOp = +// std::same_as> || std::same_as>; template - requires MinOrMaxOp +// requires MinOrMaxOp static auto CumMinOrMax(const struct ArrowArrayView *array_view, struct ArrowArray *out, bool skipna) { bool seen_na = false; From 2325b24e6a05145e0a89ede85ae03d79901b3612 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 6 Jan 2025 18:20:10 -0500 Subject: [PATCH 12/18] bump meson-python --- .circleci/config.yml | 2 +- .github/workflows/unit-tests.yml | 8 ++++---- ci/deps/actions-310-minimum_versions.yaml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-numpydev.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-311-arm64.yaml | 2 +- environment.yml | 2 +- pyproject.toml | 2 +- requirements-dev.txt | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 139ea9d220453..98d03910879c5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -54,7 +54,7 @@ jobs: command: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 600a012189032..705f3240308b2 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -233,7 +233,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.3.2 meson-python==0.13.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" @@ -272,7 +272,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.3.2 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.3.2 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir @@ -344,7 +344,7 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov @@ -389,7 +389,7 @@ jobs: # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.13.1 + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index ba5d7ac712efa..b044bf9fd58cb 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -10,7 +10,7 @@ dependencies: - versioneer - cython>=0.29.33 - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 0ae0fcc6cb7d8..ed79abbe03111 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer - cython>=0.29.33 - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 984fb68182da3..f3f9696ee2f78 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -9,7 +9,7 @@ dependencies: - versioneer - cython>=0.29.33 - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index b916a4f5126f9..7f0ed67258d07 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 - cython>=0.29.33 # test dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index ece7e3401cfa9..e693f74175df4 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer - meson=1.3.2 - cython>=0.29.33 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b16c774007dcb..f44365b9f8423 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer - cython>=0.29.33 - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 8d068d4410d2e..f20346fad0d71 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer - cython>=0.29.33 - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 6e38ecd15aebb..b10a6fca13890 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -11,7 +11,7 @@ dependencies: - versioneer - cython>=0.29.33 - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 1d36a4a0bf10a..0ba7a33599a9e 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer - cython>=0.29.33 - meson=1.3.2 - - meson-python=0.13.1 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/environment.yml b/environment.yml index 1b7de5a808811..5da739d01ee31 100644 --- a/environment.yml +++ b/environment.yml @@ -10,7 +10,7 @@ dependencies: - versioneer - cython~=3.0.5 - meson>=1.3.0 - - meson-python>=0.13.1 + - meson-python>=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/pyproject.toml b/pyproject.toml index 6cafcef296cc3..00b509ba775b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "meson-python>=0.13.1", + "meson-python>=0.14.0", "meson>=1.3.0,<2", "wheel", "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json diff --git a/requirements-dev.txt b/requirements-dev.txt index a3633a2d6d9d8..4e8d1eb2e9e8b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,7 @@ pip versioneer[toml] cython~=3.0.5 meson[ninja]>=1.3.0 -meson-python>=0.13.1 +meson-python>=0.14.0 pytest>=7.3.2 pytest-cov pytest-xdist>=3.4.0 From 0cb78cbe633f905deaa479a78e48f504db7f65c7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 6 Jan 2025 20:02:11 -0500 Subject: [PATCH 13/18] Use nanoarrow C++ helpers and iterate stream in accumulations --- pandas/_libs/arrow_string_accumulations.cc | 129 +++++++++++---------- 1 file changed, 66 insertions(+), 63 deletions(-) diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc index 5037e33f08f0f..a32601d9ee53a 100644 --- a/pandas/_libs/arrow_string_accumulations.cc +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -30,68 +30,73 @@ static auto ReleaseArrowSchema(void *ptr) noexcept -> void { delete schema; } -static auto CumSum(const struct ArrowArrayView *array_view, +template +static auto CumSum(struct ArrowArrayStream *array_stream, struct ArrowArray *out, bool skipna) { bool seen_na = false; std::stringstream ss{}; - for (int64_t i = 0; i < array_view->length; i++) { - const bool isna = ArrowArrayViewIsNull(array_view, i); - if (!skipna && (seen_na || isna)) { - seen_na = true; - ArrowArrayAppendNull(out, 1); - } else { - if (!isna) { - const auto std_sv = ArrowArrayViewGetStringUnsafe(array_view, i); - ss << std::string_view{std_sv.data, - static_cast(std_sv.size_bytes)}; + nanoarrow::ViewArrayStream array_stream_view(array_stream); + for (const auto &array : array_stream_view) { + for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { + if ((!sv || seen_na) && !skipna) { + seen_na = true; + ArrowArrayAppendNull(out, 1); + } else { + if (sv) { + ss << std::string_view{(*sv).data, + static_cast((*sv).size_bytes)}; + } + const auto str = ss.str(); + const ArrowStringView asv{str.c_str(), + static_cast(str.size())}; + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, asv)); } - const auto str = ss.str(); - const ArrowStringView asv{str.c_str(), static_cast(str.size())}; - NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, asv)); } } } +// TODO: doesn't seem like all compilers in CI support this? // template // concept MinOrMaxOp = // std::same_as> || std::same_as>; -template +template // requires MinOrMaxOp -static auto CumMinOrMax(const struct ArrowArrayView *array_view, +static auto CumMinOrMax(struct ArrowArrayStream *array_stream, struct ArrowArray *out, bool skipna) { bool seen_na = false; std::optional current_str{}; - for (int64_t i = 0; i < array_view->length; i++) { - const bool isna = ArrowArrayViewIsNull(array_view, i); - if (!skipna && (seen_na || isna)) { - seen_na = true; - ArrowArrayAppendNull(out, 1); - } else { - if (!isna || current_str) { - if (!isna) { - const auto asv = ArrowArrayViewGetStringUnsafe(array_view, i); - const nb::str pyval{asv.data, static_cast(asv.size_bytes)}; - - if (current_str) { - const nb::str pycurrent{current_str->data(), current_str->size()}; - if (Op(pyval, pycurrent)) { - current_str = - std::string{asv.data, static_cast(asv.size_bytes)}; + nanoarrow::ViewArrayStream array_stream_view(array_stream); + for (const auto &array : array_stream_view) { + for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { + if ((!sv || seen_na) && !skipna) { + seen_na = true; + ArrowArrayAppendNull(out, 1); + } else { + if (sv || current_str) { + if (sv) { + const nb::str pyval{(*sv).data, + static_cast((*sv).size_bytes)}; + if (current_str) { + const nb::str pycurrent{current_str->data(), current_str->size()}; + if (Op(pyval, pycurrent)) { + current_str = std::string{ + (*sv).data, static_cast((*sv).size_bytes)}; + } + } else { + current_str = std::string{(*sv).data, + static_cast((*sv).size_bytes)}; } - } else { - current_str = - std::string{asv.data, static_cast(asv.size_bytes)}; } - } - struct ArrowStringView out_sv{ - current_str->data(), static_cast(current_str->size())}; - NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, out_sv)); - } else { - ArrowArrayAppendEmpty(out, 1); + struct ArrowStringView out_sv{ + current_str->data(), static_cast(current_str->size())}; + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(out, out_sv)); + } else { + ArrowArrayAppendEmpty(out, 1); + } } } } @@ -131,7 +136,6 @@ class ArrowStringAccumulation { switch (schema_view.type) { case NANOARROW_TYPE_STRING: case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_STRING_VIEW: break; default: const auto error_message = @@ -159,30 +163,29 @@ class ArrowStringAccumulation { NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(uarray_out.get())); - nanoarrow::UniqueArray chunk{}; - int errcode{}; - - while ((errcode = ArrowArrayStreamGetNext(stream_.get(), chunk.get(), - nullptr) == 0) && - chunk->release != nullptr) { - struct ArrowArrayView array_view{}; - NANOARROW_THROW_NOT_OK( - ArrowArrayViewInitFromSchema(&array_view, schema_.get(), nullptr)); - - NANOARROW_THROW_NOT_OK( - ArrowArrayViewSetArray(&array_view, chunk.get(), nullptr)); - - if (accumulation_ == "cumsum") { - CumSum(&array_view, uarray_out.get(), skipna_); - } else if (accumulation_ == "cummin") { - CumMinOrMax(&array_view, uarray_out.get(), skipna_); - } else if (accumulation_ == "cummax") { - CumMinOrMax(&array_view, uarray_out.get(), skipna_); + if (accumulation_ == "cumsum") { + if (schema_view.type == NANOARROW_TYPE_STRING) { + CumSum<32>(stream_.get(), uarray_out.get(), skipna_); } else { - throw std::runtime_error("Unexpected branch"); + CumSum<64>(stream_.get(), uarray_out.get(), skipna_); } - chunk.reset(); + } else if (accumulation_ == "cummin") { + if (schema_view.type == NANOARROW_TYPE_STRING) { + CumMinOrMax<32, std::less{}>(stream_.get(), uarray_out.get(), skipna_); + } else { + CumMinOrMax<64, std::less{}>(stream_.get(), uarray_out.get(), skipna_); + } + } else if (accumulation_ == "cummax") { + if (schema_view.type == NANOARROW_TYPE_STRING) { + CumMinOrMax<32, std::greater{}>(stream_.get(), uarray_out.get(), + skipna_); + } else { + CumMinOrMax<64, std::greater{}>(stream_.get(), uarray_out.get(), + skipna_); + } + } else { + throw std::runtime_error("Unexpected branch"); } NANOARROW_THROW_NOT_OK( From c8b4fde5982970da598aa03e54aecdeabfac0117 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 7 Jan 2025 10:56:14 -0500 Subject: [PATCH 14/18] Work around nanoarrow bug --- pandas/_libs/arrow_string_accumulations.cc | 36 ++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc index a32601d9ee53a..95f8278cd8b84 100644 --- a/pandas/_libs/arrow_string_accumulations.cc +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -36,9 +36,25 @@ static auto CumSum(struct ArrowArrayStream *array_stream, bool seen_na = false; std::stringstream ss{}; + // TODO: we can simplify this further if we just iterate on the array + // and not the array view, but there is an upstream bug in nanoarrow + // that prevents that + // https://github.com/apache/arrow-nanoarrow/issues/701 + nanoarrow::UniqueArrayView array_view{}; + nanoarrow::UniqueSchema schema{}; + NANOARROW_THROW_NOT_OK( + ArrowArrayStreamGetSchema(array_stream, schema.get(), nullptr)); + nanoarrow::ViewArrayStream array_stream_view(array_stream); for (const auto &array : array_stream_view) { - for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { + array_view.reset(); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewSetArray(array_view.get(), &array, nullptr)); + + for (const auto &sv : + nanoarrow::ViewArrayAsBytes(array_view.get())) { if ((!sv || seen_na) && !skipna) { seen_na = true; ArrowArrayAppendNull(out, 1); @@ -68,9 +84,25 @@ static auto CumMinOrMax(struct ArrowArrayStream *array_stream, bool seen_na = false; std::optional current_str{}; + // TODO: we can simplify this further if we just iterate on the array + // and not the array view, but there is an upstream bug in nanoarrow + // that prevents that + // https://github.com/apache/arrow-nanoarrow/issues/701 + nanoarrow::UniqueArrayView array_view{}; + nanoarrow::UniqueSchema schema{}; + NANOARROW_THROW_NOT_OK( + ArrowArrayStreamGetSchema(array_stream, schema.get(), nullptr)); + nanoarrow::ViewArrayStream array_stream_view(array_stream); for (const auto &array : array_stream_view) { - for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { + array_view.reset(); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewSetArray(array_view.get(), &array, nullptr)); + + for (const auto &sv : + nanoarrow::ViewArrayAsBytes(array_view.get())) { if ((!sv || seen_na) && !skipna) { seen_na = true; ArrowArrayAppendNull(out, 1); From 192dba66f0933db452f03e4bf2f95543bcd5a937 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 7 Jan 2025 14:34:57 -0500 Subject: [PATCH 15/18] Revert back to C++17 --- meson.build | 2 +- pandas/_libs/arrow_string_accumulations.cc | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/meson.build b/meson.build index 09a33d9c32109..c4447b53c3497 100644 --- a/meson.build +++ b/meson.build @@ -8,7 +8,7 @@ project( default_options: [ 'buildtype=release', 'c_std=c11', - 'cpp_std=c++20', + 'cpp_std=c++17', 'warning_level=2', 'default_library=static', # TODO: how can we only set this for nanobind? diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc index 95f8278cd8b84..37d4ca7762667 100644 --- a/pandas/_libs/arrow_string_accumulations.cc +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -76,9 +76,9 @@ static auto CumSum(struct ArrowArrayStream *array_stream, // template // concept MinOrMaxOp = // std::same_as> || std::same_as>; - -template +// template // requires MinOrMaxOp +template typename CompareOp> static auto CumMinOrMax(struct ArrowArrayStream *array_stream, struct ArrowArray *out, bool skipna) { bool seen_na = false; @@ -113,7 +113,7 @@ static auto CumMinOrMax(struct ArrowArrayStream *array_stream, static_cast((*sv).size_bytes)}; if (current_str) { const nb::str pycurrent{current_str->data(), current_str->size()}; - if (Op(pyval, pycurrent)) { + if (CompareOp{}(pyval, pycurrent)) { current_str = std::string{ (*sv).data, static_cast((*sv).size_bytes)}; } @@ -204,17 +204,15 @@ class ArrowStringAccumulation { } else if (accumulation_ == "cummin") { if (schema_view.type == NANOARROW_TYPE_STRING) { - CumMinOrMax<32, std::less{}>(stream_.get(), uarray_out.get(), skipna_); + CumMinOrMax<32, std::less>(stream_.get(), uarray_out.get(), skipna_); } else { - CumMinOrMax<64, std::less{}>(stream_.get(), uarray_out.get(), skipna_); + CumMinOrMax<64, std::less>(stream_.get(), uarray_out.get(), skipna_); } } else if (accumulation_ == "cummax") { if (schema_view.type == NANOARROW_TYPE_STRING) { - CumMinOrMax<32, std::greater{}>(stream_.get(), uarray_out.get(), - skipna_); + CumMinOrMax<32, std::greater>(stream_.get(), uarray_out.get(), skipna_); } else { - CumMinOrMax<64, std::greater{}>(stream_.get(), uarray_out.get(), - skipna_); + CumMinOrMax<64, std::greater>(stream_.get(), uarray_out.get(), skipna_); } } else { throw std::runtime_error("Unexpected branch"); From 53fc8d909d645038dfb2e0dbb6deeb67c7d6857e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 8 Jan 2025 10:53:03 -0500 Subject: [PATCH 16/18] Remove meson version pins --- .circleci/config.yml | 2 +- .github/workflows/unit-tests.yml | 8 ++++---- ci/deps/actions-310-minimum_versions.yaml | 4 ++-- ci/deps/actions-310.yaml | 4 ++-- ci/deps/actions-311-downstream_compat.yaml | 4 ++-- ci/deps/actions-311-numpydev.yaml | 4 ++-- ci/deps/actions-311-pyarrownightly.yaml | 4 ++-- ci/deps/actions-311.yaml | 4 ++-- ci/deps/actions-312.yaml | 4 ++-- ci/deps/actions-pypy-39.yaml | 4 ++-- ci/deps/circle-311-arm64.yaml | 4 ++-- environment.yml | 4 ++-- requirements-dev.txt | 4 ++-- 13 files changed, 27 insertions(+), 27 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 98d03910879c5..fe4b89788c822 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -54,7 +54,7 @@ jobs: command: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python meson[ninja] python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 705f3240308b2..3e60e474ec7cb 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -233,7 +233,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.3.2 meson-python==0.14.0 + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja] meson-python python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" @@ -272,7 +272,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.3.2 + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python meson[ninja] python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir @@ -344,7 +344,7 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 + python -m pip install --upgrade pip setuptools wheel meson[ninja] meson-python python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov @@ -389,7 +389,7 @@ jobs: # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 + python -m pip install --upgrade pip setuptools wheel meson[ninja] meson-python python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index b044bf9fd58cb..c5d597f4cb3dc 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index ed79abbe03111..e579d3cc00995 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index f3f9696ee2f78..30fc12b46829c 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -8,8 +8,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 7f0ed67258d07..c510037f48539 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python - cython>=0.29.33 # test dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index e693f74175df4..74f8e0515770e 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,9 +6,9 @@ dependencies: # build dependencies - versioneer - - meson=1.3.2 + - meson - cython>=0.29.33 - - meson-python=0.14.0 + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index f44365b9f8423..0b24cec53feb9 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index f20346fad0d71..f8e7200a40b12 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index b10a6fca13890..d3b7491064ce3 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -10,8 +10,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 0ba7a33599a9e..737829d873c44 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson=1.3.2 - - meson-python=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/environment.yml b/environment.yml index 5da739d01ee31..ef7a7e8fb8a75 100644 --- a/environment.yml +++ b/environment.yml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer - cython~=3.0.5 - - meson>=1.3.0 - - meson-python>=0.14.0 + - meson + - meson-python # test dependencies - pytest>=7.3.2 diff --git a/requirements-dev.txt b/requirements-dev.txt index 4e8d1eb2e9e8b..1c22c2de0b8b6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,8 +4,8 @@ pip versioneer[toml] cython~=3.0.5 -meson[ninja]>=1.3.0 -meson-python>=0.14.0 +meson[ninja] +meson-python pytest>=7.3.2 pytest-cov pytest-xdist>=3.4.0 From c8d92b3a06efabb5d57fc3205c7f0fc85124759c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 8 Jan 2025 12:43:17 -0500 Subject: [PATCH 17/18] Revert "Remove meson version pins" This reverts commit 53fc8d909d645038dfb2e0dbb6deeb67c7d6857e. --- .circleci/config.yml | 2 +- .github/workflows/unit-tests.yml | 8 ++++---- ci/deps/actions-310-minimum_versions.yaml | 4 ++-- ci/deps/actions-310.yaml | 4 ++-- ci/deps/actions-311-downstream_compat.yaml | 4 ++-- ci/deps/actions-311-numpydev.yaml | 4 ++-- ci/deps/actions-311-pyarrownightly.yaml | 4 ++-- ci/deps/actions-311.yaml | 4 ++-- ci/deps/actions-312.yaml | 4 ++-- ci/deps/actions-pypy-39.yaml | 4 ++-- ci/deps/circle-311-arm64.yaml | 4 ++-- environment.yml | 4 ++-- requirements-dev.txt | 4 ++-- 13 files changed, 27 insertions(+), 27 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index fe4b89788c822..98d03910879c5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -54,7 +54,7 @@ jobs: command: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python meson[ninja] + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 3e60e474ec7cb..705f3240308b2 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -233,7 +233,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja] meson-python + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" @@ -272,7 +272,7 @@ jobs: run: | /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate - python -m pip install --no-cache-dir -U pip wheel setuptools meson-python meson[ninja] + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.14.0 meson[ninja]==1.3.2 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir @@ -344,7 +344,7 @@ jobs: - name: Build Environment run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja] meson-python + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov @@ -389,7 +389,7 @@ jobs: # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja] meson-python + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.3.2 meson-python==0.14.0 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index c5d597f4cb3dc..b044bf9fd58cb 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index e579d3cc00995..ed79abbe03111 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 30fc12b46829c..f3f9696ee2f78 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -8,8 +8,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index c510037f48539..7f0ed67258d07 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -6,8 +6,8 @@ dependencies: # build dependencies - versioneer - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 - cython>=0.29.33 # test dependencies diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 74f8e0515770e..e693f74175df4 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,9 +6,9 @@ dependencies: # build dependencies - versioneer - - meson + - meson=1.3.2 - cython>=0.29.33 - - meson-python + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 0b24cec53feb9..f44365b9f8423 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index f8e7200a40b12..f20346fad0d71 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index d3b7491064ce3..b10a6fca13890 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -10,8 +10,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml index 737829d873c44..0ba7a33599a9e 100644 --- a/ci/deps/circle-311-arm64.yaml +++ b/ci/deps/circle-311-arm64.yaml @@ -7,8 +7,8 @@ dependencies: # build dependencies - versioneer - cython>=0.29.33 - - meson - - meson-python + - meson=1.3.2 + - meson-python=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/environment.yml b/environment.yml index ef7a7e8fb8a75..5da739d01ee31 100644 --- a/environment.yml +++ b/environment.yml @@ -9,8 +9,8 @@ dependencies: # build dependencies - versioneer - cython~=3.0.5 - - meson - - meson-python + - meson>=1.3.0 + - meson-python>=0.14.0 # test dependencies - pytest>=7.3.2 diff --git a/requirements-dev.txt b/requirements-dev.txt index 1c22c2de0b8b6..4e8d1eb2e9e8b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,8 +4,8 @@ pip versioneer[toml] cython~=3.0.5 -meson[ninja] -meson-python +meson[ninja]>=1.3.0 +meson-python>=0.14.0 pytest>=7.3.2 pytest-cov pytest-xdist>=3.4.0 From 60c3e6f52c17db876428625eff4726292327b84c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 8 Jan 2025 15:52:59 -0500 Subject: [PATCH 18/18] Use nanoarrow commit --- pandas/_libs/arrow_string_accumulations.cc | 28 ++-------------------- subprojects/nanoarrow.wrap | 9 ++++--- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/arrow_string_accumulations.cc b/pandas/_libs/arrow_string_accumulations.cc index 37d4ca7762667..d0d2a940099fe 100644 --- a/pandas/_libs/arrow_string_accumulations.cc +++ b/pandas/_libs/arrow_string_accumulations.cc @@ -36,25 +36,13 @@ static auto CumSum(struct ArrowArrayStream *array_stream, bool seen_na = false; std::stringstream ss{}; - // TODO: we can simplify this further if we just iterate on the array - // and not the array view, but there is an upstream bug in nanoarrow - // that prevents that - // https://github.com/apache/arrow-nanoarrow/issues/701 - nanoarrow::UniqueArrayView array_view{}; nanoarrow::UniqueSchema schema{}; NANOARROW_THROW_NOT_OK( ArrowArrayStreamGetSchema(array_stream, schema.get(), nullptr)); nanoarrow::ViewArrayStream array_stream_view(array_stream); for (const auto &array : array_stream_view) { - array_view.reset(); - NANOARROW_THROW_NOT_OK( - ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr)); - NANOARROW_THROW_NOT_OK( - ArrowArrayViewSetArray(array_view.get(), &array, nullptr)); - - for (const auto &sv : - nanoarrow::ViewArrayAsBytes(array_view.get())) { + for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { if ((!sv || seen_na) && !skipna) { seen_na = true; ArrowArrayAppendNull(out, 1); @@ -84,25 +72,13 @@ static auto CumMinOrMax(struct ArrowArrayStream *array_stream, bool seen_na = false; std::optional current_str{}; - // TODO: we can simplify this further if we just iterate on the array - // and not the array view, but there is an upstream bug in nanoarrow - // that prevents that - // https://github.com/apache/arrow-nanoarrow/issues/701 - nanoarrow::UniqueArrayView array_view{}; nanoarrow::UniqueSchema schema{}; NANOARROW_THROW_NOT_OK( ArrowArrayStreamGetSchema(array_stream, schema.get(), nullptr)); nanoarrow::ViewArrayStream array_stream_view(array_stream); for (const auto &array : array_stream_view) { - array_view.reset(); - NANOARROW_THROW_NOT_OK( - ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr)); - NANOARROW_THROW_NOT_OK( - ArrowArrayViewSetArray(array_view.get(), &array, nullptr)); - - for (const auto &sv : - nanoarrow::ViewArrayAsBytes(array_view.get())) { + for (const auto &sv : nanoarrow::ViewArrayAsBytes(&array)) { if ((!sv || seen_na) && !skipna) { seen_na = true; ArrowArrayAppendNull(out, 1); diff --git a/subprojects/nanoarrow.wrap b/subprojects/nanoarrow.wrap index 7b4ce8abdb762..bd98febad3911 100644 --- a/subprojects/nanoarrow.wrap +++ b/subprojects/nanoarrow.wrap @@ -1,10 +1,9 @@ [wrap-file] -directory = apache-arrow-nanoarrow-0.6.0 -source_url = https://www.apache.org/dyn/closer.lua?action=download&filename=arrow/apache-arrow-nanoarrow-0.6.0/apache-arrow-nanoarrow-0.6.0.tar.gz +directory = arrow-nanoarrow-7a808701819cb4c5f6b6ddf7c51c09389cd097ff +source_url = https://github.com/apache/arrow-nanoarrow/archive/7a808701819cb4c5f6b6ddf7c51c09389cd097ff.tar.gz source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/nanoarrow_0.6.0-1/apache-arrow-nanoarrow-0.6.0.tar.gz -source_filename = apache-arrow-nanoarrow-0.6.0.tar.gz -source_hash = e4a02ac51002ad1875bf09317e70adb959005fad52b240ff59f73b970fa485d1 -wrapdb_version = 0.6.0-1 +source_filename = arrow-nanoarrow-7a808701819cb4c5f6b6ddf7c51c09389cd097ff.tar.gz +source_hash = 1f4924dc341bc3bf357ee23320651f18c05a4e031e089b2bc09eeadee2664855 [provide] nanoarrow = nanoarrow_dep