From 42fd7e7d9a2c115af9a52f7f896d48b75a271efe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 8 Jul 2020 18:15:55 +0200 Subject: [PATCH] ENH: concat of nullable int + bool preserves int dtype (#34985) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/integer.py | 9 +++-- pandas/tests/arrays/integer/test_concat.py | 45 +++++++++++++++++++++- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 46e0d2a1164e1..24283d2c2e48d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -322,7 +322,7 @@ Other enhancements - :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer - result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). + result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). - :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index df43b5d6115ba..7be7ef3637ee5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -92,10 +92,13 @@ def construct_array_type(cls) -> Type["IntegerArray"]: return IntegerArray def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: - # for now only handle other integer types + # we only handle nullable EA dtypes and numeric numpy dtypes if not all( - isinstance(t, _IntegerDtype) - or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer)) + isinstance(t, BaseMaskedDtype) + or ( + isinstance(t, np.dtype) + and (np.issubdtype(t, np.number) or np.issubdtype(t, np.bool_)) + ) for t in dtypes ): return None diff --git a/pandas/tests/arrays/integer/test_concat.py b/pandas/tests/arrays/integer/test_concat.py index 3ace35700bd3e..fc24709deb82c 100644 --- a/pandas/tests/arrays/integer/test_concat.py +++ b/pandas/tests/arrays/integer/test_concat.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas as pd @@ -15,12 +16,52 @@ (["Int32", "UInt32"], "Int64"), # this still gives object (awaiting float extension dtype) (["Int64", "UInt64"], "object"), + (["Int64", "boolean"], "Int64"), + (["UInt8", "boolean"], "UInt8"), ], ) def test_concat_series(to_concat_dtypes, result_dtype): - result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) - expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( + result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype( result_dtype ) tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat( + [pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]] + ) + expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Int64", "int64"], "Int64"), + (["UInt64", "uint64"], "UInt64"), + (["Int8", "int8"], "Int8"), + (["Int8", "int16"], "Int16"), + (["UInt8", "int8"], "Int16"), + (["Int32", "uint32"], "Int64"), + # this still gives object (awaiting float extension dtype) + (["Int64", "uint64"], "object"), + (["Int64", "bool"], "Int64"), + (["UInt8", "bool"], "UInt8"), + ], +) +def test_concat_series_with_numpy(to_concat_dtypes, result_dtype): + + s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0]) + s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1])) + result = pd.concat([s1, s2], ignore_index=True) + expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat([s2, s1], ignore_index=True) + expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype) + tm.assert_series_equal(result, expected)