Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c8ba339
add targeted casting to combine_first
angela-tarantula Oct 14, 2025
f786763
use difference not union to avoid sorting
angela-tarantula Oct 15, 2025
ae99b3d
fix typo, fewer comments
angela-tarantula Oct 15, 2025
6e77fc9
refactor
angela-tarantula Oct 15, 2025
ea7d2ba
add news
angela-tarantula Oct 15, 2025
301d85f
always use nullable Int for 64-bit ints
angela-tarantula Oct 15, 2025
a6b461c
always upcast, for predictability
angela-tarantula Oct 19, 2025
e15bde9
make wide ints nullable before align and restore after combining
angela-tarantula Oct 19, 2025
451621b
update test expectations (don't convert to float64 when Int64 or UInt…
angela-tarantula Oct 19, 2025
4fdc459
add type hint
angela-tarantula Oct 19, 2025
ef662a0
small refactor
angela-tarantula Oct 19, 2025
fefadcb
combine_first's combiner must preserve EA dtypes
angela-tarantula Oct 19, 2025
f80917d
clearer comments
angela-tarantula Oct 19, 2025
bf69fad
create new test for issue
angela-tarantula Oct 19, 2025
016c64e
clean up test
angela-tarantula Oct 19, 2025
2928cee
don't break any other tests, but comment why it may be worth it
angela-tarantula Oct 20, 2025
1a53d48
preserve old test
angela-tarantula Oct 20, 2025
7e6837f
thinner comment
angela-tarantula Oct 20, 2025
444deaa
clearer comments
angela-tarantula Oct 21, 2025
49ff1a5
follow contributing guidelines
angela-tarantula Oct 22, 2025
747e8bc
move news from reshaping to numeric
angela-tarantula Oct 22, 2025
c527bc0
use correct typing
angela-tarantula Oct 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,7 @@ Numeric
^^^^^^^
- Bug in :func:`api.types.infer_dtype` returning "mixed" for complex and ``pd.NA`` mix (:issue:`61976`)
- Bug in :func:`api.types.infer_dtype` returning "mixed-integer-float" for float and ``pd.NA`` mix (:issue:`61621`)
- Bug in :meth:`DataFrame.combine` and :meth:`DataFrame.combine_first` where integers with absolute value greater than ``2**53`` could lose precision after the operation. (:issue:`60128`)
- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
Expand Down
82 changes: 70 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.integer import (
Int64Dtype,
UInt64Dtype,
)
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.arrays.string_ import StringDtype
from pandas.core.construction import (
Expand Down Expand Up @@ -9025,6 +9029,67 @@ def combine(
1 0.0 3.0 1.0
2 NaN 3.0 1.0
"""

# GH#60128 Integers n where |n| > 2**53 would lose precision after align
# upcasts them to float. Avoid lossy conversion by preemptively promoting
# int64 and uint64 to their nullable ExtensionDtypes, Int64 and UInt64.
def _ensure_nullable_int64_dtypes(df: DataFrame) -> DataFrame:
"""Promote int64/uint64 DataFrame columns to Int64/UInt64."""
cast_map: dict[IndexLabel, DtypeObj] = {}
for col, dt in df.dtypes.items():
if dt == np.int64:
cast_map[col] = Int64Dtype()
elif dt == np.uint64:
cast_map[col] = UInt64Dtype()

if cast_map:
df = df.astype(cast_map)
return df

# To maintain backwards compatibility, downcast the pre-promoted int64
# columns of the combined DataFrame back to how they would have resolved.
# Consider just embracing nullable ExtensionDtypes instead, though.
def _revert_int64_dtype_promotion(
self_orig: DataFrame, other_orig: DataFrame, combined_df: DataFrame
) -> DataFrame:
"""Resolve the combined dtypes according to the original dtypes."""
cast_map: dict[IndexLabel, DtypeObj] = {}
for col in combined_df.columns:
ser = combined_df[col]
orig_dt_self = self_orig.dtypes.get(col)
orig_dt_other = other_orig.dtypes.get(col)

was_promoted = (orig_dt_self in [np.int64, np.uint64]) or (
orig_dt_other in [np.int64, np.uint64]
)

if was_promoted:
dtypes_to_resolve = [
dt for dt in (orig_dt_self, orig_dt_other) if dt is not None
]
if dtypes_to_resolve:
if isna(ser).any():
# If there are NAs, we can't safely downcast back
# to int. Previously, we left the data as float64.
# However, converting large integers to float can
# lose precision, even if it's not immediately
# obvious (since we don't cast back). Consider
# embracing nullable ExtensionDtypes instead
# and dropping this whole restoration step.
dtypes_to_resolve.append(np.dtype(np.float64))
target_type = find_common_type(dtypes_to_resolve)
cast_map[col] = target_type

if cast_map:
combined_df = combined_df.astype(cast_map)
return combined_df

# store originals and prepare for align
self_orig = self
other_orig = other
self = _ensure_nullable_int64_dtypes(self)
other = _ensure_nullable_int64_dtypes(other)

other_idxlen = len(other.index) # save for compare
other_columns = other.columns

Expand Down Expand Up @@ -9092,6 +9157,9 @@ def combine(

# convert_objects just in case
frame_result = self._constructor(result, index=new_index, columns=new_columns)
frame_result = _revert_int64_dtype_promotion(
self_orig, other_orig, frame_result
)
return frame_result.__finalize__(self, method="combine")

def combine_first(self, other: DataFrame) -> DataFrame:
Expand Down Expand Up @@ -9141,20 +9209,10 @@ def combine_first(self, other: DataFrame) -> DataFrame:
1 0.0 3.0 1.0
2 NaN 3.0 1.0
"""
from pandas.core.computation import expressions

def combiner(x: Series, y: Series):
mask = x.isna()._values

x_values = x._values
y_values = y._values

# If the column y in other DataFrame is not in first DataFrame,
# just return y_values.
if y.name not in self.columns:
return y_values

return expressions.where(mask, y_values, x_values)
# GH#60128 The combiner is supposed to preserve EA Dtypes.
return y if y.name not in self.columns else y.where(x.isna(), x)

if len(other) == 0:
combined = self.reindex(
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,21 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
).set_index(["a", "b"])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"wide_val, dtype",
(
(1666880195890293744, "uint64"),
(-1666880195890293744, "int64"),
),
)
def test_combine_first_preserve_precision(self, wide_val, dtype):
# GH#60128
df1 = DataFrame({"A": [wide_val, 5]}, dtype=dtype)
df2 = DataFrame({"A": [6, 7, wide_val]}, dtype=dtype)
result = df1.combine_first(df2)
expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"scalar1, scalar2",
Expand Down
Loading