Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support construction of a DataFrame from a Mapping #58814

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Other enhancements
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
- :meth:`DataFrame` now supports to create a new :class:`DataFrame` from a :py:class:`collections.abc.Mapping` object (:issue:`58803`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@

if TYPE_CHECKING:
from collections.abc import (
Mapping,
Sequence,
Sized,
)
Expand Down Expand Up @@ -860,13 +861,13 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
return dtype, val


def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
def dict_compat(d: Mapping[Scalar, Scalar]) -> dict[Scalar, Scalar]:
"""
Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
Convert datetimelike-keyed Mappings to a Timestamp-keyed dict.

Parameters
----------
d: dict-like object
d: Mapping object

Returns
-------
Expand Down
20 changes: 10 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,12 +513,12 @@ class DataFrame(NDFrame, OpsMixin):

Parameters
----------
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
Dict can contain Series, arrays, constants, dataclass or list-like objects. If
data is a dict, column order follows insertion-order. If a dict contains Series
which have an index defined, it is aligned by its index. This alignment also
occurs if data is a Series or a DataFrame itself. Alignment is done on
Series/DataFrame inputs.
data : ndarray (structured or homogeneous), Iterable, Mapping, or DataFrame
Mapping can contain Series, arrays, constants, dataclass or list-like objects.
If data is a Mapping, column order follows insertion-order. If a Mapping
contains Series which have an index defined, it is aligned by its index. This
alignment also occurs if data is a Series or a DataFrame itself. Alignment is
done on Series/DataFrame inputs.

If data is a list of dicts, column order follows insertion-order.

Expand Down Expand Up @@ -735,7 +735,7 @@ def __init__(
raise ValueError("columns cannot be a set")

if copy is None:
if isinstance(data, dict):
if isinstance(data, Mapping):
# retain pre-GH#38939 default behavior
copy = True
elif not isinstance(data, (Index, DataFrame, Series)):
Expand All @@ -754,7 +754,7 @@ def __init__(
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
)

elif isinstance(data, dict):
elif isinstance(data, Mapping):
# GH#38939 de facto copy defaults to False only in non-dict cases
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy)
elif isinstance(data, ma.MaskedArray):
Expand Down Expand Up @@ -1735,7 +1735,7 @@ def __rmatmul__(self, other) -> DataFrame:
@classmethod
def from_dict(
cls,
data: dict,
data: Mapping,
orient: FromDictOrient = "columns",
dtype: Dtype | None = None,
columns: Axes | None = None,
Expand All @@ -1748,7 +1748,7 @@ def from_dict(

Parameters
----------
data : dict
data : Mapping
Of the form {field : array-like} or {field : dict}.
orient : {'columns', 'index', 'tight'}, default 'columns'
The "orientation" of the data. If the keys of the passed dict
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def _check_values_indices_shape_match(


def dict_to_mgr(
data: dict,
data: abc.Mapping,
index,
columns,
*,
Expand Down Expand Up @@ -536,7 +536,7 @@ def _homogenize(
refs.append(val._references)
val = val._values
else:
if isinstance(val, dict):
if isinstance(val, abc.Mapping):
# GH#41785 this _should_ be equivalent to (but faster than)
# val = Series(val, index=index)._values
if oindex is None:
Expand Down Expand Up @@ -578,7 +578,7 @@ def _extract_index(data) -> Index:
if isinstance(val, ABCSeries):
have_series = True
indexes.append(val.index)
elif isinstance(val, dict):
elif isinstance(val, abc.Mapping):
have_dicts = True
indexes.append(list(val.keys()))
elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/frame/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from __future__ import annotations

from collections.abc import (
Mapping,
Sequence,
)
from typing import TYPE_CHECKING

from pandas import (
Expand All @@ -11,6 +15,31 @@
from pandas._typing import AxisInt


class DictWrapper(Mapping):
def __init__(self, d: dict) -> None:
self._dict = d

def __getitem__(self, key):
return self._dict[key]

def __iter__(self):
return self._dict.__iter__()

def __len__(self):
return self._dict.__len__()


class ListWrapper(Sequence):
def __init__(self, lst: list) -> None:
self._list = lst

def __getitem__(self, i):
return self._list[i]

def __len__(self):
return self._list.__len__()


def _check_mixed_float(df, dtype=None):
# float16 are most likely to be upcasted to float32
dtypes = {"A": "float32", "B": "float32", "C": "float16", "D": "float64"}
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
MultiIndex,
RangeIndex,
Series,
date_range,
)
import pandas._testing as tm
from pandas.tests.frame.common import DictWrapper


class TestFromDict:
Expand Down Expand Up @@ -135,6 +137,27 @@ def test_constructor_from_ordered_dict(self):
result = DataFrame.from_dict(a, orient="index")
tm.assert_frame_equal(result, expected)

def test_constructor_from_mapping(self):
idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
dr = date_range("20130110", periods=3)

# construction
expected = DataFrame(DictWrapper({"A": idx, "B": dr}))
result = DataFrame.from_dict(DictWrapper({"A": idx, "B": dr}))
tm.assert_frame_equal(result, expected)

def test_constructor_from_mapping_of_mapping(self):
data = DictWrapper(
{
"a": DictWrapper({"x": 1, "y": 2}),
"b": DictWrapper({"x": 3, "y": 4}),
"c": DictWrapper({"x": 5, "y": 6}),
}
)
expected = DataFrame(data)
result = DataFrame.from_dict(data)
tm.assert_frame_equal(result, expected)

def test_from_dict_columns_parameter(self):
# GH#18529
# Test new columns parameter for from_dict that was added to make
Expand Down
70 changes: 70 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@
SparseArray,
TimedeltaArray,
)
from pandas.tests.frame.common import (
DictWrapper,
ListWrapper,
)

MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
MIXED_INT_DTYPES = [
Expand Down Expand Up @@ -2917,6 +2921,72 @@ def test_from_dict(self):
tm.assert_series_equal(df["A"], Series(idx, name="A"))
tm.assert_series_equal(df["B"], Series(dr, name="B"))

def test_from_mapping(self):
idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
dr = date_range("20130110", periods=3)

# construction
df = DataFrame(DictWrapper({"A": idx, "B": dr}))
assert df["A"].dtype, "M8[ns, US/Eastern"
assert df["A"].name == "A"
tm.assert_series_equal(df["A"], Series(idx, name="A"))
tm.assert_series_equal(df["B"], Series(dr, name="B"))

def test_from_mapping_of_dict(self):
data = {
"a": {"x": 1, "y": 2},
"b": {"x": 3, "y": 4},
"c": {"x": 5, "y": 6},
}
expected = DataFrame(data)

# construction
result = DataFrame(DictWrapper(data))
tm.assert_frame_equal(result, expected)

def test_from_mapping_of_mapping(self):
data = {
"a": {"x": 1, "y": 2},
"b": {"x": 3, "y": 4},
"c": {"x": 5, "y": 6},
}
expected = DataFrame(data)

# construction
wrapped = DictWrapper({k: DictWrapper(v) for k, v in data.items()})
result = DataFrame(wrapped)
tm.assert_frame_equal(result, expected)

def test_from_mapping_list(self):
idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
dr = date_range("20130110", periods=3)
data = DataFrame({"A": idx, "B": dr})
mapping_list = [
DictWrapper(record) for record in data.to_dict(orient="records")
]

# construction
df = DataFrame(mapping_list)
assert df["A"].dtype, "M8[ns, US/Eastern"
assert df["A"].name == "A"
tm.assert_series_equal(df["A"], Series(idx, name="A"))
tm.assert_series_equal(df["B"], Series(dr, name="B"))

def test_from_mapping_sequence(self):
idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
dr = date_range("20130110", periods=3)
data = DataFrame({"A": idx, "B": dr})
mapping_list = ListWrapper(
[DictWrapper(record) for record in data.to_dict(orient="records")]
)

# construction
df = DataFrame(mapping_list)
assert df["A"].dtype, "M8[ns, US/Eastern"
assert df["A"].name == "A"
tm.assert_series_equal(df["A"], Series(idx, name="A"))
tm.assert_series_equal(df["B"], Series(dr, name="B"))

def test_from_index(self):
# from index
idx2 = date_range("20130101", periods=3, tz="US/Eastern", name="foo")
Expand Down
Loading