Construction of Series from dict containing NaN as key (pandas-dev#18496)

toobaz · jreback · commit d270bbb1448e · 2017-12-01T14:02:41.000-05:00
closes pandas-dev#18480 closes pandas-dev#18515
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -99,6 +99,7 @@ Other API Changes
 
 - :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`)
 - ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`)
+- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`)
 - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
 - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`).
 - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`)
@@ -242,5 +243,6 @@ Other
 
 - Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`)
 - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`)
+- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
 - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
 -
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -874,9 +874,8 @@ def _map_values(self, mapper, na_action=None):
                 # convert to an Series for efficiency.
                 # we specify the keys here to handle the
                 # possibility that they are tuples
-                from pandas import Series, Index
-                index = Index(mapper, tupleize_cols=False)
-                mapper = Series(mapper, index=index)
+                from pandas import Series
+                mapper = Series(mapper)
 
         if isinstance(mapper, ABCSeries):
             # Since values were input this means we came from either
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2822,27 +2822,6 @@ def get_indexer_for(self, target, **kwargs):
         indexer, _ = self.get_indexer_non_unique(target, **kwargs)
         return indexer
 
-    _index_shared_docs['_get_values_from_dict'] = """
-        Return the values of the input dictionary in the order the keys are
-        in the index. np.nan is returned for index values not in the
-        dictionary.
-
-        Parameters
-        ----------
-        data : dict
-            The dictionary from which to extract the values
-
-        Returns
-        -------
-        np.array
-
-        """
-
-    @Appender(_index_shared_docs['_get_values_from_dict'])
-    def _get_values_from_dict(self, data):
-        return lib.fast_multiget(data, self.values,
-                                 default=np.nan)
-
     def _maybe_promote(self, other):
         # A hack, but it works
         from pandas.core.indexes.datetimes import DatetimeIndex
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -700,14 +700,6 @@ def __rsub__(self, other):
     def _add_delta(self, other):
         return NotImplemented
 
-    @Appender(_index_shared_docs['_get_values_from_dict'])
-    def _get_values_from_dict(self, data):
-        if len(data):
-            return np.array([data.get(i, np.nan)
-                             for i in self.asobject.values])
-
-        return np.array([np.nan])
-
     def _add_delta_td(self, other):
         # add a delta of a timedeltalike
         # return the i8 result view
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -1457,17 +1457,6 @@ def get_value_maybe_box(self, series, key):
                                         key, tz=self.tz)
         return _maybe_box(self, values, series, key)
 
-    @Appender(_index_shared_docs['_get_values_from_dict'])
-    def _get_values_from_dict(self, data):
-        if len(data):
-            # coerce back to datetime objects for lookup
-            data = com._dict_compat(data)
-            return lib.fast_multiget(data,
-                                     self.asobject.values,
-                                     default=np.nan)
-
-        return np.array([np.nan])
-
     def get_loc(self, key, method=None, tolerance=None):
         """
         Get integer location for requested label
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -42,7 +42,6 @@
                                 _default_index,
                                 _asarray_tuplesafe,
                                 _values_from_object,
-                                _try_sort,
                                 _maybe_match_name,
                                 SettingWithCopyError,
                                 _maybe_box_datetimelike,
@@ -198,18 +197,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
                     data = data.reindex(index, copy=copy)
                 data = data._data
             elif isinstance(data, dict):
-                if index is None:
-                    if isinstance(data, OrderedDict):
-                        index = Index(data)
-                    else:
-                        index = Index(_try_sort(data))
-
-                try:
-                    data = index._get_values_from_dict(data)
-                except TypeError:
-                    data = ([data.get(i, np.nan) for i in index]
-                            if data else np.nan)
-
+                data, index = self._init_dict(data, index, dtype)
+                dtype = None
+                copy = False
             elif isinstance(data, SingleBlockManager):
                 if index is None:
                     index = data.index
@@ -257,6 +247,45 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
         self.name = name
         self._set_axis(0, index, fastpath=True)
 
+    def _init_dict(self, data, index=None, dtype=None):
+        """
+        Derive the "_data" and "index" attributes of a new Series from a
+        dictionary input.
+
+        Parameters
+        ----------
+        data : dict or dict-like
+            Data used to populate the new Series
+        index : Index or index-like, default None
+            index for the new Series: if None, use dict keys
+        dtype : dtype, default None
+            dtype for the new Series: if None, infer from data
+
+        Returns
+        -------
+        _data : BlockManager for the new Series
+        index : index for the new Series
+        """
+        # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
+        # raises KeyError), so we iterate the entire dict, and align
+        if data:
+            keys, values = zip(*compat.iteritems(data))
+        else:
+            keys, values = [], []
+
+        # Input is now list-like, so rely on "standard" construction:
+        s = Series(values, index=keys, dtype=dtype)
+
+        # Now we just make sure the order is respected, if any
+        if index is not None:
+            s = s.reindex(index, copy=False)
+        elif not isinstance(data, OrderedDict):
+            try:
+                s = s.sort_index()
+            except TypeError:
+                pass
+        return s._data, s.index
+
     @classmethod
     def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
                    fastpath=False):
diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py
@@ -422,6 +422,7 @@ def test_map_dict_with_tuple_keys(self):
         converted to a multi-index, preventing tuple values
         from being mapped properly.
         """
+        # GH 18496
         df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]})
         label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'}
 
diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py
@@ -181,7 +181,8 @@ def test_concat_empty_series_dtypes(self):
         # categorical
         assert pd.concat([Series(dtype='category'),
                           Series(dtype='category')]).dtype == 'category'
-        assert pd.concat([Series(dtype='category'),
+        # GH 18515
+        assert pd.concat([Series(np.array([]), dtype='category'),
                           Series(dtype='float64')]).dtype == 'float64'
         assert pd.concat([Series(dtype='category'),
                           Series(dtype='object')]).dtype == 'object'
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -4,6 +4,7 @@
 import pytest
 
 from datetime import datetime, timedelta
+from collections import OrderedDict
 
 from numpy import nan
 import numpy as np
@@ -79,17 +80,42 @@ def test_constructor(self):
         m = MultiIndex.from_arrays([[1, 2], [3, 4]])
         pytest.raises(NotImplementedError, Series, m)
 
-    def test_constructor_empty(self):
+    @pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
+    def test_constructor_empty(self, input_class):
         empty = Series()
-        empty2 = Series([])
+        empty2 = Series(input_class())
 
-        # the are Index() and RangeIndex() which don't compare type equal
+        # these are Index() and RangeIndex() which don't compare type equal
         # but are just .equals
         assert_series_equal(empty, empty2, check_index_type=False)
 
-        empty = Series(index=lrange(10))
-        empty2 = Series(np.nan, index=lrange(10))
-        assert_series_equal(empty, empty2)
+        # With explicit dtype:
+        empty = Series(dtype='float64')
+        empty2 = Series(input_class(), dtype='float64')
+        assert_series_equal(empty, empty2, check_index_type=False)
+
+        # GH 18515 : with dtype=category:
+        empty = Series(dtype='category')
+        empty2 = Series(input_class(), dtype='category')
+        assert_series_equal(empty, empty2, check_index_type=False)
+
+        if input_class is not list:
+            # With index:
+            empty = Series(index=lrange(10))
+            empty2 = Series(input_class(), index=lrange(10))
+            assert_series_equal(empty, empty2)
+
+            # With index and dtype float64:
+            empty = Series(np.nan, index=lrange(10))
+            empty2 = Series(input_class(), index=lrange(10), dtype='float64')
+            assert_series_equal(empty, empty2)
+
+    @pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
+    def test_constructor_nan(self, input_arg):
+        empty = Series(dtype='float64', index=lrange(10))
+        empty2 = Series(input_arg, index=lrange(10))
+
+        assert_series_equal(empty, empty2, check_index_type=False)
 
     def test_constructor_series(self):
         index1 = ['d', 'b', 'a', 'c']
@@ -625,6 +651,21 @@ def test_constructor_dict(self):
         expected.iloc[1] = 1
         assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
+    def test_constructor_dict_nan_key(self, value):
+        # GH 18480
+        d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
+        result = Series(d).sort_values()
+        expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
+        assert_series_equal(result, expected)
+
+        # MultiIndex:
+        d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
+        result = Series(d).sort_values()
+        expected = Series(['a', 'b', 'c'],
+                          index=Index([(1, 1), (2, np.nan), (3, value)]))
+        assert_series_equal(result, expected)
+
     def test_constructor_dict_datetime64_index(self):
         # GH 9456
 
@@ -658,8 +699,6 @@ def test_constructor_tuple_of_tuples(self):
         s = Series(data)
         assert tuple(s) == data
 
-    @pytest.mark.xfail(reason='GH 18480 (Series initialization from dict with '
-                              'NaN keys')
     def test_constructor_dict_of_tuples(self):
         data = {(1, 2): 3,
                 (None, 5): 6}