Source encoding always set when opening datasets (#2626)

TomNicholas · shoyer · commit 250b19c8fff5 · 2018-12-29T18:00:35.000-07:00
* Add source encoding if not already present when opening dataset * Test source encoding present * Updated what's new * Revert "Updated what's new" This reverts commit 7858799. * Don't close file-like objects * Updated whats's new * DOC: document source encoding for datasets
diff --git a/doc/io.rst b/doc/io.rst
@@ -197,24 +197,30 @@ turn this decoding off manually.
 .. _CF conventions: http://cfconventions.org/
 
 You can view this encoding information (among others) in the
-:py:attr:`DataArray.encoding <xarray.DataArray.encoding>` attribute:
+:py:attr:`DataArray.encoding <xarray.DataArray.encoding>` and
+:py:attr:`DataArray.encoding <xarray.DataArray.encoding>` attributes:
 
 .. ipython::
     :verbatim:
 
     In [1]: ds_disk['y'].encoding
     Out[1]:
-    {'calendar': u'proleptic_gregorian',
-     'chunksizes': None,
+    {'zlib': False,
+     'shuffle': False,
      'complevel': 0,
-     'contiguous': True,
-     'dtype': dtype('float64'),
      'fletcher32': False,
-     'least_significant_digit': None,
-     'shuffle': False,
+     'contiguous': True,
+     'chunksizes': None,
      'source': 'saved_on_disk.nc',
-     'units': u'days since 2000-01-01 00:00:00',
-     'zlib': False}
+     'original_shape': (5,),
+     'dtype': dtype('int64'),
+     'units': 'days since 2000-01-01 00:00:00',
+     'calendar': 'proleptic_gregorian'}
+
+    In [9]: ds_disk.encoding
+    Out[9]:
+    {'unlimited_dims': set(),
+     'source': 'saved_on_disk.nc'}
 
 Note that all operations that manipulate variables other than indexing
 will remove encoding information.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -67,6 +67,9 @@ Enhancements
 - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` now supports the
   ``loffset`` kwarg just like Pandas.
   By `Deepak Cherian <https://github.com/dcherian>`_
+- Datasets are now guaranteed to have a ``'source'`` encoding, so the source
+  file name is always stored (:issue:`2550`).
+  By `Tom Nicholas <http://github.com/TomNicholas>`_.
 - The `apply` methods for `DatasetGroupBy`, `DataArrayGroupBy`,
   `DatasetResample` and `DataArrayResample` can now pass positional arguments to
   the applied function.
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -299,6 +299,7 @@ def maybe_decode_store(store, lock=False):
 
     if isinstance(filename_or_obj, backends.AbstractDataStore):
         store = filename_or_obj
+        ds = maybe_decode_store(store)
     elif isinstance(filename_or_obj, basestring):
 
         if (isinstance(filename_or_obj, bytes) and
@@ -339,15 +340,21 @@ def maybe_decode_store(store, lock=False):
                              % engine)
 
         with close_on_error(store):
-            return maybe_decode_store(store)
+            ds = maybe_decode_store(store)
     else:
         if engine is not None and engine != 'scipy':
             raise ValueError('can only read file-like objects with '
                              "default engine or engine='scipy'")
         # assume filename_or_obj is a file-like object
         store = backends.ScipyDataStore(filename_or_obj)
+        ds = maybe_decode_store(store)
 
-    return maybe_decode_store(store)
+    # Ensure source filename always stored in dataset object (GH issue #2550)
+    if 'source' not in ds.encoding:
+        if isinstance(filename_or_obj, basestring):
+            ds.encoding['source'] = filename_or_obj
+
+    return ds
 
 
 def open_dataarray(filename_or_obj, group=None, decode_cf=True,
@@ -484,6 +491,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
                    lock=None, data_vars='all', coords='different',
                    autoclose=None, parallel=False, **kwargs):
     """Open multiple files as a single dataset.
+
     Requires dask to be installed. See documentation for details on dask [1].
     Attributes from the first dataset file are used for the combined dataset.
 
@@ -523,6 +531,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
           of all non-null values.
     preprocess : callable, optional
         If provided, call this function on each dataset prior to concatenation.
+        You can find the file-name from which each dataset was loaded in
+        ``ds.encoding['source']``.
     engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib'},
         optional
         Engine to use when reading files. If not provided, the default engine
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -3426,3 +3426,14 @@ def test_no_warning_from_dask_effective_get():
             ds = Dataset()
             ds.to_netcdf(tmpfile)
         assert len(record) == 0
+
+
+@requires_scipy_or_netCDF4
+def test_source_encoding_always_present():
+    # Test for GH issue #2550.
+    rnddata = np.random.randn(10)
+    original = Dataset({'foo': ('x', rnddata)})
+    with create_tmp_file() as tmp:
+        original.to_netcdf(tmp)
+        with open_dataset(tmp) as ds:
+            assert ds.encoding['source'] == tmp