handle NUL, fix version dtype in read_crn (#1026)

wholmgren · kandersolar · web-flow · commit 646c8fd22054 · 2020-08-25T10:01:50.000-07:00
* handle NUL char, fix version dtype in CRN parser

* whats new

* add test data file

* fix blank lines

* Update pvlib/tests/iotools/test_crn.py

Co-authored-by: Kevin Anderson &lt;57452607+kanderso-nrel@users.noreply.github.com&gt;

* add dropped line warning

* a few comments

Co-authored-by: Kevin Anderson &lt;57452607+kanderso-nrel@users.noreply.github.com&gt;
diff --git a/docs/sphinx/source/whatsnew/v0.8.0.rst b/docs/sphinx/source/whatsnew/v0.8.0.rst
@@ -48,6 +48,8 @@ Enhancements
 Bug fixes
 ~~~~~~~~~
 * Fixed unit and default value errors in :py:func:`pvlib.soiling.hsu`. (:pull:`XXX`)
+* Handle NUL characters and fix version column dtype in
+  :py:func:`~pvlib.iotools.crn.read_crn`. (:issue:`1025`)
 
 Testing
 ~~~~~~~
diff --git a/pvlib/data/CRN_with_problems.txt b/pvlib/data/CRN_with_problems.txt
diff --git a/pvlib/iotools/crn.py b/pvlib/iotools/crn.py
@@ -33,7 +33,7 @@
 
 # specify dtypes for potentially problematic values
 DTYPES = [
-    'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'float64', 'float64',
+    'int64', 'int64', 'int64', 'int64', 'int64', 'str', 'float64', 'float64',
     'float64', 'float64', 'float64', 'int64', 'float64', 'O', 'int64',
     'float64', 'int64', 'float64', 'float64', 'int64', 'int64', 'float64',
     'int64'
@@ -67,6 +67,13 @@ def read_crn(filename):
     e.g. `SOLAR_RADIATION` becomes `ghi`. See the
     `pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.
 
+    CRN files occasionally have a set of null characters on a line
+    instead of valid data. This function drops those lines. Sometimes
+    these null characters appear on a line of their own and sometimes
+    they occur on the same line as valid data. In the latter case, the
+    valid data will not be returned. Users may manually remove the null
+    characters and reparse the file if they need that line.
+
     References
     ----------
     .. [1] U.S. Climate Reference Network
@@ -78,9 +85,13 @@ def read_crn(filename):
        Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1`
     """
 
-    # read in data
+    # read in data. set fields with NUL characters to NaN
     data = pd.read_fwf(filename, header=None, names=HEADERS.split(' '),
-                       widths=WIDTHS)
+                       widths=WIDTHS, na_values=['\x00\x00\x00\x00\x00\x00'])
+    # at this point we only have NaNs from NUL characters, not -999 etc.
+    # these bad rows need to be removed so that dtypes can be set.
+    # NaNs require float dtype so we run into errors if we don't do this.
+    data = data.dropna(axis=0)
     # loop here because dtype kwarg not supported in read_fwf until 0.20
     for (col, _dtype) in zip(data.columns, DTYPES):
         data[col] = data[col].astype(_dtype)
@@ -98,8 +109,11 @@ def read_crn(filename):
     except TypeError:
         pass
 
-    # set nans
+    # Now we can set nans. This could be done a per column basis to be
+    # safer, since in principle a real -99 value could occur in a -9999
+    # column. Very unlikely to see that in the real world.
     for val in [-99, -999, -9999]:
+        # consider replacing with .replace([-99, -999, -9999])
         data = data.where(data != val, np.nan)
 
     data = data.rename(columns=VARIABLE_MAP)
diff --git a/pvlib/tests/iotools/test_crn.py b/pvlib/tests/iotools/test_crn.py
@@ -8,18 +8,39 @@
 
 
 @pytest.fixture
-def testfile():
-    return DATA_DIR / 'CRNS0101-05-2019-AZ_Tucson_11_W.txt'
-
-
-def test_read_crn(testfile):
-    columns = [
+def columns():
+    return [
         'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
         'longitude', 'latitude', 'temp_air', 'PRECIPITATION', 'ghi',
         'ghi_flag',
         'SURFACE_TEMPERATURE', 'ST_TYPE', 'ST_FLAG', 'relative_humidity',
         'relative_humidity_flag', 'SOIL_MOISTURE_5', 'SOIL_TEMPERATURE_5',
         'WETNESS', 'WET_FLAG', 'wind_speed', 'wind_speed_flag']
+
+
+@pytest.fixture
+def dtypes():
+    return [
+        dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
+        dtype('int64'), dtype('O'), dtype('float64'), dtype('float64'),
+        dtype('float64'), dtype('float64'), dtype('float64'),
+        dtype('int64'), dtype('float64'), dtype('O'), dtype('int64'),
+        dtype('float64'), dtype('int64'), dtype('float64'),
+        dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'),
+        dtype('int64')]
+
+
+@pytest.fixture
+def testfile():
+    return DATA_DIR / 'CRNS0101-05-2019-AZ_Tucson_11_W.txt'
+
+
+@pytest.fixture
+def testfile_problems():
+    return DATA_DIR / 'CRN_with_problems.txt'
+
+
+def test_read_crn(testfile, columns, dtypes):
     index = pd.DatetimeIndex(['2019-01-01 16:10:00',
                               '2019-01-01 16:15:00',
                               '2019-01-01 16:20:00',
@@ -34,16 +55,26 @@ def test_read_crn(testfile):
          0.0, 340.0, 0, 4.3, 'C', 0, 83.0, 0, nan, nan, 1183, 0, 0.53, 0],
         [53131, 20190101, 1625, 20190101, 925, 3, -111.17, 32.24, 4.0,
          0.0, 393.0, 0, 4.8, 'C', 0, 81.0, 0, nan, nan, 1223, 0, 0.64, 0]])
-    dtypes = [
-        dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
-        dtype('int64'), dtype('int64'), dtype('float64'), dtype('float64'),
-        dtype('float64'), dtype('float64'), dtype('float64'),
-        dtype('int64'), dtype('float64'), dtype('O'), dtype('int64'),
-        dtype('float64'), dtype('int64'), dtype('float64'),
-        dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'),
-        dtype('int64')]
     expected = pd.DataFrame(values, columns=columns, index=index)
     for (col, _dtype) in zip(expected.columns, dtypes):
         expected[col] = expected[col].astype(_dtype)
     out = crn.read_crn(testfile)
     assert_frame_equal(out, expected)
+
+
+def test_read_crn_problems(testfile_problems, columns, dtypes):
+    # GH1025
+    index = pd.DatetimeIndex(['2020-07-06 12:00:00',
+                              '2020-07-06 13:10:00'],
+                             freq=None).tz_localize('UTC')
+    values = np.array([
+        [92821, 20200706, 1200, 20200706, 700, '3', -80.69, 28.62, 24.9,
+         0.0, 190.0, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0],
+        [92821, 20200706, 1310, 20200706, 810, '2.623', -80.69, 28.62,
+         26.9, 0.0, 430.0, 0, 30.2, 'C', 0, 87.0, 0, nan, nan, 989, 0,
+         1.64, 0]])
+    expected = pd.DataFrame(values, columns=columns, index=index)
+    for (col, _dtype) in zip(expected.columns, dtypes):
+        expected[col] = expected[col].astype(_dtype)
+    out = crn.read_crn(testfile_problems)
+    assert_frame_equal(out, expected)