Skip to content

Commit 646c8fd

Browse files
handle NUL, fix version dtype in read_crn (#1026)
* handle NUL char, fix version dtype in CRN parser * whats new * add test data file * fix blank lines * Update pvlib/tests/iotools/test_crn.py Co-authored-by: Kevin Anderson <[email protected]> * add dropped line warning * a few comments Co-authored-by: Kevin Anderson <[email protected]>
1 parent 27872b8 commit 646c8fd

File tree

4 files changed

+65
-18
lines changed

4 files changed

+65
-18
lines changed

docs/sphinx/source/whatsnew/v0.8.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ Enhancements
4848
Bug fixes
4949
~~~~~~~~~
5050
* Fixed unit and default value errors in :py:func:`pvlib.soiling.hsu`. (:pull:`XXX`)
51+
* Handle NUL characters and fix version column dtype in
52+
:py:func:`~pvlib.iotools.crn.read_crn`. (:issue:`1025`)
5153

5254
Testing
5355
~~~~~~~

pvlib/data/CRN_with_problems.txt

1.98 KB
Binary file not shown.

pvlib/iotools/crn.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
# specify dtypes for potentially problematic values
3535
DTYPES = [
36-
'int64', 'int64', 'int64', 'int64', 'int64', 'int64', 'float64', 'float64',
36+
'int64', 'int64', 'int64', 'int64', 'int64', 'str', 'float64', 'float64',
3737
'float64', 'float64', 'float64', 'int64', 'float64', 'O', 'int64',
3838
'float64', 'int64', 'float64', 'float64', 'int64', 'int64', 'float64',
3939
'int64'
@@ -67,6 +67,13 @@ def read_crn(filename):
6767
e.g. `SOLAR_RADIATION` becomes `ghi`. See the
6868
`pvlib.iotools.crn.VARIABLE_MAP` dict for the complete mapping.
6969
70+
CRN files occasionally have a set of null characters on a line
71+
instead of valid data. This function drops those lines. Sometimes
72+
these null characters appear on a line of their own and sometimes
73+
they occur on the same line as valid data. In the latter case, the
74+
valid data will not be returned. Users may manually remove the null
75+
characters and reparse the file if they need that line.
76+
7077
References
7178
----------
7279
.. [1] U.S. Climate Reference Network
@@ -78,9 +85,13 @@ def read_crn(filename):
7885
Amer. Meteor. Soc., 94, 489-498. :doi:`10.1175/BAMS-D-12-00170.1`
7986
"""
8087

81-
# read in data
88+
# read in data. set fields with NUL characters to NaN
8289
data = pd.read_fwf(filename, header=None, names=HEADERS.split(' '),
83-
widths=WIDTHS)
90+
widths=WIDTHS, na_values=['\x00\x00\x00\x00\x00\x00'])
91+
# at this point we only have NaNs from NUL characters, not -999 etc.
92+
# these bad rows need to be removed so that dtypes can be set.
93+
# NaNs require float dtype so we run into errors if we don't do this.
94+
data = data.dropna(axis=0)
8495
# loop here because dtype kwarg not supported in read_fwf until 0.20
8596
for (col, _dtype) in zip(data.columns, DTYPES):
8697
data[col] = data[col].astype(_dtype)
@@ -98,8 +109,11 @@ def read_crn(filename):
98109
except TypeError:
99110
pass
100111

101-
# set nans
112+
# Now we can set nans. This could be done a per column basis to be
113+
# safer, since in principle a real -99 value could occur in a -9999
114+
# column. Very unlikely to see that in the real world.
102115
for val in [-99, -999, -9999]:
116+
# consider replacing with .replace([-99, -999, -9999])
103117
data = data.where(data != val, np.nan)
104118

105119
data = data.rename(columns=VARIABLE_MAP)

pvlib/tests/iotools/test_crn.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,39 @@
88

99

1010
@pytest.fixture
11-
def testfile():
12-
return DATA_DIR / 'CRNS0101-05-2019-AZ_Tucson_11_W.txt'
13-
14-
15-
def test_read_crn(testfile):
16-
columns = [
11+
def columns():
12+
return [
1713
'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN',
1814
'longitude', 'latitude', 'temp_air', 'PRECIPITATION', 'ghi',
1915
'ghi_flag',
2016
'SURFACE_TEMPERATURE', 'ST_TYPE', 'ST_FLAG', 'relative_humidity',
2117
'relative_humidity_flag', 'SOIL_MOISTURE_5', 'SOIL_TEMPERATURE_5',
2218
'WETNESS', 'WET_FLAG', 'wind_speed', 'wind_speed_flag']
19+
20+
21+
@pytest.fixture
22+
def dtypes():
23+
return [
24+
dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
25+
dtype('int64'), dtype('O'), dtype('float64'), dtype('float64'),
26+
dtype('float64'), dtype('float64'), dtype('float64'),
27+
dtype('int64'), dtype('float64'), dtype('O'), dtype('int64'),
28+
dtype('float64'), dtype('int64'), dtype('float64'),
29+
dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'),
30+
dtype('int64')]
31+
32+
33+
@pytest.fixture
34+
def testfile():
35+
return DATA_DIR / 'CRNS0101-05-2019-AZ_Tucson_11_W.txt'
36+
37+
38+
@pytest.fixture
39+
def testfile_problems():
40+
return DATA_DIR / 'CRN_with_problems.txt'
41+
42+
43+
def test_read_crn(testfile, columns, dtypes):
2344
index = pd.DatetimeIndex(['2019-01-01 16:10:00',
2445
'2019-01-01 16:15:00',
2546
'2019-01-01 16:20:00',
@@ -34,16 +55,26 @@ def test_read_crn(testfile):
3455
0.0, 340.0, 0, 4.3, 'C', 0, 83.0, 0, nan, nan, 1183, 0, 0.53, 0],
3556
[53131, 20190101, 1625, 20190101, 925, 3, -111.17, 32.24, 4.0,
3657
0.0, 393.0, 0, 4.8, 'C', 0, 81.0, 0, nan, nan, 1223, 0, 0.64, 0]])
37-
dtypes = [
38-
dtype('int64'), dtype('int64'), dtype('int64'), dtype('int64'),
39-
dtype('int64'), dtype('int64'), dtype('float64'), dtype('float64'),
40-
dtype('float64'), dtype('float64'), dtype('float64'),
41-
dtype('int64'), dtype('float64'), dtype('O'), dtype('int64'),
42-
dtype('float64'), dtype('int64'), dtype('float64'),
43-
dtype('float64'), dtype('int64'), dtype('int64'), dtype('float64'),
44-
dtype('int64')]
4558
expected = pd.DataFrame(values, columns=columns, index=index)
4659
for (col, _dtype) in zip(expected.columns, dtypes):
4760
expected[col] = expected[col].astype(_dtype)
4861
out = crn.read_crn(testfile)
4962
assert_frame_equal(out, expected)
63+
64+
65+
def test_read_crn_problems(testfile_problems, columns, dtypes):
66+
# GH1025
67+
index = pd.DatetimeIndex(['2020-07-06 12:00:00',
68+
'2020-07-06 13:10:00'],
69+
freq=None).tz_localize('UTC')
70+
values = np.array([
71+
[92821, 20200706, 1200, 20200706, 700, '3', -80.69, 28.62, 24.9,
72+
0.0, 190.0, 0, 25.5, 'C', 0, 93.0, 0, nan, nan, 990, 0, 1.57, 0],
73+
[92821, 20200706, 1310, 20200706, 810, '2.623', -80.69, 28.62,
74+
26.9, 0.0, 430.0, 0, 30.2, 'C', 0, 87.0, 0, nan, nan, 989, 0,
75+
1.64, 0]])
76+
expected = pd.DataFrame(values, columns=columns, index=index)
77+
for (col, _dtype) in zip(expected.columns, dtypes):
78+
expected[col] = expected[col].astype(_dtype)
79+
out = crn.read_crn(testfile_problems)
80+
assert_frame_equal(out, expected)

0 commit comments

Comments
 (0)