Skip to content

Commit e2cb42e

Browse files
authored
fix: serialize Pandas NaN values into LineProtocol (#648)
1 parent a645ea9 commit e2cb42e

File tree

3 files changed

+42
-20
lines changed

3 files changed

+42
-20
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
## 1.42.0 [unreleased]
22

3+
### Bug Fixes
4+
1. [#648](https://github.com/influxdata/influxdb-client-python/pull/648): Fix `DataFrame` serialization with `NaN` values
5+
36
## 1.41.0 [2024-03-01]
47

58
### Features

influxdb_client/client/write/dataframe_serializer.py

+13-20
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,6 @@ def _itertuples(data_frame):
1919
return zip(data_frame.index, *cols)
2020

2121

22-
def _not_nan(x):
23-
return x == x
24-
25-
26-
def _any_not_nan(p, indexes):
27-
return any(map(lambda x: _not_nan(p[x]), indexes))
28-
29-
3022
class DataframeSerializer:
3123
"""Serialize DataFrame into LineProtocols."""
3224

@@ -77,7 +69,7 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
7769
# When NaNs are present, the expression looks like this (split
7870
# across two lines to satisfy the code-style checker)
7971
#
80-
# lambda p: f"""{measurement_name} {"" if math.isnan(p[1])
72+
# lambda p: f"""{measurement_name} {"" if pd.isna(p[1])
8173
# else f"{keys[0]}={p[1]}"},{keys[1]}={p[2]}i {p[0].value}"""
8274
#
8375
# When there's a NaN value in column a, we'll end up with a comma at the start of the
@@ -175,7 +167,7 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
175167
# This column is a tag column.
176168
if null_columns.iloc[index]:
177169
key_value = f"""{{
178-
'' if {val_format} == '' or type({val_format}) == float and math.isnan({val_format}) else
170+
'' if {val_format} == '' or pd.isna({val_format}) else
179171
f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}'
180172
}}"""
181173
else:
@@ -192,19 +184,16 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
192184
# field column has no nulls, we don't run the comma-removal
193185
# regexp substitution step.
194186
sep = '' if len(field_indexes) == 0 else ','
195-
if issubclass(value.type, np.integer):
196-
field_value = f"{sep}{key_format}={{{val_format}}}i"
197-
elif issubclass(value.type, np.bool_):
198-
field_value = f'{sep}{key_format}={{{val_format}}}'
199-
elif issubclass(value.type, np.floating):
187+
if issubclass(value.type, np.integer) or issubclass(value.type, np.floating) or issubclass(value.type, np.bool_): # noqa: E501
188+
suffix = 'i' if issubclass(value.type, np.integer) else ''
200189
if null_columns.iloc[index]:
201-
field_value = f"""{{"" if math.isnan({val_format}) else f"{sep}{key_format}={{{val_format}}}"}}"""
190+
field_value = f"""{{"" if pd.isna({val_format}) else f"{sep}{key_format}={{{val_format}}}{suffix}"}}""" # noqa: E501
202191
else:
203-
field_value = f'{sep}{key_format}={{{val_format}}}'
192+
field_value = f"{sep}{key_format}={{{val_format}}}{suffix}"
204193
else:
205194
if null_columns.iloc[index]:
206195
field_value = f"""{{
207-
'' if type({val_format}) == float and math.isnan({val_format}) else
196+
'' if pd.isna({val_format}) else
208197
f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"'
209198
}}"""
210199
else:
@@ -229,17 +218,21 @@ def __init__(self, data_frame, point_settings, precision=DEFAULT_WRITE_PRECISION
229218
'_ESCAPE_KEY': _ESCAPE_KEY,
230219
'_ESCAPE_STRING': _ESCAPE_STRING,
231220
'keys': keys,
232-
'math': math,
221+
'pd': pd,
233222
})
234223

235224
for k, v in dict(data_frame.dtypes).items():
236225
if k in data_frame_tag_columns:
237226
data_frame = data_frame.replace({k: ''}, np.nan)
238227

228+
def _any_not_nan(p, indexes):
229+
return any(map(lambda x: not pd.isna(p[x]), indexes))
230+
239231
self.data_frame = data_frame
240232
self.f = f
241233
self.field_indexes = field_indexes
242234
self.first_field_maybe_null = null_columns.iloc[field_indexes[0] - 1]
235+
self._any_not_nan = _any_not_nan
243236

244237
#
245238
# prepare chunks
@@ -266,7 +259,7 @@ def serialize(self, chunk_idx: int = None):
266259
# When the first field is null (None/NaN), we'll have
267260
# a spurious leading comma which needs to be removed.
268261
lp = (re.sub('^(( |[^ ])* ),([a-zA-Z0-9])(.*)', '\\1\\3\\4', self.f(p))
269-
for p in filter(lambda x: _any_not_nan(x, self.field_indexes), _itertuples(chunk)))
262+
for p in filter(lambda x: self._any_not_nan(x, self.field_indexes), _itertuples(chunk)))
270263
return list(lp)
271264
else:
272265
return list(map(self.f, _itertuples(chunk)))

tests/test_WriteApiDataFrame.py

+26
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,32 @@ def test_write_object_field_nan(self):
159159
self.assertEqual("measurement val=2i 1586046600000000000",
160160
points[1])
161161

162+
def test_write_missing_values(self):
163+
from influxdb_client.extras import pd
164+
165+
data_frame = pd.DataFrame({
166+
"a_bool": [True, None, False],
167+
"b_int": [None, 1, 2],
168+
"c_float": [1.0, 2.0, None],
169+
"d_str": ["a", "b", None],
170+
})
171+
172+
data_frame['a_bool'] = data_frame['a_bool'].astype(pd.BooleanDtype())
173+
data_frame['b_int'] = data_frame['b_int'].astype(pd.Int64Dtype())
174+
data_frame['c_float'] = data_frame['c_float'].astype(pd.Float64Dtype())
175+
data_frame['d_str'] = data_frame['d_str'].astype(pd.StringDtype())
176+
177+
print(data_frame)
178+
points = data_frame_to_list_of_points(
179+
data_frame=data_frame,
180+
point_settings=PointSettings(),
181+
data_frame_measurement_name='measurement')
182+
183+
self.assertEqual(3, len(points))
184+
self.assertEqual("measurement a_bool=True,c_float=1.0,d_str=\"a\" 0", points[0])
185+
self.assertEqual("measurement b_int=1i,c_float=2.0,d_str=\"b\" 1", points[1])
186+
self.assertEqual("measurement a_bool=False,b_int=2i 2", points[2])
187+
162188
def test_write_field_bool(self):
163189
from influxdb_client.extras import pd
164190

0 commit comments

Comments
 (0)