Skip to content

Commit 7a5f655

Browse files
authored
fix: handle null values in data (#636)
* fix: handle null values in Flux data * test: add tests for null value handling and extension dtypes * test: fix failures with empty warnings cases * test: comment out dtype some extra assertion until solved * test: skip extension dtypes test on pythn 3.7 * fix: single place of dtypes conversion * fix: bump pandas dependency version * docs: update CHANGELOG * chore(build): trigger CI/CD pipeline * fix: add use_extension_dtypes also to async query API methods
1 parent 27777d1 commit 7a5f655

9 files changed

+272
-29
lines changed

Diff for: CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
## 1.41.0 [unreleased]
22

3+
### Bug Fixes
4+
1. [#636](https://github.com/influxdata/influxdb-client-python/pull/636): Handle missing data in data frames
5+
36
## 1.40.0 [2024-01-30]
47

58
### Features

Diff for: influxdb_client/client/_base.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -277,23 +277,27 @@ async def _to_flux_record_stream_async(self, response, query_options=None, respo
277277
return (await _parser.__aenter__()).generator_async()
278278

279279
def _to_data_frame_stream(self, data_frame_index, response, query_options=None,
280-
response_metadata_mode: FluxResponseMetadataMode = FluxResponseMetadataMode.full):
280+
response_metadata_mode: FluxResponseMetadataMode = FluxResponseMetadataMode.full,
281+
use_extension_dtypes=False):
281282
"""
282283
Parse HTTP response to DataFrame stream.
283284
284285
:param response: HTTP response from an HTTP client. Expected type: `urllib3.response.HTTPResponse`.
285286
"""
286-
_parser = self._to_data_frame_stream_parser(data_frame_index, query_options, response, response_metadata_mode)
287+
_parser = self._to_data_frame_stream_parser(data_frame_index, query_options, response, response_metadata_mode,
288+
use_extension_dtypes)
287289
return _parser.generator()
288290

289291
async def _to_data_frame_stream_async(self, data_frame_index, response, query_options=None, response_metadata_mode:
290-
FluxResponseMetadataMode = FluxResponseMetadataMode.full):
292+
FluxResponseMetadataMode = FluxResponseMetadataMode.full,
293+
use_extension_dtypes=False):
291294
"""
292295
Parse HTTP response to DataFrame stream.
293296
294297
:param response: HTTP response from an HTTP client. Expected type: `aiohttp.client_reqrep.ClientResponse`.
295298
"""
296-
_parser = self._to_data_frame_stream_parser(data_frame_index, query_options, response, response_metadata_mode)
299+
_parser = self._to_data_frame_stream_parser(data_frame_index, query_options, response, response_metadata_mode,
300+
use_extension_dtypes)
297301
return (await _parser.__aenter__()).generator_async()
298302

299303
def _to_tables_parser(self, response, query_options, response_metadata_mode):
@@ -304,10 +308,12 @@ def _to_flux_record_stream_parser(self, query_options, response, response_metada
304308
return FluxCsvParser(response=response, serialization_mode=FluxSerializationMode.stream,
305309
query_options=query_options, response_metadata_mode=response_metadata_mode)
306310

307-
def _to_data_frame_stream_parser(self, data_frame_index, query_options, response, response_metadata_mode):
311+
def _to_data_frame_stream_parser(self, data_frame_index, query_options, response, response_metadata_mode,
312+
use_extension_dtypes):
308313
return FluxCsvParser(response=response, serialization_mode=FluxSerializationMode.dataFrame,
309314
data_frame_index=data_frame_index, query_options=query_options,
310-
response_metadata_mode=response_metadata_mode)
315+
response_metadata_mode=response_metadata_mode,
316+
use_extension_dtypes=use_extension_dtypes)
311317

312318
def _to_data_frames(self, _generator):
313319
"""Parse stream of DataFrames into expected type."""

Diff for: influxdb_client/client/flux_csv_parser.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class FluxCsvParser(object):
6464

6565
def __init__(self, response, serialization_mode: FluxSerializationMode,
6666
data_frame_index: List[str] = None, query_options=None,
67-
response_metadata_mode: FluxResponseMetadataMode = FluxResponseMetadataMode.full) -> None:
67+
response_metadata_mode: FluxResponseMetadataMode = FluxResponseMetadataMode.full,
68+
use_extension_dtypes=False) -> None:
6869
"""
6970
Initialize defaults.
7071
@@ -75,6 +76,7 @@ def __init__(self, response, serialization_mode: FluxSerializationMode,
7576
self.tables = TableList()
7677
self._serialization_mode = serialization_mode
7778
self._response_metadata_mode = response_metadata_mode
79+
self._use_extension_dtypes = use_extension_dtypes
7880
self._data_frame_index = data_frame_index
7981
self._data_frame_values = []
8082
self._profilers = query_options.profilers if query_options is not None else None
@@ -211,7 +213,7 @@ def _parse_flux_response_row(self, metadata, csv):
211213
pass
212214
else:
213215

214-
# to int converions todo
216+
# to int conversions todo
215217
current_id = int(csv[2])
216218
if metadata.table_id == -1:
217219
metadata.table_id = current_id
@@ -253,7 +255,11 @@ def _prepare_data_frame(self):
253255
_temp_df = _temp_df.set_index(self._data_frame_index)
254256

255257
# Append data
256-
return pd.concat([self._data_frame.astype(_temp_df.dtypes), _temp_df])
258+
df = pd.concat([self._data_frame.astype(_temp_df.dtypes), _temp_df])
259+
260+
if self._use_extension_dtypes:
261+
return df.convert_dtypes()
262+
return df
257263

258264
def parse_record(self, table_index, table, csv):
259265
"""Parse one record."""
@@ -273,8 +279,10 @@ def _to_value(self, str_val, column):
273279
default_value = column.default_value
274280
if default_value == '' or default_value is None:
275281
if self._serialization_mode is FluxSerializationMode.dataFrame:
276-
from ..extras import np
277-
return self._to_value(np.nan, column)
282+
if self._use_extension_dtypes:
283+
from ..extras import pd
284+
return pd.NA
285+
return None
278286
return None
279287
return self._to_value(default_value, column)
280288

Diff for: influxdb_client/client/query_api.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,8 @@ def query_stream(self, query: str, org=None, params: dict = None) -> Generator['
222222
async_req=False, _preload_content=False, _return_http_data_only=False)
223223
return self._to_flux_record_stream(response, query_options=self._get_query_options())
224224

225-
def query_data_frame(self, query: str, org=None, data_frame_index: List[str] = None, params: dict = None):
225+
def query_data_frame(self, query: str, org=None, data_frame_index: List[str] = None, params: dict = None,
226+
use_extension_dtypes: bool = False):
226227
"""
227228
Execute synchronous Flux query and return Pandas DataFrame.
228229
@@ -234,6 +235,11 @@ def query_data_frame(self, query: str, org=None, data_frame_index: List[str] = N
234235
If not specified the default value from ``InfluxDBClient.org`` is used.
235236
:param data_frame_index: the list of columns that are used as DataFrame index
236237
:param params: bind parameters
238+
:param use_extension_dtypes: set to ``True`` to use panda's extension data types.
239+
Useful for queries with ``pivot`` function.
240+
When data has missing values, column data type may change (to ``object`` or ``float64``).
241+
Nullable extension types (``Int64``, ``Float64``, ``boolean``) support ``panda.NA`` value.
242+
For more info, see https://pandas.pydata.org/docs/user_guide/missing_data.html.
237243
:return: :class:`~DataFrame` or :class:`~List[DataFrame]`
238244
239245
.. warning:: For the optimal processing of the query results use the ``pivot() function`` which align results as a table.
@@ -250,10 +256,12 @@ def query_data_frame(self, query: str, org=None, data_frame_index: List[str] = N
250256
- https://docs.influxdata.com/flux/latest/stdlib/universe/pivot/
251257
- https://docs.influxdata.com/flux/latest/stdlib/influxdata/influxdb/schema/fieldsascols/
252258
""" # noqa: E501
253-
_generator = self.query_data_frame_stream(query, org=org, data_frame_index=data_frame_index, params=params)
259+
_generator = self.query_data_frame_stream(query, org=org, data_frame_index=data_frame_index, params=params,
260+
use_extension_dtypes=use_extension_dtypes)
254261
return self._to_data_frames(_generator)
255262

256-
def query_data_frame_stream(self, query: str, org=None, data_frame_index: List[str] = None, params: dict = None):
263+
def query_data_frame_stream(self, query: str, org=None, data_frame_index: List[str] = None, params: dict = None,
264+
use_extension_dtypes: bool = False):
257265
"""
258266
Execute synchronous Flux query and return stream of Pandas DataFrame as a :class:`~Generator[DataFrame]`.
259267
@@ -265,6 +273,11 @@ def query_data_frame_stream(self, query: str, org=None, data_frame_index: List[s
265273
If not specified the default value from ``InfluxDBClient.org`` is used.
266274
:param data_frame_index: the list of columns that are used as DataFrame index
267275
:param params: bind parameters
276+
:param use_extension_dtypes: set to ``True`` to use panda's extension data types.
277+
Useful for queries with ``pivot`` function.
278+
When data has missing values, column data type may change (to ``object`` or ``float64``).
279+
Nullable extension types (``Int64``, ``Float64``, ``boolean``) support ``panda.NA`` value.
280+
For more info, see https://pandas.pydata.org/docs/user_guide/missing_data.html.
268281
:return: :class:`~Generator[DataFrame]`
269282
270283
.. warning:: For the optimal processing of the query results use the ``pivot() function`` which align results as a table.
@@ -289,7 +302,8 @@ def query_data_frame_stream(self, query: str, org=None, data_frame_index: List[s
289302

290303
return self._to_data_frame_stream(data_frame_index=data_frame_index,
291304
response=response,
292-
query_options=self._get_query_options())
305+
query_options=self._get_query_options(),
306+
use_extension_dtypes=use_extension_dtypes)
293307

294308
def __del__(self):
295309
"""Close QueryAPI."""

Diff for: influxdb_client/client/query_api_async.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ async def query_stream(self, query: str, org=None, params: dict = None) -> Async
120120

121121
return await self._to_flux_record_stream_async(response, query_options=self._get_query_options())
122122

123-
async def query_data_frame(self, query: str, org=None, data_frame_index: List[str] = None, params: dict = None):
123+
async def query_data_frame(self, query: str, org=None, data_frame_index: List[str] = None, params: dict = None,
124+
use_extension_dtypes: bool = False):
124125
"""
125126
Execute asynchronous Flux query and return :class:`~pandas.core.frame.DataFrame`.
126127
@@ -132,6 +133,11 @@ async def query_data_frame(self, query: str, org=None, data_frame_index: List[st
132133
If not specified the default value from ``InfluxDBClientAsync.org`` is used.
133134
:param data_frame_index: the list of columns that are used as DataFrame index
134135
:param params: bind parameters
136+
:param use_extension_dtypes: set to ``True`` to use panda's extension data types.
137+
Useful for queries with ``pivot`` function.
138+
When data has missing values, column data type may change (to ``object`` or ``float64``).
139+
Nullable extension types (``Int64``, ``Float64``, ``boolean``) support ``panda.NA`` value.
140+
For more info, see https://pandas.pydata.org/docs/user_guide/missing_data.html.
135141
:return: :class:`~DataFrame` or :class:`~List[DataFrame]`
136142
137143
.. warning:: For the optimal processing of the query results use the ``pivot() function`` which align results as a table.
@@ -149,7 +155,7 @@ async def query_data_frame(self, query: str, org=None, data_frame_index: List[st
149155
- https://docs.influxdata.com/flux/latest/stdlib/influxdata/influxdb/schema/fieldsascols/
150156
""" # noqa: E501
151157
_generator = await self.query_data_frame_stream(query, org=org, data_frame_index=data_frame_index,
152-
params=params)
158+
params=params, use_extension_dtypes=use_extension_dtypes)
153159

154160
dataframes = []
155161
async for dataframe in _generator:
@@ -158,7 +164,7 @@ async def query_data_frame(self, query: str, org=None, data_frame_index: List[st
158164
return self._to_data_frames(dataframes)
159165

160166
async def query_data_frame_stream(self, query: str, org=None, data_frame_index: List[str] = None,
161-
params: dict = None):
167+
params: dict = None, use_extension_dtypes: bool = False):
162168
"""
163169
Execute asynchronous Flux query and return stream of :class:`~pandas.core.frame.DataFrame` as an AsyncGenerator[:class:`~pandas.core.frame.DataFrame`].
164170
@@ -170,6 +176,11 @@ async def query_data_frame_stream(self, query: str, org=None, data_frame_index:
170176
If not specified the default value from ``InfluxDBClientAsync.org`` is used.
171177
:param data_frame_index: the list of columns that are used as DataFrame index
172178
:param params: bind parameters
179+
:param use_extension_dtypes: set to ``True`` to use panda's extension data types.
180+
Useful for queries with ``pivot`` function.
181+
When data has missing values, column data type may change (to ``object`` or ``float64``).
182+
Nullable extension types (``Int64``, ``Float64``, ``boolean``) support ``panda.NA`` value.
183+
For more info, see https://pandas.pydata.org/docs/user_guide/missing_data.html.
173184
:return: :class:`AsyncGenerator[:class:`DataFrame`]`
174185
175186
.. warning:: For the optimal processing of the query results use the ``pivot() function`` which align results as a table.
@@ -192,7 +203,8 @@ async def query_data_frame_stream(self, query: str, org=None, data_frame_index:
192203
dataframe_query=True))
193204

194205
return await self._to_data_frame_stream_async(data_frame_index=data_frame_index, response=response,
195-
query_options=self._get_query_options())
206+
query_options=self._get_query_options(),
207+
use_extension_dtypes=use_extension_dtypes)
196208

197209
async def query_raw(self, query: str, org=None, dialect=_BaseQueryApi.default_dialect, params: dict = None):
198210
"""

Diff for: setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
]
3232

3333
extra_requires = [
34-
'pandas>=0.25.3',
34+
'pandas>=1.0.0',
3535
'numpy'
3636
]
3737

0 commit comments

Comments
 (0)