Skip to content

Commit

Permalink
Merge pull request #34 from mdavis-xyz/pandas_warning
Browse files Browse the repository at this point in the history
Pass datetime format explicitly when parsing
  • Loading branch information
nick-gorman authored May 16, 2024
2 parents e2f21e6 + 43b8631 commit 9ed5346
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 71 deletions.
2 changes: 1 addition & 1 deletion nemosis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import sys
from . import data_fetch_methods
from .value_parser import _parse_datetime, _parse_column, _infer_column_data_types
from .data_fetch_methods import *

name = "osdan"
Expand Down
39 changes: 2 additions & 37 deletions nemosis/data_fetch_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from . import processing_info_maps as _processing_info_maps
from . import defaults as _defaults
from . import custom_tables as _custom_tables
from . import _infer_column_data_types
from .custom_errors import UserInputError, NoDataToReturn, DataMismatchError

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -257,7 +258,6 @@ def static_table(
update_static_file (bool): If True download latest version of file
even if a version already exists.
Default is False.
**kwargs: additional arguments passed to the pd.to_{fformat}() function
Returns:
data (pd.Dataframe)
Expand Down Expand Up @@ -772,42 +772,7 @@ def _download_data(
)
return


def _infer_column_data_types(data):
"""
Infer datatype of DataFrame assuming inference need only be carried out
for any columns with dtype "object". Adapted from StackOverflow.
If the column is an object type, attempt conversions to (in order of):
1. datetime
2. numeric
Returns: Data with inferred types.
"""

def _get_series_type(series):
if series.dtype == "object":
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
col_new = _pd.to_datetime(series)
return col_new
except Exception as e:
try:
col_new = _pd.to_numeric(series)
return col_new
except Exception as e:
return series
else:
return series

for col in data:
series = data[col]
typed = _get_series_type(series)
data[col] = typed
return data



# GUI wrappers and mappers below


Expand Down
6 changes: 6 additions & 0 deletions nemosis/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,12 @@

months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

date_formats = [
"%Y/%m/%d %H:%M:%S",
'%Y/%m/%d %H:%M:%S.%f',
'%Y-%m-%d %H:%M:%S'
]

nem_data_model_start_time = "2009/07/01 00:00:00"


Expand Down
38 changes: 13 additions & 25 deletions nemosis/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,30 @@
from datetime import datetime, timedelta
import numpy as np

from .value_parser import _parse_datetime

logger = logging.getLogger(__name__)

def filter_on_start_and_end_date(data, start_time, end_time):
data["START_DATE"] = pd.to_datetime(data["START_DATE"], format="%Y/%m/%d %H:%M:%S")
data["START_DATE"] = _parse_datetime(data["START_DATE"])
data["END_DATE"] = np.where(
data["END_DATE"] == "2999/12/31 00:00:00",
"2100/12/31 00:00:00",
data["END_DATE"],
)
data["END_DATE"] = pd.to_datetime(data["END_DATE"], format="%Y/%m/%d %H:%M:%S")
data["END_DATE"] = _parse_datetime(data["END_DATE"])
data = data[(data["START_DATE"] < end_time) & (data["END_DATE"] > start_time)]
return data


def filter_on_effective_date(data, start_time, end_time):
data["EFFECTIVEDATE"] = pd.to_datetime(
data["EFFECTIVEDATE"], format="%Y/%m/%d %H:%M:%S"
)
data["EFFECTIVEDATE"] = _parse_datetime(data["EFFECTIVEDATE"])
data = data[data["EFFECTIVEDATE"] < end_time]
return data


def filter_on_settlementdate(data, start_time, end_time):
data["SETTLEMENTDATE"] = pd.to_datetime(
data["SETTLEMENTDATE"], format="%Y/%m/%d %H:%M:%S"
)
data["SETTLEMENTDATE"] = _parse_datetime(data["SETTLEMENTDATE"])
data = data[
(data["SETTLEMENTDATE"] > start_time) & (data["SETTLEMENTDATE"] <= end_time)
]
Expand All @@ -37,29 +35,23 @@ def filter_on_settlementdate(data, start_time, end_time):

def filter_on_timestamp(data, start_time, end_time):
try:
data["TIMESTAMP"] = pd.to_datetime(
data["TIMESTAMP"], format="%Y/%m/%d %H:%M:%S"
)
except Exception as e:
data["TIMESTAMP"] = _parse_datetime(data["TIMESTAMP"])
except ValueError as e:
logger.error(e)
# if date format is wrong, str may be too short
med_str_len = np.median(data["TIMESTAMP"].str.len())
not_data = data.loc[data["TIMESTAMP"].str.len() < med_str_len, :]
data = data.loc[data["TIMESTAMP"].str.len() >= med_str_len, :]
data["TIMESTAMP"] = pd.to_datetime(
data["TIMESTAMP"], format="%Y/%m/%d %H:%M:%S"
)
data["TIMESTAMP"] = _parse_datetime(data["TIMESTAMP"])
logger.warning("Rows with incorrect data formats omitted")
logger.warning(not_data)
logger.warning(not_data.head())
finally:
data = data[(data["TIMESTAMP"] > start_time) & (data["TIMESTAMP"] <= end_time)]
return data


def filter_on_interval_datetime(data, start_time, end_time):
data["INTERVAL_DATETIME"] = pd.to_datetime(
data["INTERVAL_DATETIME"], format="%Y/%m/%d %H:%M:%S"
)
data["INTERVAL_DATETIME"] = _parse_datetime(data["INTERVAL_DATETIME"])
data = data[
(data["INTERVAL_DATETIME"] > start_time)
& (data["INTERVAL_DATETIME"] <= end_time)
Expand All @@ -79,9 +71,7 @@ def filter_on_date_and_peroid(data, start_time, end_time):

# Not tested, just for nemlite integration.
def filter_on_date_and_interval(data, start_time, end_time):
data["SETTLEMENTDATE"] = pd.to_datetime(
data["SETTLEMENTDATE"], format="%Y/%m/%d %H:%M:%S"
)
data["SETTLEMENTDATE"] = _parse_datetime(data["SETTLEMENTDATE"])
data = data[
(data["SETTLEMENTDATE"] > start_time) & (data["SETTLEMENTDATE"] <= end_time)
]
Expand All @@ -90,9 +80,7 @@ def filter_on_date_and_interval(data, start_time, end_time):

# Not tested, just for nemlite integration.
def filter_on_last_changed(data, start_time, end_time):
data["LASTCHANGED"] = pd.to_datetime(
data["LASTCHANGED"], format="%Y/%m/%d %H:%M:%S"
)
data["LASTCHANGED"] = _parse_datetime(data["LASTCHANGED"])
data = data[data["LASTCHANGED"] < end_time]
return data

Expand Down
4 changes: 2 additions & 2 deletions nemosis/query_wrappers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from datetime import datetime, timedelta
from . import defaults
from . import defaults, _parse_datetime


def dispatch_date_setup(start_time, end_time):
Expand Down Expand Up @@ -70,5 +70,5 @@ def drop_duplicates_by_primary_key(data, start_time, table_name):

def convert_genconid_effectivedate_to_datetime_format(data, start_time, table_name):
if "GENCONID_EFFECTIVEDATE" in data.columns:
data["GENCONID_EFFECTIVEDATE"] = pd.to_datetime(data["GENCONID_EFFECTIVEDATE"])
data["GENCONID_EFFECTIVEDATE"] = _parse_datetime(data["GENCONID_EFFECTIVEDATE"])
return data
65 changes: 65 additions & 0 deletions nemosis/value_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import pandas as pd

from . import defaults as _defaults


def _parse_datetime(series):
"""
Attempts to parse a column into a datetime
If unable to (because the data is not a datetime), will raise a ValueError
Args:
series: a numpy array (pandas column)
Returns:
series (np.Array)
"""

try:
# this first format is the most common
return pd.to_datetime(series, format=_defaults.date_formats[0])
except ValueError as e:
try:
# this format with milliseconds is used in some bidding columns
return pd.to_datetime(series, format=_defaults.date_formats[1])
except ValueError as e:
# this format is used in some 4-second FCAS data
return pd.to_datetime(series, format=_defaults.date_formats[2])


def _parse_column(series):
"""
Attempts to parse a column into a datetime or numeric.
If unable to, returns the original column
Args:
series: a numpy array (pandas column)
Returns:
series (np.Array)
"""

try:
return _parse_datetime(series)
except ValueError:
try:
col_new = pd.to_numeric(series)
return col_new
except ValueError as e:
return series


def _infer_column_data_types(data):
"""
Infer datatype of DataFrame assuming inference need only be carried out
for any columns with dtype "object". Adapted from StackOverflow.
If the column is an object type, attempt conversions to (in order of):
1. datetime
2. numeric
Returns: Data with inferred types.
"""

for col in data:
data[col] = _parse_column(data[col])

return data
30 changes: 24 additions & 6 deletions tests/test_errors_and_warnings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import unittest
from nemosis import dynamic_data_compiler, cache_compiler, static_table, defaults
import os
import sys
from unittest.mock import patch
from io import StringIO


class TestDynamicDataCompilerRaisesExpectedErrors(unittest.TestCase):
Expand Down Expand Up @@ -96,14 +99,14 @@ def test_raise_error_if_select_columns_not_in_data(self):
defaults.raw_data_cache,
select_columns=["NOTACOLUMN"],
)
self.assertTrue(
self.assertIn(
(
f"None of columns ['NOTACOLUMN'] are in D:/nemosis_test_cache\\PUBLIC_DVD_DISPATCHPRICE_201812010000.feather. "
"This may be caused by user input if the 'select_columns' "
"argument is being used, or by changed AEMO data formats. "
"This error can be avoided by using the argument select_columns='all'."
)
in str(context.exception)
),
str(context.exception)
)

def test_using_select_columns_all_does_not_raise_error(self):
Expand Down Expand Up @@ -177,6 +180,20 @@ def test_using_select_columns_all_does_not_raise_error(self):
]
self.assertSequenceEqual(list(price_data.columns), expected_columns)

class TestWarnings(unittest.TestCase):
def test_no_parse_warning(self):
with patch('sys.stderr', new=StringIO()) as fakeOutput:
print('hello world')

dynamic_data_compiler(
start_time='2017/01/01 00:00:00',
end_time='2017/01/01 00:05:00',
table_name='DISPATCHPRICE',
raw_data_location=defaults.raw_data_cache
)

stderr = fakeOutput.getvalue().strip()
self.assertNotIn("UserWarning: Could not infer format", stderr)

class TestCacheCompilerRaisesExpectedErrors(unittest.TestCase):
def test_raise_error_for_incorrect_table_name(self):
Expand Down Expand Up @@ -223,6 +240,7 @@ def test_raise_error_if_select_columns_used_without_rebuild_true(self):
)



class TestStaticTableRaisesExpectedErrors(unittest.TestCase):
def test_raise_error_for_incorrect_table_name(self):
with self.assertRaises(Exception) as context:
Expand Down Expand Up @@ -294,14 +312,14 @@ def test_raise_error_if_select_columns_not_in_data(self):
defaults.raw_data_cache,
select_columns=["NOTACOLUMN"],
)
self.assertTrue(
self.assertIn(
(
f"None of columns ['NOTACOLUMN'] are in D:/nemosis_test_cache\\Ancillary Services Market Causer Pays Variables File.csv. "
"This may be caused by user input if the 'select_columns' "
"argument is being used, or by changed AEMO data formats. "
"This error can be avoided by using the argument select_columns='all'."
)
in str(context.exception)
),
str(context.exception)
)

def test_using_select_columns_all_does_not_raise_error(self):
Expand Down

0 comments on commit 9ed5346

Please sign in to comment.