diff --git a/CHANGELOG.md b/CHANGELOG.md index d8ea83be..ffe7ad6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ### Tobac Changelog +_**Unreleased Changes:**_ + +**Bug fixes** + +- Update internals to allow for pandas 3 compatibility [#568](https://github.com/tobac-project/tobac/pull/568) + _**Version 1.6.3:**_ **Enhancements for Users** diff --git a/environment-ci.yml b/environment-ci.yml index ee3f8646..c325cea6 100644 --- a/environment-ci.yml +++ b/environment-ci.yml @@ -7,7 +7,7 @@ dependencies: - scipy - scikit-image - scikit-learn - - pandas<3 + - pandas - matplotlib - iris - xarray diff --git a/environment-examples.yml b/environment-examples.yml index a49e4dbf..d64f7e17 100644 --- a/environment-examples.yml +++ b/environment-examples.yml @@ -6,7 +6,7 @@ dependencies: - scipy - scikit-image - scikit-learn - - pandas<3 + - pandas - matplotlib - iris - xarray<2024.10.0 diff --git a/environment.yml b/environment.yml index 2de0f196..1fa30644 100644 --- a/environment.yml +++ b/environment.yml @@ -7,7 +7,7 @@ dependencies: - scipy - scikit-image - scikit-learn - - pandas<3 + - pandas - matplotlib - iris - xarray diff --git a/pyproject.toml b/pyproject.toml index d424377a..916dcd14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dependencies = [ "scipy", "scikit-image", "scikit-learn", - "pandas<3", + "pandas", "matplotlib", "scitools-iris", "xarray", diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index d5729ff2..b7d8e99a 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1539,7 +1539,6 @@ def feature_detection_multithreshold( # we map the feature index to the original index if return_labels: - for i, time_i, label_field_i, features_i in field_and_features_over_time( label_fields, features ): @@ -1684,9 +1683,11 @@ def filter_min_distance( # Calculate feature locations in cartesian coordinates if is_3D: - feature_locations = features[ - [z_coordinate_name, y_coordinate_name, x_coordinate_name] - ].to_numpy() + feature_locations = ( + features[[z_coordinate_name, y_coordinate_name, x_coordinate_name]] + .to_numpy() + .copy() + ) feature_locations[:, 0] *= dz feature_locations[:, 1:] *= dxy else: diff --git a/tobac/segmentation/watershed_segmentation.py b/tobac/segmentation/watershed_segmentation.py index c1323652..7524ebe5 100644 --- a/tobac/segmentation/watershed_segmentation.py +++ b/tobac/segmentation/watershed_segmentation.py @@ -776,7 +776,7 @@ def segmentation_timestep( ) # Get features that are needed for the buddy box - buddy_features = deepcopy(features_in.iloc[feat_inds]) + buddy_features = features_in.iloc[feat_inds].copy() # create arrays to contain points of all buddies # and their transpositions/transformations @@ -824,15 +824,15 @@ def segmentation_timestep( ) # edit value in buddy_features dataframe - buddy_features.hdim_1.values[buddy_looper] = ( - pbc_utils.transfm_pbc_point( - float(buddy_feat.hdim_1), hdim1_min, hdim1_max - ) + buddy_features.iloc[ + buddy_looper, buddy_features.columns.get_loc("hdim_1") + ] = pbc_utils.transfm_pbc_point( + float(buddy_feat.hdim_1), hdim1_min, hdim1_max ) - buddy_features.hdim_2.values[buddy_looper] = ( - pbc_utils.transfm_pbc_point( - float(buddy_feat.hdim_2), hdim2_min, hdim2_max - ) + buddy_features.iloc[ + buddy_looper, buddy_features.columns.get_loc("hdim_2") + ] = pbc_utils.transfm_pbc_point( + float(buddy_feat.hdim_2), hdim2_min, hdim2_max ) buddy_looper = buddy_looper + 1 @@ -903,16 +903,16 @@ def segmentation_timestep( if "vdim" not in buddy_features: buddy_features["vdim"] = np.zeros(len(buddy_features), dtype=int) for buddy_looper in range(0, len(buddy_features)): - buddy_features.vdim.values[buddy_looper] = ( - buddy_features.vdim.values[buddy_looper] - bbox_zstart - ) - - buddy_features.hdim_1.values[buddy_looper] = ( - buddy_features.hdim_1.values[buddy_looper] - bbox_ystart - ) - buddy_features.hdim_2.values[buddy_looper] = ( - buddy_features.hdim_2.values[buddy_looper] - bbox_xstart - ) + buddy_features.iloc[ + buddy_looper, buddy_features.columns.get_loc("vdim") + ] = (buddy_features.vdim.values[buddy_looper] - bbox_zstart) + + buddy_features.iloc[ + buddy_looper, buddy_features.columns.get_loc("hdim_1") + ] = (buddy_features.hdim_1.values[buddy_looper] - bbox_ystart) + buddy_features.iloc[ + buddy_looper, buddy_features.columns.get_loc("hdim_2") + ] = (buddy_features.hdim_2.values[buddy_looper] - bbox_xstart) # Create dask array from input data: buddy_data = buddy_rgn diff --git a/tobac/tests/test_datetime.py b/tobac/tests/test_datetime.py index 82a9f7b7..a5c002e3 100644 --- a/tobac/tests/test_datetime.py +++ b/tobac/tests/test_datetime.py @@ -143,6 +143,7 @@ def test_to_datestr(): assert ( datetime_utils.to_datestr(date) == "2000-01-01T00:00:00.000000000" or datetime_utils.to_datestr(date) == "2000-01-01T00:00:00" + or datetime_utils.to_datestr(date) == "2000-01-01T00:00:00.000000" ) @@ -163,9 +164,11 @@ def test_to_datestr_array(): cftime.DatetimeNoLeap(2000, 1, 1), ] for date in test_dates: - assert datetime_utils.to_datestr([date]) == [ - "2000-01-01T00:00:00.000000000" - ] or datetime_utils.to_datestr([date]) == ["2000-01-01T00:00:00"] + assert ( + datetime_utils.to_datestr([date]) == ["2000-01-01T00:00:00.000000000"] + or datetime_utils.to_datestr([date]) == ["2000-01-01T00:00:00"] + or (datetime_utils.to_datestr([date]) == ["2000-01-01T00:00:00.000000"]) + ) def test_match_datetime_format(): @@ -214,3 +217,18 @@ def test_match_datetime_format_error(): """ with pytest.raises(ValueError, match="Target is not a valid datetime*"): datetime_utils.match_datetime_format(datetime(2000, 1, 1), 1.5) + + +@pytest.mark.parametrize( + ["date_in", "precision"], + [ + ("2000-02-04T00:00:00", "s"), + ("2000-02-04T00:00:00.000", "ms"), + ("2000-02-04T00:00:00.000000", "us"), + ("2000-02-04T00:00:00.000000000", "ns"), + ("00:00:00", "s"), + ], +) +def test_detect_str_precision(date_in: str, precision: str): + """test that detect_str_precision returns the right precision""" + assert datetime_utils.detect_str_precision(date_in) == precision diff --git a/tobac/tests/tracking_tests/test_tracking.py b/tobac/tests/tracking_tests/test_tracking.py index 70be714c..951d5f56 100644 --- a/tobac/tests/tracking_tests/test_tracking.py +++ b/tobac/tests/tracking_tests/test_tracking.py @@ -549,7 +549,10 @@ def test_untracked_nat(): assert np.all(pd.isnull(output["time_cell"])) # the exact data type depends on architecture, so # instead just check by name - assert output["time_cell"].dtype.name == "timedelta64[ns]" + assert ( + output["time_cell"].dtype.name == "timedelta64[ns]" + or output["time_cell"].dtype.name == "timedelta64[us]" + ) @pytest.mark.parametrize( diff --git a/tobac/utils/datetime.py b/tobac/utils/datetime.py index 611603cb..1e83b9fb 100644 --- a/tobac/utils/datetime.py +++ b/tobac/utils/datetime.py @@ -1,11 +1,12 @@ """Functions for converting between and working with different datetime formats""" -from typing import Union +from typing import Union, Optional, Literal import datetime import numpy as np import pandas as pd import xarray as xr import cftime +import re def to_cftime( @@ -51,6 +52,7 @@ def to_cftime( def to_timestamp( dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime], + precision: Optional[Literal["ns", "us", "s", "ms"]] = None, ) -> pd.Timestamp: """Converts a provided datetime-like object to a pandas timestamp @@ -58,6 +60,13 @@ def to_timestamp( ---------- dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime] A datetime-like object or array of datetime-like objects to be converted + precision : Optional[Literal["ns", "us", "s", "ms"]] + The precision of the timestamp. If None, the default precision is used. + The default precision is ns for Pandas 2 and before; us for Pandas 3 + - "ns": nanoseconds + - "us": microseconds + - "ms": milliseconds + - "s": seconds Returns ------- @@ -74,6 +83,9 @@ def to_timestamp( else: pd_dates = pd.to_datetime(dates) + if precision is not None: + pd_dates = pd_dates.astype(f"datetime64[{precision}]") + if squeeze_output: return next(iter(pd_dates)) return pd_dates @@ -99,6 +111,7 @@ def to_datetime( def to_datetime64( dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime], + precision: Optional[Literal["ns", "us", "s", "ms"]] = None, ) -> np.datetime64: """Converts a provided datetime-like object to numpy datetime64 objects @@ -106,17 +119,25 @@ def to_datetime64( ---------- dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime] A datetime-like object or array of datetime-like objects to be converted + precision : Optional[Literal["ns", "us", "s", "ms"]] + The precision of the timestamp. If None, the default precision is used. + The default precision is ns for Pandas 2 and before; us for Pandas 3 + - "ns": nanoseconds + - "us": microseconds + - "ms": milliseconds + - "s": seconds Returns ------- np.datetime64 A numpy datetime64 or array of numpy datetime64s """ - return to_timestamp(dates).to_numpy() + return to_timestamp(dates, precision).to_numpy() def to_datestr( dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime], + precision: Optional[Literal["ns", "us", "s", "ms"]] = None, ) -> str: """Converts a provided datetime-like object to ISO format date strings @@ -124,24 +145,64 @@ def to_datestr( ---------- dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime] A datetime-like object or array of datetime-like objects to be converted + precision : Optional[Literal["ns", "us", "s", "ms"]] + The precision of the timestamp. If None, the default precision is used. + The default precision is ns for Pandas 2 and before; us for Pandas 3 + - "ns": nanoseconds + - "us": microseconds + - "ms": milliseconds + - "s": seconds Returns ------- str A string or array of strings in ISO date format """ - dates = to_datetime64(dates) + dates = to_datetime64(dates, precision) if hasattr(dates, "__iter__"): return dates.astype(str) return str(dates) +def detect_str_precision(datestr: str) -> Literal["s", "ms", "us", "ns"]: + """Detects the precision of a datetime str by counting the number of digits after . + Parameters + ---------- + datestr : str + Input string + + Returns + ------- + Literal['s', 'ms', 'us', 'ns'] + The precision of the string based on the number of digits after . + + Raises + ------ + ValueError + Raises a ValueError if the input string is not a datetime string or if + the number of digits after . is not evenly divisible by 3 + """ + + digits_matching = re.search(r"\.(\d+)", datestr) + if not digits_matching: + return "s" + n = len(digits_matching.group(1)) + if n <= 3: + return "ms" + elif n <= 6: + return "us" + elif n <= 9: + return "ns" + else: + raise ValueError("Finer than ns precision.") + + def match_datetime_format( dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime], target: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime], ) -> Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]: """Converts the provided datetime-like objects to the same datetime format - as the provided target + as the provided target, ensuring that the precisions match Parameters ---------- @@ -164,7 +225,8 @@ def match_datetime_format( of datetime-like objects """ if isinstance(target, str): - return to_datestr(dates) + precision = detect_str_precision(target) + return to_datestr(dates, precision) if isinstance(target, xr.DataArray): target = target.values if isinstance(target, pd.Series): @@ -172,13 +234,16 @@ def match_datetime_format( if hasattr(target, "__iter__"): target = target[0] if isinstance(target, str): - return to_datestr(dates) + precision = detect_str_precision(target) + return to_datestr(dates, precision) if isinstance(target, cftime.datetime): return to_cftime(dates, target.calendar) if isinstance(target, pd.Timestamp): - return to_timestamp(dates) + precision = target.unit + return to_timestamp(dates, precision=precision) if isinstance(target, np.datetime64): - return to_datetime64(dates) + precision = np.datetime_data(target)[0] + return to_datetime64(dates, precision=precision) if isinstance(target, datetime.datetime): return to_datetime(dates) raise ValueError("Target is not a valid datetime format")