Skip to content

Commit 2ecb52b

Browse files
committed
implement default precision timestamp using predefined options with default "ns" resolution
1 parent 5c64182 commit 2ecb52b

14 files changed

+161
-113
lines changed

xarray/coding/cftime_offsets.py

+4-14
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
from xarray.core.pdcompat import (
6565
NoDefault,
6666
count_not_none,
67-
nanosecond_precision_timestamp,
67+
default_precision_timestamp,
6868
no_default,
6969
)
7070
from xarray.core.utils import emit_user_level_warning
@@ -83,21 +83,13 @@
8383
T_FreqStr = TypeVar("T_FreqStr", str, None)
8484

8585

86-
def _nanosecond_precision_timestamp(*args, **kwargs):
87-
# As of pandas version 3.0, pd.to_datetime(Timestamp(...)) will try to
88-
# infer the appropriate datetime precision. Until xarray supports
89-
# non-nanosecond precision times, we will use this constructor wrapper to
90-
# explicitly create nanosecond-precision Timestamp objects.
91-
return pd.Timestamp(*args, **kwargs).as_unit("ns")
92-
93-
9486
def get_date_type(calendar, use_cftime=True):
9587
"""Return the cftime date type for a given calendar name."""
9688
if cftime is None:
9789
raise ImportError("cftime is required for dates with non-standard calendars")
9890
else:
9991
if _is_standard_calendar(calendar) and not use_cftime:
100-
return _nanosecond_precision_timestamp
92+
return default_precision_timestamp
10193

10294
calendars = {
10395
"noleap": cftime.DatetimeNoLeap,
@@ -1475,10 +1467,8 @@ def date_range_like(source, calendar, use_cftime=None):
14751467
if is_np_datetime_like(source.dtype):
14761468
# We want to use datetime fields (datetime64 object don't have them)
14771469
source_calendar = "standard"
1478-
# TODO: the strict enforcement of nanosecond precision Timestamps can be
1479-
# relaxed when addressing GitHub issue #7493.
1480-
source_start = nanosecond_precision_timestamp(source_start)
1481-
source_end = nanosecond_precision_timestamp(source_end)
1470+
source_start = default_precision_timestamp(source_start)
1471+
source_end = default_precision_timestamp(source_end)
14821472
else:
14831473
if isinstance(source, CFTimeIndex):
14841474
source_calendar = source.calendar

xarray/coding/times.py

+75-36
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
2525
from xarray.core.duck_array_ops import asarray, ravel, reshape
2626
from xarray.core.formatting import first_n_items, format_timestamp, last_item
27-
from xarray.core.pdcompat import nanosecond_precision_timestamp
27+
from xarray.core.options import _get_datetime_resolution
28+
from xarray.core.pdcompat import default_precision_timestamp
2829
from xarray.core.utils import emit_user_level_warning
2930
from xarray.core.variable import Variable
3031
from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type
@@ -193,9 +194,7 @@ def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
193194
# same us _unpack_netcdf_time_units but finalizes ref_date for
194195
# processing in encode_cf_datetime
195196
time_units, _ref_date = _unpack_netcdf_time_units(units)
196-
# TODO: the strict enforcement of nanosecond precision Timestamps can be
197-
# relaxed when addressing GitHub issue #7493.
198-
ref_date = nanosecond_precision_timestamp(_ref_date)
197+
ref_date = default_precision_timestamp(_ref_date)
199198
# If the ref_date Timestamp is timezone-aware, convert to UTC and
200199
# make it timezone-naive (GH 2649).
201200
if ref_date.tz is not None:
@@ -266,20 +265,54 @@ def _decode_datetime_with_pandas(
266265
time_units, ref_date_str = _unpack_netcdf_time_units(units)
267266
time_units = _netcdf_to_numpy_timeunit(time_units)
268267
try:
269-
# TODO: the strict enforcement of nanosecond precision Timestamps can be
270-
# relaxed when addressing GitHub issue #7493.
271-
ref_date = nanosecond_precision_timestamp(ref_date_str)
268+
# relaxed to non-nanosecond resolution
269+
ref_date = pd.Timestamp(ref_date_str)
270+
# strip tz information
271+
if ref_date.tz is not None:
272+
ref_date = ref_date.tz_convert(None)
273+
# get default unit and delta
274+
default_unit = _get_datetime_resolution()
275+
default_delta = np.timedelta64(1, default_unit).astype("timedelta64[ns]")
276+
# get ref_date and time delta
277+
ref_date_delta = np.timedelta64(1, ref_date.unit).astype("timedelta64[ns]")
278+
time_delta = np.timedelta64(1, time_units).astype("timedelta64[ns]")
279+
# choose the highest resolution
280+
new_time_units = {
281+
ref_date_delta: ref_date.unit,
282+
time_delta: time_units,
283+
default_delta: default_unit,
284+
}[min(default_delta, ref_date_delta, time_delta)]
285+
# transform to the highest needed resolution
286+
# this will raise accordingly
287+
ref_date = ref_date.as_unit(new_time_units)
272288
except ValueError as err:
273289
# ValueError is raised by pd.Timestamp for non-ISO timestamp
274290
# strings, in which case we fall back to using cftime
275291
raise OutOfBoundsDatetime from err
276292

293+
dunit = ref_date.unit
294+
277295
with warnings.catch_warnings():
278296
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
279297
if flat_num_dates.size > 0:
280298
# avoid size 0 datetimes GH1329
281-
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
282-
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
299+
fnd_min, fnd_max = flat_num_dates.min(), flat_num_dates.max()
300+
min_delta = fnd_min * np.timedelta64(1, time_units)
301+
max_delta = fnd_max * np.timedelta64(1, time_units)
302+
if not np.isnan(min_delta):
303+
# this will raise on overflow
304+
(ref_date + min_delta).as_unit(dunit)
305+
# this will raise on dtype oveflow
306+
if not np.int64(min_delta) == fnd_min:
307+
# todo: add error message
308+
raise OutOfBoundsTimedelta
309+
if not np.isnan(max_delta):
310+
# this will raise on overflow
311+
(ref_date + max_delta).as_unit(dunit)
312+
# this will raise on dtype oveflow
313+
if not np.int64(max_delta) == fnd_max:
314+
# todo: add error message
315+
raise OutOfBoundsTimedelta
283316

284317
# To avoid integer overflow when converting to nanosecond units for integer
285318
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
@@ -292,20 +325,25 @@ def _decode_datetime_with_pandas(
292325
elif flat_num_dates.dtype.kind in "f":
293326
flat_num_dates = flat_num_dates.astype(np.float64)
294327

295-
# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
296-
# works much faster when dealing with integers (GH 1399).
297-
# properly handle NaN/NaT to prevent casting NaN to int
328+
# keep NaT/nan mask
298329
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
299-
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
300-
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
301-
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
302-
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)
303330

304-
# Use pd.to_timedelta to safely cast integer values to timedeltas,
305-
# and add those to a Timestamp to safely produce a DatetimeIndex. This
306-
# ensures that we do not encounter integer overflow at any point in the
307-
# process without raising OutOfBoundsDatetime.
308-
return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
331+
# in case we need to change the unit, we fix the numbers here
332+
# this should be safe, as errors would have been raised above
333+
ns_time_unit = _NS_PER_TIME_DELTA[time_units]
334+
ns_dunit = _NS_PER_TIME_DELTA[dunit]
335+
if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_dunit):
336+
flat_num_dates *= np.int64(ns_time_unit / ns_dunit)
337+
time_units = dunit
338+
339+
# Cast input ordinals to integers and properly handle NaN/NaT
340+
# to prevent casting NaN to int
341+
flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64)
342+
flat_num_dates_int[nan] = np.iinfo(np.int64).min
343+
flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64)
344+
345+
# cast to timedelta64[time_units] and add to ref_date
346+
return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_units}]")
309347

310348

311349
def decode_cf_datetime(
@@ -370,7 +408,7 @@ def to_timedelta_unboxed(value, **kwargs):
370408

371409
def to_datetime_unboxed(value, **kwargs):
372410
result = pd.to_datetime(value, **kwargs).to_numpy()
373-
assert result.dtype == "datetime64[ns]"
411+
assert result.dtype == f"datetime64[{_get_datetime_resolution()}]"
374412
return result
375413

376414

@@ -390,7 +428,11 @@ def _unit_timedelta_cftime(units: str) -> timedelta:
390428

391429
def _unit_timedelta_numpy(units: str) -> np.timedelta64:
392430
numpy_units = _netcdf_to_numpy_timeunit(units)
393-
return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns")
431+
default_unit = _get_datetime_resolution()
432+
return np.timedelta64(
433+
int(_NS_PER_TIME_DELTA[numpy_units] / _NS_PER_TIME_DELTA[default_unit]),
434+
default_unit,
435+
)
394436

395437

396438
def _infer_time_units_from_diff(unique_timedeltas) -> str:
@@ -411,7 +453,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:
411453

412454

413455
def _time_units_to_timedelta64(units: str) -> np.timedelta64:
414-
return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]")
456+
default_unit = _get_datetime_resolution()
457+
return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype(
458+
f"timedelta64[{default_unit}]"
459+
)
415460

416461

417462
def infer_calendar_name(dates) -> CFCalendar:
@@ -440,13 +485,11 @@ def infer_datetime_units(dates) -> str:
440485
unique time deltas in `dates`)
441486
"""
442487
dates = ravel(np.asarray(dates))
443-
if np.asarray(dates).dtype == "datetime64[ns]":
488+
if np.issubdtype(np.asarray(dates).dtype, "datetime64"):
444489
dates = to_datetime_unboxed(dates)
445490
dates = dates[pd.notnull(dates)]
446491
reference_date = dates[0] if len(dates) > 0 else "1970-01-01"
447-
# TODO: the strict enforcement of nanosecond precision Timestamps can be
448-
# relaxed when addressing GitHub issue #7493.
449-
reference_date = nanosecond_precision_timestamp(reference_date)
492+
reference_date = default_precision_timestamp(reference_date)
450493
else:
451494
reference_date = dates[0] if len(dates) > 0 else "1970-01-01"
452495
reference_date = format_cftime_datetime(reference_date)
@@ -479,17 +522,15 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray:
479522
If raise_on_invalid is True (default), invalid dates trigger a ValueError.
480523
Otherwise, the invalid element is replaced by np.NaT."""
481524
times = np.asarray(times)
482-
# TODO: the strict enforcement of nanosecond precision datetime values can
483-
# be relaxed when addressing GitHub issue #7493.
484-
new = np.empty(times.shape, dtype="M8[ns]")
525+
new = np.empty(times.shape, dtype=f"M8[{_get_datetime_resolution()}]")
485526
dt: pd.Timestamp | Literal["NaT"]
486527
for i, t in np.ndenumerate(times):
487528
try:
488529
# Use pandas.Timestamp in place of datetime.datetime, because
489530
# NumPy casts it safely it np.datetime64[ns] for dates outside
490531
# 1678 to 2262 (this is not currently the case for
491532
# datetime.datetime).
492-
dt = nanosecond_precision_timestamp(
533+
dt = default_precision_timestamp(
493534
t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond
494535
)
495536
except ValueError as e:
@@ -546,10 +587,8 @@ def convert_time_or_go_back(date, date_type):
546587
547588
This is meant to convert end-of-month dates into a new calendar.
548589
"""
549-
# TODO: the strict enforcement of nanosecond precision Timestamps can be
550-
# relaxed when addressing GitHub issue #7493.
551590
if date_type == pd.Timestamp:
552-
date_type = nanosecond_precision_timestamp
591+
date_type = default_precision_timestamp
553592
try:
554593
return date_type(
555594
date.year,
@@ -757,7 +796,7 @@ def _eagerly_encode_cf_datetime(
757796
if not _is_standard_calendar(calendar) or dates.dtype.kind == "O":
758797
# parse with cftime instead
759798
raise OutOfBoundsDatetime
760-
assert dates.dtype == "datetime64[ns]"
799+
assert np.issubdtype(dates.dtype, "datetime64")
761800

762801
time_units, ref_date = _unpack_time_units_and_ref_date(units)
763802
time_delta = _time_units_to_timedelta64(time_units)

xarray/core/options.py

+9
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,12 @@ class T_Options(TypedDict):
8686
"use_flox": True,
8787
"use_numbagg": True,
8888
"use_opt_einsum": True,
89+
"time_resolution": "ns",
8990
}
9091

9192
_JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"])
9293
_DISPLAY_OPTIONS = frozenset(["text", "html"])
94+
_TIME_RESOLUTION_OPTIONS = frozenset(["s", "ms", "us", "ns"])
9395

9496

9597
def _positive_integer(value: Any) -> bool:
@@ -117,6 +119,7 @@ def _positive_integer(value: Any) -> bool:
117119
"use_opt_einsum": lambda value: isinstance(value, bool),
118120
"use_flox": lambda value: isinstance(value, bool),
119121
"warn_for_unclosed_files": lambda value: isinstance(value, bool),
122+
"time_resolution": _TIME_RESOLUTION_OPTIONS.__contains__,
120123
}
121124

122125

@@ -158,6 +161,10 @@ def _get_keep_attrs(default: bool) -> bool:
158161
return _get_boolean_with_default("keep_attrs", default)
159162

160163

164+
def _get_datetime_resolution() -> str:
165+
return OPTIONS["time_resolution"]
166+
167+
161168
class set_options:
162169
"""
163170
Set options for xarray in a controlled context.
@@ -258,6 +265,8 @@ class set_options:
258265
warn_for_unclosed_files : bool, default: False
259266
Whether or not to issue a warning when unclosed files are
260267
deallocated. This is mostly useful for debugging.
268+
time_resolution : {"s", "ms", "us", "ns"}, default: "ns"
269+
Time resolution used for CF encoding/decoding.
261270
262271
Examples
263272
--------

xarray/core/pdcompat.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@
3939
from typing import Literal
4040

4141
import pandas as pd
42-
from packaging.version import Version
42+
43+
from xarray.core.options import _get_datetime_resolution
4344

4445

4546
def count_not_none(*args) -> int:
@@ -73,13 +74,16 @@ def __repr__(self) -> str:
7374
NoDefault = Literal[_NoDefault.no_default] # For typing following pandas
7475

7576

76-
def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
77-
"""Return a nanosecond-precision Timestamp object.
77+
def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
78+
"""Return a Timestamp object with the default precision.
7879
79-
Note this function should no longer be needed after addressing GitHub issue
80-
#7493.
80+
Xarray default is "ns". This can be overridden by setting
81+
set_options(time_resolution="us") or any other resolution
82+
of {"s", "ms", "us", "ns"}.
8183
"""
82-
if Version(pd.__version__) >= Version("2.0.0"):
83-
return pd.Timestamp(*args, **kwargs).as_unit("ns")
84-
else:
85-
return pd.Timestamp(*args, **kwargs)
84+
dt = pd.Timestamp(*args, **kwargs)
85+
units = ["s", "ms", "us", "ns"]
86+
default = _get_datetime_resolution()
87+
if units.index(default) > units.index(dt.unit):
88+
dt = dt.as_unit(default)
89+
return dt

0 commit comments

Comments
 (0)