Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion arro3-core/python/arro3/core/_data_type.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ class DataType:

Args:
unit: one of `'s'` [second], `'ms'` [millisecond], `'us'` [microsecond], or `'ns'` [nanosecond]
tz: Time zone name. None indicates time zone naive. Defaults to None.
tz: Time zone name. None indicates time zone naive. Defaults to None. Supported
values are IANA time-zones, see `pytz.all_timezones` for a list of supported values.

Returns:
_description_
Expand Down
57 changes: 56 additions & 1 deletion pyo3-arrow/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ use arrow_array::types::{
use arrow_array::{
Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Datum, FixedSizeBinaryArray,
LargeBinaryArray, LargeStringArray, PrimitiveArray, StringArray, StringViewArray,
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
TimestampSecondArray,
};
use arrow_cast::cast;
use arrow_cast::display::ArrayFormatter;
use arrow_schema::{ArrowError, DataType, Field, FieldRef};
use arrow_schema::{ArrowError, DataType, Field, FieldRef, TimeUnit};
use arrow_select::concat::concat;
use arrow_select::take::take;
use chrono::{FixedOffset, Utc};
use numpy::PyUntypedArray;
use pyo3::exceptions::{PyIndexError, PyNotImplementedError, PyValueError};
use pyo3::intern;
Expand Down Expand Up @@ -212,6 +215,7 @@ impl PyArray {
"type must be passed for non-Arrow input",
))?
.into_inner();

let array: ArrayRef = match field.data_type() {
DataType::Float32 => impl_primitive!(f32, Float32Type),
DataType::Float64 => impl_primitive!(f64, Float64Type),
Expand Down Expand Up @@ -284,6 +288,57 @@ impl PyArray {
.collect::<Vec<_>>();
Arc::new(StringViewArray::from(slices))
}
DataType::Timestamp(unit, tz) => {
// We normalize all datetimes to datetimes in UTC.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a more detailed comment here, explaining why this is valid?

let values: Vec<Option<chrono::DateTime<Utc>>> = match tz {
Some(_) => {
let vs: Vec<Option<chrono::DateTime<FixedOffset>>> = obj.extract()?;
vs.into_iter()
.map(|v| v.map(|dt| dt.with_timezone(&Utc)))
.collect()
}
None => {
let vs: Vec<Option<chrono::NaiveDateTime>> = obj.extract()?;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This says that if the type passed in doesn't have a tz, then ignore any tz passed with the input data. That seems easily wrong if we have a variety of timezones in our array of input timestamps; I'm curious how pyarrow handles this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not ignore it, everything is converted to chrono::NaiveDateTime, so if there is a dt with tz it errors out:

dt = datetime(1999, 8, 7, 11, 12, 13, 141516)
tzinfo = zoneinfo.ZoneInfo('Europe/Madrid')
dt2: datetime = dt.astimezone(timezone(tzinfo.utcoffset(dt)))

arr = Array([dt, None, dt2], type=DataType.timestamp("ms"))
# E       TypeError: expected a datetime without tzinfo

In pyarrow it seems to work this way:

Same datetime, one has a tz and array type is tz

dt = datetime(1999, 8, 7, 11, 12, 13, 141516)
tzinfo = zoneinfo.ZoneInfo('Europe/Madrid')
dt2: datetime = dt.astimezone(timezone(tzinfo.utcoffset(dt)))
arr = pyarrow.array([dt, None, dt2], type=pyarrow.timestamp('ms', 'Europe/Madrid'))
print(arr.type)

It does not error out:

[
  1999-08-07 11:12:13.141Z,
  null,
  1999-08-07 09:12:13.141Z
]
timestamp[ms, tz=Europe/Madrid]

It applies the given tz to all dates.

Back to python:

[
    datetime.datetime(1999, 8, 7, 13, 12, 13, 141000, tzinfo=<DstTzInfo 'Europe/Madrid' CEST+2:00:00 DST>),
    None,
    datetime.datetime(1999, 8, 7, 11, 12, 13, 141000, tzinfo=<DstTzInfo 'Europe/Madrid' CEST+2:00:00 DST>)
]

Both have applied tz.

Same datetime, one has tz and array type does not have tz

[
    datetime.datetime(1999, 8, 7, 11, 12, 13, 141516),
    None, 
    datetime.datetime(1999, 8, 7, 9, 12, 13, 141516)
]

Both dates are still as is, but tz information is now lost.

So I guess the expected functionality would be to apply the given tz to all dates, if no tz exists just take the dates as is.

Copy link
Owner

@kylebarron kylebarron Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not error out:

[
  1999-08-07 11:12:13.141Z,
  null,
  1999-08-07 09:12:13.141Z
]
timestamp[ms, tz=Europe/Madrid]

That feels wrong...? You have two input datetimes that represent the same time. And when they are returned to Python they have the same time. But displayed from Rust they are displayed as different.

Edit: I didn't see that back in Python they are different

vs.into_iter()
.map(|v| v.map(|naive| naive.and_utc()))
.collect()
}
};
match unit {
TimeUnit::Second => {
let values: Vec<_> =
values.iter().map(|v| v.map(|x| x.timestamp())).collect();
Arc::new(TimestampSecondArray::from(values).with_timezone_opt(tz.clone()))
}
TimeUnit::Millisecond => {
let values: Vec<Option<i64>> = values
.iter()
.map(|v| v.map(|x| x.timestamp_millis()))
.collect();
Arc::new(
TimestampMillisecondArray::from(values).with_timezone_opt(tz.clone()),
)
}
TimeUnit::Microsecond => {
let values: Vec<Option<i64>> = values
.iter()
.map(|v| v.map(|x| x.timestamp_micros()))
.collect();
Arc::new(
TimestampMicrosecondArray::from(values).with_timezone_opt(tz.clone()),
)
}
TimeUnit::Nanosecond => {
let values: Vec<Option<i64>> = values
.iter()
.map(|v| v.map(|x| x.timestamp_nanos_opt().unwrap()))
.collect();
Arc::new(
TimestampNanosecondArray::from(values).with_timezone_opt(tz.clone()),
)
}
}
}
dt => {
return Err(PyNotImplementedError::new_err(format!(
"Array constructor for {dt} not yet implemented."
Expand Down
51 changes: 51 additions & 0 deletions tests/core/test_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import zoneinfo
from datetime import datetime, timezone

import pytest
from arro3.core import Array, DataType


@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
def test_array_timestamp_timezone(unit):
"""Test that an array with timestamp type can be created with different units."""
dt = datetime(1999, 8, 7, 11, 12, 13, 141516)
arr = Array([dt, None], type=DataType.timestamp(unit))

result: datetime = arr.to_pylist()[0]

assert result.replace(microsecond=0) == dt.replace(microsecond=0)

if unit == "s":
assert result.microsecond == 0

if unit == "ms":
assert result.microsecond == 141000

if unit == "us" or unit == "ns":
assert result.microsecond == dt.microsecond


@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
@pytest.mark.parametrize("tz_name", ["UTC", "America/Chicago", "Europe/Madrid"])
def test_array_timestamp_tz(unit, tz_name):
"""Test that an array with timestamp type can be created with different units and timezone."""
dt = datetime(1999, 8, 7, 11, 12, 13, 141516)

tzinfo = zoneinfo.ZoneInfo(tz_name)
expected: datetime = dt.astimezone(timezone(tzinfo.utcoffset(dt)))

arr = Array([expected, None], type=DataType.timestamp(unit, tz=tz_name))
result: datetime = arr.to_pylist()[0]

# compare without microseconds because its more direct.
assert result.replace(microsecond=0) == expected.replace(microsecond=0)
assert result.tzinfo.utcoffset(dt) == expected.tzinfo.utcoffset(dt)

if unit == "s":
assert result.microsecond == 0

if unit == "ms":
assert result.microsecond == 141000

if unit == "us" or unit == "ns":
assert result.microsecond == expected.microsecond