-
Notifications
You must be signed in to change notification settings - Fork 22
feat: Support creating timestamp arrays #465
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,12 +8,15 @@ use arrow_array::types::{ | |
| use arrow_array::{ | ||
| Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Datum, FixedSizeBinaryArray, | ||
| LargeBinaryArray, LargeStringArray, PrimitiveArray, StringArray, StringViewArray, | ||
| TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, | ||
| TimestampSecondArray, | ||
| }; | ||
| use arrow_cast::cast; | ||
| use arrow_cast::display::ArrayFormatter; | ||
| use arrow_schema::{ArrowError, DataType, Field, FieldRef}; | ||
| use arrow_schema::{ArrowError, DataType, Field, FieldRef, TimeUnit}; | ||
| use arrow_select::concat::concat; | ||
| use arrow_select::take::take; | ||
| use chrono::{FixedOffset, Utc}; | ||
| use numpy::PyUntypedArray; | ||
| use pyo3::exceptions::{PyIndexError, PyNotImplementedError, PyValueError}; | ||
| use pyo3::intern; | ||
|
|
@@ -212,6 +215,7 @@ impl PyArray { | |
| "type must be passed for non-Arrow input", | ||
| ))? | ||
| .into_inner(); | ||
|
|
||
| let array: ArrayRef = match field.data_type() { | ||
| DataType::Float32 => impl_primitive!(f32, Float32Type), | ||
| DataType::Float64 => impl_primitive!(f64, Float64Type), | ||
|
|
@@ -284,6 +288,57 @@ impl PyArray { | |
| .collect::<Vec<_>>(); | ||
| Arc::new(StringViewArray::from(slices)) | ||
| } | ||
| DataType::Timestamp(unit, tz) => { | ||
| // We normalize all datetimes to datetimes in UTC. | ||
| let values: Vec<Option<chrono::DateTime<Utc>>> = match tz { | ||
| Some(_) => { | ||
| let vs: Vec<Option<chrono::DateTime<FixedOffset>>> = obj.extract()?; | ||
| vs.into_iter() | ||
| .map(|v| v.map(|dt| dt.with_timezone(&Utc))) | ||
| .collect() | ||
| } | ||
| None => { | ||
| let vs: Vec<Option<chrono::NaiveDateTime>> = obj.extract()?; | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This says that if the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It does not ignore it, everything is converted to dt = datetime(1999, 8, 7, 11, 12, 13, 141516)
tzinfo = zoneinfo.ZoneInfo('Europe/Madrid')
dt2: datetime = dt.astimezone(timezone(tzinfo.utcoffset(dt)))
arr = Array([dt, None, dt2], type=DataType.timestamp("ms"))
# E TypeError: expected a datetime without tzinfoIn pyarrow it seems to work this way: Same datetime, one has a
|
||
| vs.into_iter() | ||
| .map(|v| v.map(|naive| naive.and_utc())) | ||
| .collect() | ||
| } | ||
| }; | ||
| match unit { | ||
| TimeUnit::Second => { | ||
| let values: Vec<_> = | ||
| values.iter().map(|v| v.map(|x| x.timestamp())).collect(); | ||
| Arc::new(TimestampSecondArray::from(values).with_timezone_opt(tz.clone())) | ||
| } | ||
| TimeUnit::Millisecond => { | ||
| let values: Vec<Option<i64>> = values | ||
| .iter() | ||
| .map(|v| v.map(|x| x.timestamp_millis())) | ||
| .collect(); | ||
| Arc::new( | ||
| TimestampMillisecondArray::from(values).with_timezone_opt(tz.clone()), | ||
| ) | ||
| } | ||
| TimeUnit::Microsecond => { | ||
| let values: Vec<Option<i64>> = values | ||
| .iter() | ||
| .map(|v| v.map(|x| x.timestamp_micros())) | ||
| .collect(); | ||
| Arc::new( | ||
| TimestampMicrosecondArray::from(values).with_timezone_opt(tz.clone()), | ||
| ) | ||
| } | ||
| TimeUnit::Nanosecond => { | ||
| let values: Vec<Option<i64>> = values | ||
| .iter() | ||
| .map(|v| v.map(|x| x.timestamp_nanos_opt().unwrap())) | ||
surister marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| .collect(); | ||
| Arc::new( | ||
| TimestampNanosecondArray::from(values).with_timezone_opt(tz.clone()), | ||
| ) | ||
| } | ||
| } | ||
| } | ||
| dt => { | ||
| return Err(PyNotImplementedError::new_err(format!( | ||
| "Array constructor for {dt} not yet implemented." | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| import zoneinfo | ||
| from datetime import datetime, timezone | ||
|
|
||
| import pytest | ||
| from arro3.core import Array, DataType | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) | ||
| def test_array_timestamp_timezone(unit): | ||
| """Test that an array with timestamp type can be created with different units.""" | ||
| dt = datetime(1999, 8, 7, 11, 12, 13, 141516) | ||
| arr = Array([dt, None], type=DataType.timestamp(unit)) | ||
|
|
||
| result: datetime = arr.to_pylist()[0] | ||
|
|
||
| assert result.replace(microsecond=0) == dt.replace(microsecond=0) | ||
|
|
||
| if unit == "s": | ||
| assert result.microsecond == 0 | ||
|
|
||
| if unit == "ms": | ||
| assert result.microsecond == 141000 | ||
|
|
||
| if unit == "us" or unit == "ns": | ||
| assert result.microsecond == dt.microsecond | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) | ||
| @pytest.mark.parametrize("tz_name", ["UTC", "America/Chicago", "Europe/Madrid"]) | ||
| def test_array_timestamp_tz(unit, tz_name): | ||
| """Test that an array with timestamp type can be created with different units and timezone.""" | ||
| dt = datetime(1999, 8, 7, 11, 12, 13, 141516) | ||
|
|
||
| tzinfo = zoneinfo.ZoneInfo(tz_name) | ||
| expected: datetime = dt.astimezone(timezone(tzinfo.utcoffset(dt))) | ||
|
|
||
| arr = Array([expected, None], type=DataType.timestamp(unit, tz=tz_name)) | ||
| result: datetime = arr.to_pylist()[0] | ||
|
|
||
| # compare without microseconds because its more direct. | ||
| assert result.replace(microsecond=0) == expected.replace(microsecond=0) | ||
| assert result.tzinfo.utcoffset(dt) == expected.tzinfo.utcoffset(dt) | ||
|
|
||
| if unit == "s": | ||
| assert result.microsecond == 0 | ||
|
|
||
| if unit == "ms": | ||
| assert result.microsecond == 141000 | ||
|
|
||
| if unit == "us" or unit == "ns": | ||
| assert result.microsecond == expected.microsecond |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a more detailed comment here, explaining why this is valid?