Skip to content

Commit 4088ef2

Browse files
committed
OGRLayer::GetArrowStream(): add a DATETIME_AS_STRING=YES/NO option
DATETIME_AS_STRING=YES/NO. Defaults to NO. Added in GDAL 3.11. Whether DateTime fields should be returned as a (normally ISO-8601 formatted) string by drivers. The aim is to be able to handle mixed timezones (or timezone naive values) in the same column. All drivers must honour that option, and potentially fallback to the OGRLayer generic implementation if they cannot (which is the case for the Arrow, Parquet and ADBC drivers). When DATETIME_AS_STRING=YES, the TIMEZONE option is ignored. Fixes geopandas/pyogrio#487
1 parent 53c759a commit 4088ef2

14 files changed

+498
-73
lines changed

autotest/ogr/ogr_adbc.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,36 @@ def test_ogr_adbc_test_ogrsf_parquet_filename_with_glob():
326326
assert "ERROR" not in ret
327327

328328

329+
###############################################################################
330+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
331+
332+
333+
def test_ogr_adbc_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
334+
pytest.importorskip("osgeo.gdal_array")
335+
pytest.importorskip("numpy")
336+
337+
if not _has_libduckdb():
338+
pytest.skip("libduckdb.so missing")
339+
340+
with gdal.OpenEx(
341+
"data/parquet/test.parquet", gdal.OF_VECTOR, allowed_drivers=["ADBC"]
342+
) as ds:
343+
lyr = ds.GetLayer(0)
344+
stream = lyr.GetArrowStreamAsNumPy(
345+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
346+
)
347+
batches = [batch for batch in stream]
348+
batch = batches[0]
349+
# Should be "2019-01-01T14:00:00.500-02:15" but DuckDB returns in UTC
350+
# On my machine, for some reason it returns without the Z, whereas on
351+
# the ubuntu_2404 it returns with the Z... despite both using libduckdb 1.1.3
352+
# at time of writing...
353+
assert batch["timestamp_ms_gmt_minus_0215"][0] in (
354+
b"2019-01-01T16:15:00.500",
355+
b"2019-01-01T16:15:00.500Z",
356+
)
357+
358+
329359
###############################################################################
330360
# Run test_ogrsf on a DuckDB dataset
331361

autotest/ogr/ogr_flatgeobuf.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,3 +1564,52 @@ def test_ogr_flatgeobuf_sql_arrow(tmp_vsimem):
15641564
assert f["bar"] == "baz"
15651565
assert f.GetGeometryRef().ExportToWkt() == "POINT (1 2)"
15661566
f = tmp_lyr.GetNextFeature()
1567+
1568+
1569+
###############################################################################
1570+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
1571+
1572+
1573+
def test_ogr_flatgeobuf_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
1574+
pytest.importorskip("osgeo.gdal_array")
1575+
pytest.importorskip("numpy")
1576+
1577+
filename = str(tmp_vsimem / "datetime_as_string.fgb")
1578+
with ogr.GetDriverByName("FlatGeoBuf").CreateDataSource(filename) as ds:
1579+
lyr = ds.CreateLayer("test")
1580+
1581+
field = ogr.FieldDefn("datetime", ogr.OFTDateTime)
1582+
lyr.CreateField(field)
1583+
1584+
f = ogr.Feature(lyr.GetLayerDefn())
1585+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1586+
lyr.CreateFeature(f)
1587+
1588+
f = ogr.Feature(lyr.GetLayerDefn())
1589+
f.SetField("datetime", "2022-05-31T12:34:56.789Z")
1590+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1591+
lyr.CreateFeature(f)
1592+
1593+
f = ogr.Feature(lyr.GetLayerDefn())
1594+
f.SetField("datetime", "2022-05-31T12:34:56")
1595+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1596+
lyr.CreateFeature(f)
1597+
1598+
f = ogr.Feature(lyr.GetLayerDefn())
1599+
f.SetField("datetime", "2022-05-31T12:34:56+12:30")
1600+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1601+
lyr.CreateFeature(f)
1602+
1603+
with ogr.Open(filename) as ds:
1604+
lyr = ds.GetLayer(0)
1605+
stream = lyr.GetArrowStreamAsNumPy(
1606+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
1607+
)
1608+
batches = [batch for batch in stream]
1609+
assert len(batches) == 1
1610+
batch = batches[0]
1611+
assert len(batch["datetime"]) == 4
1612+
assert batch["datetime"][0] == b""
1613+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
1614+
assert batch["datetime"][2] == b"2022-05-31T12:34:56"
1615+
assert batch["datetime"][3] == b"2022-05-31T12:34:56+12:30"

autotest/ogr/ogr_gpkg.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10831,3 +10831,76 @@ def test_ogr_gpkg_write_check_golden_file(tmp_path, src_filename):
1083110831
golden_data[96] = golden_data[97] = golden_data[98] = golden_data[99] = 0
1083210832
got_data[96] = got_data[97] = got_data[98] = got_data[99] = 0
1083310833
assert got_data == golden_data
10834+
10835+
10836+
###############################################################################
10837+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
10838+
10839+
10840+
def test_ogr_gpkg_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
10841+
pytest.importorskip("osgeo.gdal_array")
10842+
pytest.importorskip("numpy")
10843+
10844+
filename = str(tmp_vsimem / "datetime_as_string.gpkg")
10845+
ds = ogr.GetDriverByName("GPKG").CreateDataSource(filename)
10846+
lyr = ds.CreateLayer("test")
10847+
10848+
field = ogr.FieldDefn("datetime", ogr.OFTDateTime)
10849+
lyr.CreateField(field)
10850+
10851+
f = ogr.Feature(lyr.GetLayerDefn())
10852+
lyr.CreateFeature(f)
10853+
10854+
f = ogr.Feature(lyr.GetLayerDefn())
10855+
f.SetField("datetime", "2022-05-31T12:34:56.789Z")
10856+
lyr.CreateFeature(f)
10857+
10858+
f = ogr.Feature(lyr.GetLayerDefn())
10859+
f.SetField("datetime", "2022-05-31T12:34:56.000")
10860+
lyr.CreateFeature(f)
10861+
10862+
f = ogr.Feature(lyr.GetLayerDefn())
10863+
f.SetField("datetime", "2022-05-31T12:34:56.000+12:30")
10864+
lyr.CreateFeature(f)
10865+
10866+
# Test DATETIME_AS_STRING=YES
10867+
stream = lyr.GetArrowStreamAsNumPy(
10868+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
10869+
)
10870+
batches = [batch for batch in stream]
10871+
assert len(batches) == 1
10872+
batch = batches[0]
10873+
assert len(batch["datetime"]) == 4
10874+
assert batch["datetime"][0] == b""
10875+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
10876+
assert batch["datetime"][2] == b"2022-05-31T12:34:56.000"
10877+
assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30"
10878+
10879+
# Setting a filer tests the use of the less optimized
10880+
# OGRGeoPackageTableLayer::GetNextArray() implementation
10881+
lyr.SetAttributeFilter("1 = 1")
10882+
stream = lyr.GetArrowStreamAsNumPy(
10883+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
10884+
)
10885+
lyr.SetAttributeFilter(None)
10886+
batches = [batch for batch in stream]
10887+
assert len(batches) == 1
10888+
batch = batches[0]
10889+
assert len(batch["datetime"]) == 4
10890+
assert batch["datetime"][0] == b""
10891+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
10892+
assert batch["datetime"][2] == b"2022-05-31T12:34:56.000"
10893+
assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30"
10894+
10895+
with ds.ExecuteSQL("SELECT * FROM test") as sql_lyr:
10896+
stream = sql_lyr.GetArrowStreamAsNumPy(
10897+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
10898+
)
10899+
batches = [batch for batch in stream]
10900+
assert len(batches) == 1
10901+
batch = batches[0]
10902+
assert len(batch["datetime"]) == 4
10903+
assert batch["datetime"][0] == b""
10904+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
10905+
assert batch["datetime"][2] == b"2022-05-31T12:34:56.000"
10906+
assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30"

autotest/ogr/ogr_mem.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,49 @@ def test_ogr_mem_arrow_stream_numpy():
979979
assert len(batches) == 0
980980

981981

982+
###############################################################################
983+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
984+
985+
986+
def test_ogr_mem_arrow_stream_numpy_datetime_as_string():
987+
pytest.importorskip("osgeo.gdal_array")
988+
pytest.importorskip("numpy")
989+
990+
ds = ogr.GetDriverByName("Memory").CreateDataSource("")
991+
lyr = ds.CreateLayer("foo")
992+
993+
field = ogr.FieldDefn("datetime", ogr.OFTDateTime)
994+
lyr.CreateField(field)
995+
996+
f = ogr.Feature(lyr.GetLayerDefn())
997+
lyr.CreateFeature(f)
998+
999+
f = ogr.Feature(lyr.GetLayerDefn())
1000+
f.SetField("datetime", "2022-05-31T12:34:56.789Z")
1001+
lyr.CreateFeature(f)
1002+
1003+
f = ogr.Feature(lyr.GetLayerDefn())
1004+
f.SetField("datetime", "2022-05-31T12:34:56")
1005+
lyr.CreateFeature(f)
1006+
1007+
f = ogr.Feature(lyr.GetLayerDefn())
1008+
f.SetField("datetime", "2022-05-31T12:34:56+12:30")
1009+
lyr.CreateFeature(f)
1010+
1011+
# Test DATETIME_AS_STRING=YES
1012+
stream = lyr.GetArrowStreamAsNumPy(
1013+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
1014+
)
1015+
batches = [batch for batch in stream]
1016+
assert len(batches) == 1
1017+
batch = batches[0]
1018+
assert len(batch["datetime"]) == 4
1019+
assert batch["datetime"][0] == b""
1020+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
1021+
assert batch["datetime"][2] == b"2022-05-31T12:34:56"
1022+
assert batch["datetime"][3] == b"2022-05-31T12:34:56+12:30"
1023+
1024+
9821025
###############################################################################
9831026

9841027

autotest/ogr/ogr_parquet.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4167,4 +4167,25 @@ def test_ogr_parquet_ogr2ogr_reprojection(tmp_vsimem):
41674167
with ogr.Open(outfilename) as ds:
41684168
assert ds.GetLayer(0).GetExtent() == pytest.approx(
41694169
(8.73380363499761, 8.774681944824946, 43.01833481785084, 43.04292637071279)
4170+
4171+
4172+
###############################################################################
4173+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
4174+
4175+
4176+
def test_ogr_parquet_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
4177+
pytest.importorskip("osgeo.gdal_array")
4178+
pytest.importorskip("numpy")
4179+
4180+
with gdal.OpenEx(
4181+
"data/parquet/test.parquet", gdal.OF_VECTOR, allowed_drivers=["Parquet"]
4182+
) as ds:
4183+
lyr = ds.GetLayer(0)
4184+
stream = lyr.GetArrowStreamAsNumPy(
4185+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
4186+
)
4187+
batches = [batch for batch in stream]
4188+
batch = batches[0]
4189+
assert (
4190+
batch["timestamp_ms_gmt_minus_0215"][0] == b"2019-01-01T14:00:00.500-02:15"
41704191
)

ogr/ogrsf_frmts/adbc/ogradbclayer.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,11 @@ GDALDataset *OGRADBCLayer::GetDataset()
155155
bool OGRADBCLayer::GetArrowStream(struct ArrowArrayStream *out_stream,
156156
CSLConstList papszOptions)
157157
{
158-
if (m_poFilterGeom || m_poAttrQuery)
158+
if (m_poFilterGeom || m_poAttrQuery ||
159+
CPLFetchBool(papszOptions, GAS_OPT_DATETIME_AS_STRING, false))
160+
{
159161
return OGRLayer::GetArrowStream(out_stream, papszOptions);
162+
}
160163

161164
if (m_stream)
162165
{

ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5490,6 +5490,24 @@ inline bool OGRArrowLayer::UseRecordBatchBaseImplementation() const
54905490
return true;
54915491
}
54925492

5493+
if (m_aosArrowArrayStreamOptions.FetchBool(GAS_OPT_DATETIME_AS_STRING,
5494+
false))
5495+
{
5496+
const int nFieldCount = m_poFeatureDefn->GetFieldCount();
5497+
for (int i = 0; i < nFieldCount; ++i)
5498+
{
5499+
const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i);
5500+
if (!poFieldDefn->IsIgnored() &&
5501+
poFieldDefn->GetType() == OFTDateTime)
5502+
{
5503+
CPLDebug("ARROW",
5504+
"DATETIME_AS_STRING=YES not compatible of fast "
5505+
"Arrow implementation");
5506+
return true;
5507+
}
5508+
}
5509+
}
5510+
54935511
if (EQUAL(m_aosArrowArrayStreamOptions.FetchNameValueDef(
54945512
"GEOMETRY_ENCODING", ""),
54955513
"WKB"))

0 commit comments

Comments
 (0)