Skip to content

Commit 1bc27cf

Browse files
committed
OGRLayer::GetArrowStream(): add a DATETIME_AS_STRING=YES/NO option
DATETIME_AS_STRING=YES/NO. Defaults to NO. Added in GDAL 3.11. Whether DateTime fields should be returned as a (normally ISO-8601 formatted) string by drivers. The aim is to be able to handle mixed timezones (or timezone naive values) in the same column. All drivers must honour that option, and potentially fallback to the OGRLayer generic implementation if they cannot (which is the case for the Arrow, Parquet and ADBC drivers). When DATETIME_AS_STRING=YES, the TIMEZONE option is ignored. Fixes geopandas/pyogrio#487
1 parent c578e9d commit 1bc27cf

File tree

13 files changed

+482
-73
lines changed

13 files changed

+482
-73
lines changed

autotest/ogr/ogr_adbc.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,30 @@ def test_ogr_adbc_test_ogrsf_parquet_filename_with_glob():
326326
assert "ERROR" not in ret
327327

328328

329+
###############################################################################
330+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
331+
332+
333+
def test_ogr_adbc_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
334+
pytest.importorskip("osgeo.gdal_array")
335+
pytest.importorskip("numpy")
336+
337+
if not _has_libduckdb():
338+
pytest.skip("libduckdb.so missing")
339+
340+
with gdal.OpenEx(
341+
"data/parquet/test.parquet", gdal.OF_VECTOR, allowed_drivers=["ADBC"]
342+
) as ds:
343+
lyr = ds.GetLayer(0)
344+
stream = lyr.GetArrowStreamAsNumPy(
345+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
346+
)
347+
batches = [batch for batch in stream]
348+
batch = batches[0]
349+
# Should be "2019-01-01T14:00:00.500-02:15" but DuckDB returns in UTC without the timezone
350+
assert batch["timestamp_ms_gmt_minus_0215"][0] == b"2019-01-01T16:15:00.500"
351+
352+
329353
###############################################################################
330354
# Run test_ogrsf on a DuckDB dataset
331355

autotest/ogr/ogr_flatgeobuf.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,3 +1564,52 @@ def test_ogr_flatgeobuf_sql_arrow(tmp_vsimem):
15641564
assert f["bar"] == "baz"
15651565
assert f.GetGeometryRef().ExportToWkt() == "POINT (1 2)"
15661566
f = tmp_lyr.GetNextFeature()
1567+
1568+
1569+
###############################################################################
1570+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
1571+
1572+
1573+
def test_ogr_flatgeobuf_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
1574+
pytest.importorskip("osgeo.gdal_array")
1575+
pytest.importorskip("numpy")
1576+
1577+
filename = str(tmp_vsimem / "datetime_as_string.fgb")
1578+
with ogr.GetDriverByName("FlatGeoBuf").CreateDataSource(filename) as ds:
1579+
lyr = ds.CreateLayer("test")
1580+
1581+
field = ogr.FieldDefn("datetime", ogr.OFTDateTime)
1582+
lyr.CreateField(field)
1583+
1584+
f = ogr.Feature(lyr.GetLayerDefn())
1585+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1586+
lyr.CreateFeature(f)
1587+
1588+
f = ogr.Feature(lyr.GetLayerDefn())
1589+
f.SetField("datetime", "2022-05-31T12:34:56.789Z")
1590+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1591+
lyr.CreateFeature(f)
1592+
1593+
f = ogr.Feature(lyr.GetLayerDefn())
1594+
f.SetField("datetime", "2022-05-31T12:34:56")
1595+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1596+
lyr.CreateFeature(f)
1597+
1598+
f = ogr.Feature(lyr.GetLayerDefn())
1599+
f.SetField("datetime", "2022-05-31T12:34:56+12:30")
1600+
f.SetGeometry(ogr.CreateGeometryFromWkt("POINT (1 2)"))
1601+
lyr.CreateFeature(f)
1602+
1603+
with ogr.Open(filename) as ds:
1604+
lyr = ds.GetLayer(0)
1605+
stream = lyr.GetArrowStreamAsNumPy(
1606+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
1607+
)
1608+
batches = [batch for batch in stream]
1609+
assert len(batches) == 1
1610+
batch = batches[0]
1611+
assert len(batch["datetime"]) == 4
1612+
assert batch["datetime"][0] == b""
1613+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
1614+
assert batch["datetime"][2] == b"2022-05-31T12:34:56"
1615+
assert batch["datetime"][3] == b"2022-05-31T12:34:56+12:30"

autotest/ogr/ogr_gpkg.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10766,3 +10766,76 @@ def test_gpkg_secure_delete(tmp_vsimem):
1076610766
with ds.ExecuteSQL("PRAGMA secure_delete") as sql_lyr:
1076710767
f = sql_lyr.GetNextFeature()
1076810768
assert f.GetField(0) == 0
10769+
10770+
10771+
###############################################################################
10772+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
10773+
10774+
10775+
def test_ogr_gpkg_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
10776+
pytest.importorskip("osgeo.gdal_array")
10777+
pytest.importorskip("numpy")
10778+
10779+
filename = str(tmp_vsimem / "datetime_as_string.gpkg")
10780+
ds = ogr.GetDriverByName("GPKG").CreateDataSource(filename)
10781+
lyr = ds.CreateLayer("test")
10782+
10783+
field = ogr.FieldDefn("datetime", ogr.OFTDateTime)
10784+
lyr.CreateField(field)
10785+
10786+
f = ogr.Feature(lyr.GetLayerDefn())
10787+
lyr.CreateFeature(f)
10788+
10789+
f = ogr.Feature(lyr.GetLayerDefn())
10790+
f.SetField("datetime", "2022-05-31T12:34:56.789Z")
10791+
lyr.CreateFeature(f)
10792+
10793+
f = ogr.Feature(lyr.GetLayerDefn())
10794+
f.SetField("datetime", "2022-05-31T12:34:56.000")
10795+
lyr.CreateFeature(f)
10796+
10797+
f = ogr.Feature(lyr.GetLayerDefn())
10798+
f.SetField("datetime", "2022-05-31T12:34:56.000+12:30")
10799+
lyr.CreateFeature(f)
10800+
10801+
# Test DATETIME_AS_STRING=YES
10802+
stream = lyr.GetArrowStreamAsNumPy(
10803+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
10804+
)
10805+
batches = [batch for batch in stream]
10806+
assert len(batches) == 1
10807+
batch = batches[0]
10808+
assert len(batch["datetime"]) == 4
10809+
assert batch["datetime"][0] == b""
10810+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
10811+
assert batch["datetime"][2] == b"2022-05-31T12:34:56.000"
10812+
assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30"
10813+
10814+
# Setting a filer tests the use of the less optimized
10815+
# OGRGeoPackageTableLayer::GetNextArray() implementation
10816+
lyr.SetAttributeFilter("1 = 1")
10817+
stream = lyr.GetArrowStreamAsNumPy(
10818+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
10819+
)
10820+
lyr.SetAttributeFilter(None)
10821+
batches = [batch for batch in stream]
10822+
assert len(batches) == 1
10823+
batch = batches[0]
10824+
assert len(batch["datetime"]) == 4
10825+
assert batch["datetime"][0] == b""
10826+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
10827+
assert batch["datetime"][2] == b"2022-05-31T12:34:56.000"
10828+
assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30"
10829+
10830+
with ds.ExecuteSQL("SELECT * FROM test") as sql_lyr:
10831+
stream = sql_lyr.GetArrowStreamAsNumPy(
10832+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
10833+
)
10834+
batches = [batch for batch in stream]
10835+
assert len(batches) == 1
10836+
batch = batches[0]
10837+
assert len(batch["datetime"]) == 4
10838+
assert batch["datetime"][0] == b""
10839+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
10840+
assert batch["datetime"][2] == b"2022-05-31T12:34:56.000"
10841+
assert batch["datetime"][3] == b"2022-05-31T12:34:56.000+12:30"

autotest/ogr/ogr_mem.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,49 @@ def test_ogr_mem_arrow_stream_numpy():
979979
assert len(batches) == 0
980980

981981

982+
###############################################################################
983+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
984+
985+
986+
def test_ogr_mem_arrow_stream_numpy_datetime_as_string():
987+
pytest.importorskip("osgeo.gdal_array")
988+
pytest.importorskip("numpy")
989+
990+
ds = ogr.GetDriverByName("Memory").CreateDataSource("")
991+
lyr = ds.CreateLayer("foo")
992+
993+
field = ogr.FieldDefn("datetime", ogr.OFTDateTime)
994+
lyr.CreateField(field)
995+
996+
f = ogr.Feature(lyr.GetLayerDefn())
997+
lyr.CreateFeature(f)
998+
999+
f = ogr.Feature(lyr.GetLayerDefn())
1000+
f.SetField("datetime", "2022-05-31T12:34:56.789Z")
1001+
lyr.CreateFeature(f)
1002+
1003+
f = ogr.Feature(lyr.GetLayerDefn())
1004+
f.SetField("datetime", "2022-05-31T12:34:56")
1005+
lyr.CreateFeature(f)
1006+
1007+
f = ogr.Feature(lyr.GetLayerDefn())
1008+
f.SetField("datetime", "2022-05-31T12:34:56+12:30")
1009+
lyr.CreateFeature(f)
1010+
1011+
# Test DATETIME_AS_STRING=YES
1012+
stream = lyr.GetArrowStreamAsNumPy(
1013+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
1014+
)
1015+
batches = [batch for batch in stream]
1016+
assert len(batches) == 1
1017+
batch = batches[0]
1018+
assert len(batch["datetime"]) == 4
1019+
assert batch["datetime"][0] == b""
1020+
assert batch["datetime"][1] == b"2022-05-31T12:34:56.789Z"
1021+
assert batch["datetime"][2] == b"2022-05-31T12:34:56"
1022+
assert batch["datetime"][3] == b"2022-05-31T12:34:56+12:30"
1023+
1024+
9821025
###############################################################################
9831026

9841027

autotest/ogr/ogr_parquet.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4150,3 +4150,25 @@ def test_ogr_parquet_IsArrowSchemaSupported_arrow_15_types(
41504150
success, error_msg = dst_lyr.IsArrowSchemaSupported(schema)
41514151
assert not success
41524152
assert error_msg == expected_error_msg
4153+
4154+
4155+
###############################################################################
4156+
# Test DATETIME_AS_STRING=YES GetArrowStream() option
4157+
4158+
4159+
def test_ogr_parquet_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
4160+
pytest.importorskip("osgeo.gdal_array")
4161+
pytest.importorskip("numpy")
4162+
4163+
with gdal.OpenEx(
4164+
"data/parquet/test.parquet", gdal.OF_VECTOR, allowed_drivers=["Parquet"]
4165+
) as ds:
4166+
lyr = ds.GetLayer(0)
4167+
stream = lyr.GetArrowStreamAsNumPy(
4168+
options=["USE_MASKED_ARRAYS=NO", "DATETIME_AS_STRING=YES"]
4169+
)
4170+
batches = [batch for batch in stream]
4171+
batch = batches[0]
4172+
assert (
4173+
batch["timestamp_ms_gmt_minus_0215"][0] == b"2019-01-01T14:00:00.500-02:15"
4174+
)

ogr/ogrsf_frmts/adbc/ogradbclayer.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,11 @@ GDALDataset *OGRADBCLayer::GetDataset()
155155
bool OGRADBCLayer::GetArrowStream(struct ArrowArrayStream *out_stream,
156156
CSLConstList papszOptions)
157157
{
158-
if (m_poFilterGeom || m_poAttrQuery)
158+
if (m_poFilterGeom || m_poAttrQuery ||
159+
CPLFetchBool(papszOptions, "DATETIME_AS_STRING", false))
160+
{
159161
return OGRLayer::GetArrowStream(out_stream, papszOptions);
162+
}
160163

161164
if (m_stream)
162165
{

ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5490,6 +5490,23 @@ inline bool OGRArrowLayer::UseRecordBatchBaseImplementation() const
54905490
return true;
54915491
}
54925492

5493+
if (m_aosArrowArrayStreamOptions.FetchBool("DATETIME_AS_STRING", false))
5494+
{
5495+
const int nFieldCount = m_poFeatureDefn->GetFieldCount();
5496+
for (int i = 0; i < nFieldCount; ++i)
5497+
{
5498+
const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i);
5499+
if (!poFieldDefn->IsIgnored() &&
5500+
poFieldDefn->GetType() == OFTDateTime)
5501+
{
5502+
CPLDebug("ARROW",
5503+
"DATETIME_AS_STRING=YES not compatible of fast "
5504+
"Arrow implementation");
5505+
return true;
5506+
}
5507+
}
5508+
}
5509+
54935510
if (EQUAL(m_aosArrowArrayStreamOptions.FetchNameValueDef(
54945511
"GEOMETRY_ENCODING", ""),
54955512
"WKB"))

ogr/ogrsf_frmts/flatgeobuf/ogrflatgeobuflayer.cpp

Lines changed: 54 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1481,6 +1481,8 @@ int OGRFlatGeobufLayer::GetNextArrowArray(struct ArrowArrayStream *stream,
14811481
}
14821482

14831483
const GIntBig nFeatureIdxStart = m_featuresPos;
1484+
const bool bDateTimeAsString =
1485+
m_aosArrowArrayStreamOptions.FetchBool("DATETIME_AS_STRING", false);
14841486

14851487
const uint32_t nMemLimit = OGRArrowArrayHelper::GetMemLimit();
14861488
while (iFeat < sHelper.m_nMaxBatchSize)
@@ -1851,6 +1853,58 @@ int OGRFlatGeobufLayer::GetNextArrowArray(struct ArrowArrayStream *stream,
18511853
offset += sizeof(double);
18521854
break;
18531855

1856+
case ColumnType::DateTime:
1857+
{
1858+
if (!bDateTimeAsString)
1859+
{
1860+
if (offset + sizeof(uint32_t) > size)
1861+
{
1862+
CPLErrorInvalidSize("datetime length ");
1863+
goto error;
1864+
}
1865+
uint32_t len;
1866+
memcpy(&len, data + offset, sizeof(int32_t));
1867+
CPL_LSBPTR32(&len);
1868+
offset += sizeof(uint32_t);
1869+
if (len > size - offset || len > 32)
1870+
{
1871+
CPLErrorInvalidSize("datetime value");
1872+
goto error;
1873+
}
1874+
if (!isIgnored)
1875+
{
1876+
OGRField ogrField;
1877+
if (ParseDateTime(
1878+
reinterpret_cast<const char *>(data +
1879+
offset),
1880+
len, &ogrField))
1881+
{
1882+
sHelper.SetDateTime(
1883+
psArray, iFeat, brokenDown,
1884+
sHelper.m_anTZFlags[i], ogrField);
1885+
}
1886+
else
1887+
{
1888+
char str[32 + 1];
1889+
memcpy(str, data + offset, len);
1890+
str[len] = '\0';
1891+
if (OGRParseDate(str, &ogrField, 0))
1892+
{
1893+
sHelper.SetDateTime(
1894+
psArray, iFeat, brokenDown,
1895+
sHelper.m_anTZFlags[i], ogrField);
1896+
}
1897+
}
1898+
}
1899+
offset += len;
1900+
break;
1901+
}
1902+
else
1903+
{
1904+
[[fallthrough]];
1905+
}
1906+
}
1907+
18541908
case ColumnType::String:
18551909
case ColumnType::Json:
18561910
case ColumnType::Binary:
@@ -1896,50 +1950,6 @@ int OGRFlatGeobufLayer::GetNextArrowArray(struct ArrowArrayStream *stream,
18961950
offset += len;
18971951
break;
18981952
}
1899-
1900-
case ColumnType::DateTime:
1901-
{
1902-
if (offset + sizeof(uint32_t) > size)
1903-
{
1904-
CPLErrorInvalidSize("datetime length ");
1905-
goto error;
1906-
}
1907-
uint32_t len;
1908-
memcpy(&len, data + offset, sizeof(int32_t));
1909-
CPL_LSBPTR32(&len);
1910-
offset += sizeof(uint32_t);
1911-
if (len > size - offset || len > 32)
1912-
{
1913-
CPLErrorInvalidSize("datetime value");
1914-
goto error;
1915-
}
1916-
if (!isIgnored)
1917-
{
1918-
OGRField ogrField;
1919-
if (ParseDateTime(reinterpret_cast<const char *>(
1920-
data + offset),
1921-
len, &ogrField))
1922-
{
1923-
sHelper.SetDateTime(psArray, iFeat, brokenDown,
1924-
sHelper.m_anTZFlags[i],
1925-
ogrField);
1926-
}
1927-
else
1928-
{
1929-
char str[32 + 1];
1930-
memcpy(str, data + offset, len);
1931-
str[len] = '\0';
1932-
if (OGRParseDate(str, &ogrField, 0))
1933-
{
1934-
sHelper.SetDateTime(
1935-
psArray, iFeat, brokenDown,
1936-
sHelper.m_anTZFlags[i], ogrField);
1937-
}
1938-
}
1939-
}
1940-
offset += len;
1941-
break;
1942-
}
19431953
}
19441954
}
19451955
}

0 commit comments

Comments
 (0)