Skip to content

Commit c2842ed

Browse files
committed
Issue #114/#200 support promoting feature properties to cube values
- flattening: move options to export phase iso vector cube constructor - introduce `VectorCube.from_geodataframe` wiith support for promoting selected columns to cube values - regardless of promotion: all properties are still associated with `VectorCube.geometries` for now (otherwise properties can not be preserved when using `aggregate_spatial`, see Open-EO/openeo-api#504) - only promote numerical values by default for now
1 parent 4f97c75 commit c2842ed

File tree

4 files changed

+193
-70
lines changed

4 files changed

+193
-70
lines changed

openeo_driver/datacube.py

+98-22
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import io
88

99
import geopandas as gpd
10-
import numpy as np
10+
import numpy
1111
import pyproj
1212
import shapely.geometry
1313
import shapely.geometry.base
@@ -218,12 +218,13 @@ class DriverVectorCube:
218218
DIM_GEOMETRIES = "geometries"
219219
DIM_BANDS = "bands"
220220
FLATTEN_PREFIX = "vc"
221+
COLUMN_SELECTION_ALL = "all"
222+
COLUMN_SELECTION_NUMERICAL = "numerical"
221223

222224
def __init__(
223225
self,
224226
geometries: gpd.GeoDataFrame,
225227
cube: Optional[xarray.DataArray] = None,
226-
flatten_prefix: str = FLATTEN_PREFIX,
227228
):
228229
"""
229230
@@ -237,18 +238,77 @@ def __init__(
237238
log.error(f"First cube dim should be {self.DIM_GEOMETRIES!r} but got dims {cube.dims!r}")
238239
raise VectorCubeError("Cube's first dimension is invalid.")
239240
if not geometries.index.equals(cube.indexes[cube.dims[0]]):
240-
log.error(f"Invalid VectorCube components {geometries.index!r} != {cube.indexes[cube.dims[0]]!r}")
241+
log.error(f"Invalid VectorCube components {geometries.index=} != {cube.indexes[cube.dims[0]]=}")
241242
raise VectorCubeError("Incompatible vector cube components")
242243
self._geometries: gpd.GeoDataFrame = geometries
243244
self._cube = cube
244-
self._flatten_prefix = flatten_prefix
245245

246-
def with_cube(self, cube: xarray.DataArray, flatten_prefix: str = FLATTEN_PREFIX) -> "DriverVectorCube":
246+
def with_cube(self, cube: xarray.DataArray) -> "DriverVectorCube":
247247
"""Create new vector cube with same geometries but new cube"""
248248
log.info(f"Creating vector cube with new cube {cube.name!r}")
249-
return type(self)(
250-
geometries=self._geometries, cube=cube, flatten_prefix=flatten_prefix
251-
)
249+
return type(self)(geometries=self._geometries, cube=cube)
250+
251+
@classmethod
252+
def from_geodataframe(
253+
cls,
254+
data: gpd.GeoDataFrame,
255+
*,
256+
columns_for_cube: Union[List[str], str] = COLUMN_SELECTION_NUMERICAL,
257+
dimension_name: str = DIM_BANDS,
258+
) -> "DriverVectorCube":
259+
"""
260+
Build a DriverVectorCube from given GeoPandas data frame,
261+
using the data frame geometries as vector cube geometries
262+
and other columns (as specified) as cube values along a "bands" dimension
263+
264+
:param data: geopandas data frame
265+
:param columns_for_cube: which data frame columns to use as cube values.
266+
One of:
267+
- "numerical": automatically pick numerical columns
268+
- "all": use all columns as cube values
269+
- list of column names
270+
:param dimension_name: name of the "bands" dimension
271+
:return: vector cube
272+
"""
273+
available_columns = [c for c in data.columns if c != "geometry"]
274+
275+
if columns_for_cube is None:
276+
# TODO #114: what should default selection be?
277+
columns_for_cube = cls.COLUMN_SELECTION_NUMERICAL
278+
279+
if columns_for_cube == cls.COLUMN_SELECTION_NUMERICAL:
280+
columns_for_cube = [c for c in available_columns if numpy.issubdtype(data[c].dtype, numpy.number)]
281+
elif columns_for_cube == cls.COLUMN_SELECTION_ALL:
282+
columns_for_cube = available_columns
283+
elif isinstance(columns_for_cube, list):
284+
# TODO #114 limit to subset with available columns (and automatically fill in missing columns with nodata)?
285+
columns_for_cube = columns_for_cube
286+
else:
287+
raise ValueError(columns_for_cube)
288+
assert isinstance(columns_for_cube, list)
289+
290+
if columns_for_cube:
291+
cube_df = data[columns_for_cube]
292+
# TODO: remove `columns_for_cube` from geopandas data frame?
293+
# Enabling that triggers failure of som existing tests that use `aggregate_spatial`
294+
# to "enrich" a vector cube with pre-existing properties
295+
# Also see https://github.com/Open-EO/openeo-api/issues/504
296+
# geometries_df = data.drop(columns=columns_for_cube)
297+
geometries_df = data
298+
299+
# TODO: leverage pandas `to_xarray` and xarray `to_array` instead of this manual building?
300+
cube: xarray.DataArray = xarray.DataArray(
301+
data=cube_df.values,
302+
dims=[cls.DIM_GEOMETRIES, dimension_name],
303+
coords={
304+
cls.DIM_GEOMETRIES: data.geometry.index.to_list(),
305+
dimension_name: cube_df.columns,
306+
},
307+
)
308+
return cls(geometries=geometries_df, cube=cube)
309+
310+
else:
311+
return cls(geometries=data)
252312

253313
@classmethod
254314
def from_fiona(
@@ -261,15 +321,21 @@ def from_fiona(
261321
if len(paths) != 1:
262322
# TODO #114 EP-3981: support multiple paths
263323
raise FeatureUnsupportedException(message="Loading a vector cube from multiple files is not supported")
324+
columns_for_cube = options.get("columns_for_cube", cls.COLUMN_SELECTION_NUMERICAL)
264325
# TODO #114 EP-3981: lazy loading like/with DelayedVector
265326
# note for GeoJSON: will consider Feature.id as well as Feature.properties.id
266327
if "parquet" == driver:
267-
return cls.from_parquet(paths=paths)
328+
return cls.from_parquet(paths=paths, columns_for_cube=columns_for_cube)
268329
else:
269-
return cls(geometries=gpd.read_file(paths[0], driver=driver))
330+
gdf = gpd.read_file(paths[0], driver=driver)
331+
return cls.from_geodataframe(gdf, columns_for_cube=columns_for_cube)
270332

271333
@classmethod
272-
def from_parquet(cls, paths: List[Union[str, Path]]):
334+
def from_parquet(
335+
cls,
336+
paths: List[Union[str, Path]],
337+
columns_for_cube: Union[List[str], str] = COLUMN_SELECTION_NUMERICAL,
338+
):
273339
if len(paths) != 1:
274340
# TODO #114 EP-3981: support multiple paths
275341
raise FeatureUnsupportedException(
@@ -287,10 +353,14 @@ def from_parquet(cls, paths: List[Union[str, Path]]):
287353
if "OGC:CRS84" in str(df.crs) or "WGS 84 (CRS84)" in str(df.crs):
288354
# workaround for not being able to decode ogc:crs84
289355
df.crs = CRS.from_epsg(4326)
290-
return cls(geometries=df)
356+
return cls.from_geodataframe(df, columns_for_cube=columns_for_cube)
291357

292358
@classmethod
293-
def from_geojson(cls, geojson: dict) -> "DriverVectorCube":
359+
def from_geojson(
360+
cls,
361+
geojson: dict,
362+
columns_for_cube: Union[List[str], str] = COLUMN_SELECTION_NUMERICAL,
363+
) -> "DriverVectorCube":
294364
"""Construct vector cube from GeoJson dict structure"""
295365
validate_geojson_coordinates(geojson)
296366
# TODO support more geojson types?
@@ -308,7 +378,8 @@ def from_geojson(cls, geojson: dict) -> "DriverVectorCube":
308378
raise FeatureUnsupportedException(
309379
f"Can not construct DriverVectorCube from {geojson.get('type', type(geojson))!r}"
310380
)
311-
return cls(geometries=gpd.GeoDataFrame.from_features(features))
381+
gdf = gpd.GeoDataFrame.from_features(features)
382+
return cls.from_geodataframe(gdf, columns_for_cube=columns_for_cube)
312383

313384
@classmethod
314385
def from_geometry(
@@ -323,7 +394,9 @@ def from_geometry(
323394
geometry = [geometry]
324395
return cls(geometries=gpd.GeoDataFrame(geometry=geometry))
325396

326-
def _as_geopandas_df(self) -> gpd.GeoDataFrame:
397+
def _as_geopandas_df(
398+
self, flatten_prefix: Optional[str] = None, flatten_name_joiner: str = "~"
399+
) -> gpd.GeoDataFrame:
327400
"""Join geometries and cube as a geopandas dataframe"""
328401
# TODO: avoid copy?
329402
df = self._geometries.copy(deep=True)
@@ -334,18 +407,19 @@ def _as_geopandas_df(self) -> gpd.GeoDataFrame:
334407
if self._cube.dims[1:]:
335408
stacked = self._cube.stack(prop=self._cube.dims[1:])
336409
log.info(f"Flattened cube component of vector cube to {stacked.shape[1]} properties")
410+
name_prefix = [flatten_prefix] if flatten_prefix else []
337411
for p in stacked.indexes["prop"]:
338-
name = "~".join(str(x) for x in [self._flatten_prefix] + list(p))
412+
name = flatten_name_joiner.join(str(x) for x in name_prefix + list(p))
339413
# TODO: avoid column collisions?
340414
df[name] = stacked.sel(prop=p)
341415
else:
342-
df[self._flatten_prefix] = self._cube
416+
df[flatten_prefix or self.FLATTEN_PREFIX] = self._cube
343417

344418
return df
345419

346-
def to_geojson(self) -> dict:
420+
def to_geojson(self, flatten_prefix: Optional[str] = None) -> dict:
347421
"""Export as GeoJSON FeatureCollection."""
348-
return shapely.geometry.mapping(self._as_geopandas_df())
422+
return shapely.geometry.mapping(self._as_geopandas_df(flatten_prefix=flatten_prefix))
349423

350424
def to_wkt(self) -> List[str]:
351425
wkts = [str(g) for g in self._geometries.geometry]
@@ -369,7 +443,8 @@ def write_assets(
369443
)
370444
return self.to_legacy_save_result().write_assets(directory)
371445

372-
self._as_geopandas_df().to_file(path, driver=format_info.fiona_driver)
446+
gdf = self._as_geopandas_df(flatten_prefix=options.get("flatten_prefix"))
447+
gdf.to_file(path, driver=format_info.fiona_driver)
373448

374449
if not format_info.multi_file:
375450
# single file format
@@ -474,8 +549,9 @@ def get_xarray_cube_basics(self) -> Tuple[tuple, dict]:
474549
return dims, coords
475550

476551
def __eq__(self, other):
477-
return (isinstance(other, DriverVectorCube)
478-
and np.array_equal(self._as_geopandas_df().values, other._as_geopandas_df().values))
552+
return isinstance(other, DriverVectorCube) and numpy.array_equal(
553+
self._as_geopandas_df().values, other._as_geopandas_df().values
554+
)
479555

480556
def fit_class_random_forest(
481557
self,

openeo_driver/dummy/dummy_backend.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ def assert_polygon_sequence(geometries: Union[Sequence, BaseMultipartGeometry])
265265
coords=coords,
266266
name="aggregate_spatial",
267267
)
268-
return geometries.with_cube(cube=cube, flatten_prefix="agg")
268+
return geometries.with_cube(cube=cube)
269269
elif isinstance(geometries, str):
270270
geometries = [geometry for geometry in DelayedVector(geometries).geometries]
271271
n_geometries = assert_polygon_sequence(geometries)

tests/test_vectorcube.py

+65-32
Original file line numberDiff line numberDiff line change
@@ -92,37 +92,70 @@ def test_with_cube_to_geojson(self, gdf):
9292
dims += ("bands",)
9393
coords["bands"] = ["red", "green"]
9494
cube = xarray.DataArray(data=[[1, 2], [3, 4]], dims=dims, coords=coords)
95-
vc2 = vc1.with_cube(cube, flatten_prefix="bandz")
96-
assert vc1.to_geojson() == DictSubSet({
97-
"type": "FeatureCollection",
98-
"features": [
99-
DictSubSet({
100-
"type": "Feature",
101-
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
102-
"properties": {"id": "first", "pop": 1234},
103-
}),
104-
DictSubSet({
105-
"type": "Feature",
106-
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
107-
"properties": {"id": "second", "pop": 5678},
108-
}),
109-
]
110-
})
111-
assert vc2.to_geojson() == DictSubSet({
112-
"type": "FeatureCollection",
113-
"features": [
114-
DictSubSet({
115-
"type": "Feature",
116-
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
117-
"properties": {"id": "first", "pop": 1234, "bandz~red": 1, "bandz~green": 2},
118-
}),
119-
DictSubSet({
120-
"type": "Feature",
121-
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
122-
"properties": {"id": "second", "pop": 5678, "bandz~red": 3, "bandz~green": 4},
123-
}),
124-
]
125-
})
95+
vc2 = vc1.with_cube(cube)
96+
assert vc1.to_geojson() == DictSubSet(
97+
{
98+
"type": "FeatureCollection",
99+
"features": [
100+
DictSubSet(
101+
{
102+
"type": "Feature",
103+
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
104+
"properties": {"id": "first", "pop": 1234},
105+
}
106+
),
107+
DictSubSet(
108+
{
109+
"type": "Feature",
110+
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
111+
"properties": {"id": "second", "pop": 5678},
112+
}
113+
),
114+
],
115+
}
116+
)
117+
assert vc2.to_geojson() == DictSubSet(
118+
{
119+
"type": "FeatureCollection",
120+
"features": [
121+
DictSubSet(
122+
{
123+
"type": "Feature",
124+
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
125+
"properties": {"id": "first", "pop": 1234, "red": 1, "green": 2},
126+
}
127+
),
128+
DictSubSet(
129+
{
130+
"type": "Feature",
131+
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
132+
"properties": {"id": "second", "pop": 5678, "red": 3, "green": 4},
133+
}
134+
),
135+
],
136+
}
137+
)
138+
assert vc2.to_geojson(flatten_prefix="bandz") == DictSubSet(
139+
{
140+
"type": "FeatureCollection",
141+
"features": [
142+
DictSubSet(
143+
{
144+
"type": "Feature",
145+
"geometry": {"type": "Polygon", "coordinates": (((1, 1), (3, 1), (2, 3), (1, 1)),)},
146+
"properties": {"id": "first", "pop": 1234, "bandz~red": 1, "bandz~green": 2},
147+
}
148+
),
149+
DictSubSet(
150+
{
151+
"type": "Feature",
152+
"geometry": {"type": "Polygon", "coordinates": (((4, 2), (5, 4), (3, 4), (4, 2)),)},
153+
"properties": {"id": "second", "pop": 5678, "bandz~red": 3, "bandz~green": 4},
154+
}
155+
),
156+
],
157+
}
158+
)
126159

127160
@pytest.mark.parametrize(["geojson", "expected"], [
128161
(
@@ -342,7 +375,7 @@ def test_from_geometry(self, geometry, expected):
342375
],
343376
)
344377
def test_from_fiona(self, path, driver):
345-
vc = DriverVectorCube.from_fiona([path], driver=driver)
378+
vc = DriverVectorCube.from_fiona([path], driver=driver, columns_for_cube=[])
346379
assert vc.to_geojson() == DictSubSet(
347380
{
348381
"type": "FeatureCollection",

0 commit comments

Comments
 (0)