From fca80330f90452efe87885c77c91ef55b50f8056 Mon Sep 17 00:00:00 2001 From: CyrilThebault Date: Tue, 29 Jul 2025 15:02:40 -0600 Subject: [PATCH] Change algorithm for summarizing values in the portion of a raster dataset that is covered by a polygon (rasterio -> exactextract) --- .../1_topo/1_find_HRU_elevation.ipynb | 20 ++--- 4b_remapping/1_topo/1_find_HRU_elevation.py | 13 ++- .../1_topo/2_find_HRU_soil_classes.ipynb | 84 +++++++++---------- .../1_topo/2_find_HRU_soil_classes.py | 52 +++++++----- .../1_topo/3_find_HRU_land_classes.ipynb | 77 ++++++++--------- .../1_topo/3_find_HRU_land_classes.py | 36 +++++--- 6 files changed, 140 insertions(+), 142 deletions(-) diff --git a/4b_remapping/1_topo/1_find_HRU_elevation.ipynb b/4b_remapping/1_topo/1_find_HRU_elevation.ipynb index def957c..6d8debe 100644 --- a/4b_remapping/1_topo/1_find_HRU_elevation.ipynb +++ b/4b_remapping/1_topo/1_find_HRU_elevation.ipynb @@ -23,10 +23,10 @@ "# modules\n", "import os\n", "import geopandas as gpd\n", - "from rasterstats import zonal_stats\n", "from pathlib import Path\n", "from shutil import copyfile\n", - "from datetime import datetime" + "from datetime import datetime\n", + "from exactextract import exact_extract" ] }, { @@ -250,7 +250,8 @@ "outputs": [], "source": [ "# Load the shapefile\n", - "gdf = gpd.read_file(str(intersect_path/intersect_name))" + "gdf = gpd.read_file(str(intersect_path/intersect_name))\n", + "raster_path = dem_path / dem_name" ] }, { @@ -260,10 +261,7 @@ "outputs": [], "source": [ "# Calculate zonal statistics\n", - "stats = zonal_stats(gdf, \n", - " str(dem_path/dem_name), \n", - " stats=['mean'], \n", - " all_touched=True)" + "elev_means = exact_extract(str(raster_path), gdf, 'mean', output='pandas')" ] }, { @@ -273,7 +271,7 @@ "outputs": [], "source": [ "# Add the mean elevation to the GeoDataFrame\n", - "gdf['elev_mean'] = [stat['mean'] for stat in stats]" + "gdf['elev_mean'] = elev_means" ] }, { @@ -363,9 +361,9 @@ ], "metadata": { "kernelspec": { - "display_name": "summa-env", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "summa-env" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -377,7 +375,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/4b_remapping/1_topo/1_find_HRU_elevation.py b/4b_remapping/1_topo/1_find_HRU_elevation.py index 26a6fd1..689b818 100644 --- a/4b_remapping/1_topo/1_find_HRU_elevation.py +++ b/4b_remapping/1_topo/1_find_HRU_elevation.py @@ -9,11 +9,10 @@ # modules import os import geopandas as gpd -from rasterstats import zonal_stats from pathlib import Path from shutil import copyfile from datetime import datetime - +from exactextract import exact_extract # --- Control file handling # Easy access to control file folder @@ -113,16 +112,14 @@ def make_default_path(suffix): # --- Rasterstats analysis # Load the shapefile -gdf = gpd.read_file(str(intersect_path/intersect_name)) +gdf = gpd.read_file(intersect_path / intersect_name) +raster_path = dem_path / dem_name # Calculate zonal statistics -stats = zonal_stats(gdf, - str(dem_path/dem_name), - stats=['mean'], - all_touched=True) +elev_means = exact_extract(str(raster_path), gdf, 'mean', output='pandas') # Add the mean elevation to the GeoDataFrame -gdf['elev_mean'] = [stat['mean'] for stat in stats] +gdf['elev_mean'] = elev_means # Save the updated GeoDataFrame gdf.to_file(str(intersect_path/intersect_name)) diff --git a/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb b/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb index 8de6fd1..3e7129e 100644 --- a/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb +++ b/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb @@ -19,10 +19,9 @@ "from shutil import copyfile\n", "from datetime import datetime\n", "import geopandas as gpd\n", - "import rasterio\n", - "from rasterstats import zonal_stats\n", "import pandas as pd\n", - "import numpy as np" + "import numpy as np\n", + "from exactextract import exact_extract" ] }, { @@ -201,7 +200,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Rasterstats analysis" + "#### Exactextract analysis" ] }, { @@ -211,7 +210,8 @@ "outputs": [], "source": [ "# Load the shapefile\n", - "gdf = gpd.read_file(catchment_path / catchment_name)" + "gdf = gpd.read_file(catchment_path / catchment_name)\n", + "raster_path = soil_path / soil_name" ] }, { @@ -220,10 +220,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Open the raster file\n", - "with rasterio.open(soil_path / soil_name) as src:\n", - " affine = src.transform\n", - " array = src.read(1) # Read the first band" + "# Perform exactextract with fractional coverage per class\n", + "stats = exact_extract(\n", + " str(raster_path),\n", + " gdf,\n", + " [\"unique\", \"frac\"],\n", + " output='pandas'\n", + ")" ] }, { @@ -232,53 +235,44 @@ "metadata": {}, "outputs": [], "source": [ - "# Get unique values in the raster\n", - "unique_values = np.unique(array).astype(int)\n", - "unique_values = unique_values[unique_values != src.nodata] # Remove nodata value if present" + "# Build a list of row-wise dicts: {soil_class: fraction}\n", + "rows = []\n", + "for i, (classes, fractions) in stats.iterrows():\n", + " row_dict = {k: v for k, v in zip(classes, fractions)}\n", + " rows.append(row_dict)" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/cyrilthebault/Postdoc_Ucal/02_DATA/01_RAW/CWARHM-main/cwarhm-env/lib/python3.11/site-packages/rasterstats/io.py:328: NodataWarning: Setting nodata to -999; specify nodata explicitly\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "# Perform zonal statistics for each unique value\n", - "results = []\n", - "for value in unique_values:\n", - " category_stats = zonal_stats(gdf, array, affine=affine, stats=['count'], categorical=True, \n", - " category_map={value: 1})\n", - " counts = [stat.get(1, 0) for stat in category_stats] # Get count for category 1, default to 0 if not present\n", - " results.append(pd.DataFrame(counts, columns=[f'USGS_{value}']))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Combine all results\n", - "hist_df = pd.concat(results, axis=1).fillna(0).astype(int)" + "# Convert to DataFrame\n", + "df_stats = pd.DataFrame(rows)\n", + "\n", + "# Replace NaN with 0, ensure float type\n", + "df_stats = df_stats.fillna(0).astype(float)\n", + "\n", + "# Sort columns numerically\n", + "sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))\n", + "df_stats = df_stats[sorted_columns]\n", + "\n", + "# Round to 4 digit\n", + "df_stats = df_stats.round(4)\n", + "\n", + "# Rename columns to USGS soil class format\n", + "df_stats.columns = [f'USGS_{int(col)}' for col in df_stats.columns]" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Combine the original GeoDataFrame with the histogram results\n", - "result = gdf.join(hist_df)" + "# Merge with original GeoDataFrame\n", + "result = gdf.join(df_stats)" ] }, { @@ -368,9 +362,9 @@ ], "metadata": { "kernelspec": { - "display_name": "summa-env", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "summa-env" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -382,7 +376,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/4b_remapping/1_topo/2_find_HRU_soil_classes.py b/4b_remapping/1_topo/2_find_HRU_soil_classes.py index 2705a71..cb22ae8 100644 --- a/4b_remapping/1_topo/2_find_HRU_soil_classes.py +++ b/4b_remapping/1_topo/2_find_HRU_soil_classes.py @@ -7,10 +7,9 @@ from shutil import copyfile from datetime import datetime import geopandas as gpd -import rasterio -from rasterstats import zonal_stats import pandas as pd import numpy as np +from exactextract import exact_extract # --- Control file handling @@ -92,32 +91,43 @@ def make_default_path(suffix): intersect_path.mkdir(parents=True, exist_ok=True) -# --- Rasterstats analysis +# --- Exactextract analysis # Load the shapefile gdf = gpd.read_file(catchment_path / catchment_name) +raster_path = soil_path / soil_name -# Open the raster file -with rasterio.open(soil_path / soil_name) as src: - affine = src.transform - array = src.read(1) # Read the first band +# Perform exactextract with fractional coverage per class +stats = exact_extract( + str(raster_path), + gdf, + ["unique", "frac"], + output='pandas' +) -# Get unique values in the raster -unique_values = np.unique(array).astype(int) -unique_values = unique_values[unique_values != src.nodata] # Remove nodata value if present +# Build a list of row-wise dicts: {soil_class: fraction} +rows = [] +for i, (classes, fractions) in stats.iterrows(): + row_dict = {k: v for k, v in zip(classes, fractions)} + rows.append(row_dict) -# Perform zonal statistics for each unique value -results = [] -for value in unique_values: - category_stats = zonal_stats(gdf, array, affine=affine, stats=['count'], categorical=True, - category_map={value: 1}) - counts = [stat.get(1, 0) for stat in category_stats] # Get count for category 1, default to 0 if not present - results.append(pd.DataFrame(counts, columns=[f'USGS_{value}'])) +# Convert to DataFrame +df_stats = pd.DataFrame(rows) -# Combine all results -hist_df = pd.concat(results, axis=1).fillna(0).astype(int) +# Replace NaN with 0, ensure float type +df_stats = df_stats.fillna(0).astype(float) -# Combine the original GeoDataFrame with the histogram results -result = gdf.join(hist_df) +# Sort columns numerically +sorted_columns = sorted(df_stats.columns, key=lambda x: int(x)) +df_stats = df_stats[sorted_columns] + +# Round to 4 digit +df_stats = df_stats.round(4) + +# Rename columns to USGS soil class format +df_stats.columns = [f'USGS_{int(col)}' for col in df_stats.columns] + +# Merge with original GeoDataFrame +result = gdf.join(df_stats) # Save the result result.to_file(intersect_path / intersect_name) diff --git a/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb b/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb index a73be67..7d81465 100644 --- a/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb +++ b/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb @@ -19,9 +19,8 @@ "from shutil import copyfile\n", "from datetime import datetime\n", "import geopandas as gpd\n", - "import rasterio\n", - "from rasterstats import zonal_stats\n", - "import pandas as pd" + "import pandas as pd\n", + "from exactextract import exact_extract" ] }, { @@ -200,7 +199,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Rasterstats analysis" + "#### Exactextract analysis" ] }, { @@ -220,10 +219,7 @@ "outputs": [], "source": [ "# Open the raster file\n", - "with rasterio.open(land_path / land_name) as src:\n", - " affine = src.transform\n", - " array = src.read(1)\n", - " nodata = src.nodata" + "raster_path = land_path / land_name" ] }, { @@ -242,55 +238,48 @@ ], "source": [ "# Perform zonal statistics\n", - "stats = zonal_stats(gdf, array, affine=affine, nodata=nodata, categorical=True)" + "stats = exact_extract(\n", + " str(raster_path),\n", + " gdf,\n", + " [\"unique\", \"frac\"],\n", + " output='pandas'\n", + ")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Convert stats to DataFrame\n", - "df_stats = pd.DataFrame(stats)" + "# Initialize list to store per-row dicts\n", + "rows = []\n", + "for i, (classes, fractions) in stats.iterrows():\n", + " row_dict = {k: v for k, v in zip(classes, fractions)}\n", + " rows.append(row_dict)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ + "# Convert list of dicts to DataFrame\n", + "df_stats = pd.DataFrame(rows)\n", + "\n", "# Convert column names to integers and sort them\n", - "sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ + "sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))\n", + "\n", "# Reorder DataFrame based on sorted column names\n", - "df_stats = df_stats[sorted_columns]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "# Replace NaN with 0 and convert to integers\n", - "df_stats = df_stats.fillna(0).astype(int)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ + "df_stats = df_stats[sorted_columns]\n", + "\n", + "# Replace NaN with 0 and convert to float\n", + "df_stats = df_stats.fillna(0).astype(float)\n", + "\n", + "# Round to 4 digit\n", + "df_stats = df_stats.round(4)\n", + "\n", "# Rename columns\n", "df_stats.columns = [f'IGBP_{int(col)}' for col in df_stats.columns]" ] @@ -392,9 +381,9 @@ ], "metadata": { "kernelspec": { - "display_name": "summa-env", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "summa-env" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -406,7 +395,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/4b_remapping/1_topo/3_find_HRU_land_classes.py b/4b_remapping/1_topo/3_find_HRU_land_classes.py index 94e0220..982f8d2 100644 --- a/4b_remapping/1_topo/3_find_HRU_land_classes.py +++ b/4b_remapping/1_topo/3_find_HRU_land_classes.py @@ -7,9 +7,8 @@ from shutil import copyfile from datetime import datetime import geopandas as gpd -import rasterio -from rasterstats import zonal_stats import pandas as pd +from exactextract import exact_extract # --- Control file handling @@ -91,21 +90,29 @@ def make_default_path(suffix): intersect_path.mkdir(parents=True, exist_ok=True) -# --- Rasterstats analysis +# --- Exactextract analysis # Load the shapefile gdf = gpd.read_file(catchment_path / catchment_name) +raster_path = land_path / land_name -# Open the raster file -with rasterio.open(land_path / land_name) as src: - affine = src.transform - array = src.read(1) - nodata = src.nodata # Perform zonal statistics -stats = zonal_stats(gdf, array, affine=affine, nodata=nodata, categorical=True) +stats = exact_extract( + str(raster_path), + gdf, + ["unique", "frac"], + output='pandas' +) -# Convert stats to DataFrame -df_stats = pd.DataFrame(stats) + +# Initialize list to store per-row dicts +rows = [] +for i, (classes, fractions) in stats.iterrows(): + row_dict = {k: v for k, v in zip(classes, fractions)} + rows.append(row_dict) + +# Convert list of dicts to DataFrame +df_stats = pd.DataFrame(rows) # Convert column names to integers and sort them sorted_columns = sorted(df_stats.columns, key=lambda x: int(x)) @@ -113,8 +120,11 @@ def make_default_path(suffix): # Reorder DataFrame based on sorted column names df_stats = df_stats[sorted_columns] -# Replace NaN with 0 and convert to integers -df_stats = df_stats.fillna(0).astype(int) +# Replace NaN with 0 and convert to float +df_stats = df_stats.fillna(0).astype(float) + +# Round to 4 digit +df_stats = df_stats.round(4) # Rename columns df_stats.columns = [f'IGBP_{int(col)}' for col in df_stats.columns]