From fca80330f90452efe87885c77c91ef55b50f8056 Mon Sep 17 00:00:00 2001
From: CyrilThebault <cyril.thebault@ucalgary.ca>
Date: Tue, 29 Jul 2025 15:02:40 -0600
Subject: [PATCH] Change algorithm for summarizing values in the portion of a
 raster dataset that is covered by a polygon (rasterio -> exactextract)

---
 .../1_topo/1_find_HRU_elevation.ipynb         | 20 ++---
 4b_remapping/1_topo/1_find_HRU_elevation.py   | 13 ++-
 .../1_topo/2_find_HRU_soil_classes.ipynb      | 84 +++++++++----------
 .../1_topo/2_find_HRU_soil_classes.py         | 52 +++++++-----
 .../1_topo/3_find_HRU_land_classes.ipynb      | 77 ++++++++---------
 .../1_topo/3_find_HRU_land_classes.py         | 36 +++++---
 6 files changed, 140 insertions(+), 142 deletions(-)

diff --git a/4b_remapping/1_topo/1_find_HRU_elevation.ipynb b/4b_remapping/1_topo/1_find_HRU_elevation.ipynb
index def957c..6d8debe 100644
--- a/4b_remapping/1_topo/1_find_HRU_elevation.ipynb
+++ b/4b_remapping/1_topo/1_find_HRU_elevation.ipynb
@@ -23,10 +23,10 @@
     "# modules\n",
     "import os\n",
     "import geopandas as gpd\n",
-    "from rasterstats import zonal_stats\n",
     "from pathlib import Path\n",
     "from shutil import copyfile\n",
-    "from datetime import datetime"
+    "from datetime import datetime\n",
+    "from exactextract import exact_extract"
    ]
   },
   {
@@ -250,7 +250,8 @@
    "outputs": [],
    "source": [
     "# Load the shapefile\n",
-    "gdf = gpd.read_file(str(intersect_path/intersect_name))"
+    "gdf = gpd.read_file(str(intersect_path/intersect_name))\n",
+    "raster_path = dem_path / dem_name"
    ]
   },
   {
@@ -260,10 +261,7 @@
    "outputs": [],
    "source": [
     "# Calculate zonal statistics\n",
-    "stats = zonal_stats(gdf, \n",
-    "                    str(dem_path/dem_name), \n",
-    "                    stats=['mean'], \n",
-    "                    all_touched=True)"
+    "elev_means = exact_extract(str(raster_path), gdf, 'mean', output='pandas')"
    ]
   },
   {
@@ -273,7 +271,7 @@
    "outputs": [],
    "source": [
     "# Add the mean elevation to the GeoDataFrame\n",
-    "gdf['elev_mean'] = [stat['mean'] for stat in stats]"
+    "gdf['elev_mean'] = elev_means"
    ]
   },
   {
@@ -363,9 +361,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "summa-env",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "summa-env"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -377,7 +375,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/4b_remapping/1_topo/1_find_HRU_elevation.py b/4b_remapping/1_topo/1_find_HRU_elevation.py
index 26a6fd1..689b818 100644
--- a/4b_remapping/1_topo/1_find_HRU_elevation.py
+++ b/4b_remapping/1_topo/1_find_HRU_elevation.py
@@ -9,11 +9,10 @@
 # modules
 import os
 import geopandas as gpd
-from rasterstats import zonal_stats
 from pathlib import Path
 from shutil import copyfile
 from datetime import datetime
-
+from exactextract import exact_extract
 
 # --- Control file handling
 # Easy access to control file folder
@@ -113,16 +112,14 @@ def make_default_path(suffix):
         
 # --- Rasterstats analysis
 # Load the shapefile
-gdf = gpd.read_file(str(intersect_path/intersect_name))
+gdf = gpd.read_file(intersect_path / intersect_name)
+raster_path = dem_path / dem_name
 
 # Calculate zonal statistics
-stats = zonal_stats(gdf, 
-                    str(dem_path/dem_name), 
-                    stats=['mean'], 
-                    all_touched=True)
+elev_means = exact_extract(str(raster_path), gdf, 'mean', output='pandas')
 
 # Add the mean elevation to the GeoDataFrame
-gdf['elev_mean'] = [stat['mean'] for stat in stats]
+gdf['elev_mean'] = elev_means
 
 # Save the updated GeoDataFrame
 gdf.to_file(str(intersect_path/intersect_name))
diff --git a/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb b/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb
index 8de6fd1..3e7129e 100644
--- a/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb
+++ b/4b_remapping/1_topo/2_find_HRU_soil_classes.ipynb
@@ -19,10 +19,9 @@
     "from shutil import copyfile\n",
     "from datetime import datetime\n",
     "import geopandas as gpd\n",
-    "import rasterio\n",
-    "from rasterstats import zonal_stats\n",
     "import pandas as pd\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "from exactextract import exact_extract"
    ]
   },
   {
@@ -201,7 +200,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Rasterstats analysis"
+    "#### Exactextract analysis"
    ]
   },
   {
@@ -211,7 +210,8 @@
    "outputs": [],
    "source": [
     "# Load the shapefile\n",
-    "gdf = gpd.read_file(catchment_path / catchment_name)"
+    "gdf = gpd.read_file(catchment_path / catchment_name)\n",
+    "raster_path = soil_path / soil_name"
    ]
   },
   {
@@ -220,10 +220,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Open the raster file\n",
-    "with rasterio.open(soil_path / soil_name) as src:\n",
-    "    affine = src.transform\n",
-    "    array = src.read(1)  # Read the first band"
+    "# Perform exactextract with fractional coverage per class\n",
+    "stats = exact_extract(\n",
+    "    str(raster_path),\n",
+    "    gdf,\n",
+    "    [\"unique\", \"frac\"],\n",
+    "    output='pandas'\n",
+    ")"
    ]
   },
   {
@@ -232,53 +235,44 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Get unique values in the raster\n",
-    "unique_values = np.unique(array).astype(int)\n",
-    "unique_values = unique_values[unique_values != src.nodata]  # Remove nodata value if present"
+    "# Build a list of row-wise dicts: {soil_class: fraction}\n",
+    "rows = []\n",
+    "for i, (classes, fractions) in stats.iterrows():\n",
+    "    row_dict = {k: v for k, v in zip(classes, fractions)}\n",
+    "    rows.append(row_dict)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/cyrilthebault/Postdoc_Ucal/02_DATA/01_RAW/CWARHM-main/cwarhm-env/lib/python3.11/site-packages/rasterstats/io.py:328: NodataWarning: Setting nodata to -999; specify nodata explicitly\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Perform zonal statistics for each unique value\n",
-    "results = []\n",
-    "for value in unique_values:\n",
-    "    category_stats = zonal_stats(gdf, array, affine=affine, stats=['count'], categorical=True, \n",
-    "                                 category_map={value: 1})\n",
-    "    counts = [stat.get(1, 0) for stat in category_stats]  # Get count for category 1, default to 0 if not present\n",
-    "    results.append(pd.DataFrame(counts, columns=[f'USGS_{value}']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Combine all results\n",
-    "hist_df = pd.concat(results, axis=1).fillna(0).astype(int)"
+    "# Convert to DataFrame\n",
+    "df_stats = pd.DataFrame(rows)\n",
+    "\n",
+    "# Replace NaN with 0, ensure float type\n",
+    "df_stats = df_stats.fillna(0).astype(float)\n",
+    "\n",
+    "# Sort columns numerically\n",
+    "sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))\n",
+    "df_stats = df_stats[sorted_columns]\n",
+    "\n",
+    "# Round to 4 digit\n",
+    "df_stats = df_stats.round(4)\n",
+    "\n",
+    "# Rename columns to USGS soil class format\n",
+    "df_stats.columns = [f'USGS_{int(col)}' for col in df_stats.columns]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Combine the original GeoDataFrame with the histogram results\n",
-    "result = gdf.join(hist_df)"
+    "# Merge with original GeoDataFrame\n",
+    "result = gdf.join(df_stats)"
    ]
   },
   {
@@ -368,9 +362,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "summa-env",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "summa-env"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -382,7 +376,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/4b_remapping/1_topo/2_find_HRU_soil_classes.py b/4b_remapping/1_topo/2_find_HRU_soil_classes.py
index 2705a71..cb22ae8 100644
--- a/4b_remapping/1_topo/2_find_HRU_soil_classes.py
+++ b/4b_remapping/1_topo/2_find_HRU_soil_classes.py
@@ -7,10 +7,9 @@
 from shutil import copyfile
 from datetime import datetime
 import geopandas as gpd
-import rasterio
-from rasterstats import zonal_stats
 import pandas as pd
 import numpy as np
+from exactextract import exact_extract
 
 
 # --- Control file handling
@@ -92,32 +91,43 @@ def make_default_path(suffix):
 intersect_path.mkdir(parents=True, exist_ok=True)
 
 
-# --- Rasterstats analysis
+# --- Exactextract analysis
 # Load the shapefile
 gdf = gpd.read_file(catchment_path / catchment_name)
+raster_path = soil_path / soil_name
 
-# Open the raster file
-with rasterio.open(soil_path / soil_name) as src:
-    affine = src.transform
-    array = src.read(1)  # Read the first band
+# Perform exactextract with fractional coverage per class
+stats = exact_extract(
+    str(raster_path),
+    gdf,
+    ["unique", "frac"],
+    output='pandas'
+)
 
-# Get unique values in the raster
-unique_values = np.unique(array).astype(int)
-unique_values = unique_values[unique_values != src.nodata]  # Remove nodata value if present
+# Build a list of row-wise dicts: {soil_class: fraction}
+rows = []
+for i, (classes, fractions) in stats.iterrows():
+    row_dict = {k: v for k, v in zip(classes, fractions)}
+    rows.append(row_dict)
 
-# Perform zonal statistics for each unique value
-results = []
-for value in unique_values:
-    category_stats = zonal_stats(gdf, array, affine=affine, stats=['count'], categorical=True, 
-                                 category_map={value: 1})
-    counts = [stat.get(1, 0) for stat in category_stats]  # Get count for category 1, default to 0 if not present
-    results.append(pd.DataFrame(counts, columns=[f'USGS_{value}']))
+# Convert to DataFrame
+df_stats = pd.DataFrame(rows)
 
-# Combine all results
-hist_df = pd.concat(results, axis=1).fillna(0).astype(int)
+# Replace NaN with 0, ensure float type
+df_stats = df_stats.fillna(0).astype(float)
 
-# Combine the original GeoDataFrame with the histogram results
-result = gdf.join(hist_df)
+# Sort columns numerically
+sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))
+df_stats = df_stats[sorted_columns]
+
+# Round to 4 digit
+df_stats = df_stats.round(4)
+
+# Rename columns to USGS soil class format
+df_stats.columns = [f'USGS_{int(col)}' for col in df_stats.columns]
+
+# Merge with original GeoDataFrame
+result = gdf.join(df_stats)
 
 # Save the result
 result.to_file(intersect_path / intersect_name)
diff --git a/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb b/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb
index a73be67..7d81465 100644
--- a/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb
+++ b/4b_remapping/1_topo/3_find_HRU_land_classes.ipynb
@@ -19,9 +19,8 @@
     "from shutil import copyfile\n",
     "from datetime import datetime\n",
     "import geopandas as gpd\n",
-    "import rasterio\n",
-    "from rasterstats import zonal_stats\n",
-    "import pandas as pd"
+    "import pandas as pd\n",
+    "from exactextract import exact_extract"
    ]
   },
   {
@@ -200,7 +199,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Rasterstats analysis"
+    "#### Exactextract analysis"
    ]
   },
   {
@@ -220,10 +219,7 @@
    "outputs": [],
    "source": [
     "# Open the raster file\n",
-    "with rasterio.open(land_path / land_name) as src:\n",
-    "    affine = src.transform\n",
-    "    array = src.read(1)\n",
-    "    nodata = src.nodata"
+    "raster_path = land_path / land_name"
    ]
   },
   {
@@ -242,55 +238,48 @@
    ],
    "source": [
     "# Perform zonal statistics\n",
-    "stats = zonal_stats(gdf, array, affine=affine, nodata=nodata, categorical=True)"
+    "stats = exact_extract(\n",
+    "    str(raster_path),\n",
+    "    gdf,\n",
+    "    [\"unique\", \"frac\"],\n",
+    "    output='pandas'\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Convert stats to DataFrame\n",
-    "df_stats = pd.DataFrame(stats)"
+    "# Initialize list to store per-row dicts\n",
+    "rows = []\n",
+    "for i, (classes, fractions) in stats.iterrows():\n",
+    "    row_dict = {k: v for k, v in zip(classes, fractions)}\n",
+    "    rows.append(row_dict)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Convert list of dicts to DataFrame\n",
+    "df_stats = pd.DataFrame(rows)\n",
+    "\n",
     "# Convert column names to integers and sort them\n",
-    "sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))\n",
+    "\n",
     "# Reorder DataFrame based on sorted column names\n",
-    "df_stats = df_stats[sorted_columns]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Replace NaN with 0 and convert to integers\n",
-    "df_stats = df_stats.fillna(0).astype(int)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "df_stats = df_stats[sorted_columns]\n",
+    "\n",
+    "# Replace NaN with 0 and convert to float\n",
+    "df_stats = df_stats.fillna(0).astype(float)\n",
+    "\n",
+    "# Round to 4 digit\n",
+    "df_stats = df_stats.round(4)\n",
+    "\n",
     "# Rename columns\n",
     "df_stats.columns = [f'IGBP_{int(col)}' for col in df_stats.columns]"
    ]
@@ -392,9 +381,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "summa-env",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "summa-env"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -406,7 +395,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/4b_remapping/1_topo/3_find_HRU_land_classes.py b/4b_remapping/1_topo/3_find_HRU_land_classes.py
index 94e0220..982f8d2 100644
--- a/4b_remapping/1_topo/3_find_HRU_land_classes.py
+++ b/4b_remapping/1_topo/3_find_HRU_land_classes.py
@@ -7,9 +7,8 @@
 from shutil import copyfile
 from datetime import datetime
 import geopandas as gpd
-import rasterio
-from rasterstats import zonal_stats
 import pandas as pd
+from exactextract import exact_extract
 
 
 # --- Control file handling
@@ -91,21 +90,29 @@ def make_default_path(suffix):
 intersect_path.mkdir(parents=True, exist_ok=True)
 
 
-# --- Rasterstats analysis
+# --- Exactextract analysis
 # Load the shapefile
 gdf = gpd.read_file(catchment_path / catchment_name)
+raster_path = land_path / land_name
 
-# Open the raster file
-with rasterio.open(land_path / land_name) as src:
-    affine = src.transform
-    array = src.read(1)
-    nodata = src.nodata
 
 # Perform zonal statistics
-stats = zonal_stats(gdf, array, affine=affine, nodata=nodata, categorical=True)
+stats = exact_extract(
+    str(raster_path),
+    gdf,
+    ["unique", "frac"],
+    output='pandas'
+)
 
-# Convert stats to DataFrame
-df_stats = pd.DataFrame(stats)
+
+# Initialize list to store per-row dicts
+rows = []
+for i, (classes, fractions) in stats.iterrows():
+    row_dict = {k: v for k, v in zip(classes, fractions)}
+    rows.append(row_dict)
+
+# Convert list of dicts to DataFrame
+df_stats = pd.DataFrame(rows)
 
 # Convert column names to integers and sort them
 sorted_columns = sorted(df_stats.columns, key=lambda x: int(x))
@@ -113,8 +120,11 @@ def make_default_path(suffix):
 # Reorder DataFrame based on sorted column names
 df_stats = df_stats[sorted_columns]
 
-# Replace NaN with 0 and convert to integers
-df_stats = df_stats.fillna(0).astype(int)
+# Replace NaN with 0 and convert to float
+df_stats = df_stats.fillna(0).astype(float)
+
+# Round to 4 digit
+df_stats = df_stats.round(4)
 
 # Rename columns
 df_stats.columns = [f'IGBP_{int(col)}' for col in df_stats.columns]