Merge remote-tracking branch 'origin/jacob/netcdf' into jacob/netcdf

jacobbieker · jacobbieker · commit 137ef7979205 · 2023-11-16T14:57:46.000Z
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,7 +1,7 @@
 [bumpversion]
 commit = True
 tag = True
-current_version = 2.0.9
+current_version = 2.0.11
 message = Bump version: {current_version} → {new_version} [skip ci]
 
 [bumpversion:file:setup.py]
diff --git a/.github/workflows/workflows.yaml b/.github/workflows/workflows.yaml
@@ -16,3 +16,4 @@ jobs:
       sudo_apt_install: "libgeos++-dev libproj-dev proj-data proj-bin"
       #      brew_install: "proj geos librttopo"
       os_list: '["ubuntu-latest"]'
+      python-version: "['3.9','3.10','3.11']"
diff --git a/ocf_datapipes/transform/xarray/__init__.py b/ocf_datapipes/transform/xarray/__init__.py
@@ -34,6 +34,9 @@
 from .get_contiguous_time_periods import (
     GetContiguousT0TimePeriodsIterDataPipe as GetContiguousT0TimePeriods,
 )
+from .get_contiguous_time_periods import (
+    GetContiguousT0TimePeriodsNWPIterDataPipe as GetContiguousT0TimePeriodsNWP,
+)
 from .gsp.create_gsp_image import CreateGSPImageIterDataPipe as CreateGSPImage
 from .gsp.ensure_n_gsp_per_example import (
     EnsureNGSPSPerExampleIterDataPipe as EnsureNGSPSPerExampleIter,
diff --git a/ocf_datapipes/transform/xarray/get_contiguous_time_periods.py b/ocf_datapipes/transform/xarray/get_contiguous_time_periods.py
@@ -61,21 +61,48 @@ def __iter__(self) -> pd.DataFrame:
             yield contiguous_time_periods
 
 
-def get_contiguous_t0_time_periods(
-    contiguous_time_periods: pd.DataFrame, history_duration: timedelta, forecast_duration: timedelta
-) -> pd.DataFrame:
-    """Get all time periods which contain valid t0 datetimes.
+@functional_datapipe("get_contiguous_time_periods_nwp")
+class GetContiguousT0TimePeriodsNWPIterDataPipe(IterDataPipe):
+    """Get contiguous NWP time periods for training"""
 
-    `t0` is the datetime of the most recent observation.
+    def __init__(
+        self,
+        source_datapipe: IterDataPipe,
+        history_duration: timedelta,
+        max_staleness: timedelta = timedelta(minutes=0),
+        max_dropout: timedelta = timedelta(minutes=0),
+        time_dim: str = "init_time_utc",
+    ):
+        """
+        Get contiguous time periods for use in determing t0 times for training
 
-    Returns:
-      pd.DataFrame where each row represents a single time period.  The pd.DataFrame
-      has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
-    """
-    contiguous_time_periods["start_dt"] += history_duration
-    contiguous_time_periods["end_dt"] -= forecast_duration
-    assert (contiguous_time_periods["start_dt"] < contiguous_time_periods["end_dt"]).all()
-    return contiguous_time_periods
+        Args:
+            source_datapipe: Datapipe emitting a Xarray dataset
+            history_duration: Length of the historical slice used for a sample
+            max_staleness: Up to how long after an NWP forecast init_time are we willing to use the
+                forecast. Each init time will only be used up to this t0 time regardless of the
+                forecast valid time.
+            max_dropout: What is the maximum amount of dropout that will be used. This must be <=
+                max_staleness.
+            time_dim: time dimensions for which to find the contiguous time periods
+        """
+        self.source_datapipe = source_datapipe
+        self.history_duration = history_duration
+        self.max_staleness = max_staleness
+        self.max_dropout = max_dropout
+        self.time_dim = time_dim
+
+    def __iter__(self) -> pd.DataFrame:
+        """Calculate contiguous time periods and return a dataframe containing them"""
+        for xr_data in self.source_datapipe:
+            logger.debug("Getting contiguous NWP t0 time periods")
+            contiguous_time_periods = get_contiguous_t0_periods_nwp(
+                datetimes=pd.DatetimeIndex(xr_data[self.time_dim]),
+                history_duration=self.history_duration,
+                max_staleness=self.max_staleness,
+                max_dropout=self.max_dropout,
+            )
+            yield contiguous_time_periods
 
 
 def get_contiguous_time_periods(
@@ -132,3 +159,75 @@ def get_contiguous_time_periods(
     )
 
     return pd.DataFrame(periods)
+
+
+def get_contiguous_t0_time_periods(
+    contiguous_time_periods: pd.DataFrame, history_duration: timedelta, forecast_duration: timedelta
+) -> pd.DataFrame:
+    """Get all time periods which contain valid t0 datetimes.
+
+    `t0` is the datetime of the most recent observation.
+
+    Returns:
+      pd.DataFrame where each row represents a single time period.  The pd.DataFrame
+      has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
+    """
+    contiguous_time_periods["start_dt"] += history_duration
+    contiguous_time_periods["end_dt"] -= forecast_duration
+    assert (contiguous_time_periods["start_dt"] < contiguous_time_periods["end_dt"]).all()
+    return contiguous_time_periods
+
+
+def get_contiguous_t0_periods_nwp(
+    datetimes: pd.DatetimeIndex,
+    history_duration: timedelta,
+    max_staleness: timedelta,
+    max_dropout: timedelta = timedelta(0),
+) -> pd.DataFrame:
+    """Get all time periods from the NWP init times which are valid as t0 datetimes.
+
+    Args:
+        datetimes: Sorted pd.DatetimeIndex
+        history_duration: Length of the historical slice used for a sample
+        max_staleness: Up to how long after an NWP forecast init_time are we willing to use the
+            forecast. Each init time will only be used up to this t0 time regardless of the forecast
+            valid time.
+        max_dropout: What is the maximum amount of dropout that will be used. This must be <=
+            max_staleness.
+
+    Returns:
+        pd.DataFrame where each row represents a single time period.  The pd.DataFrame
+        has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
+    """
+    # Sanity checks.
+    assert len(datetimes) > 0
+    assert datetimes.is_monotonic_increasing
+    assert datetimes.is_unique
+    assert history_duration >= timedelta(0)
+    assert max_staleness >= timedelta(0)
+    assert max_dropout <= max_staleness
+
+    hist_drop_buffer = max(history_duration, max_dropout)
+
+    # Store contiguous periods
+    contiguous_periods = []
+
+    # Start first period allowing for history slice and max dropout
+    start_this_period = datetimes[0] + hist_drop_buffer
+
+    # The first forecast is valid up to the max staleness
+    end_this_period = datetimes[0] + max_staleness
+
+    for dt_init in datetimes[1:]:
+        # If the previous init time becomes stale before the next init becomes valid whilst also
+        # considering dropout and the need for a historic period - then the contiguous period breaks
+        if end_this_period < dt_init + hist_drop_buffer:
+            contiguous_periods += [[start_this_period, end_this_period]]
+
+            # And start a new period
+            start_this_period = dt_init + hist_drop_buffer
+        end_this_period = dt_init + max_staleness
+
+    contiguous_periods += [[start_this_period, end_this_period]]
+
+    return pd.DataFrame(contiguous_periods, columns=["start_dt", "end_dt"])
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="ocf_datapipes",
-    version="2.0.9",
+    version="2.0.11",
     license="MIT",
     description="Pytorch Datapipes built for use in Open Climate Fix's forecasting work",
     author="Jacob Bieker, Jack Kelly, Peter Dudfield, James Fulton",
diff --git a/tests/transform/xarray/test_get_contiguous_time_periods.py b/tests/transform/xarray/test_get_contiguous_time_periods.py
@@ -1,17 +1,172 @@
 from datetime import timedelta
 
-from ocf_datapipes.select import DropGSP, LocationPicker
-from ocf_datapipes.transform.xarray import GetContiguousT0TimePeriods
+import numpy as np
+import pandas as pd
+
+from torchdata.datapipes.iter import IterableWrapper
+from ocf_datapipes.transform.xarray import GetContiguousT0TimePeriods, GetContiguousT0TimePeriodsNWP
+
+
+def _remove_indexes(x, inds):
+    xs = []
+    i_last = -1
+    for i in np.sort(inds):
+        xs += [x[i_last + 1 : i]]
+        i_last = i
+    xs += [x[i_last + 1 :]]
+    return pd.to_datetime(np.concatenate(xs))
 
 
 def test_get_contiguous_time_periods(nwp_datapipe):
-    nwp_datapipe = GetContiguousT0TimePeriods(
-        nwp_datapipe,
-        sample_period_duration=timedelta(hours=3),
-        history_duration=timedelta(minutes=60),
-        forecast_duration=timedelta(minutes=180),
-        time_dim="init_time_utc",
+    # Create 5-minutely data timestamps
+    freq = timedelta(minutes=5)
+    history_duration = timedelta(minutes=60)
+    forecast_duration = timedelta(minutes=15)
+
+    datetimes = _remove_indexes(
+        pd.date_range("2023-01-01 12:00", "2023-01-01 17:00", freq=freq),
+        [5, 30],
+    )
+
+    # Create initial datapipe
+    time_datapipe = IterableWrapper([pd.DataFrame(datetimes, columns=["time_utc"]).to_xarray()])
+
+    history_duration = timedelta(minutes=60)
+
+    contig_t0_datapipe = GetContiguousT0TimePeriods(
+        time_datapipe,
+        sample_period_duration=freq,
+        history_duration=history_duration,
+        forecast_duration=forecast_duration,
+        time_dim="time_utc",
+    )
+
+    periods = next(iter(contig_t0_datapipe))
+
+    expected_results = pd.DataFrame(
+        {
+            "start_dt": pd.to_datetime(
+                [
+                    "2023-01-01 13:30:00",
+                    "2023-01-01 15:35:00",
+                ]
+            ),
+            "end_dt": pd.to_datetime(
+                [
+                    "2023-01-01 14:10:00",
+                    "2023-01-01 16:45:00",
+                ]
+            ),
+        },
     )
 
-    batch = next(iter(nwp_datapipe))
-    print(batch)
+    assert periods.equals(expected_results)
+
+
+def test_get_contiguous_time_periods_nwp():
+    # These are the expected results of the test
+    expected_results = [
+        pd.DataFrame(
+            {
+                "start_dt": pd.to_datetime(["2023-01-01 03:00:00", "2023-01-02 03:00:00"]),
+                "end_dt": pd.to_datetime(["2023-01-01 21:00:00", "2023-01-03 06:00:00"]),
+            },
+        ),
+        pd.DataFrame(
+            {
+                "start_dt": pd.to_datetime(
+                    [
+                        "2023-01-01 05:00:00",
+                        "2023-01-02 05:00:00",
+                        "2023-01-02 14:00:00",
+                    ]
+                ),
+                "end_dt": pd.to_datetime(
+                    [
+                        "2023-01-01 21:00:00",
+                        "2023-01-02 12:00:00",
+                        "2023-01-03 06:00:00",
+                    ]
+                ),
+            },
+        ),
+        pd.DataFrame(
+            {
+                "start_dt": pd.to_datetime(
+                    [
+                        "2023-01-01 05:00:00",
+                        "2023-01-01 11:00:00",
+                        "2023-01-02 05:00:00",
+                        "2023-01-02 14:00:00",
+                    ]
+                ),
+                "end_dt": pd.to_datetime(
+                    [
+                        "2023-01-01 09:00:00",
+                        "2023-01-01 18:00:00",
+                        "2023-01-02 09:00:00",
+                        "2023-01-03 03:00:00",
+                    ]
+                ),
+            },
+        ),
+        pd.DataFrame(
+            {
+                "start_dt": pd.to_datetime(
+                    [
+                        "2023-01-01 05:00:00",
+                        "2023-01-01 11:00:00",
+                        "2023-01-01 14:00:00",
+                        "2023-01-02 05:00:00",
+                        "2023-01-02 14:00:00",
+                        "2023-01-02 17:00:00",
+                        "2023-01-02 20:00:00",
+                        "2023-01-02 23:00:00",
+                    ]
+                ),
+                "end_dt": pd.to_datetime(
+                    [
+                        "2023-01-01 06:00:00",
+                        "2023-01-01 12:00:00",
+                        "2023-01-01 15:00:00",
+                        "2023-01-02 06:00:00",
+                        "2023-01-02 15:00:00",
+                        "2023-01-02 18:00:00",
+                        "2023-01-02 21:00:00",
+                        "2023-01-03 00:00:00",
+                    ]
+                ),
+            },
+        ),
+    ]
+
+    # Create 3-hourly init times with a few time stamps missing
+    freq = timedelta(minutes=180)
+
+    datetimes = _remove_indexes(
+        pd.date_range("2023-01-01 03:00", "2023-01-02 21:00", freq=freq),
+        [1, 4, 5, 6, 7, 9, 10],
+    )
+
+    # Choose some history durations and max stalenesses
+    history_durations_hr = [0, 2, 2, 2]
+    max_stalenesses_hr = [9, 9, 6, 3]
+
+    for i in range(len(expected_results)):
+        history_duration = timedelta(hours=history_durations_hr[i])
+        max_staleness = timedelta(hours=max_stalenesses_hr[i])
+
+        # Create initial datapipe
+        time_datapipe = IterableWrapper(
+            [pd.DataFrame(datetimes, columns=["init_time_utc"]).to_xarray()]
+        )
+
+        time_periods = time_datapipe.get_contiguous_time_periods_nwp(
+            history_duration=history_duration,
+            max_staleness=max_staleness,
+            time_dim="init_time_utc",
+        )
+
+        # Check if results are as expected
+        results = next(iter(time_periods))
+        assert results.equals(expected_results[i])