HIT L1B - Handle data gaps in sectored counts (#1827)

vmartinez-cu · web-flow · commit 08892de817ec · 2025-06-18T12:23:47.000-06:00
* Fix issue where data gaps weren't being excluded from subsetting sectored counts data. Filtering data by slicing included partial major frames in the slice range. Now the code properly filters for complete major frames only

* Update test for filtering sector counts for complete sets of sector count data to check for different types of data gaps that could occur. Also update docstring for the function to clarify that only data where livetime values from the previous 10 minutes are available are included in the output since it's needed to calculate rates

* Raise error if no valid start indices are found in the dataset. Update unit test for this case.

* Address PR comments by adding comments to explain data filtering steps
diff --git a/imap_processing/hit/l1b/hit_l1b.py b/imap_processing/hit/l1b/hit_l1b.py
@@ -366,7 +366,8 @@ def subset_data_for_sectored_counts(
     A set of sectored data starts with hydrogen and ends with iron and correspond to
     the mod 10 values 0-9. The livetime values from the previous 10 minutes are used
     to calculate the rates for each set since those counts are transmitted 10 minutes
-    after they were collected.
+    after they were collected. Therefore, only complete sets of sectored counts where
+    livetime from the previous 10 minutes are available are included in the output.
 
     Parameters
     ----------
@@ -378,7 +379,7 @@ def subset_data_for_sectored_counts(
     Returns
     -------
     tuple[xr.Dataset, xr.DataArray]
-        Subsetted L1A counts dataset and corresponding livetime values.
+        Dataset of complete sectored counts and corresponding livetime values.
     """
     # Identify 10-minute intervals of complete sectored counts.
     bin_size = 10
@@ -392,16 +393,34 @@ def subset_data_for_sectored_counts(
     start_indices = np.where(matches)[0]
 
     # Filter out start indices that are less than or equal to the bin size
-    # since the previous 10 minutes are needed
-    start_indices = start_indices[start_indices > bin_size]
-    data_slice = slice(start_indices[0], start_indices[-1] + bin_size)
-
-    # Subset data to include only complete sets of sectored counts
-    l1b_sectored_rates_dataset = l1a_counts_dataset.isel(epoch=data_slice)
+    # since the previous 10 minutes are needed for calculating rates
+    if start_indices.size == 0:
+        logger.error(
+            "No data to process - valid start indices not found for "
+            "complete sectored counts."
+        )
+        raise ValueError("No valid start indices found for complete sectored counts.")
+    else:
+        start_indices = start_indices[start_indices >= bin_size]
+
+    # Subset data for complete sets of sectored counts.
+    # Each set of sectored counts is 10 minutes long, so we take the indices
+    # starting from the start indices and extend to the bin size of 10.
+    # This creates a 1D array of indices that correspond to the complete
+    # sets of sectored counts which is used to filter the L1A dataset and
+    # create the L1B sectored rates dataset.
+    data_indices = np.concatenate(
+        [np.arange(idx, idx + bin_size) for idx in start_indices]
+    )
+    l1b_sectored_rates_dataset = l1a_counts_dataset.isel(epoch=data_indices)
 
-    # Subset livetime staggered from sectored counts by 10 minutes
-    livetime_slice = slice(start_indices[0] - bin_size, start_indices[-1])
-    livetime = livetime[livetime_slice]
+    # Subset livetime values corresponding to the previous 10 minutes
+    # for each start index. This ensures the livetime data aligns correctly
+    # with the sectored counts for rate calculations.
+    livetime_indices = np.concatenate(
+        [np.arange(idx - bin_size, idx) for idx in start_indices]
+    )
+    livetime = livetime.isel(epoch=livetime_indices)
 
     return l1b_sectored_rates_dataset, livetime
 
diff --git a/imap_processing/tests/hit/test_hit_l1b.py b/imap_processing/tests/hit/test_hit_l1b.py
@@ -150,27 +150,113 @@ def test_sum_livetime_10min():
 
 def test_subset_data_for_sectored_counts():
     """Test the subset_data_for_sectored_counts function."""
-    # Create a sample L1A counts dataset
-    l1a_counts_dataset = xr.Dataset(
-        {
-            "hdr_minute_cnt": ("epoch", np.arange(105, 135)),
-            "h_sectored_counts": ("epoch", np.arange(0, 30)),
-            "he4_sectored_counts": ("epoch", np.arange(0, 30)),
-        },
-    )
+
+    def create_l1a_counts_dataset(hdr_minute_cnt_values):
+        """Helper to create L1A counts dataset."""
+        return xr.Dataset(
+            {
+                "hdr_minute_cnt": ("epoch", hdr_minute_cnt_values),
+                "h_sectored_counts": ("epoch", np.arange(len(hdr_minute_cnt_values))),
+                "he4_sectored_counts": ("epoch", np.arange(len(hdr_minute_cnt_values))),
+            },
+        )
+
+    def validate_subset(l1a_counts_dataset, livetime):
+        """Helper to validate the subset results."""
+        subset_dataset, subset_livetime = subset_data_for_sectored_counts(
+            l1a_counts_dataset, livetime
+        )
+        assert subset_dataset.sizes["epoch"] == 10
+        assert len(subset_livetime["epoch"]) == 10
+        assert np.all(subset_dataset["hdr_minute_cnt"].values % 10 == np.arange(10))
 
     # Create a sample livetime data array
     livetime = xr.DataArray(np.arange(1.0, 31.0, dtype=np.float32), dims=["epoch"])
 
-    # Call the function
-    subset_dataset, subset_livetime = subset_data_for_sectored_counts(
-        l1a_counts_dataset, livetime
+    # Test with partial data at the start and end of the dataset
+    l1a_counts_dataset = create_l1a_counts_dataset(np.arange(105, 135))
+    validate_subset(l1a_counts_dataset, livetime)
+
+    # Test with partial data in the middle of the dataset
+    l1a_counts_dataset = create_l1a_counts_dataset(
+        [
+            100,
+            101,
+            102,
+            103,
+            104,
+            105,
+            106,
+            107,
+            108,
+            109,
+            110,
+            111,
+            112,
+            113,
+            114,
+            120,
+            121,
+            122,
+            123,
+            124,
+            130,
+            131,
+            132,
+            133,
+            134,
+            135,
+            136,
+            137,
+            138,
+            139,
+        ]
     )
-
-    # Check the results
-    assert subset_dataset.sizes["epoch"] == 10
-    assert len(subset_livetime["epoch"]) == 10
-    assert np.all(subset_dataset["hdr_minute_cnt"].values % 10 == np.arange(10))
+    validate_subset(l1a_counts_dataset, livetime)
+
+    # Test with partial data at the start, middle, and end of the dataset
+    l1a_counts_dataset = create_l1a_counts_dataset(
+        [
+            105,
+            106,
+            107,
+            108,
+            109,
+            110,
+            111,
+            112,
+            113,
+            114,
+            115,
+            116,
+            117,
+            118,
+            119,
+            120,
+            121,
+            122,
+            130,
+            131,
+            132,
+            133,
+            134,
+            135,
+            136,
+            137,
+            138,
+            139,
+            140,
+            141,
+        ]
+    )
+    validate_subset(l1a_counts_dataset, livetime)
+
+    # Test with only partial data in the dataset
+    l1a_counts_dataset = create_l1a_counts_dataset(np.arange(100, 160, 2))
+    with pytest.raises(
+        ValueError, match="No valid start indices found for complete sectored counts."
+    ):
+        subset_data_for_sectored_counts(l1a_counts_dataset, livetime)
 
 
 def test_process_summed_rates_data(l1a_counts_dataset, livetime):