Merge branch 'develop' into feature/check_orientations

birajstha · birajstha · commit 5eedfb7b0cfb · 2024-10-10T12:28:53.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - Moved `pygraphviz` from requirements to `graphviz` optional dependencies group.
+- Automatically tag untagged `subject_id` and `unique_id` as `!!str` when loading data config files.
 - Made orientation configurable (was hard-coded as "RPI").
 
 ### Fixed
diff --git a/CPAC/nuisance/utils/compcor.py b/CPAC/nuisance/utils/compcor.py
@@ -91,18 +91,33 @@ def cosine_filter(
     failure_mode="error",
 ):
     """
-    `cosine_filter` adapted from Nipype.
+    Apply cosine filter to the input BOLD image using the discrete cosine transform (DCT) method.
+
+    Adapted from nipype implementation. https://github.com/nipy/nipype/blob/d353f0d/nipype/algorithms/confounds.py#L1086-L1107
+    It removes the low-frequency drift from the voxel time series. The filtered image is saved to disk.
 
-    https://github.com/nipy/nipype/blob/d353f0d/nipype/algorithms/confounds.py#L1086-L1107
 
     Parameters
     ----------
-    input_image_path : string
-            Bold image to be filtered.
+    input_image_path : str
+        Path to the BOLD image to be filtered.
     timestep : float
-            'Repetition time (TR) of series (in sec) - derived from image header if unspecified'
-    period_cut : float
-            Minimum period (in sec) for DCT high-pass filter, nipype default value: 128.
+        Repetition time (TR) of the series (in seconds). Derived from image header if unspecified.
+    period_cut : float, optional
+        Minimum period (in seconds) for the DCT high-pass filter. Default value is 128.
+    remove_mean : bool, optional
+        Whether to remove the mean from the voxel time series before filtering. Default is True.
+    axis : int, optional
+        The axis along which to apply the filter. Default is -1 (last axis).
+    failure_mode : {'error', 'ignore'}, optional
+        Specifies how to handle failure modes. If set to 'error', the function raises an error.
+        If set to 'ignore', it returns the input data unchanged in case of failure. Default is 'error'.
+
+    Returns
+    -------
+    cosfiltered_img : str
+        Path to the filtered BOLD image.
+
     """
     # STATEMENT OF CHANGES:
     #     This function is derived from sources licensed under the Apache-2.0 terms,
@@ -113,6 +128,7 @@ def cosine_filter(
     #     * Removed caluclation and return of `non_constant_regressors`
     #     * Modified docstring to reflect local changes
     #     * Updated style to match C-PAC codebase
+    #     * Updated to use generator and iterate over voxel time series to optimize memory usage.
 
     # ORIGINAL WORK'S ATTRIBUTION NOTICE:
     #    Copyright (c) 2009-2016, Nipype developers
@@ -132,41 +148,74 @@ def cosine_filter(
     #    Prior to release 0.12, Nipype was licensed under a BSD license.
 
     # Modifications copyright (C) 2019 - 2024  C-PAC Developers
-    from nipype.algorithms.confounds import _cosine_drift, _full_rank
+    try:
 
-    input_img = nib.load(input_image_path)
-    input_data = input_img.get_fdata()
+        def voxel_generator():
+            for i in range(datashape[0]):
+                for j in range(datashape[1]):
+                    for k in range(datashape[2]):
+                        yield input_data[i, j, k, :]
 
-    datashape = input_data.shape
-    timepoints = datashape[axis]
-    if datashape[0] == 0 and failure_mode != "error":
-        return input_data, np.array([])
+        from nipype.algorithms.confounds import _cosine_drift, _full_rank
 
-    input_data = input_data.reshape((-1, timepoints))
+        input_img = nib.load(input_image_path)
+        input_data = input_img.get_fdata()
+        datashape = input_data.shape
+        timepoints = datashape[axis]
+        if datashape[0] == 0 and failure_mode != "error":
+            return input_data, np.array([])
 
-    frametimes = timestep * np.arange(timepoints)
-    X = _full_rank(_cosine_drift(period_cut, frametimes))[0]
+        frametimes = timestep * np.arange(timepoints)
+        X_full = _full_rank(_cosine_drift(period_cut, frametimes))[0]
 
-    betas = np.linalg.lstsq(X, input_data.T)[0]
+        # Generate X with and without the mean column
+        X_with_mean = X_full
+        X_without_mean = X_full[:, :-1] if X_full.shape[1] > 1 else X_full
 
-    if not remove_mean:
-        X = X[:, :-1]
-        betas = betas[:-1]
+        # Reshape the input data to bring the time dimension to the last axis if it's not already
+        if axis != -1:
+            reshaped_data = np.moveaxis(input_data, axis, -1)
+        else:
+            reshaped_data = input_data
+
+        reshaped_output_data = np.zeros_like(reshaped_data)
+
+        # Choose the appropriate X matrix
+        X = X_without_mean if remove_mean else X_with_mean
 
-    residuals = input_data - X.dot(betas).T
+        voxel_gen = voxel_generator()
 
-    output_data = residuals.reshape(datashape)
+        for i in range(reshaped_data.shape[0]):
+            IFLOGGER.info(
+                f"calculating {i+1} of {reshaped_data.shape[0]} row of voxels"
+            )
+            for j in range(reshaped_data.shape[1]):
+                for k in range(reshaped_data.shape[2]):
+                    voxel_time_series = next(voxel_gen)
+                    betas = np.linalg.lstsq(X, voxel_time_series.T, rcond=None)[0]
+
+                    residuals = voxel_time_series - X.dot(betas)
+                    reshaped_output_data[i, j, k, :] = residuals
+
+        # Move the time dimension back to its original position if it was reshaped
+        if axis != -1:
+            output_data = np.moveaxis(reshaped_output_data, -1, axis)
+        else:
+            output_data = reshaped_output_data
 
-    hdr = input_img.header
-    output_img = nib.Nifti1Image(output_data, header=hdr, affine=input_img.affine)
+        hdr = input_img.header
+        output_img = nib.Nifti1Image(output_data, header=hdr, affine=input_img.affine)
+        file_name = input_image_path[input_image_path.rindex("/") + 1 :]
 
-    file_name = input_image_path[input_image_path.rindex("/") + 1 :]
+        cosfiltered_img = os.path.join(os.getcwd(), file_name)
 
-    cosfiltered_img = os.path.join(os.getcwd(), file_name)
+        output_img.to_filename(cosfiltered_img)
 
-    output_img.to_filename(cosfiltered_img)
+        return cosfiltered_img
 
-    return cosfiltered_img
+    except Exception as e:
+        message = f"Error in cosine_filter: {e}"
+        IFLOGGER.error(message)
 
 
 def fallback_svd(a, full_matrices=True, compute_uv=True):
diff --git a/CPAC/utils/bids_utils.py b/CPAC/utils/bids_utils.py
@@ -14,10 +14,13 @@
 
 # You should have received a copy of the GNU Lesser General Public
 # License along with C-PAC. If not, see <https://www.gnu.org/licenses/>.
+from base64 import b64decode
+from collections.abc import Iterable
 import json
 import os
 import re
 import sys
+from typing import Any, Callable, Optional
 from warnings import warn
 
 from botocore.exceptions import BotoCoreError
@@ -26,6 +29,16 @@
 from CPAC.utils.monitoring import UTLOGGER
 
 
+class SpecifiedBotoCoreError(BotoCoreError):
+    """Specified :py:class:`~botocore.exceptions.BotoCoreError`."""
+
+    def __init__(self, msg: str, *args, **kwargs) -> None:
+        """Initialize BotoCoreError with message."""
+        msg = msg.format(**kwargs)
+        Exception.__init__(self, msg)
+        self.kwargs = kwargs
+
+
 def bids_decode_fname(file_path, dbg=False, raise_error=True):
     f_dict = {}
 
@@ -842,7 +855,7 @@ def collect_bids_files_configs(bids_dir, aws_input_creds=""):
                                 f"Error retrieving {s3_obj.key.replace(prefix, '')}"
                                 f" ({e.message})"
                             )
-                            raise BotoCoreError(msg) from e
+                            raise SpecifiedBotoCoreError(msg) from e
                     elif "nii" in str(s3_obj.key):
                         file_paths.append(
                             str(s3_obj.key).replace(prefix, "").lstrip("/")
@@ -868,9 +881,15 @@ def collect_bids_files_configs(bids_dir, aws_input_creds=""):
                                         ): json.load(open(os.path.join(root, f), "r"))
                                     }
                                 )
-                            except UnicodeDecodeError:
+                            except UnicodeDecodeError as unicode_decode_error:
                                 msg = f"Could not decode {os.path.join(root, f)}"
-                                raise UnicodeDecodeError(msg)
+                                raise UnicodeDecodeError(
+                                    unicode_decode_error.encoding,
+                                    unicode_decode_error.object,
+                                    unicode_decode_error.start,
+                                    unicode_decode_error.end,
+                                    msg,
+                                )
 
     if not file_paths and not config_dict:
         msg = (
@@ -983,15 +1002,35 @@ def insert_entity(resource, key, value):
     return "_".join([*new_entities[0], f"{key}-{value}", *new_entities[1], suff])
 
 
-def load_yaml_config(config_filename, aws_input_creds):
+def apply_modifications(
+    yaml_contents: str, modifications: Optional[list[Callable[[str], str]]]
+) -> str:
+    """Apply modification functions to YAML contents"""
+    if modifications:
+        for modification in modifications:
+            yaml_contents = modification(yaml_contents)
+    return yaml_contents
+
+
+def load_yaml_config(
+    config_filename: str,
+    aws_input_creds,
+    modifications: Optional[list[Callable[[str], str]]] = None,
+) -> dict | list | str:
+    """Load a YAML config file, possibly from AWS, with modifications applied.
+
+    `modifications` should be a list of functions that take a single string argument (the loaded YAML contents) and return a single string argument (the modified YAML contents).
+    """
     if config_filename.lower().startswith("data:"):
         try:
-            header, encoded = config_filename.split(",", 1)
-            config_content = b64decode(encoded)
+            _header, encoded = config_filename.split(",", 1)
+            config_content = apply_modifications(
+                b64decode(encoded).decode("utf-8"), modifications
+            )
             return yaml.safe_load(config_content)
-        except:
+        except Exception:
             msg = f"Error! Could not find load config from data URI {config_filename}"
-            raise BotoCoreError(msg)
+            raise SpecifiedBotoCoreError(msg=msg)
 
     if config_filename.lower().startswith("s3://"):
         # s3 paths begin with s3://bucket/
@@ -1013,7 +1052,8 @@ def load_yaml_config(config_filename, aws_input_creds):
     config_filename = os.path.realpath(config_filename)
 
     try:
-        return yaml.safe_load(open(config_filename, "r"))
+        with open(config_filename, "r") as _f:
+            return yaml.safe_load(apply_modifications(_f.read(), modifications))
     except IOError:
         msg = f"Error! Could not find config file {config_filename}"
         raise FileNotFoundError(msg)
@@ -1110,6 +1150,25 @@ def create_cpac_data_config(
     return sub_list
 
 
+def _check_value_type(
+    sub_list: list[dict[str, Any]],
+    keys: list[str] = ["subject_id", "unique_id"],
+    value_type: type = int,
+    any_or_all: Callable[[Iterable], bool] = any,
+) -> bool:
+    """Check if any or all of a key in a sub_list is of a given type."""
+    return any_or_all(
+        isinstance(sub.get(key), value_type) for key in keys for sub in sub_list
+    )
+
+
+def coerce_data_config_strings(contents: str) -> str:
+    """Coerge `subject_id` and `unique_id` to be strings."""
+    for key in ["subject_id: ", "unique_id: "]:
+        contents = re.sub(f"{key}(?!!!)", f"{key}!!str ", contents)
+    return contents.replace(": !!str !!", ": !!")
+
+
 def load_cpac_data_config(data_config_file, participant_labels, aws_input_creds):
     """
     Loads the file as a check to make sure it is available and readable.
@@ -1127,7 +1186,9 @@ def load_cpac_data_config(data_config_file, participant_labels, aws_input_creds)
     -------
     list
     """
-    sub_list = load_yaml_config(data_config_file, aws_input_creds)
+    sub_list: list[dict[str, str]] = load_yaml_config(
+        data_config_file, aws_input_creds, modifications=[coerce_data_config_strings]
+    )
 
     if participant_labels:
         sub_list = [
diff --git a/CPAC/utils/tests/configs/__init__.py b/CPAC/utils/tests/configs/__init__.py
@@ -1,15 +1,21 @@
 """Configs for testing."""
 
-from pathlib import Path
+from importlib import resources
+
+try:
+    from importlib.resources.abc import Traversable
+except ModuleNotFoundError:  # TODO: Remove this block once minimum Python version includes `importlib.resources.abc`
+    from importlib.abc import Traversable
 
-from pkg_resources import resource_filename
 import yaml
 
-_TEST_CONFIGS_PATH = Path(resource_filename("CPAC", "utils/tests/configs"))
-with open(_TEST_CONFIGS_PATH / "neurostars_23786.yml", "r", encoding="utf-8") as _f:
+_TEST_CONFIGS_PATH: Traversable = resources.files("CPAC").joinpath(
+    "utils/tests/configs"
+)
+with (_TEST_CONFIGS_PATH / "neurostars_23786.yml").open("r", encoding="utf-8") as _f:
     # A loaded YAML file to test https://tinyurl.com/neurostars23786
     NEUROSTARS_23786 = _f.read()
-with open(_TEST_CONFIGS_PATH / "neurostars_24035.yml", "r", encoding="utf-8") as _f:
+with (_TEST_CONFIGS_PATH / "neurostars_24035.yml").open("r", encoding="utf-8") as _f:
     # A loaded YAML file to test https://tinyurl.com/neurostars24035
     NEUROSTARS_24035 = _f.read()
 # A loaded YAML file to test https://tinyurl.com/cmicnlslack420349
diff --git a/CPAC/utils/tests/configs/github_2144.yml b/CPAC/utils/tests/configs/github_2144.yml
@@ -0,0 +1,8 @@
+- site: site-1
+  subject_id: 01
+  unique_id: 02
+  derivatives_dir: /fprep/sub-0151
+- site: site-1
+  subject_id: !!str 02
+  unique_id: 02
+  derivatives_dir: /fprep/sub-0151
diff --git a/CPAC/utils/tests/test_bids_utils.py b/CPAC/utils/tests/test_bids_utils.py
@@ -16,18 +16,21 @@
 # License along with C-PAC. If not, see <https://www.gnu.org/licenses/>.
 """Tests for bids_utils."""
 
+from importlib import resources
 import os
 from subprocess import run
 
 import pytest
 import yaml
 
 from CPAC.utils.bids_utils import (
+    _check_value_type,
     bids_gen_cpac_sublist,
     cl_strip_brackets,
     collect_bids_files_configs,
     create_cpac_data_config,
     load_cpac_data_config,
+    load_yaml_config,
     sub_list_filter_by_labels,
 )
 from CPAC.utils.monitoring.custom_logging import getLogger
@@ -107,6 +110,19 @@ def test_gen_bids_sublist(bids_dir, test_yml, creds_path, dbg=False):
     assert sublist
 
 
+def test_load_data_config_with_ints() -> None:
+    """Check that C-PAC coerces sub- and ses- ints to strings."""
+    data_config_file = resources.files("CPAC").joinpath(
+        "utils/tests/configs/github_2144.yml"
+    )
+    # make sure there are ints in the test data
+    assert _check_value_type(load_yaml_config(str(data_config_file), None))
+    # make sure there aren't ints when it's loaded through the loader
+    assert not _check_value_type(
+        load_cpac_data_config(str(data_config_file), None, None)
+    )
+
+
 @pytest.mark.parametrize("t1w_label", ["acq-HCP", "acq-VNavNorm", "T1w", None])
 @pytest.mark.parametrize(
     "bold_label", ["task-peer_run-1", "[task-peer_run-1 task-peer_run-2]", "bold", None]