diff --git a/bash_utilities/config_file_example.json b/bash_utilities/config_file_example.json deleted file mode 100644 index 7db35629..00000000 --- a/bash_utilities/config_file_example.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "input": { - "type": "volume", - "files": ["anat/t1.nii.gz", "dwi/fa.nii.gz", "dwi/fodf.nii.gz"], - "standardization": "per_file" - }, - "wm_mask": { - "type": "volume", - "files": ["masks/wm.nii.gz"], - "standardization": "none" - }, - "streamlines": { - "type": "streamlines", - "files": ["tractograms/ALL"] - } -} diff --git a/docs/2_A_creating_the_hdf5.rst b/docs/2_A_creating_the_hdf5.rst index 0d4e704a..48707f0a 100644 --- a/docs/2_A_creating_the_hdf5.rst +++ b/docs/2_A_creating_the_hdf5.rst @@ -10,44 +10,27 @@ We chose to base our code on the hdf5 data. One reason is that it allows to regr The hdf5 may contain many groups of data. For instance, if your model needs an input volume and the streamlines as target, you might need one group for each. You might want to include tracking masks or any other required data. -Volume groups will mimic nifti files. While creating the hdf5, you may concatenate many nifti files into a single group. - -Streamline groups will mimic tractogram files. Again, you may concatenate many .trk or .tck files in a single group, for instance you could concatenate many bundles per subject. - 2.2 How to organize your data? ****************************** -We suggest that your data should be organized correctly on your computer, such as described below. - This is how your data should be organized before trying to load your data as a hdf5 file. This structure should hold wether you work with hdf5 or BIDS. Below, we call "dwi_ml_ready" the folder with correct organization. *Hint:* use symlinks to avoid doubling your data on disk! **dwi_ml_ready** -This folder is the most important one and must be organized in a very precise way to be able to load the data as a hdf5 using our script **create_hdf5_dataset.py**. Each subject should have the exact same sub-folders and files. Then, you can create a **config_file.json** that will tell the script what to include in the hdf5 file. +This folder is the most important one and must be organized in a very precise way to be able to load the data as a hdf5 using our script **dwiml_create_hdf5_dataset**. Each subject should have the exact same sub-folders and files. Then, you can create a **config_file.json** that will tell the script what to include in the hdf5 file. **Example:** .. code-block:: bash {database_name} - | original =====> Organized as you wish but if you intend on using - tractoflow, you should organize it as below. - | {subject_id} - | dwi.nii.gz - | bval - | bvec - | t1.nii.gz - | preprocessed =====> Organized as you wish. - | {subject_id} - | Ex: Tractoflow folders - | Ex: bundles from Recobundles | dwi_ml_ready =====> Each subject should contain the exact same sub-folders and files, such as below. It is also possible to add prefixes to the files (ex: subj1__t1.nii.gz) based on - the subject id. For instance: + the subject id. | {subject_id} | anat | t1.nii.gz @@ -74,61 +57,68 @@ To create the hdf5 file, you will need a config file such as below. HDF groups w { "input": { "type": "volume", - "files": ["dwi/dwi.nii.gz", "anat/t1.nii.gz", "dwi/*__dwi.nii.gz], --> Will get, for instance, subX__dwi.nii.gz - "standardization": "all", + "files": ["dwi/dwi.nii.gz", "anat/t1.nii.gz", "dwi/*__dwi.nii.gz], "std_mask": [masks/some_mask.nii.gz] - }, + }, "target": { "type": "streamlines", - "files": ["tractograms/bundle1.trk", "tractograms/wholebrain.trk", "tractograms/*__wholebrain.trk"], ----> Will get, for instance, sub1000__bundle1.trk + "files": ["tractograms/bundle1.trk", "tractograms/wholebrain.trk", "tractograms/*__wholebrain.trk"], "connectivity_matrix": "my_file.npy", - "connectivity_nb_blocs": 6 ---> OR + "connectivity_nb_blocs": 6 ( OR ) "connectivity_labels": labels_volume_group, "dps_keys": ['dps1', 'dps2'] - } + } "bad_streamlines": { "type": "streamlines", - "files": ["bad_tractograms/*"] ---> Will get all trk and tck files. - } + "files": ["bad_tractograms/*"] + } "wm_mask": { "type": "volume", "files": ["masks/wm_mask.nii.gz"] - } + } } | +Each group key will become the group's **name** in the hdf5. It can be anything you want. We suggest you keep it significative, ex 'input_volume', 'target_volume', 'target_streamlines'. In our scripts (ex, l2t_train_model.py, tt_train_model.py, etc), you will often be asked for the labels given to your groups. -General group attributes in the config file: -"""""""""""""""""""""""""""""""""""""""""""" - -Each group key will become the group's **name** in the hdf5. It can be anything you want. We suggest you keep it significative, ex 'input_volume', 'target_volume', 'target_directions'. In other scripts (ex, l2t_train_model.py, tt_train_model.py, etc), you will often be asked for the labels given to your groups. -Each group may have a number of parameters: +Required attributes for each group +"""""""""""""""""""""""""""""""""" - **"type"**: It must be recognized in dwi_ml. Currently, accepted datatype are: - - 'volume': for instance, a dwi, an anat, mask, t1, fa, etc. - - 'streamlines': for instance, a .trk, .tck file (any format accepted by Dipy's *Stateful Tractogram*). + - 'volume': Volume groups will mimic nifti files. While creating the hdf5, you may concatenate many nifti files into a single group. + + - 'streamlines': Streamline groups will mimic tractogram files. Again, you may concatenate many .trk or .tck files in a single group, for instance you could concatenate many bundles per subject. Files must be inany format accepted by Dipy's *Stateful Tractogram*, such as .trk or .tck. - - **"files"**: The listed file(s) must exist in every subject folder inside the root repository. That is: the files must be organized correctly on your computer (except if option 'enforce_files_presence is set to False). If there are more than one files, they will be concatenated (on the 4th dimension for volumes, using the union of tractograms for streamlines). + - **"files"**: The files to concatenate into a single volume or a single tractogram. They must exist in every subject folder inside the root repository. That is: the files must be organized correctly on your computer (except if option 'enforce_files_presence is set to False). + + Note: There is the possibility to add a wildcard (\*), for instance if you files have variable prefixes (*_T1.nii.gz will include subj1_T1.nii.gz), or to include many files (bundles/*.trk will include all trk files in the bundles folder.). - - There is the possibility to add a wildcard (\*). Additional attributes for volume groups: """""""""""""""""""""""""""""""""""""""" - - **std_mask**: The name of the standardization mask. Data is standardized (normalized) during data creation: data = (data - mean_in_mask) / std_in_mask. If more than one files are given, the union (logical_or) of all masks is used (ex of usage: ["masks/wm_mask.nii.gz", "masks/gm_mask.nii.gz"] would use a mask of all the brain). + - **std_mask**: The name of the standardization mask (see Note 1). Data is standardized (normalized) during data creation: data = (data - mean_in_mask) / std_in_mask. If more than one files are given, the union (logical_or) of all masks is used (ex of usage: ["masks/wm_mask.nii.gz", "masks/gm_mask.nii.gz"] would use a mask of all the brain). - - **"standardization"**: It defined the standardization option applied to the volume group. It must be one of: + - **"standardization"**: It defines the standardization (normalization) option applied to the volume group. It must be one of: - - "all", to apply standardization (normalization) to the final (concatenated) file. - - "independent", to apply it independently on the last dimension of the data (ex, for a fODF, it would apply it independently on each SH). - - "per_file", to apply it independently on each file included in the group. + - "all", to apply standardization to the final (concatenated) file, per subject. + - "all_across_subjs", to apply standardization to the final file, across all subjects + - "independent", to apply it independently on the last dimension of the data; on each feature. For instance, for a fODF, this would apply standardization independently on each SH coefficient, per subject. + - ("independent_across_subjs": not implemented!) + - "per_file", to apply it independently on each file concatenated in the volume, per subject. + - "per_file_across_subjs", to apply the same normalization to all subjects. See note 2. - "none", to skip this step (default) -****A note about data standardization** +****Note 1: why we use a mask for standardization** + +If all voxel were to be used, most of them would probably contain the background of the data, bringing the mean and std probably very close to 0. Thus, non-zero voxels only are used to compute the mean and std, or voxels inside the provided mask if any. If a mask is provided, voxels outside the mask could have been set to NaN, but the safer choice made here was to simply modify all voxels [ data = (data - mean) / std ], even voxels outside the mask, with the mean and std of voxels in the mask. + +****Note 2: how we apply standardization across subjects** + +When we create the hdf5, to apply a the same standardization to all subjects, we could load volumes from all subjects in the training set at once, compute their mean and std. This could become heavy in memory, in data is big (typically 4D volumes) and if there are a lot of subjects. Rather, as we loop over all subjects to prepare the data, we use `Welford's algorithm `_ to compute the variance in an incremental way. The final mean and std [sqrt(variance)] are save as attributes of the hdf5. Models have access to this information in the hdf5 and can later standardize any new data it receives, even unseen data from the testing set. -If all voxel were to be used, most of them would probably contain the background of the data, bringing the mean and std probably very close to 0. Thus, non-zero voxels only are used to compute the mean and std, or voxels inside the provided mask if any. If a mask is provided, voxels outside the mask could have been set to NaN, but the simpler choice made here was to simply modify all voxels [ data = (data - mean) / std ], even voxels outside the mask, with the mean and std of voxels in the mask. Mask name is provided through the config file. It is formatted as a list: if many files are listed, the union of the binary masks will be used. Additional attributes for streamlines groups: @@ -141,10 +131,12 @@ Additional attributes for streamlines groups: - **dps_keys**: List of data_per_streamline keys to keep in memory in the hdf5. + + 2.4. Creating the hdf5 ********************** -You will use the **dwiml_create_hdf5_dataset.py** script to create a hdf5 file. +You will use the **dwiml_create_hdf5_dataset** script to create a hdf5 file. .. code-block:: bash @@ -160,6 +152,8 @@ You will use the **dwiml_create_hdf5_dataset.py** script to create a hdf5 file. $dwi_ml_folder $hdf5_file $config_file \ $training_subjs $validation_subjs $testing_subjs +You may later investigate the organization of your hdf5 with the script **dwiml_hdf5_print_architecture**. + .. toctree:: :maxdepth: 1 :caption: Detailed explanations for developers: diff --git a/docs/2_B_advanced_hdf5_organization.rst b/docs/2_B_advanced_hdf5_organization.rst index 3793a2a4..0b85714e 100644 --- a/docs/2_B_advanced_hdf5_organization.rst +++ b/docs/2_B_advanced_hdf5_organization.rst @@ -11,6 +11,7 @@ Here is the output format created by dwiml_create_hdf5_dataset.py and recognized hdf5.attrs['training_subjs'] = the list of str representing the training subjects. hdf5.attrs['validation_subjs'] = the list of str representing the validation subjects. hdf5.attrs['testing_subjs'] = the list of str representing the testing subjects. + hdf5.attrs['means_and_stds_groupX'] = (mean, std) for olume group named groupX (if normalization across subjects is used), where std [=sqrt(variance)]. Each one is a vector of length = nb_features # hdf5.keys() are the subjects. hdf5['subj1'].keys() are the groups from the config_file. @@ -31,11 +32,12 @@ Here is the output format created by dwiml_create_hdf5_dataset.py and recognized # (others:) hdf5['subj1']['group1']['connectivity_matrix'] hdf5['subj1']['group1']['connectivity_matrix_type'] = 'from_blocs' or 'from_labels' - hdf5['subj1']['group1']['connectivity_label_volume'] (the labels\' volume group) OR - hdf5['subj1']['group1']['connectivity_nb_blocs'] (a list of three integers) - hdf5['subj1']['group1']['data_per_streamline'] (a HDF5 group of 2D numpy arrays) + hdf5['subj1']['group1']['connectivity_label_volume'] = (the labels\' volume group) OR + hdf5['subj1']['group1']['connectivity_nb_blocs'] = (a list of three integers) + hdf5['subj1']['group1']['data_per_streamline'] = a HDF5 group of 2D numpy arrays # For volumes, other available data: hdf5['sub1']['group1']['affine'] hdf5['sub1']['group1']['voxres'] hdf5['sub1']['group1']['nb_features'] + diff --git a/src/dwi_ml/cli/tests/test_create_hdf5_dataset.py b/src/dwi_ml/cli/tests/test_create_hdf5_dataset.py index 311d0038..598a92f1 100644 --- a/src/dwi_ml/cli/tests/test_create_hdf5_dataset.py +++ b/src/dwi_ml/cli/tests/test_create_hdf5_dataset.py @@ -29,6 +29,9 @@ # "dps_keys": ['mean_color_dps', 'mock_2d_dps'] # } # } +# TODO: test connectivity matrices (need to be added in data) +# TODO: Test other standardization (need to modify the config_file.) + def test_help_option(script_runner): ret = script_runner.run('dwiml_create_hdf5_dataset', '--help') @@ -49,5 +52,6 @@ def test_execution(script_runner): ret = script_runner.run('dwiml_create_hdf5_dataset', dwi_ml_folder, hdf5_output, config_file, - training_subjs, validation_subjs, testing_subjs) + training_subjs, validation_subjs, testing_subjs, + '-v', 'DEBUG') assert ret.success diff --git a/src/dwi_ml/data/dataset/multi_subject_containers.py b/src/dwi_ml/data/dataset/multi_subject_containers.py index 5aa999c3..cc148a1f 100644 --- a/src/dwi_ml/data/dataset/multi_subject_containers.py +++ b/src/dwi_ml/data/dataset/multi_subject_containers.py @@ -42,11 +42,23 @@ def __init__(self, set_name: str, hdf5_file: str, lazy: bool, self.set_name = set_name self.hdf5_file = hdf5_file + self.is_lazy = lazy + self.cache_size = cache_size + + # ----- General information: attributes of the hdf5 + # Volumes: self.volume_groups = [] # type: List[str] self.nb_features = [] # type: List[int] + self.means_and_std = {} + + # Streamlines: self.streamline_groups = [] # type: List[str] self.contains_connectivity = [] # type: np.ndarray + self.step_size = None + self.compress = None + + # ----- Information obtained by actually loading the data: # The subjects data list will be either a SubjectsDataList or a # LazySubjectsDataList depending on MultisubjectDataset.is_lazy. @@ -70,14 +82,7 @@ def __init__(self, set_name: str, hdf5_file: str, lazy: bool, self.streamline_lengths_mm = [] # type: List[List[int]] self.streamline_lengths = [] # type: List[List[int]] - # Preprocessing information will be found in the hdf5 later. - self.step_size = None - self.compress = None - - self.is_lazy = lazy - # This is only used in the lazy case. - self.cache_size = cache_size self.volume_cache_manager = None # type: SingleThreadCacheManager def close_all_handles(self): @@ -90,10 +95,15 @@ def close_all_handles(self): s.hdf_handle.close() s.hdf_handle = None - def set_subset_info(self, volume_groups, nb_features, streamline_groups, - contains_connectivity, step_size, compress): + def set_subset_info(self, volume_groups, nb_features, means_and_stds, + streamline_groups, contains_connectivity, + step_size, compress): + # Volumes: self.volume_groups = volume_groups self.nb_features = nb_features + self.means_and_std = means_and_stds + + # Streamlines: self.streamline_groups = streamline_groups self.contains_connectivity = contains_connectivity self.step_size = step_size @@ -385,6 +395,7 @@ def __init__(self, hdf5_file: str, lazy: bool, self.volume_groups = [] # type: List[str] self.nb_features = [] # type: List[int] + self.means_and_stds = {} self.streamline_groups = [] # type: List[str] self.streamlines_contain_connectivity = [] @@ -446,7 +457,7 @@ def load_data(self, load_training=True, load_validation=True, if step_size == 'Not defined by user': step_size = None if compress == 'Not defined by user': - compress = None + compress = None # Loading the first training subject's group information. # Others should fit. @@ -463,7 +474,7 @@ def load_data(self, load_training=True, load_validation=True, logger.debug("Streamline groups containing a connectivity matrix: " "{}".format(contains_connectivity)) - # Verifying groups of interest + # Verifying if groups of interest are indeed in the hdf5 if volume_groups is not None: missing_vol = np.setdiff1d(volume_groups, poss_volume_groups) if len(missing_vol) > 0: @@ -496,14 +507,24 @@ def load_data(self, load_training=True, load_validation=True, logger.info("--> Using all streamline groups.") self.streamline_groups = poss_strea_groups self.streamlines_contain_connectivity = contains_connectivity - self.streamline_groups = list(self.streamline_groups) + + # Loading normalization information + for group in self.self.volume_groups: + if 'means_and_stds_' + group in hdf_handle.attrs: + self.means_and_stds[group] = \ + hdf_handle.attrs['means_and_stds_' + group] + else: + self.means_and_stds[group] = None + + # Finalizing group information group_info = (self.volume_groups, self.nb_features, + self.means_and_stds, self.streamline_groups, self.streamlines_contain_connectivity) self.training_set.set_subset_info(*group_info, step_size, compress) self.validation_set.set_subset_info(*group_info, step_size, compress) - self.testing_set.set_subset_info(*group_info, step_size, compress) + self.testing_set.set_subset_info(*group_info, step_size, compress) # LOADING if load_training: diff --git a/src/dwi_ml/data/hdf5/hdf5_creation.py b/src/dwi_ml/data/hdf5/hdf5_creation.py index 006b16fe..01fb5bfc 100644 --- a/src/dwi_ml/data/hdf5/hdf5_creation.py +++ b/src/dwi_ml/data/hdf5/hdf5_creation.py @@ -11,19 +11,21 @@ from dipy.io.utils import is_header_compatible from dipy.tracking.utils import length import h5py -from scilpy.image.labels import get_data_as_labels - -from dwi_ml.data.hdf5.utils import format_nb_blocs_connectivity -from dwi_ml.data.processing.streamlines.data_augmentation import \ - resample_or_compress from nested_lookup import nested_lookup import nibabel as nib import numpy as np +from scilpy.image.labels import get_data_as_labels from scilpy.tractograms.tractogram_operations import concatenate_sft from dwi_ml.data.io import load_file_to4d -from dwi_ml.data.processing.dwi.dwi import standardize_data +from dwi_ml.data.hdf5.utils import format_nb_blocs_connectivity +from dwi_ml.data.processing.dwi.dwi import OnlineMeanAndVariance, standardize_data +from dwi_ml.data.processing.streamlines.data_augmentation import \ + resample_or_compress + +STD_CHOICES = ['all', 'independent', 'per_file', 'per_file_across_subjs', + 'none'] def format_filelist(filenames, enforce_presence, folder=None) -> List[str]: @@ -211,6 +213,9 @@ def __init__(self, root_folder: Path, out_hdf_filename: Path, # Check that all files exist if enforce_files_presence: self._check_files_presence() + + # Preparing the data standardization across subjects + self.online_stats = {} def _analyse_config_file(self): """ @@ -238,20 +243,19 @@ def _analyse_config_file(self): # Volume groups if self.groups_config[group]['type'] == 'volume': - std_choices = ['all', 'independent', 'per_file', 'none'] if 'standardization' not in self.groups_config[group]: raise KeyError( "Group {}'s 'standardization' was not defined. It " "should be one of {}. See the doc for a " "groups_config.json example." - .format(group, std_choices)) + .format(group, STD_CHOICES)) if self.groups_config[group]['standardization'] not in \ - std_choices: + STD_CHOICES: raise KeyError( "Group {}'s 'standardization' should be one of {}, " "but we got {}. See the doc for a groups_config.json " "example." - .format(group, std_choices, + .format(group, STD_CHOICES, self.groups_config[group]['standardization'])) volume_groups.append(group) @@ -398,6 +402,26 @@ def create_database(self): logging.info("*Processing subject {}/{}: {}" .format(nb_processed, nb_subjs, subj_id)) self._create_one_subj(subj_id, hdf_handle) + + # Save the final mean and std, for normalization across subjects + # self.online_variances has been updated during the process + # We would like to save a dictionnary in the hdf5: the means and + # std for each group. But hdf5 does not support dictionaries. + # Saving multiple keys: hdf_handle.attrs['means_and_stds_group'] + for group_name, info in self.online_stats.items(): + # User options are one value per volume group (len(list) = 1) or + # one option per nifti volume concatenated in the group. So we + # receive a list. + assert isinstance(info, list) # one value per volume + assert isinstance(info[0], OnlineMeanAndVariance) + + mean = [] + std = [] + for volumeinfo in info: + mean.extend([volumeinfo.mean] * volumeinfo.nb_features) + std.extend([volumeinfo.std] * volumeinfo.nb_features) + means_and_stds = (np.asarray(mean), np.asarray(std)) + hdf_handle.attrs['means_and_stds_' + group_name] = means_and_stds logging.info("Saved dataset : {}".format(self.out_hdf_filename)) @@ -407,16 +431,13 @@ def _create_one_subj(self, subj_id, hdf_handle): volume group(s) + streamline group(s). """ subj_input_dir = self.root_folder.joinpath(subj_id) - subj_hdf_group = hdf_handle.create_group(subj_id) # Add the subj data based on groups in the json config file - ref = self._create_volume_groups(subj_id, subj_input_dir, - subj_hdf_group) - + ref = self._create_volume_groups(subj_id, subj_input_dir, subj_hdf_group) self._create_streamline_groups(ref, subj_input_dir, subj_id, subj_hdf_group) - + def _create_volume_groups(self, subj_id, subj_input_dir, subj_hdf_group): """ Create the hdf5 groups for all volume groups in the config_file for a @@ -453,9 +474,10 @@ def _create_volume_groups(self, subj_id, subj_input_dir, subj_hdf_group): # Adding the shape info separately to access it without loading # the data (useful for lazy data!). subj_hdf_group[group].attrs['nb_features'] = group_data.shape[-1] + return ref_header - def _process_one_volume_group(self, group: str, subj_id: str, + def _process_one_volume_group(self, group_name: str, subj_id: str, subj_input_dir: Path): """ Processes each volume group from the json config file for a given @@ -482,17 +504,35 @@ def _process_one_volume_group(self, group: str, subj_id: str, group_affine: np.ndarray Affine for the group. """ - std_mask = None + # Get the files and add the subject_dir as prefix. + file_list = self.groups_config[group_name]['files'] + file_list = format_filelist(file_list, self.enforce_files_presence, + folder=subj_input_dir) + + # Std option std_option = 'none' - if 'standardization' in self.groups_config[group]: - std_option = self.groups_config[group]['standardization'] - if 'std_mask' in self.groups_config[group]: + if 'standardization' in self.groups_config[group_name]: + std_option = self.groups_config[group_name]['standardization'] + + # Instantiate online variances if this is the first subject + # Just preparing the key in the dict; we need the number of features + # to actually instantiate it. + if group_name not in self.online_stats.keys(): + if std_option == 'per_file_across_subjs': + self.online_stats[group_name] = [None] * len(file_list) + elif std_option == 'all_across_subjs': + self.online_stats[group_name] = [None] + + # Std mask + std_mask = None + if 'std_mask' in self.groups_config[group_name]: if std_option == 'none': logging.warning("You provided a std_mask for volume group {}, " - "but std_option is 'none'. Skipping.") + "but std_option is 'none'. Skipping." + .format(group_name)) else: # Load subject's standardization mask. Can be a list of files. - std_masks = self.groups_config[group]['std_mask'] + std_masks = self.groups_config[group_name]['std_mask'] std_masks = format_filelist(std_masks, self.enforce_files_presence, folder=subj_input_dir) @@ -505,46 +545,50 @@ def _process_one_volume_group(self, group: str, subj_id: str, else: std_mask = np.logical_or(sub_mask_data, std_mask) - # Get the files and add the subject_dir as prefix. - file_list = self.groups_config[group]['files'] - file_list = format_filelist(file_list, self.enforce_files_presence, - folder=subj_input_dir) - - # First file will define data dimension and affine - logging.info(" - Processing file {} (first file=reference) " - .format(os.path.basename(file_list[0]))) - group_data, group_affine, group_res, group_header = load_file_to4d( - file_list[0]) - - if std_option == 'per_file': - logging.debug(' *Standardizing sub-data') - group_data = standardize_data(group_data, std_mask, - independent=False) - - # Other files must fit (data shape, affine, voxel size) - # It is not a promise that data has been correctly registered, but it - # is a minimal check. - if len(file_list) > 1: - for file_name in file_list[1:]: - logging.info(" - Processing file {}" - .format(os.path.basename(file_name))) - data = _load_and_verify_file(file_name, group, group_affine, - group_res) - - if std_option == 'per_file': - logging.info(' - Standardizing') - data = standardize_data(data, std_mask, independent=False) - - # Append file data to hdf group. + group_data = None + for i, file_name in enumerate(file_list): + logging.info(" - Processing file {}" + .format(os.path.basename(file_name))) + + # Loading. + # First file is loaded with header information. + # Other files must fit. + if i == 0: + logging.debug("First file = reference.") + data, group_affine, group_res, group_header = \ + load_file_to4d(file_name) + else: + data = _load_and_verify_file(file_name, group_name, + group_affine, group_res) + + # Per-file standardization + if std_option == 'per_file': + logging.info(' - Standardizing') + data = standardize_data(data, std_mask, independent=False) + elif std_option == 'per_file_across_subjs': + logging.debug(" - Storing mean and variance for later " + "standardization") + # Instantiate online variances if this is the first subject + if self.online_stats[group_name][i] == None: + self.online_stats[group_name][i] = \ + OnlineMeanAndVariance(nb_features=data.shape[-1]) + + # Update the mean and variance with current subject + self.online_stats[group_name][i].update_from_new_subject(data) + + # Append file data to hdf group. + if i == 0: + group_data = data + else: try: group_data = np.append(group_data, data, axis=-1) except ImportError: raise ImportError( 'Data file {} could not be added to data group {}. ' - 'Wrong dimensions?'.format(file_name, group)) + 'Wrong dimensions?'.format(file_name, group_name)) - # Standardize data (per channel) (if not done 'per_file' yet). - if std_option == 'independent': + # Standardize data (if not done 'per_file' yet). + if std_option == 'independent': # i.e. per feature logging.info(' - Standardizing data on each feature.') group_data = standardize_data(group_data, std_mask, independent=True) @@ -552,14 +596,24 @@ def _process_one_volume_group(self, group: str, subj_id: str, logging.info(' - Standardizing data as a whole.') group_data = standardize_data(group_data, std_mask, independent=False) - elif std_option not in ['none', 'per_file']: - raise ValueError("standardization must be one of " - "['all', 'independent', 'per_file', 'none']") + elif std_option == 'all_across_subjs': + logging.info(' - Storing mean and std of data as a whole for ' + 'later standardization.') + # Instantiate online variances if this is the first subject + if self.online_stats[group_name][0] == None: + self.online_stats[group_name][0] = \ + OnlineMeanAndVariance(nb_features=group_data.shape[-1]) + + # Update the mean and variance with current subject + self.online_stats[group_name].update_from_new_subject(group_data) + + elif std_option not in ['none', 'per_file', 'per_file_across_subjs']: + raise ValueError("standardization must be one of " + STD_CHOICES) # Save standardized data if self.save_intermediate: output_fname = self.intermediate_folder.joinpath( - subj_id + '_' + group + ".nii.gz") + subj_id + '_' + group_name + ".nii.gz") logging.debug(' *Saving intermediate files into {}.' .format(output_fname)) standardized_img = nib.Nifti1Image(group_data, group_affine) diff --git a/src/dwi_ml/data/processing/dwi/dwi.py b/src/dwi_ml/data/processing/dwi/dwi.py index 48decfff..60953708 100644 --- a/src/dwi_ml/data/processing/dwi/dwi.py +++ b/src/dwi_ml/data/processing/dwi/dwi.py @@ -12,10 +12,94 @@ eps = 1e-6 +class OnlineMeanAndVariance: + """ + Class to help standardize data volumes. This procedure computes the mean and + variance of the training set; a single volume (single subject) at once. Can + be computed during the preparation of the hdf5, for instance, where we + iterate over volumes, rather than needing to load all data at once in + memory. + + At each iteration, we update the mean and variance from a batch of data (all + voxels in that volume). + + Solution 1: (method = 'welford') + If we wanted to update a single observation at the time: See the Welford + Variance algorithm: + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + So we can loop over voxels and use that algorithm (slow). + + Solution 2: (method = 'batch') + See mathematics here: + https://notmatthancock.github.io/2017/03/23/simple-batch-stat-updates.html + + """ + def __init__(self, nb_features, method='batch'): + assert method in ['batch', 'welford'] + + # Count = the total number of voxels across all subjects, all volumes + self.count = 0 + self.mean = 0.0 + self._variance = 0.0 + self.method = method + + # We will compute the mean and variance over all voxels, all features + # This is to help supervise shape. + self.nb_features = nb_features + + def update_from_new_subject(self, x: np.ndarray): + """ + Parameters + ---------- + x: np.ndarray + A volume for one subject. + """ + assert (x.shape[-1] == self.nb_features) + x = x.flatten() + + if self.method == 'batch': + self.add_variable_batch(x) + else: + self.add_variable_welford(x) + + def add_variable_batch(self, x): + m = self.count + mu1 = self.mean + s1 = self._variance + + n = len(x) + mu2 = np.mean(x) + s2 = np.std(x) ** 2 + + self.count = m + n + self.mean = m/self.count*mu1 + n/self.count*mu2 + self._variance = m/self.count*s1 + n/self.count*s2 + \ + m*n/(self.count**2) * (mu1 - mu2)**2 + + def add_variable_welford(self, x): + # Currently looping over voxels. Not sure if it can be accelerated. + for xi in x: + self.count += 1 + delta = (x - self.mean) + self.mean += delta / self.count + self._variance += delta * (x - self.mean) + + @property + def variance(self) -> float: + if self.method == 'batch': + return self._variance + else: + return self._variance / self.count + + @property + def std(self) -> float: + return np.sqrt(self.variance) + + def standardize_data(data: np.ndarray, mask: np.ndarray = None, independent: bool = False): """Apply classic data standardization (centralized, normalized = zero- - centering and variance to 1). + centering and variance to 1) to a single volume. Parameters ----------