scil-vital · EmmaRenauld · Jan 30, 2026 · Jan 30, 2026
diff --git a/bash_utilities/config_file_example.json b/bash_utilities/config_file_example.json
diff --git a/docs/2_A_creating_the_hdf5.rst b/docs/2_A_creating_the_hdf5.rst
@@ -10,44 +10,27 @@ We chose to base our code on the hdf5 data. One reason is that it allows to regr
 
 The hdf5 may contain many groups of data. For instance, if your model needs an input volume and the streamlines as target, you might need one group for each. You might want to include tracking masks or any other required data.
 
-Volume groups will mimic nifti files. While creating the hdf5, you may concatenate many nifti files into a single group.
-
-Streamline groups will mimic tractogram files. Again, you may concatenate many .trk or .tck files in a single group, for instance you could concatenate many bundles per subject.
-
 
 2.2 How to organize your data?
 ******************************
 
-We suggest that your data should be organized correctly on your computer, such as described below.
-
 This is how your data should be organized before trying to load your data as a hdf5 file. This structure should hold wether you work with hdf5 or BIDS. Below, we call "dwi_ml_ready" the folder with correct organization.
 
 *Hint:* use symlinks to avoid doubling your data on disk!
 
 **dwi_ml_ready**
 
-This folder is the most important one and must be organized in a very precise way to be able to load the data as a hdf5 using our script **create_hdf5_dataset.py**. Each subject should have the exact same sub-folders and files. Then, you can create a **config_file.json** that will tell the script what to include in the hdf5 file.
+This folder is the most important one and must be organized in a very precise way to be able to load the data as a hdf5 using our script **dwiml_create_hdf5_dataset**. Each subject should have the exact same sub-folders and files. Then, you can create a **config_file.json** that will tell the script what to include in the hdf5 file.
 
 **Example:**
 
 .. code-block:: bash
 
     {database_name}
-    | original  =====> Organized as you wish but if you intend on using
-                       tractoflow, you should organize it as below.
-        | {subject_id}
-            | dwi.nii.gz
-            | bval
-            | bvec
-            | t1.nii.gz
-    | preprocessed =====>  Organized as you wish.
-        | {subject_id}
-            | Ex: Tractoflow folders
-            | Ex: bundles from Recobundles
     | dwi_ml_ready  =====> Each subject should contain the exact same sub-folders
                            and files, such as below. It is also possible to add
                            prefixes to the files (ex: subj1__t1.nii.gz) based on
-                           the subject id. For instance:
+                           the subject id.
         | {subject_id}
             | anat
                 | t1.nii.gz
@@ -74,61 +57,68 @@ To create the hdf5 file, you will need a config file such as below. HDF groups w
     {
         "input": {
             "type": "volume",
-            "files": ["dwi/dwi.nii.gz", "anat/t1.nii.gz", "dwi/*__dwi.nii.gz], --> Will get, for instance, subX__dwi.nii.gz
-            "standardization": "all",
+            "files": ["dwi/dwi.nii.gz", "anat/t1.nii.gz", "dwi/*__dwi.nii.gz],
             "std_mask": [masks/some_mask.nii.gz]
-             },
+        },
         "target": {
             "type": "streamlines",
-            "files": ["tractograms/bundle1.trk", "tractograms/wholebrain.trk", "tractograms/*__wholebrain.trk"], ----> Will get, for instance, sub1000__bundle1.trk
+            "files": ["tractograms/bundle1.trk", "tractograms/wholebrain.trk", "tractograms/*__wholebrain.trk"],
             "connectivity_matrix": "my_file.npy",
-            "connectivity_nb_blocs": 6  ---> OR
+            "connectivity_nb_blocs": 6                    ( OR )
             "connectivity_labels": labels_volume_group,
             "dps_keys": ['dps1', 'dps2']
-             }
+        }
         "bad_streamlines": {
             "type": "streamlines",
-            "files": ["bad_tractograms/*"] ---> Will get all trk and tck files.
-             }
+            "files": ["bad_tractograms/*"]
+        }
         "wm_mask": {
             "type": "volume",
             "files": ["masks/wm_mask.nii.gz"]
-            }
+        }
     }
 
 |
+Each group key will become the group's **name** in the hdf5. It can be anything you want. We suggest you keep it significative, ex 'input_volume', 'target_volume', 'target_streamlines'. In our scripts (ex, l2t_train_model.py, tt_train_model.py, etc), you will often be asked for the labels given to your groups.
 
-General group attributes in the config file:
-""""""""""""""""""""""""""""""""""""""""""""
-
-Each group key will become the group's **name** in the hdf5. It can be anything you want. We suggest you keep it significative, ex 'input_volume', 'target_volume', 'target_directions'. In other scripts (ex, l2t_train_model.py, tt_train_model.py, etc), you will often be asked for the labels given to your groups.
 
-Each group may have a number of parameters:
+Required attributes for each group
+""""""""""""""""""""""""""""""""""
 
     - **"type"**: It must be recognized in dwi_ml. Currently, accepted datatype are:
 
-        - 'volume': for instance, a dwi, an anat, mask, t1, fa, etc.
-        - 'streamlines': for instance, a .trk, .tck file (any format accepted by Dipy's *Stateful Tractogram*).
+        - 'volume': Volume groups will mimic nifti files. While creating the hdf5, you may concatenate many nifti files into a single group.
+
+        - 'streamlines': Streamline groups will mimic tractogram files. Again, you may concatenate many .trk or .tck files in a single group, for instance you could concatenate many bundles per subject. Files must be inany format accepted by Dipy's *Stateful Tractogram*, such as .trk or .tck.
 
-    - **"files"**: The listed file(s) must exist in every subject folder inside the root repository. That is: the files must be organized correctly on your computer (except if option 'enforce_files_presence is set to False). If there are more than one files, they will be concatenated (on the 4th dimension for volumes, using the union of tractograms for streamlines).
+    - **"files"**: The files to concatenate into a single volume or a single tractogram. They must exist in every subject folder inside the root repository. That is: the files must be organized correctly on your computer (except if option 'enforce_files_presence is set to False).
+
+        Note: There is the possibility to add a wildcard (\*), for instance if you files have variable prefixes (*_T1.nii.gz will include subj1_T1.nii.gz), or to include many files (bundles/*.trk will include all trk files in the bundles folder.).
 
-        - There is the possibility to add a wildcard (\*).
 
 Additional attributes for volume groups:
 """"""""""""""""""""""""""""""""""""""""
 
-    - **std_mask**: The name of the standardization mask. Data is standardized (normalized) during data creation: data = (data - mean_in_mask) / std_in_mask. If more than one files are given, the union (logical_or) of all masks is used (ex of usage: ["masks/wm_mask.nii.gz", "masks/gm_mask.nii.gz"] would use a mask of all the brain).
+    - **std_mask**: The name of the standardization mask (see Note 1). Data is standardized (normalized) during data creation: data = (data - mean_in_mask) / std_in_mask. If more than one files are given, the union (logical_or) of all masks is used (ex of usage: ["masks/wm_mask.nii.gz", "masks/gm_mask.nii.gz"] would use a mask of all the brain).
 
-    - **"standardization"**: It defined the standardization option applied to the volume group. It must be one of:
+    - **"standardization"**: It defines the standardization (normalization) option applied to the volume group. It must be one of:
 
-        - "all", to apply standardization (normalization) to the final (concatenated) file.
-        - "independent", to apply it independently on the last dimension of the data (ex, for a fODF, it would apply it independently on each SH).
-        - "per_file", to apply it independently on each file included in the group.
+        - "all", to apply standardization to the final (concatenated) file, per subject.
+        - "all_across_subjs", to apply standardization  to the final file, across all subjects
+        - "independent", to apply it independently on the last dimension of the data; on each feature. For instance, for a fODF, this would apply standardization independently on each SH coefficient, per subject.
+        - ("independent_across_subjs": not implemented!)
+        - "per_file", to apply it independently on each file concatenated in the volume, per subject.
+        - "per_file_across_subjs", to apply the same normalization to all subjects. See note 2.
         - "none", to skip this step (default)
 
-****A note about data standardization**
+****Note 1: why we use a mask for standardization**
+
+If all voxel were to be used, most of them would probably contain the background of the data, bringing the mean and std probably very close to 0. Thus, non-zero voxels only are used to compute the mean and std, or voxels inside the provided mask if any. If a mask is provided, voxels outside the mask could have been set to NaN, but the safer choice made here was to simply modify all voxels [ data = (data - mean) / std ], even voxels outside the mask, with the mean and std of voxels in the mask.
+
+****Note 2: how we apply standardization across subjects**
+
+When we create the hdf5, to apply a the same standardization to all subjects, we could load volumes from all subjects in the training set at once, compute their mean and std. This could become heavy in memory, in data is big (typically 4D volumes) and if there are a lot of subjects. Rather, as we loop over all subjects to prepare the data, we use `Welford's algorithm <https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance>`_ to compute the variance in an incremental way. The final mean and std [sqrt(variance)] are save as attributes of the hdf5. Models have access to this information in the hdf5 and can later standardize any new data it receives, even unseen data from the testing set.
 
-If all voxel were to be used, most of them would probably contain the background of the data, bringing the mean and std probably very close to 0. Thus, non-zero voxels only are used to compute the mean and std, or voxels inside the provided mask if any. If a mask is provided, voxels outside the mask could have been set to NaN, but the simpler choice made here was to simply modify all voxels [ data = (data - mean) / std ], even voxels outside the mask, with the mean and std of voxels in the mask. Mask name is provided through the config file. It is formatted as a list: if many files are listed, the union of the binary masks will be used.
 
 
 Additional attributes for streamlines groups:
@@ -141,10 +131,12 @@ Additional attributes for streamlines groups:
 
     - **dps_keys**: List of data_per_streamline keys to keep in memory in the hdf5.
 
+
+
 2.4. Creating the hdf5
 **********************
 
-You will use the **dwiml_create_hdf5_dataset.py** script to create a hdf5 file.
+You will use the **dwiml_create_hdf5_dataset** script to create a hdf5 file.
 
 .. code-block:: bash
 
@@ -160,6 +152,8 @@ You will use the **dwiml_create_hdf5_dataset.py** script to create a hdf5 file.
             $dwi_ml_folder $hdf5_file $config_file \
             $training_subjs $validation_subjs $testing_subjs
 
+You may later investigate the organization of your hdf5 with the script **dwiml_hdf5_print_architecture**.
+
 .. toctree::
     :maxdepth: 1
     :caption: Detailed explanations for developers:

diff --git a/docs/2_B_advanced_hdf5_organization.rst b/docs/2_B_advanced_hdf5_organization.rst
@@ -11,6 +11,7 @@ Here is the output format created by dwiml_create_hdf5_dataset.py and recognized
     hdf5.attrs['training_subjs'] = the list of str representing the training subjects.
     hdf5.attrs['validation_subjs'] = the list of str representing the validation subjects.
     hdf5.attrs['testing_subjs'] = the list of str representing the testing subjects.
+    hdf5.attrs['means_and_stds_groupX'] = (mean, std) for olume group named groupX (if normalization across subjects is used), where std [=sqrt(variance)]. Each one is a vector of length = nb_features
 
     # hdf5.keys() are the subjects.
     hdf5['subj1'].keys() are the groups from the config_file.
@@ -31,11 +32,12 @@ Here is the output format created by dwiml_create_hdf5_dataset.py and recognized
     # (others:)
     hdf5['subj1']['group1']['connectivity_matrix']
     hdf5['subj1']['group1']['connectivity_matrix_type'] = 'from_blocs' or 'from_labels'
-    hdf5['subj1']['group1']['connectivity_label_volume'] (the labels\' volume group) OR
-    hdf5['subj1']['group1']['connectivity_nb_blocs'] (a list of three integers)
-    hdf5['subj1']['group1']['data_per_streamline'] (a HDF5 group of 2D numpy arrays)
+    hdf5['subj1']['group1']['connectivity_label_volume'] = (the labels\' volume group) OR
+    hdf5['subj1']['group1']['connectivity_nb_blocs'] = (a list of three integers)
+    hdf5['subj1']['group1']['data_per_streamline'] = a HDF5 group of 2D numpy arrays
 
     # For volumes, other available data:
     hdf5['sub1']['group1']['affine']
     hdf5['sub1']['group1']['voxres']
     hdf5['sub1']['group1']['nb_features']
+
diff --git a/src/dwi_ml/cli/tests/test_create_hdf5_dataset.py b/src/dwi_ml/cli/tests/test_create_hdf5_dataset.py
@@ -29,6 +29,9 @@
 #          "dps_keys": ['mean_color_dps', 'mock_2d_dps']
 #     }
 # }
+# TODO: test connectivity matrices (need to be added in data)
+# TODO: Test other standardization (need to modify the config_file.)
+
 
 def test_help_option(script_runner):
     ret = script_runner.run('dwiml_create_hdf5_dataset', '--help')
@@ -49,5 +52,6 @@ def test_execution(script_runner):
 
     ret = script_runner.run('dwiml_create_hdf5_dataset',
                             dwi_ml_folder, hdf5_output, config_file,
-                            training_subjs, validation_subjs, testing_subjs)
+                            training_subjs, validation_subjs, testing_subjs,
+                            '-v', 'DEBUG')
     assert ret.success
diff --git a/src/dwi_ml/data/dataset/multi_subject_containers.py b/src/dwi_ml/data/dataset/multi_subject_containers.py
@@ -42,11 +42,23 @@ def __init__(self, set_name: str, hdf5_file: str, lazy: bool,
 
         self.set_name = set_name
         self.hdf5_file = hdf5_file
+        self.is_lazy = lazy
+        self.cache_size = cache_size
+
+        # ----- General information: attributes of the hdf5
 
+        # Volumes:
         self.volume_groups = []  # type: List[str]
         self.nb_features = []  # type: List[int]
+        self.means_and_std = {} 
+
+        # Streamlines:
         self.streamline_groups = []  # type: List[str]
         self.contains_connectivity = []  # type: np.ndarray
+        self.step_size = None
+        self.compress = None
+
+        # ----- Information obtained by actually loading the data:
 
         # The subjects data list will be either a SubjectsDataList or a
         # LazySubjectsDataList depending on MultisubjectDataset.is_lazy.
@@ -70,14 +82,7 @@ def __init__(self, set_name: str, hdf5_file: str, lazy: bool,
         self.streamline_lengths_mm = []  # type: List[List[int]]
         self.streamline_lengths = []  # type: List[List[int]]
 
-        # Preprocessing information will be found in the hdf5 later.
-        self.step_size = None
-        self.compress = None
-
-        self.is_lazy = lazy
-
         # This is only used in the lazy case.
-        self.cache_size = cache_size
         self.volume_cache_manager = None  # type: SingleThreadCacheManager
 
     def close_all_handles(self):
@@ -90,10 +95,15 @@ def close_all_handles(self):
                 s.hdf_handle.close()
                 s.hdf_handle = None
 
-    def set_subset_info(self, volume_groups, nb_features, streamline_groups,
-                        contains_connectivity, step_size, compress):
+    def set_subset_info(self, volume_groups, nb_features, means_and_stds, 
+                        streamline_groups, contains_connectivity, 
+                        step_size, compress):
+        # Volumes:
         self.volume_groups = volume_groups
         self.nb_features = nb_features
+        self.means_and_std = means_and_stds
+
+        # Streamlines:
         self.streamline_groups = streamline_groups
         self.contains_connectivity = contains_connectivity
         self.step_size = step_size
@@ -385,6 +395,7 @@ def __init__(self, hdf5_file: str, lazy: bool,
 
         self.volume_groups = []  # type: List[str]
         self.nb_features = []  # type: List[int]
+        self.means_and_stds = {}
         self.streamline_groups = []  # type: List[str]
         self.streamlines_contain_connectivity = []
 
@@ -446,7 +457,7 @@ def load_data(self, load_training=True, load_validation=True,
             if step_size == 'Not defined by user':
                 step_size = None
             if compress == 'Not defined by user':
-                compress = None
+                compress = None          
 
             # Loading the first training subject's group information.
             # Others should fit.
@@ -463,7 +474,7 @@ def load_data(self, load_training=True, load_validation=True,
             logger.debug("Streamline groups containing a connectivity matrix: "
                          "{}".format(contains_connectivity))
 
-            # Verifying groups of interest
+            # Verifying if groups of interest are indeed in the hdf5
             if volume_groups is not None:
                 missing_vol = np.setdiff1d(volume_groups, poss_volume_groups)
                 if len(missing_vol) > 0:
@@ -496,14 +507,24 @@ def load_data(self, load_training=True, load_validation=True,
                 logger.info("--> Using all streamline groups.")
                 self.streamline_groups = poss_strea_groups
                 self.streamlines_contain_connectivity = contains_connectivity
-
             self.streamline_groups = list(self.streamline_groups)
+
+            # Loading normalization information
+            for group in self.self.volume_groups:
+                if 'means_and_stds_' + group in hdf_handle.attrs:
+                    self.means_and_stds[group] = \
+                          hdf_handle.attrs['means_and_stds_' + group] 
+                else:
+                    self.means_and_stds[group] = None
+
+            # Finalizing group information
             group_info = (self.volume_groups, self.nb_features,
+                          self.means_and_stds,
                           self.streamline_groups,
                           self.streamlines_contain_connectivity)
             self.training_set.set_subset_info(*group_info, step_size, compress)
             self.validation_set.set_subset_info(*group_info, step_size, compress)
-            self.testing_set.set_subset_info(*group_info, step_size, compress)
+            self.testing_set.set_subset_info(*group_info, step_size, compress)            
 
             # LOADING
             if load_training: