khanlab · akhanf · Feb 25, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/cfmm2tar/api.py b/cfmm2tar/api.py
@@ -231,6 +231,7 @@ def download_studies(
     keep_sorted_dicom: bool = False,
     skip_derived: bool = False,
     additional_tags: dict[str, str] | None = None,
+    use_gzip: bool = False,
 ) -> str:
     """
     Download DICOM studies from the server and create tar archives.
@@ -265,6 +266,7 @@ def download_studies(
                         Dict maps DICOM tag (hex string) to field name.
                         Example: {"00100030": "PatientBirthDate", "00100040": "PatientSex"}
                         (default: None)
+        use_gzip: Create gzip-compressed tar files (.tar.gz instead of .tar) (default: False)
 
     Returns:
         Path to the output directory containing the downloaded tar files
@@ -315,6 +317,14 @@ def download_studies(
         ...         "00100040": "PatientSex"
         ...     }
         ... )
+
+        Download with gzip compression:
+        >>> download_studies(
+        ...     output_dir="/path/to/output",
+        ...     study_description="Khan^NeuroAnalytics",
+        ...     study_date="20240101",
+        ...     use_gzip=True
+        ... )
     """
     # Get credentials
     username, password = _get_credentials(username, password, credentials_file)
@@ -352,6 +362,7 @@ def download_studies(
                 force_refresh_trust_store=force_refresh_trust_store,
                 skip_derived=skip_derived,
                 additional_tags=additional_tags,
+                use_gzip=use_gzip,
             )
     else:
         # Single UID or wildcard
@@ -372,6 +383,7 @@ def download_studies(
             force_refresh_trust_store=force_refresh_trust_store,
             skip_derived=skip_derived,
             additional_tags=additional_tags,
+            use_gzip=use_gzip,
         )
 
     # Clean up temp directory if empty
@@ -397,6 +409,7 @@ def download_studies_from_metadata(
     keep_sorted_dicom: bool = False,
     skip_derived: bool = False,
     additional_tags: dict[str, str] | None = None,
+    use_gzip: bool = False,
 ) -> str:
     """
     Download DICOM studies using UIDs from metadata.
@@ -479,6 +492,13 @@ def download_studies_from_metadata(
         ...     metadata="study_metadata.tsv",
         ...     additional_tags={"00100030": "PatientBirthDate"}
         ... )
+
+        Download with gzip compression:
+        >>> download_studies_from_metadata(
+        ...     output_dir="/path/to/output",
+        ...     metadata="study_metadata.tsv",
+        ...     use_gzip=True
+        ... )
     """
     # Get credentials
     username, password = _get_credentials(username, password, credentials_file)
@@ -570,6 +590,7 @@ def download_studies_from_metadata(
             force_refresh_trust_store=force_refresh_trust_store,
             skip_derived=skip_derived,
             additional_tags=additional_tags,
+            use_gzip=use_gzip,
         )
 
     # Clean up temp directory if empty

diff --git a/cfmm2tar/cli.py b/cfmm2tar/cli.py
@@ -168,6 +168,12 @@ def main():
         action="store_true",
         help="Skip DICOM files with ImageType containing DERIVED (e.g., reformats, derived images)",
     )
+    parser.add_argument(
+        "--gzip",
+        dest="use_gzip",
+        action="store_true",
+        help="Create gzip-compressed tar files (.tar.gz instead of .tar)",
+    )
     parser.add_argument(
         "--metadata-tags",
         dest="metadata_tags",
@@ -378,6 +384,7 @@ def main():
                     skip_derived=args.skip_derived,
                     additional_tags=additional_tags,
                     tls_cipher=args.tls_cipher,
+                    use_gzip=args.use_gzip,
                 )
         else:
             # Normal mode - use search criteria (no specific UIDs provided)
@@ -399,6 +406,7 @@ def main():
                 skip_derived=args.skip_derived,
                 additional_tags=additional_tags,
                 tls_cipher=args.tls_cipher,
+                use_gzip=args.use_gzip,
             )
 
         # Clean up temp directory if empty

diff --git a/cfmm2tar/dicom_sorter.py b/cfmm2tar/dicom_sorter.py
@@ -261,7 +261,7 @@ def sort(self):
 
         return sorted_dirs
 
-    def tar(self, depth, tar_filename_sep="_"):
+    def tar(self, depth, tar_filename_sep="_", use_gzip=False):
         """
         extract, apply sort rule, unwrap non-imaging dicom files, and create tar files(imaging->*.tar,non-imaging->*.attached.tar)
 
@@ -272,6 +272,7 @@ def tar(self, depth, tar_filename_sep="_"):
                     given depth = 4,
                     tar filename is: project_study_date_patient_name_study_id.tar
             tar_filename_sep: seprator of the tar file name elements
+            use_gzip: if True, create gzip-compressed tar files (.tar.gz)
 
         output:
             tar_full_filename_list: list of resulted tar filenames
@@ -320,12 +321,14 @@ def tar(self, depth, tar_filename_sep="_"):
             # dir_split: ['PI','Project','19700101','1970_01_01_T2','1.9AC66A0D','0003','1970_01_01_T2.MR.PI_project.0003.0194.19700101.D6C44EC8.dcm']
             dir_split = relative_path_new_filename.split(os.sep)
 
-            tar_filename = tar_filename_sep.join(dir_split[:depth]) + ".tar"
+            tar_ext = ".tar.gz" if use_gzip else ".tar"
+            tar_filename = tar_filename_sep.join(dir_split[:depth]) + tar_ext
             tar_full_filename = os.path.join(self.output_dir, tar_filename)
             tar_full_filename_dict[tar_full_filename].append(item)
 
+        tar_mode = "w:gz" if use_gzip else "w"
         for tar_full_filename, items in tar_full_filename_dict.items():
-            with tarfile.open(tar_full_filename, "w") as tar:
+            with tarfile.open(tar_full_filename, tar_mode) as tar:
                 for item in items:
                     original_full_filename = item[0]
                     relative_path_new_filename = item[1]
@@ -334,7 +337,8 @@ def tar(self, depth, tar_filename_sep="_"):
                     tar.add(original_full_filename, arcname=arcname)
 
         # tar non-imaging:
-        attached_tar_full_filenames = []
+        # Collect all unwrapped dirs for each attached tar file
+        attached_tar_dict = defaultdict(list)
         for item in before_after_sort_rule_list:
             original_full_filename = item[0]
             relative_path_new_filename = item[1]
@@ -343,20 +347,19 @@ def tar(self, depth, tar_filename_sep="_"):
 
             if unwraped_dir:
                 dir_split = relative_path_new_filename.split(os.sep)
-                attached_tar_filename = tar_filename_sep.join(dir_split[:depth]) + ".attached.tar"
+                attached_tar_ext = ".attached.tar.gz" if use_gzip else ".attached.tar"
+                attached_tar_filename = tar_filename_sep.join(dir_split[:depth]) + attached_tar_ext
                 attached_tar_full_filename = os.path.join(self.output_dir, attached_tar_filename)
-
                 tar_arcname = relative_path_new_filename + "_unwraped"
+                attached_tar_dict[attached_tar_full_filename].append((unwraped_dir, tar_arcname))
 
-                if attached_tar_full_filename not in attached_tar_full_filenames:
-                    with tarfile.open(attached_tar_full_filename, "w") as tar:
-                        tar.add(unwraped_dir, arcname=tar_arcname)
-
-                    attached_tar_full_filenames.append(attached_tar_full_filename)
-
-                else:
-                    with tarfile.open(attached_tar_full_filename, "a") as tar:
-                        tar.add(unwraped_dir, arcname=tar_arcname)
+        # Write all attached tar files
+        attached_tar_full_filenames = []
+        for attached_tar_full_filename, items in attached_tar_dict.items():
+            with tarfile.open(attached_tar_full_filename, tar_mode) as tar:
+                for unwraped_dir, tar_arcname in items:
+                    tar.add(unwraped_dir, arcname=tar_arcname)
+            attached_tar_full_filenames.append(attached_tar_full_filename)
 
         return list(tar_full_filename_dict.keys()) + attached_tar_full_filenames
 

diff --git a/cfmm2tar/retrieve_cfmm_tar.py b/cfmm2tar/retrieve_cfmm_tar.py
@@ -77,6 +77,7 @@ def main(
     skip_derived=False,
     additional_tags=None,
     tls_cipher="TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384",
+    use_gzip=False,
 ):
     """
     main workflow: for each study: query,retrieve,tar
@@ -194,7 +195,7 @@ def main(
             # according to CFMM's rule, folder depth is 5:
             # pi/project/study_date/patient/studyID_and_hash_studyInstanceUID
             # a list with one element, retrieved_dicom_dir contain's one study
-            tar_full_filenames = d.tar(5)
+            tar_full_filenames = d.tar(5, use_gzip=use_gzip)
 
             # if there is no dicom files in the retrieved folder, tar_full_filenames is None
             if not tar_full_filenames:
@@ -205,7 +206,11 @@ def main(
             logger.info(f"tar file created: {tar_full_filename}")
 
             # .uid file
-            uid_full_filename = tar_full_filename[:-3] + "uid"
+            # Strip .tar or .tar.gz extension to add .uid
+            if tar_full_filename.endswith(".tar.gz"):
+                uid_full_filename = tar_full_filename[:-7] + ".uid"
+            else:
+                uid_full_filename = tar_full_filename[:-4] + ".uid"
             with open(uid_full_filename, "w") as f:
                 f.write(StudyInstanceUID + "\n")