diff --git a/cfmm2tar/api.py b/cfmm2tar/api.py index 12a971e..959d051 100644 --- a/cfmm2tar/api.py +++ b/cfmm2tar/api.py @@ -231,6 +231,7 @@ def download_studies( keep_sorted_dicom: bool = False, skip_derived: bool = False, additional_tags: dict[str, str] | None = None, + use_gzip: bool = False, ) -> str: """ Download DICOM studies from the server and create tar archives. @@ -265,6 +266,7 @@ def download_studies( Dict maps DICOM tag (hex string) to field name. Example: {"00100030": "PatientBirthDate", "00100040": "PatientSex"} (default: None) + use_gzip: Create gzip-compressed tar files (.tar.gz instead of .tar) (default: False) Returns: Path to the output directory containing the downloaded tar files @@ -315,6 +317,14 @@ def download_studies( ... "00100040": "PatientSex" ... } ... ) + + Download with gzip compression: + >>> download_studies( + ... output_dir="/path/to/output", + ... study_description="Khan^NeuroAnalytics", + ... study_date="20240101", + ... use_gzip=True + ... ) """ # Get credentials username, password = _get_credentials(username, password, credentials_file) @@ -352,6 +362,7 @@ def download_studies( force_refresh_trust_store=force_refresh_trust_store, skip_derived=skip_derived, additional_tags=additional_tags, + use_gzip=use_gzip, ) else: # Single UID or wildcard @@ -372,6 +383,7 @@ def download_studies( force_refresh_trust_store=force_refresh_trust_store, skip_derived=skip_derived, additional_tags=additional_tags, + use_gzip=use_gzip, ) # Clean up temp directory if empty @@ -397,6 +409,7 @@ def download_studies_from_metadata( keep_sorted_dicom: bool = False, skip_derived: bool = False, additional_tags: dict[str, str] | None = None, + use_gzip: bool = False, ) -> str: """ Download DICOM studies using UIDs from metadata. @@ -479,6 +492,13 @@ def download_studies_from_metadata( ... metadata="study_metadata.tsv", ... additional_tags={"00100030": "PatientBirthDate"} ... ) + + Download with gzip compression: + >>> download_studies_from_metadata( + ... output_dir="/path/to/output", + ... metadata="study_metadata.tsv", + ... use_gzip=True + ... ) """ # Get credentials username, password = _get_credentials(username, password, credentials_file) @@ -570,6 +590,7 @@ def download_studies_from_metadata( force_refresh_trust_store=force_refresh_trust_store, skip_derived=skip_derived, additional_tags=additional_tags, + use_gzip=use_gzip, ) # Clean up temp directory if empty diff --git a/cfmm2tar/cli.py b/cfmm2tar/cli.py index cdb4df8..c9bc9c6 100644 --- a/cfmm2tar/cli.py +++ b/cfmm2tar/cli.py @@ -168,6 +168,12 @@ def main(): action="store_true", help="Skip DICOM files with ImageType containing DERIVED (e.g., reformats, derived images)", ) + parser.add_argument( + "--gzip", + dest="use_gzip", + action="store_true", + help="Create gzip-compressed tar files (.tar.gz instead of .tar)", + ) parser.add_argument( "--metadata-tags", dest="metadata_tags", @@ -378,6 +384,7 @@ def main(): skip_derived=args.skip_derived, additional_tags=additional_tags, tls_cipher=args.tls_cipher, + use_gzip=args.use_gzip, ) else: # Normal mode - use search criteria (no specific UIDs provided) @@ -399,6 +406,7 @@ def main(): skip_derived=args.skip_derived, additional_tags=additional_tags, tls_cipher=args.tls_cipher, + use_gzip=args.use_gzip, ) # Clean up temp directory if empty diff --git a/cfmm2tar/dicom_sorter.py b/cfmm2tar/dicom_sorter.py index fa79595..f41feae 100755 --- a/cfmm2tar/dicom_sorter.py +++ b/cfmm2tar/dicom_sorter.py @@ -261,7 +261,7 @@ def sort(self): return sorted_dirs - def tar(self, depth, tar_filename_sep="_"): + def tar(self, depth, tar_filename_sep="_", use_gzip=False): """ extract, apply sort rule, unwrap non-imaging dicom files, and create tar files(imaging->*.tar,non-imaging->*.attached.tar) @@ -272,6 +272,7 @@ def tar(self, depth, tar_filename_sep="_"): given depth = 4, tar filename is: project_study_date_patient_name_study_id.tar tar_filename_sep: seprator of the tar file name elements + use_gzip: if True, create gzip-compressed tar files (.tar.gz) output: tar_full_filename_list: list of resulted tar filenames @@ -320,12 +321,14 @@ def tar(self, depth, tar_filename_sep="_"): # dir_split: ['PI','Project','19700101','1970_01_01_T2','1.9AC66A0D','0003','1970_01_01_T2.MR.PI_project.0003.0194.19700101.D6C44EC8.dcm'] dir_split = relative_path_new_filename.split(os.sep) - tar_filename = tar_filename_sep.join(dir_split[:depth]) + ".tar" + tar_ext = ".tar.gz" if use_gzip else ".tar" + tar_filename = tar_filename_sep.join(dir_split[:depth]) + tar_ext tar_full_filename = os.path.join(self.output_dir, tar_filename) tar_full_filename_dict[tar_full_filename].append(item) + tar_mode = "w:gz" if use_gzip else "w" for tar_full_filename, items in tar_full_filename_dict.items(): - with tarfile.open(tar_full_filename, "w") as tar: + with tarfile.open(tar_full_filename, tar_mode) as tar: for item in items: original_full_filename = item[0] relative_path_new_filename = item[1] @@ -334,7 +337,8 @@ def tar(self, depth, tar_filename_sep="_"): tar.add(original_full_filename, arcname=arcname) # tar non-imaging: - attached_tar_full_filenames = [] + # Collect all unwrapped dirs for each attached tar file + attached_tar_dict = defaultdict(list) for item in before_after_sort_rule_list: original_full_filename = item[0] relative_path_new_filename = item[1] @@ -343,20 +347,19 @@ def tar(self, depth, tar_filename_sep="_"): if unwraped_dir: dir_split = relative_path_new_filename.split(os.sep) - attached_tar_filename = tar_filename_sep.join(dir_split[:depth]) + ".attached.tar" + attached_tar_ext = ".attached.tar.gz" if use_gzip else ".attached.tar" + attached_tar_filename = tar_filename_sep.join(dir_split[:depth]) + attached_tar_ext attached_tar_full_filename = os.path.join(self.output_dir, attached_tar_filename) - tar_arcname = relative_path_new_filename + "_unwraped" + attached_tar_dict[attached_tar_full_filename].append((unwraped_dir, tar_arcname)) - if attached_tar_full_filename not in attached_tar_full_filenames: - with tarfile.open(attached_tar_full_filename, "w") as tar: - tar.add(unwraped_dir, arcname=tar_arcname) - - attached_tar_full_filenames.append(attached_tar_full_filename) - - else: - with tarfile.open(attached_tar_full_filename, "a") as tar: - tar.add(unwraped_dir, arcname=tar_arcname) + # Write all attached tar files + attached_tar_full_filenames = [] + for attached_tar_full_filename, items in attached_tar_dict.items(): + with tarfile.open(attached_tar_full_filename, tar_mode) as tar: + for unwraped_dir, tar_arcname in items: + tar.add(unwraped_dir, arcname=tar_arcname) + attached_tar_full_filenames.append(attached_tar_full_filename) return list(tar_full_filename_dict.keys()) + attached_tar_full_filenames diff --git a/cfmm2tar/retrieve_cfmm_tar.py b/cfmm2tar/retrieve_cfmm_tar.py index f6f6efa..729c5b3 100755 --- a/cfmm2tar/retrieve_cfmm_tar.py +++ b/cfmm2tar/retrieve_cfmm_tar.py @@ -77,6 +77,7 @@ def main( skip_derived=False, additional_tags=None, tls_cipher="TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", + use_gzip=False, ): """ main workflow: for each study: query,retrieve,tar @@ -194,7 +195,7 @@ def main( # according to CFMM's rule, folder depth is 5: # pi/project/study_date/patient/studyID_and_hash_studyInstanceUID # a list with one element, retrieved_dicom_dir contain's one study - tar_full_filenames = d.tar(5) + tar_full_filenames = d.tar(5, use_gzip=use_gzip) # if there is no dicom files in the retrieved folder, tar_full_filenames is None if not tar_full_filenames: @@ -205,7 +206,11 @@ def main( logger.info(f"tar file created: {tar_full_filename}") # .uid file - uid_full_filename = tar_full_filename[:-3] + "uid" + # Strip .tar or .tar.gz extension to add .uid + if tar_full_filename.endswith(".tar.gz"): + uid_full_filename = tar_full_filename[:-7] + ".uid" + else: + uid_full_filename = tar_full_filename[:-4] + ".uid" with open(uid_full_filename, "w") as f: f.write(StudyInstanceUID + "\n") diff --git a/tests/test_gzip.py b/tests/test_gzip.py new file mode 100644 index 0000000..9ec2fd7 --- /dev/null +++ b/tests/test_gzip.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +Tests for gzip compression functionality in DicomSorter. + +These are unit tests that verify the gzip compression option works correctly. +""" + +import datetime +import os +import tarfile + +try: + import pytest + + PYTEST_AVAILABLE = True +except ImportError: + PYTEST_AVAILABLE = False + + # Create dummy decorators if pytest not available + class DummyMarker: + def __call__(self, *args, **kwargs): + if callable(args[0]): + # Being used as decorator directly + return args[0] + + # Being called with arguments, return decorator + def decorator(func): + return func + + return decorator + + def __getattr__(self, name): + return DummyMarker() + + class MockPytest: + mark = DummyMarker() + + @staticmethod + def fixture(*args, **kwargs): + def decorator(func): + return func + + return decorator + + @staticmethod + def skipif(*args, **kwargs): + def decorator(func): + return func + + return decorator + + pytest = MockPytest() + + +try: + from pydicom.dataset import Dataset, FileDataset + from pydicom.uid import ImplicitVRLittleEndian, generate_uid + + PYDICOM_AVAILABLE = True +except ImportError: + PYDICOM_AVAILABLE = False + +from cfmm2tar import dicom_sorter, sort_rules + + +def create_test_dicom(output_path, study_uid=None): + """ + Create a minimal DICOM file for testing. + + Args: + output_path: Path where to save the DICOM file + study_uid: Optional StudyInstanceUID to use (for grouping files) + + Returns: + The created dataset + """ + file_meta = Dataset() + file_meta.TransferSyntaxUID = ImplicitVRLittleEndian + file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.4" # MR Image Storage + file_meta.MediaStorageSOPInstanceUID = generate_uid() + file_meta.ImplementationClassUID = generate_uid() + + ds = FileDataset(output_path, dataset={}, file_meta=file_meta, preamble=b"\0" * 128) + + # Add required DICOM tags + ds.PatientName = "Test^Patient" + ds.PatientID = "TEST001" + ds.StudyInstanceUID = study_uid if study_uid else generate_uid() + ds.SeriesInstanceUID = generate_uid() + ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID + ds.SOPClassUID = file_meta.MediaStorageSOPClassUID + ds.StudyDate = datetime.datetime.now().strftime("%Y%m%d") + ds.StudyTime = datetime.datetime.now().strftime("%H%M%S") + ds.StudyDescription = "TestPI^TestProject" + ds.SeriesNumber = "1" + ds.InstanceNumber = "1" + ds.Modality = "MR" + ds.StudyID = "1" + ds.ProtocolName = "TestProtocol" + ds.SeriesDescription = "TestSeries" + ds.ContentDate = ds.StudyDate + ds.ImageType = ["ORIGINAL", "PRIMARY"] + + ds.SamplesPerPixel = 1 + ds.PhotometricInterpretation = "MONOCHROME2" + ds.Rows = 64 + ds.Columns = 64 + ds.BitsAllocated = 16 + ds.BitsStored = 16 + ds.HighBit = 15 + ds.PixelRepresentation = 0 + + # Create minimal pixel data (64x64 = 4096 pixels * 2 bytes = 8192 bytes) + ds.PixelData = b"\x00" * 8192 + + # Save the file + ds.save_as(output_path) + return ds + + +@pytest.mark.unit +@pytest.mark.skipif(not PYDICOM_AVAILABLE, reason="pydicom not available") +class TestGzipCompression: + """Tests for gzip compression functionality.""" + + def test_tar_without_gzip_creates_tar_files(self, temp_output_dir): + """Test that use_gzip=False creates regular .tar files.""" + input_dir = os.path.join(temp_output_dir, "input") + output_dir = os.path.join(temp_output_dir, "output") + os.makedirs(input_dir) + os.makedirs(output_dir) + + # Use same StudyInstanceUID for all files to group them into one tar + study_uid = generate_uid() + + # Create 3 DICOM files + for i in range(3): + dcm_path = os.path.join(input_dir, f"test_{i}.dcm") + create_test_dicom(dcm_path, study_uid=study_uid) + + # Create DicomSorter with use_gzip=False (default) + with dicom_sorter.DicomSorter( + input_dir, sort_rules.sort_rule_CFMM, output_dir, skip_derived=False + ) as sorter: + tar_files = sorter.tar(5, use_gzip=False) + + # Should create tar file(s) with .tar extension + assert tar_files is not None + assert len(tar_files) >= 1 + + # Check that files have .tar extension (not .tar.gz) + for tar_file in tar_files: + if not tar_file.endswith(".attached.tar"): + assert tar_file.endswith(".tar") + assert not tar_file.endswith(".tar.gz") + + # Verify files can be opened as regular tar + for tar_file in tar_files: + if tar_file.endswith(".tar"): + with tarfile.open(tar_file, "r") as tar: + members = tar.getmembers() + assert len(members) > 0 + + def test_tar_with_gzip_creates_tar_gz_files(self, temp_output_dir): + """Test that use_gzip=True creates .tar.gz files.""" + input_dir = os.path.join(temp_output_dir, "input") + output_dir = os.path.join(temp_output_dir, "output") + os.makedirs(input_dir) + os.makedirs(output_dir) + + # Use same StudyInstanceUID for all files to group them into one tar + study_uid = generate_uid() + + # Create 3 DICOM files + for i in range(3): + dcm_path = os.path.join(input_dir, f"test_{i}.dcm") + create_test_dicom(dcm_path, study_uid=study_uid) + + # Create DicomSorter with use_gzip=True + with dicom_sorter.DicomSorter( + input_dir, sort_rules.sort_rule_CFMM, output_dir, skip_derived=False + ) as sorter: + tar_files = sorter.tar(5, use_gzip=True) + + # Should create tar file(s) with .tar.gz extension + assert tar_files is not None + assert len(tar_files) >= 1 + + # Check that files have .tar.gz extension + for tar_file in tar_files: + if not tar_file.endswith(".attached.tar.gz"): + assert tar_file.endswith(".tar.gz") + + # Verify files can be opened as gzipped tar + for tar_file in tar_files: + if tar_file.endswith(".tar.gz"): + with tarfile.open(tar_file, "r:gz") as tar: + members = tar.getmembers() + assert len(members) > 0 + + def test_gzip_tar_contains_correct_files(self, temp_output_dir): + """Test that gzipped tar files contain the correct number of DICOM files.""" + input_dir = os.path.join(temp_output_dir, "input") + output_dir = os.path.join(temp_output_dir, "output") + os.makedirs(input_dir) + os.makedirs(output_dir) + + # Use same StudyInstanceUID for all files to group them + study_uid = generate_uid() + + # Create 5 DICOM files + num_files = 5 + for i in range(num_files): + dcm_path = os.path.join(input_dir, f"test_{i}.dcm") + create_test_dicom(dcm_path, study_uid=study_uid) + + # Create DicomSorter with use_gzip=True + with dicom_sorter.DicomSorter( + input_dir, sort_rules.sort_rule_CFMM, output_dir, skip_derived=False + ) as sorter: + tar_files = sorter.tar(5, use_gzip=True) + + # Count files in all tar archives + total_files = 0 + for tar_file in tar_files: + if tar_file.endswith(".tar.gz"): + with tarfile.open(tar_file, "r:gz") as tar: + total_files += len(tar.getmembers()) + + # Should have all 5 files + assert total_files == num_files + + def test_gzip_tar_files_are_smaller(self, temp_output_dir): + """Test that gzipped tar files are smaller than uncompressed ones.""" + input_dir = os.path.join(temp_output_dir, "input") + output_dir_ungz = os.path.join(temp_output_dir, "output_ungz") + output_dir_gz = os.path.join(temp_output_dir, "output_gz") + os.makedirs(input_dir) + os.makedirs(output_dir_ungz) + os.makedirs(output_dir_gz) + + # Use same StudyInstanceUID for all files + study_uid = generate_uid() + + # Create 10 DICOM files (more files = better compression ratio) + for i in range(10): + dcm_path = os.path.join(input_dir, f"test_{i}.dcm") + create_test_dicom(dcm_path, study_uid=study_uid) + + # Create uncompressed tar + with dicom_sorter.DicomSorter( + input_dir, sort_rules.sort_rule_CFMM, output_dir_ungz, skip_derived=False + ) as sorter: + tar_files_ungz = sorter.tar(5, use_gzip=False) + + # Create compressed tar + with dicom_sorter.DicomSorter( + input_dir, sort_rules.sort_rule_CFMM, output_dir_gz, skip_derived=False + ) as sorter: + tar_files_gz = sorter.tar(5, use_gzip=True) + + # Get sizes of uncompressed tar files + size_ungz = sum(os.path.getsize(f) for f in tar_files_ungz if f.endswith(".tar")) + + # Get sizes of compressed tar files + size_gz = sum(os.path.getsize(f) for f in tar_files_gz if f.endswith(".tar.gz")) + + # Gzipped files should be smaller + # Note: compression ratio may vary, but gzip should provide some compression + assert size_gz < size_ungz, ( + f"Gzipped tar ({size_gz} bytes) should be smaller than uncompressed tar ({size_ungz} bytes)" + )