diff --git a/src/cleanvision/dataset/fsspec_dataset.py b/src/cleanvision/dataset/fsspec_dataset.py index c732c707..942dd05b 100644 --- a/src/cleanvision/dataset/fsspec_dataset.py +++ b/src/cleanvision/dataset/fsspec_dataset.py @@ -19,6 +19,7 @@ def __init__( data_folder: Optional[str] = None, filepaths: Optional[List[str]] = None, storage_opts: Dict[str, str] = {}, + verbose: bool = True, ) -> None: super().__init__() self.storage_opts = storage_opts @@ -32,7 +33,7 @@ def __init__( self.fs, dataset_path = fsspec.core.url_to_fs( data_folder, **self.storage_opts ) - self._filepaths = self.__get_filepaths(dataset_path) + self._filepaths = self.__get_filepaths(dataset_path, verbose) else: assert filepaths is not None if len(filepaths) != len(set(filepaths)): @@ -64,10 +65,11 @@ def get_name(self, item: Union[int, str]) -> str: assert isinstance(item, str) return item.split("/")[-1] - def __get_filepaths(self, dataset_path: str) -> List[str]: + def __get_filepaths(self, dataset_path: str, verbose: bool) -> List[str]: """See an issue here: https://github.com/fsspec/filesystem_spec/issues/1019 There's a problem with proper patterning on /**/ in fsspec""" - print(f"Reading images from {dataset_path}") + if verbose: + print(f"Reading images from {dataset_path}") filepaths = [] for ext in IMAGE_FILE_EXTENSIONS: # initial *.ext search, top level diff --git a/src/cleanvision/dataset/utils.py b/src/cleanvision/dataset/utils.py index 97873bdb..f5f0ee1b 100644 --- a/src/cleanvision/dataset/utils.py +++ b/src/cleanvision/dataset/utils.py @@ -19,11 +19,16 @@ def build_dataset( image_key: Optional[str] = None, torchvision_dataset: Optional["VisionDataset"] = None, storage_opts: Dict[str, str] = {}, + verbose: bool = True, ) -> Dataset: if data_path: - return FSDataset(data_folder=data_path, storage_opts=storage_opts) + return FSDataset( + data_folder=data_path, storage_opts=storage_opts, verbose=verbose + ) elif filepaths: - return FSDataset(filepaths=filepaths, storage_opts=storage_opts) + return FSDataset( + filepaths=filepaths, storage_opts=storage_opts, verbose=verbose + ) elif hf_dataset and image_key: return HFDataset(hf_dataset, image_key) elif torchvision_dataset: diff --git a/src/cleanvision/imagelab.py b/src/cleanvision/imagelab.py index 2c98f9e1..0e9319d7 100644 --- a/src/cleanvision/imagelab.py +++ b/src/cleanvision/imagelab.py @@ -124,6 +124,7 @@ def __init__( image_key: Optional[str] = None, torchvision_dataset: Optional["VisionDataset"] = None, storage_opts: Dict[str, Any] = {}, + verbose: bool = True, ) -> None: self._dataset = build_dataset( data_path, @@ -132,6 +133,7 @@ def __init__( image_key, torchvision_dataset, storage_opts=storage_opts, + verbose=verbose, ) if len(self._dataset) == 0: raise ValueError("No images found in the dataset specified") @@ -276,6 +278,7 @@ def find_issues( dataset=self._dataset, imagelab_info=self.info, n_jobs=n_jobs, + verbose=verbose, ) # update issues, issue_summary and info diff --git a/src/cleanvision/issue_managers/duplicate_issue_manager.py b/src/cleanvision/issue_managers/duplicate_issue_manager.py index 2e7858f3..81ea3811 100644 --- a/src/cleanvision/issue_managers/duplicate_issue_manager.py +++ b/src/cleanvision/issue_managers/duplicate_issue_manager.py @@ -107,6 +107,7 @@ def find_issues( dataset: Optional[Dataset] = None, imagelab_info: Optional[Dict[str, Any]] = None, n_jobs: Optional[int] = None, + verbose: Optional[bool] = None, **kwargs: Any, ) -> None: super().find_issues(**kwargs) @@ -125,7 +126,9 @@ def find_issues( results: List[Dict[str, Union[str, int]]] = [] if n_jobs == 1: - for idx in tqdm(dataset.index): + for idx in tqdm( + dataset.index, leave=verbose, desc="Computing hashes", smoothing=0 + ): results.append(compute_hash(idx, dataset, to_compute, self.params)) else: args = [ @@ -145,6 +148,9 @@ def find_issues( compute_hash_wrapper, args, chunksize=chunksize ), total=len(dataset), + leave=verbose, + desc="Computing hashes", + smoothing=0, ) ) diff --git a/src/cleanvision/issue_managers/image_property_issue_manager.py b/src/cleanvision/issue_managers/image_property_issue_manager.py index facc92ab..a25efaf5 100644 --- a/src/cleanvision/issue_managers/image_property_issue_manager.py +++ b/src/cleanvision/issue_managers/image_property_issue_manager.py @@ -114,6 +114,7 @@ def find_issues( dataset: Optional[Dataset] = None, imagelab_info: Optional[Dict[str, Any]] = None, n_jobs: Optional[int] = None, + verbose: Optional[bool] = None, **kwargs: Any, ) -> None: super().find_issues(**kwargs) @@ -138,7 +139,9 @@ def find_issues( if to_be_computed: results: List[Dict[str, Union[int, float, str]]] = [] if n_jobs == 1: - for idx in tqdm(dataset.index): + for idx in tqdm( + dataset.index, leave=verbose, desc="Computing scores", smoothing=0 + ): results.append( compute_scores( idx, dataset, to_be_computed, self.image_properties @@ -162,6 +165,9 @@ def find_issues( compute_scores_wrapper, args, chunksize=chunksize ), total=len(dataset), + leave=verbose, + desc="Computing scores", + smoothing=0, ) ) diff --git a/src/cleanvision/utils/base_issue_manager.py b/src/cleanvision/utils/base_issue_manager.py index 67a01c1e..b3977f08 100644 --- a/src/cleanvision/utils/base_issue_manager.py +++ b/src/cleanvision/utils/base_issue_manager.py @@ -32,6 +32,7 @@ def check_params(**kwargs: Any) -> None: "dataset": Dataset, "imagelab_info": Dict[str, Any], "n_jobs": int, + "verbose": bool, } for name, value in kwargs.items(): diff --git a/src/cleanvision/utils/utils.py b/src/cleanvision/utils/utils.py index efe9c2aa..2a4f8f5e 100644 --- a/src/cleanvision/utils/utils.py +++ b/src/cleanvision/utils/utils.py @@ -51,6 +51,7 @@ def get_filepaths( """ abs_dir_path = os.path.abspath(os.path.expanduser(dir_path)) + # ToDo: Suppress print according to verbosity level print(f"Reading images from {abs_dir_path}") filepaths = [] for ext in IMAGE_FILE_EXTENSIONS: