Skip to content

Enable to follow symbolic link directories when dataset upload #246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/kagglehub/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def dataset_upload(
local_dataset_dir: str,
version_notes: str = "",
ignore_patterns: Optional[Union[list[str], str]] = None,
*,
follow_links: bool = False,
) -> None:
"""Upload dataset files.
Args:
Expand All @@ -62,6 +64,7 @@ def dataset_upload(
https://docs.python.org/3/library/fnmatch.html.
Use a pattern ending with "/" to ignore the whole dir,
e.g., ".git/" is equivalent to ".git/*".
follow_links: (bool) Enable to follow symbolic link directories.
"""
h = parse_dataset_handle(handle)
logger.info(f"Uploading Dataset {h.to_url()} ...")
Expand All @@ -73,6 +76,7 @@ def dataset_upload(
local_dataset_dir,
item_type="dataset",
ignore_patterns=normalize_patterns(default=DEFAULT_IGNORE_PATTERNS, additional=ignore_patterns),
follow_links=follow_links,
)

create_dataset_or_version(h, tokens, version_notes)
Expand Down
63 changes: 55 additions & 8 deletions src/kagglehub/gcs_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ def get_size(size: float, precision: int = 0) -> str:
return f"{size:.{precision}f}{suffixes[suffix_index]}"


def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str]) -> Iterable[tuple[str, list[str], list[str]]]:
def filtered_walk(
*, base_dir: str, ignore_patterns: Sequence[str], follow_links: bool = False, max_warning: int = 0
) -> Iterable[tuple[str, list[str], list[str]]]:
"""An `os.walk` like directory tree generator with filtering.

This method filters out files matching any ignore pattern.
Expand All @@ -78,12 +80,27 @@ def filtered_walk(*, base_dir: str, ignore_patterns: Sequence[str]) -> Iterable[
base_dir (str): The base dir to walk in.
ignore_patterns (Sequence[str]):
The patterns for ignored files. These are standard wildcards relative to base_dir.
follow_links (bool): If `True`, follows symbolic linked directories.
If links that point to already visited directories are detected, skip them.
max_warning (int): Maximum number of warning to be displayed.

Yields:
Iterable[tuple[str, list[str], list[str]]]: (base_dir_path, list[dir_names], list[filtered_file_names])
"""
for dir_path, dir_names, file_names in os.walk(base_dir):
visited = set()
num_warning = 0
for dir_path, dir_names, file_names in os.walk(base_dir, followlinks=follow_links):
dir_p = pathlib.Path(dir_path)

resolved_dir_p = dir_p.resolve()
if resolved_dir_p in visited:
# Avoid inclusion of circular links or duplicate directories in the dataset when follow_links=True.
if num_warning < max_warning:
logger.warning(f"Skip duplicated symlinks pointing to the same directory: {dir_p}({resolved_dir_p})")
num_warning += 1
continue
visited.add(resolved_dir_p)

filtered_files = []
for file_name in file_names:
rel_file_p = (dir_p / file_name).relative_to(base_dir)
Expand Down Expand Up @@ -188,17 +205,44 @@ def _upload_blob(file_path: str, item_type: str) -> str:
return response["token"]


def _check_uploadable_files(folder: str, *, ignore_patterns: Sequence[str], follow_links: bool) -> int:
file_count = 0
symlinked_dirs = set()
for root, dirs, files in filtered_walk(
base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links, max_warning=5
):
root_path = pathlib.Path(root)
for d in dirs:
dir_p = root_path / d
if dir_p.is_symlink():
symlinked_dirs.add(dir_p)
file_count += len(files)

n_links = len(symlinked_dirs)
if not follow_links and n_links > 0:
logger.warning(
f"Skip {n_links} symbolic link directories."
" If you want to include these directores, set `follow_links=True`."
)

if file_count == 0 and os.path.isdir(folder):
no_uploadable_exception = "No uploadable files are found. At least one file is needed."
raise ValueError(no_uploadable_exception)

return file_count


def upload_files_and_directories(
folder: str,
*,
ignore_patterns: Sequence[str],
item_type: str,
quiet: bool = False,
follow_links: bool = False,
) -> UploadDirectoryInfo:

# Count the total number of files
file_count = 0
for _, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns):
file_count += len(files)
file_count = _check_uploadable_files(folder, ignore_patterns=ignore_patterns, follow_links=follow_links)

if file_count > MAX_FILES_TO_UPLOAD:
if not quiet:
Expand All @@ -207,7 +251,9 @@ def upload_files_and_directories(
with TemporaryDirectory() as temp_dir:
zip_path = os.path.join(temp_dir, TEMP_ARCHIVE_FILE)
with zipfile.ZipFile(zip_path, "w") as zipf:
for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns):
for root, _, files in filtered_walk(
base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links
):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, folder))
Expand All @@ -226,10 +272,11 @@ def upload_files_and_directories(
if token:
root_dict.files.append(token)
else:
for root, _, files in filtered_walk(base_dir=folder, ignore_patterns=ignore_patterns):
for root, _, files in filtered_walk(
base_dir=folder, ignore_patterns=ignore_patterns, follow_links=follow_links
):
# Path of the current folder relative to the base folder
path = os.path.relpath(root, folder)

# Navigate or create the dictionary path to the current folder
current_dict = root_dict
if path != ".":
Expand Down
66 changes: 66 additions & 0 deletions tests/test_gcs_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,69 @@ def test_filtered_walk(self) -> None:
for file_name in file_names:
walked_files.append(pathlib.Path(dir_path) / file_name)
self.assertEqual(set(walked_files), expected_files)

def _setup_link_dir(self, tmp_dir_p: pathlib.Path) -> None:
"""setup the following structure
tmp_dir/
root/
link_dir -> tmp_dir/extern/
real_dir/
a.txt
extern/
loop_dir -> tmp_dir/extern/
b.txt
"""
extern_dir = tmp_dir_p / "extern"
extern_dir.mkdir()
(extern_dir / "b.txt").touch()
(extern_dir / "loop_dir").symlink_to(extern_dir, target_is_directory=True)

root_dir = tmp_dir_p / "root"
(root_dir / "real_dir").mkdir(parents=True)
(root_dir / "real_dir" / "a.txt").touch()
(root_dir / "link_dir").symlink_to(extern_dir, target_is_directory=True)

def test_follow_link_dir(self) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir_p = pathlib.Path(tmp_dir)

try:
self._setup_link_dir(tmp_dir_p)
except Exception:
self.skipTest("failed to setup linked dir")

follow_links = True
root_dir = tmp_dir_p / "root"
expected_files = {
root_dir / "real_dir" / "a.txt",
root_dir / "link_dir" / "b.txt",
}
walked_files = []
for dir_path, _, file_names in filtered_walk(
base_dir=root_dir, ignore_patterns=[], follow_links=follow_links
):
for file_name in file_names:
walked_files.append(pathlib.Path(dir_path) / file_name)
self.assertEqual(set(walked_files), expected_files)

def test_skip_link_dir(self) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_dir_p = pathlib.Path(tmp_dir)

try:
self._setup_link_dir(tmp_dir_p)
except Exception:
self.skipTest("failed to setup linked dir")

follow_links = False
root_dir = tmp_dir_p / "root"
expected_files = {
root_dir / "real_dir" / "a.txt",
}
walked_files = []
for dir_path, _, file_names in filtered_walk(
base_dir=root_dir, ignore_patterns=[], follow_links=follow_links
):
for file_name in file_names:
walked_files.append(pathlib.Path(dir_path) / file_name)
self.assertEqual(set(walked_files), expected_files)