Skip to content

Commit 22c9beb

Browse files
authored
Use lsjson for searches (#551)
* Add get_rclone_config_name_local and rename get_rclone_config_name to get_rclone_config_name_central. * Add new transfer function. * Fix rename in tests. * Tidy up the search functions. * Fix circular import. * Adding more tests and fixing an edge case. * Add wildcards to local filesystem transfer tests. * Added tests to ssh. * Remove unused function. * Move teardown to fix tests on macos. * Revert get_rclone_config_name_central name change. * Add documentation to tests. * Remove unecessary sorted. * Fix cyclical import. * Remove unecessary wildcard. * Remove unused functions.
1 parent 9d680f3 commit 22c9beb

File tree

8 files changed

+331
-196
lines changed

8 files changed

+331
-196
lines changed

datashuttle/configs/config_class.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ def get_base_folder(
209209
def get_rclone_config_name(
210210
self, connection_method: Optional[str] = None
211211
) -> str:
212-
"""Generate the rclone configuration name for the project.
212+
"""Generate the rclone configuration name for the central project.
213213
214214
These configs are created by datashuttle but managed and stored by rclone.
215215
"""

datashuttle/utils/folders.py

Lines changed: 72 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
from datashuttle.configs.config_class import Configs
1717
from datashuttle.utils.custom_types import TopLevelFolder
1818

19-
import glob
19+
import fnmatch
20+
import json
2021
from pathlib import Path
2122

2223
from datashuttle.configs import canonical_folders, canonical_tags
23-
from datashuttle.utils import ssh, utils, validation
24+
from datashuttle.utils import rclone, utils, validation
2425
from datashuttle.utils.custom_exceptions import NeuroBlueprintError
2526

2627
# -----------------------------------------------------------------------------
@@ -598,67 +599,90 @@ def search_for_folders(
598599
Discovered folders (`all_folder_names`) and files (`all_filenames`).
599600
600601
"""
601-
if local_or_central == "central" and cfg["connection_method"] == "ssh":
602-
all_folder_names, all_filenames = ssh.search_ssh_central_for_folders(
603-
search_path,
604-
search_prefix,
605-
cfg,
606-
verbose,
607-
return_full_path,
608-
)
602+
if (
603+
local_or_central == "local"
604+
or cfg["connection_method"] == "local_filesystem"
605+
) and not search_path.exists():
606+
if verbose:
607+
utils.log_and_message(f"No file found at {search_path.as_posix()}")
608+
return [], []
609+
610+
if local_or_central == "local":
611+
rclone_config_name = None
609612
else:
610-
if not search_path.exists():
611-
if verbose:
612-
utils.log_and_message(
613-
f"No file found at {search_path.as_posix()}"
614-
)
615-
return [], []
616-
617-
all_folder_names, all_filenames = search_filesystem_path_for_folders(
618-
search_path / search_prefix, return_full_path
613+
rclone_config_name = cfg.get_rclone_config_name(
614+
cfg["connection_method"]
619615
)
616+
617+
all_folder_names, all_filenames = search_local_or_remote(
618+
search_path,
619+
search_prefix,
620+
rclone_config_name,
621+
return_full_path,
622+
)
623+
620624
return all_folder_names, all_filenames
621625

622626

623-
# Actual function implementation
624-
def search_filesystem_path_for_folders(
625-
search_path_with_prefix: Path, return_full_path: bool = False
626-
) -> Tuple[List[Path | str], List[Path | str]]:
627-
r"""Search a folder through the local filesystem.
627+
def search_local_or_remote(
628+
search_path: Path,
629+
search_prefix: str,
630+
rclone_config_name: str | None,
631+
return_full_path: bool = False,
632+
) -> Tuple[List[Any], List[Any]]:
633+
"""Search for files and folders in central path using `rclone lsjson` command.
628634
629-
Use glob to search the full search path (including prefix) with glob.
630-
Files are filtered out of results, returning folders only.
635+
This command lists all the files and folders in the central path in a json format.
636+
The json contains file/folder info about each file/folder like name, type, etc.
631637
632638
Parameters
633639
----------
634-
search_path_with_prefix
635-
Path to search along with search prefix e.g. "C:\drive\project\sub-*"
636-
640+
search_path
641+
The path to search (relative to the local or remote drive). For example,
642+
for "local_filesystem" this is the path on the local machine. For "ssh", this
643+
is the path on the machine that has been connected to.
644+
search_prefix
645+
The search string e.g. "sub-*".
646+
rclone_config_name
647+
Name of the rclone config for the remote (not set for local). `rclone config`
648+
can be used in the terminal to see how rclone has stored these. In datashuttle,
649+
these are managed by `Configs`.
637650
return_full_path
638-
If `True` returns the path to the discovered folder or file,
639-
otherwise just the name.
640-
641-
Returns
642-
-------
643-
Discovered folders (`all_folder_names`) and files (`all_filenames`).
651+
If `True`, return the full filepath, otherwise return only the folder/file name.
644652
645653
"""
646-
all_folder_names = []
647-
all_filenames = []
654+
config_prefix = "" if not rclone_config_name else f"{rclone_config_name}:"
648655

649-
all_files_and_folders = list(glob.glob(search_path_with_prefix.as_posix()))
650-
sorter_files_and_folders = sorted(all_files_and_folders)
656+
output = rclone.call_rclone(
657+
f'lsjson {config_prefix}"{search_path.as_posix()}"',
658+
pipe_std=True,
659+
)
651660

652-
for file_or_folder_str in sorter_files_and_folders:
653-
file_or_folder = Path(file_or_folder_str)
661+
all_folder_names: List[str] = []
662+
all_filenames: List[str] = []
654663

655-
if file_or_folder.is_dir():
656-
all_folder_names.append(
657-
file_or_folder if return_full_path else file_or_folder.name
658-
)
664+
if output.returncode != 0:
665+
utils.log_and_message(
666+
f"Error searching files at {search_path.as_posix()}\n"
667+
f"{output.stderr.decode('utf-8') if output.stderr else ''}"
668+
)
669+
return all_folder_names, all_filenames
670+
671+
files_and_folders = json.loads(output.stdout)
672+
673+
for file_or_folder in files_and_folders:
674+
name = file_or_folder["Name"]
675+
676+
if not fnmatch.fnmatch(name, search_prefix):
677+
continue
678+
679+
is_dir = file_or_folder.get("IsDir", False)
680+
681+
to_append = search_path / name if return_full_path else name
682+
683+
if is_dir:
684+
all_folder_names.append(to_append)
659685
else:
660-
all_filenames.append(
661-
file_or_folder if return_full_path else file_or_folder.name
662-
)
686+
all_filenames.append(to_append)
663687

664688
return all_folder_names, all_filenames

datashuttle/utils/formatting.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,13 @@ def check_and_format_names(
6464
names_to_format, reserved_keywords = [], []
6565
for name in names:
6666
if name in canonical_reserved_keywords() or tags("*") in name:
67-
reserved_keywords.append(name)
67+
if tags("to") in name:
68+
# handle an edge case where use searches with both tags
69+
reserved_keywords += update_names_with_range_to_flag(
70+
[name], prefix
71+
)
72+
else:
73+
reserved_keywords.append(name)
6874
else:
6975
names_to_format.append(name)
7076

datashuttle/utils/getters.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,9 +293,9 @@ def get_existing_project_paths() -> List[Path]:
293293
"""
294294
datashuttle_path = canonical_folders.get_datashuttle_path()
295295

296-
all_folders, _ = folders.search_filesystem_path_for_folders(
297-
datashuttle_path / "*"
298-
)
296+
all_folders = [
297+
path_ for path_ in datashuttle_path.glob("*") if path_.is_dir()
298+
]
299299

300300
existing_project_paths = []
301301
for folder_name in all_folders:

datashuttle/utils/rclone.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
from pathlib import Path
7+
8+
from datashuttle.configs.config_class import Configs
9+
from datashuttle.utils.custom_types import TopLevelFolder
10+
111
import os
212
import platform
313
import subprocess
414
import tempfile
5-
from pathlib import Path
615
from subprocess import CompletedProcess
716
from typing import Dict, List, Literal
817

918
from datashuttle.configs import canonical_configs
10-
from datashuttle.configs.config_class import Configs
1119
from datashuttle.utils import utils
12-
from datashuttle.utils.custom_types import TopLevelFolder
1320

1421

1522
def call_rclone(command: str, pipe_std: bool = False) -> CompletedProcess:

datashuttle/utils/ssh.py

Lines changed: 1 addition & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,10 @@
55
if TYPE_CHECKING:
66
from datashuttle.configs.config_class import Configs
77

8-
import fnmatch
98
import getpass
10-
import stat
119
import sys
1210
from pathlib import Path
13-
from typing import Any, List, Optional, Tuple
11+
from typing import Optional
1412

1513
import paramiko
1614

@@ -322,119 +320,3 @@ def verify_ssh_central_host(
322320
utils.log("Host not accepted. No connection made.")
323321

324322
return success
325-
326-
327-
# -----------------------------------------------------------------------------
328-
# Search over SSH
329-
# -----------------------------------------------------------------------------
330-
331-
332-
def search_ssh_central_for_folders(
333-
search_path: Path,
334-
search_prefix: str,
335-
cfg: Configs,
336-
verbose: bool = True,
337-
return_full_path: bool = False,
338-
) -> Tuple[List[Any], List[Any]]:
339-
"""Search for the search prefix in the search path over SSH.
340-
341-
Parameters
342-
----------
343-
search_path
344-
Path to search for folders in.
345-
346-
search_prefix
347-
Search prefix for folder names e.g. "sub-*".
348-
349-
cfg
350-
See connect_client_with_logging().
351-
352-
verbose
353-
If `True`, if a search folder cannot be found, a message
354-
will be printed with the un-found path.
355-
356-
return_full_path
357-
include the search_path in the returned paths
358-
359-
Returns
360-
-------
361-
Discovered folders (`all_folder_names`) and files (`all_filenames`).
362-
363-
"""
364-
client: paramiko.SSHClient
365-
with paramiko.SSHClient() as client:
366-
connect_client_with_logging(
367-
client, cfg, message_on_sucessful_connection=verbose
368-
)
369-
370-
sftp = client.open_sftp()
371-
372-
all_folder_names, all_filenames = get_list_of_folder_names_over_sftp(
373-
sftp,
374-
search_path,
375-
search_prefix,
376-
verbose,
377-
return_full_path,
378-
)
379-
380-
return all_folder_names, all_filenames
381-
382-
383-
def get_list_of_folder_names_over_sftp(
384-
sftp: paramiko.sftp_client.SFTPClient,
385-
search_path: Path,
386-
search_prefix: str,
387-
verbose: bool = True,
388-
return_full_path: bool = False,
389-
) -> Tuple[List[Any], List[Any]]:
390-
"""Use paramiko's sftp to search a path over ssh for folders.
391-
392-
Return the folder names.
393-
394-
Parameters
395-
----------
396-
sftp
397-
Connected paramiko stfp object
398-
(see search_ssh_central_for_folders()).
399-
400-
search_path
401-
Path to search for folders in.
402-
403-
search_prefix
404-
Prefix (can include wildcards)
405-
to search folder names.
406-
407-
verbose
408-
If `True`, if a search folder cannot be found, a message
409-
will be printed with the un-found path.
410-
411-
return_full_path
412-
include the search_path in the returned paths.
413-
414-
Returns
415-
-------
416-
Discovered folders (`all_folder_names`) and files (`all_filenames`).
417-
418-
"""
419-
all_folder_names = []
420-
all_filenames = []
421-
try:
422-
for file_or_folder in sftp.listdir_attr(search_path.as_posix()):
423-
if file_or_folder.st_mode is not None and fnmatch.fnmatch(
424-
file_or_folder.filename, search_prefix
425-
):
426-
to_append = (
427-
search_path / file_or_folder.filename
428-
if return_full_path
429-
else file_or_folder.filename
430-
)
431-
if stat.S_ISDIR(file_or_folder.st_mode):
432-
all_folder_names.append(to_append)
433-
else:
434-
all_filenames.append(to_append)
435-
436-
except FileNotFoundError:
437-
if verbose:
438-
utils.log_and_message(f"No file found at {search_path.as_posix()}")
439-
440-
return all_folder_names, all_filenames

0 commit comments

Comments
 (0)