Skip to content

Commit

Permalink
scandeps should not apply mergedirs at every level (#1615)
Browse files Browse the repository at this point in the history
* scandeps should not apply mergedirs at every level

When getting back a list of dependencies, and using "nestdirs", in the
specific case it there are references to both a directory and files
within that directory, it may report multiple dependencies for the
same directory.  The mergedir method merges those references into one.

* Add comments to scandeps

* Add test.  Fix mypy/format
  • Loading branch information
tetron authored Feb 10, 2022
1 parent 7cef4dd commit 5f93354
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 24 deletions.
5 changes: 4 additions & 1 deletion cwltool/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
CWL_IANA,
Process,
add_sizes,
mergedirs,
scandeps,
shortname,
use_custom_schema,
Expand Down Expand Up @@ -620,7 +621,9 @@ def loadref(base: str, uri: str) -> Union[CommentedMap, CommentedSeq, str, None]
nestdirs=nestdirs,
)
if sfs is not None:
deps["secondaryFiles"] = cast(MutableSequence[CWLOutputAtomType], sfs)
deps["secondaryFiles"] = cast(
MutableSequence[CWLOutputAtomType], mergedirs(sfs)
)

return deps

Expand Down
65 changes: 42 additions & 23 deletions cwltool/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,42 +1104,38 @@ def nestdir(base: str, deps: CWLObjectType) -> CWLObjectType:
sp = s2.split("/")
sp.pop()
while sp:
loc = dirname + "/".join(sp)
nx = sp.pop()
deps = {"class": "Directory", "basename": nx, "listing": [deps]}
deps = {
"class": "Directory",
"basename": nx,
"listing": [deps],
"location": loc,
}
return deps


def mergedirs(listing: List[CWLObjectType]) -> List[CWLObjectType]:
def mergedirs(
listing: MutableSequence[CWLObjectType],
) -> MutableSequence[CWLObjectType]:
r = [] # type: List[CWLObjectType]
ents = {} # type: Dict[str, CWLObjectType]
collided = set() # type: Set[str]
for e in listing:
basename = cast(str, e["basename"])
if basename not in ents:
ents[basename] = e
elif e["location"] != ents[basename]["location"]:
raise ValidationException(
"Conflicting basename in listing or secondaryFiles, '%s' used by both '%s' and '%s'"
% (basename, e["location"], ents[basename]["location"])
)
elif e["class"] == "Directory":
if e.get("listing"):
# name already in entries
# merge it into the existing listing
cast(
List[CWLObjectType], ents[basename].setdefault("listing", [])
).extend(cast(List[CWLObjectType], e["listing"]))
if cast(str, ents[basename]["location"]).startswith("_:"):
ents[basename]["location"] = e["location"]
elif e["location"] != ents[basename]["location"]:
# same basename, different location, collision,
# rename both.
collided.add(basename)
e2 = ents[basename]

e["basename"] = urllib.parse.quote(cast(str, e["location"]), safe="")
e2["basename"] = urllib.parse.quote(cast(str, e2["location"]), safe="")

e["nameroot"], e["nameext"] = os.path.splitext(cast(str, e["basename"]))
e2["nameroot"], e2["nameext"] = os.path.splitext(cast(str, e2["basename"]))

ents[cast(str, e["basename"])] = e
ents[cast(str, e2["basename"])] = e2
for c in collided:
del ents[c]
for e in ents.values():
if e["class"] == "Directory" and "listing" in e:
e["listing"] = cast(
Expand All @@ -1162,6 +1158,30 @@ def scandeps(
urljoin: Callable[[str, str], str] = urllib.parse.urljoin,
nestdirs: bool = True,
) -> MutableSequence[CWLObjectType]:

"""Given a CWL document or input object, search for dependencies
(references to external files) of 'doc' and return them as a list
of File or Directory objects.
The 'base' is the base URL for relative references.
Looks for objects with 'class: File' or 'class: Directory' and
adds them to the list of dependencies.
Anything in 'urlfields' is also added as a File dependency.
Anything in 'reffields' (such as workflow step 'run') will be
added as a dependency and also loaded (using the 'loadref'
function) and recursively scanned for dependencies. Those
dependencies will be added as secondary files to the primary file.
If "nestdirs" is true, create intermediate directory objects when
a file is located in a subdirectory under the starting directory.
This is so that if the dependencies are materialized, they will
produce the same relative file system locations.
"""

r: MutableSequence[CWLObjectType] = []
if isinstance(doc, MutableMapping):
if "id" in doc:
Expand Down Expand Up @@ -1268,7 +1288,7 @@ def scandeps(
)
if sf:
deps2["secondaryFiles"] = cast(
MutableSequence[CWLOutputAtomType], sf
MutableSequence[CWLOutputAtomType], mergedirs(sf)
)
if nestdirs:
deps2 = nestdir(base, deps2)
Expand Down Expand Up @@ -1313,7 +1333,6 @@ def scandeps(

if r:
normalizeFilesDirs(r)
r = mergedirs(cast(List[CWLObjectType], r))

return r

Expand Down
47 changes: 47 additions & 0 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,53 @@ def loadref(
assert scanned_deps2 == expected_deps


def test_scandeps_samedirname() -> None:
obj: CWLObjectType = {
"dir1": {"class": "Directory", "location": "tests/wf/dir1/foo"},
"dir2": {"class": "Directory", "location": "tests/wf/dir2/foo"},
}

def loadref(
base: str, p: Union[CommentedMap, CommentedSeq, str, None]
) -> Union[CommentedMap, CommentedSeq, str, None]:
if isinstance(p, dict):
return p
raise Exception("test case can't load things")

scanned_deps = cast(
List[Dict[str, Any]],
cwltool.process.scandeps(
"",
obj,
{"$import", "run"},
{"$include", "$schemas", "location"},
loadref,
nestdirs=False,
),
)

scanned_deps.sort(key=lambda k: cast(str, k["basename"]))

expected_deps = [
{"basename": "foo", "class": "Directory", "location": "tests/wf/dir1/foo"},
{"basename": "foo", "class": "Directory", "location": "tests/wf/dir2/foo"},
]

assert scanned_deps == expected_deps


def test_scandeps_collision() -> None:
stream = StringIO()

assert (
main(
["--print-deps", "--debug", get_data("tests/wf/dir_deps.json")],
stdout=stream,
)
== 1
)


def test_trick_scandeps() -> None:
stream = StringIO()

Expand Down
7 changes: 7 additions & 0 deletions tests/wf/dir_deps.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"dir1": {
"class": "Directory",
"listing": [
{"class": "File", "basename": "foo", "location": "tests/wf/foo1"},
{"class": "File", "basename": "foo", "location": "tests/wf/foo2"}
]
}

0 comments on commit 5f93354

Please sign in to comment.