Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial validate support, with --strict option #142

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ef746b1
Initial validate support, with --warnings option
will-moore Dec 2, 2021
ace592a
Add jsonschema to deps in setup.py
will-moore Dec 2, 2021
7d26157
Copy schemas from ngff. Use LocalRefResolver to load schemas
will-moore Dec 3, 2021
145b938
Add validation tests to test_cli.py
will-moore Dec 3, 2021
4888698
Add jsonschema to requirements-test.txt
will-moore Dec 3, 2021
a20c8aa
Fix jsonschema dependency
will-moore Dec 3, 2021
7520081
Remove .DS_Store files
will-moore Dec 9, 2021
cbd4ee6
Add .DS_Store to .gitignore
will-moore Dec 9, 2021
f0683a6
Use latest schema version by default
will-moore Dec 9, 2021
d093cda
Use logging instead of print
will-moore Dec 9, 2021
33298bd
Move top-level /schemas to ome_zarr/schemas
will-moore Dec 9, 2021
4c28024
remove get_strict_schema(). use get_schema(strict)
will-moore Dec 9, 2021
b388a3a
fix duplicate validate()
will-moore Dec 9, 2021
674ebb7
Import schemas from ngff
will-moore Dec 14, 2021
7b3ab42
Delete schemas
will-moore Dec 14, 2021
e380451
Merge remote-tracking branch 'origin/master' into validate_json_schema
will-moore Mar 22, 2022
d301229
Update to use 'ome_ngff' package
will-moore Mar 22, 2022
c37e116
visit(path: str, func: callable)
will-moore Mar 23, 2022
c1ed016
Remove use of ngff repo. Use cached_path instead
will-moore Jul 1, 2022
9ccebe0
Merge remote-tracking branch 'origin/master' into validate_json_schema
will-moore Jul 1, 2022
e3abc33
Remove ome_ngff dependency
will-moore Jul 1, 2022
5fef946
Add cached_path to dependencies in environment.yml
will-moore Jul 4, 2022
3ad5315
Rename --warnings to --strict. Support Plate and Well
will-moore Jul 4, 2022
5fcfe94
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 4, 2022
6774922
Update test to use --strict
will-moore Jul 4, 2022
2bbce18
Support --clear_cache argument for validate
will-moore Jul 5, 2022
96961bd
Replace spec.get_schema() with SCHEMA_NAME
will-moore Jul 5, 2022
c03b0a0
Use format.get_metadata_version(data) to get version
will-moore Jul 5, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ var
build
dist/
target/
*.DS_Store
*/.DS_Store
docs/build/
3 changes: 2 additions & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[settings]
known_third_party = dask,numcodecs,numpy,pytest,scipy,setuptools,skimage,zarr

known_third_party = cached_path,dask,jsonschema,numcodecs,numpy,pytest,scipy,setuptools,skimage,zarr
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ channels:
- conda-forge
- defaults
dependencies:
- cached_path
- flake8
- ipython
- jsonschema
- mypy
- omero-py
- pip
Expand Down
20 changes: 20 additions & 0 deletions ome_zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .scale import Scaler
from .utils import download as zarr_download
from .utils import info as zarr_info
from .utils import validate as zarr_validate


def config_logging(loglevel: int, args: argparse.Namespace) -> None:
Expand All @@ -29,6 +30,12 @@ def info(args: argparse.Namespace) -> None:
list(zarr_info(args.path, stats=args.stats))


def validate(args: argparse.Namespace) -> None:
"""Wrap the :func:`~ome_zarr.utils.validate` method."""
config_logging(logging.WARN, args)
list(zarr_validate(args.path, args.strict, args.clear_cache))


def download(args: argparse.Namespace) -> None:
"""Wrap the :func:`~ome_zarr.utils.download` method."""
config_logging(logging.WARN, args)
Expand Down Expand Up @@ -99,6 +106,19 @@ def main(args: List[str] = None) -> None:
parser_info.add_argument("--stats", action="store_true")
parser_info.set_defaults(func=info)

# validate
parser_validate = subparsers.add_parser("validate")
parser_validate.add_argument("path")
parser_validate.add_argument(
"--strict", action="store_true", help="validate using a strict schema"
)
parser_validate.add_argument(
"--clear_cache",
action="store_true",
help="Remove any cached schemas to force reload",
)
parser_validate.set_defaults(func=validate)

# download
parser_download = subparsers.add_parser("download")
parser_download.add_argument("path")
Expand Down
6 changes: 3 additions & 3 deletions ome_zarr/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,19 @@ def create_zarr(
"channels": [
{
"color": "FF0000",
"window": {"start": 0, "end": 255},
"window": {"start": 0, "end": 255, "min": 0, "max": 255},
"label": "Red",
"active": True,
},
{
"color": "00FF00",
"window": {"start": 0, "end": 255},
"window": {"start": 0, "end": 255, "min": 0, "max": 255},
"label": "Green",
"active": True,
},
{
"color": "0000FF",
"window": {"start": 0, "end": 255},
"window": {"start": 0, "end": 255, "min": 0, "max": 255},
"label": "Blue",
"active": True,
},
Expand Down
4 changes: 2 additions & 2 deletions ome_zarr/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def init_store(self, path: str, mode: str = "r") -> FSStore:
def init_channels(self) -> None: # pragma: no cover
raise NotImplementedError()

def _get_metadata_version(self, metadata: dict) -> Optional[str]:
def get_metadata_version(self, metadata: dict) -> Optional[str]:
"""
Checks the metadata dict for a version

Expand Down Expand Up @@ -127,7 +127,7 @@ def version(self) -> str:
return "0.1"

def matches(self, metadata: dict) -> bool:
version = self._get_metadata_version(metadata)
version = self.get_metadata_version(metadata)
LOGGER.debug(f"{self.version} matches {version}?")
return version == self.version

Expand Down
77 changes: 69 additions & 8 deletions ome_zarr/reader.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,36 @@
"""Reading logic for ome-zarr."""

import json
import logging
import math
from abc import ABC
from typing import Any, Dict, Iterator, List, Optional, Type, Union, cast, overload

import dask.array as da
import numpy as np
from cached_path import cached_path
from dask import delayed
from jsonschema import Draft202012Validator as Validator
from jsonschema import RefResolver
from jsonschema import validate as jsonschema_validate

from .axes import Axes
from .format import format_from_version
from .format import CurrentFormat, detect_format, format_from_version
from .io import ZarrLocation
from .types import JSONDict

LOGGER = logging.getLogger("ome_zarr.reader")


def get_schema(name: str, version: str, strict: bool = False) -> Dict:
pre = "strict_" if strict else ""
schema_url = f"https://ngff.openmicroscopy.org/{version}/schemas/{pre}{name}.schema"
local_path = cached_path(schema_url)
with open(local_path) as f:
sch_string = f.read()
return json.loads(sch_string)


class Node:
"""Container for a representation of the binary data somewhere in the data
hierarchy."""
Expand Down Expand Up @@ -106,6 +120,12 @@ def load(self, spec_type: Type["Spec"]) -> Optional["Spec"]:
return spec
return None

def validate(self, strict: bool) -> None:
# Validation for a node is delegated to each spec
# e.g. Labels may have spec for multiscales and labels
for spec in self.specs:
spec.validate(strict)

def add(
self,
zarr: ZarrLocation,
Expand Down Expand Up @@ -162,13 +182,19 @@ class Spec(ABC):
Multiple subclasses may apply.
"""

SCHEMA_NAME: str
version: str

@staticmethod
def matches(zarr: ZarrLocation) -> bool:
raise NotImplementedError()

def __init__(self, node: Node) -> None:
self.node = node
self.zarr = node.zarr
fmt = detect_format(self.zarr.root_attrs, CurrentFormat())
version = fmt.get_metadata_version(self.zarr.root_attrs)
self.version = version if version is not None else fmt.version
LOGGER.debug(f"treating {self.zarr} as {self.__class__.__name__}")
for k, v in self.zarr.root_attrs.items():
LOGGER.info("root_attr: %s", k)
Expand All @@ -177,6 +203,35 @@ def __init__(self, node: Node) -> None:
def lookup(self, key: str, default: Any) -> Any:
return self.zarr.root_attrs.get(key, default)

def validate(self, strict: bool = False) -> None:
if not hasattr(self, "SCHEMA_NAME"):
LOGGER.info("No schema for %s" % self.zarr)
return
LOGGER.info("Validating Multiscales spec at: %s" % self.zarr)
schema = get_schema(self.SCHEMA_NAME, self.version)

# Always do a validation with the MUST rules
# Will throw ValidationException if it fails
json_data = self.zarr.root_attrs
jsonschema_validate(instance=json_data, schema=schema)

# If we're also checking for SHOULD rules,
# we want to iterate all errors and show as Warnings
if strict:
strict_schema = get_schema(self.SCHEMA_NAME, self.version, strict=True)
if strict_schema is None:
return
# we only need this store to allow use of cached schemas
# (and potential off-line use)
schema_store = {
schema["$id"]: schema,
strict_schema["$id"]: strict_schema,
}
resolver = RefResolver.from_schema(strict_schema, store=schema_store)
validator = Validator(strict_schema, resolver=resolver)
for error in validator.iter_errors(json_data):
LOGGER.warn(error.message)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to logging errors at a WARN level, what is the expected contract of self.validate(strict=True) if the JSON does not meet the strict schema? Should an error be thrown?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As in #142 (comment) above, I'm thinking of separating the reporting of errors from the validation itself. So, the cli validate command can log or print errors, but the Spec.validate() and utils.validate() would only return errors / messages (and maybe log at INFO level)?

It could get tricky to give the user an accurate picture of what failed: eg. when you validate an Image with Labels, the image schema is used for both, so you can get an output like this and it's hard to know which object was missing "name":

$ ome_zarr validate 6001247.zarr/ --strict --clear_cache
WARNING:ome_zarr.reader:'metadata' is a required property
WARNING:ome_zarr.reader:'type' is a required property
WARNING:ome_zarr.reader:'name' is a required property
WARNING:ome_zarr.reader:'metadata' is a required property
WARNING:ome_zarr.reader:'type' is a required property

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the spirit of jsonschema.validate vs jsonschema.iter_errors, I concur with the idea of having an API that returns an iterable of all exceptions.

Will the contract be the same for the default and the strict schemas? In the current implementation, the base schema raises an exception but the strict schema does not.
Also I assume since the strict schema extends the base schema, it is not necessary to have 2 validation calls?



class Labels(Spec):
"""Relatively small specification for the well-known "labels" group which only
Expand Down Expand Up @@ -266,6 +321,9 @@ def __init__(self, node: Node) -> None:


class Multiscales(Spec):

SCHEMA_NAME = "image"

@staticmethod
def matches(zarr: ZarrLocation) -> bool:
"""is multiscales metadata present?"""
Expand All @@ -279,12 +337,9 @@ def __init__(self, node: Node) -> None:

try:
multiscales = self.lookup("multiscales", [])
version = multiscales[0].get(
"version", "0.1"
) # should this be matched with Format.version?
datasets = multiscales[0]["datasets"]
axes = multiscales[0].get("axes")
fmt = format_from_version(version)
fmt = format_from_version(self.version)
# Raises ValueError if not valid
axes_obj = Axes(axes, fmt)
node.metadata["axes"] = axes_obj.to_list()
Expand All @@ -299,7 +354,7 @@ def __init__(self, node: Node) -> None:
return # EARLY EXIT

for resolution in self.datasets:
data: da.core.Array = self.array(resolution, version)
data: da.core.Array = self.array(resolution, self.version)
chunk_sizes = [
str(c[0]) + (" (+ %s)" % c[-1] if c[-1] != c[0] else "")
for c in data.chunks
Expand Down Expand Up @@ -395,6 +450,9 @@ def __init__(self, node: Node) -> None:


class Well(Spec):

SCHEMA_NAME = "well"

@staticmethod
def matches(zarr: ZarrLocation) -> bool:
return bool("well" in zarr.root_attrs)
Expand Down Expand Up @@ -467,22 +525,25 @@ def get_lazy_well(level: int, tile_shape: tuple) -> da.Array:


class Plate(Spec):

SCHEMA_NAME = "plate"

@staticmethod
def matches(zarr: ZarrLocation) -> bool:
return bool("plate" in zarr.root_attrs)

def __init__(self, node: Node) -> None:
super().__init__(node)
LOGGER.debug(f"Plate created with ZarrLocation fmt:{ self.zarr.fmt}")
self.plate_data = self.lookup("plate", {})
LOGGER.info("plate_data: %s", self.plate_data)
self.get_pyramid_lazy(node)

def get_pyramid_lazy(self, node: Node) -> None:
"""
Return a pyramid of dask data, where the highest resolution is the
stitched full-resolution images.
"""
self.plate_data = self.lookup("plate", {})
LOGGER.info("plate_data: %s", self.plate_data)
self.rows = self.plate_data.get("rows")
self.columns = self.plate_data.get("columns")
self.first_field = "0"
Expand Down
47 changes: 38 additions & 9 deletions ome_zarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

import json
import logging
import shutil
from pathlib import Path
from typing import Iterator, List
from typing import Callable, Iterator, List

import dask
import dask.array as da
import zarr
from cached_path import get_cache_dir
from dask.diagnostics import ProgressBar

from .io import parse_url
Expand All @@ -17,21 +19,26 @@
LOGGER = logging.getLogger("ome_zarr.utils")


def info(path: str, stats: bool = False) -> Iterator[Node]:
"""Print information about an OME-Zarr fileset.

All :class:`Nodes <ome_utils.reader.Node>` that are found from the given path will
be visited recursively.
"""
def visit(path: str, func: Callable) -> Iterator[Node]:
"""Call func(node) for each node read from path."""
zarr = parse_url(path)
assert zarr, f"not a zarr: {zarr}"
reader = Reader(zarr)
for node in reader():

if not node.specs:
print(f"not an ome-zarr node: {node}")
continue
yield func(node)


def info(path: str, stats: bool = False) -> Iterator[Node]:
"""Print information about an OME-Zarr fileset.

All :class:`Nodes <ome_utils.reader.Node>` that are found from the given path will
be visited recursively.
"""

def func(node: Node) -> Node:
print(node)
print(" - metadata")
for spec in node.specs:
Expand All @@ -43,7 +50,29 @@ def info(path: str, stats: bool = False) -> Iterator[Node]:
minmax = f" minmax={dask.compute(array.min(), array.max())}"
print(f" - {array.shape}{minmax}")
LOGGER.debug(node.data)
yield node
return node

return visit(path, func)


def validate(path: str, strict: bool, clear_cache: bool = False) -> Iterator[Node]:
"""
Validate OME-NGFF data

All :class:`Nodes <ome_utils.reader.Node>` that are found from the given path will
be visited recursively.
"""

if clear_cache:
dir_path = get_cache_dir()
shutil.rmtree(dir_path, ignore_errors=True)

def func(node: Node) -> Node:
if hasattr(node, "validate"):
node.validate(strict)
return node

return visit(path, func)


def download(input_path: str, output_dir: str = ".") -> None:
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def read(fname):
install_requires += (["requests"],)
install_requires += (["scikit-image"],)
install_requires += (["toolz"],)
install_requires += (["jsonschema"],)
install_requires += (["cached_path"],)


setup(
Expand Down
9 changes: 9 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ def test_astronaut_info(self):
main(["create", "--method=astronaut", filename])
main(["info", filename])

@pytest.mark.parametrize("strict", [False, True])
def test_astronaut_validation(self, strict):
filename = str(self.path) + "-2"
main(["create", "--method=astronaut", filename])
if strict:
main(["validate", "--strict", filename])
else:
main(["validate", filename])

def test_astronaut_download(self, tmpdir):
out = str(tmpdir / "out")
filename = str(self.path) + "-3"
Expand Down