From 3851997fcd5fea99179d686b86c378e5f33250e8 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 18 Feb 2025 09:35:52 +0100 Subject: [PATCH 1/9] Update to zarrs 0.20 --- Cargo.toml | 7 ++-- pyproject.toml | 2 +- src/lib.rs | 109 +++++++++++++++++++++++-------------------------- 3 files changed, 57 insertions(+), 61 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c7b26ad..a832f9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,9 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["abi3-py311"] } -zarrs = { version = "0.19.0", features = ["async"] } +zarrs = { git = "https://github.com/LDeakin/zarrs.git", rev = "1354a0fb830144146a7bf6b5cff31f1f429a6f63", features = [ + "async", +] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path @@ -21,8 +23,7 @@ serde_json = "1.0.128" pyo3-stub-gen = "0.6.2" opendal = { version = "0.51.0", features = ["services-http"] } tokio = { version = "1.41.1", features = ["rt-multi-thread"] } -zarrs_opendal = "0.5.0" -zarrs_metadata = "0.3.3" # require recent zarr-python compatibility fixes (remove with zarrs 0.20) +zarrs_opendal = { git = "https://github.com/LDeakin/zarrs.git", rev = "1354a0fb830144146a7bf6b5cff31f1f429a6f63" } [profile.release] lto = true diff --git a/pyproject.toml b/pyproject.toml index 91916c6..0e5cf08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,7 +99,7 @@ select = [ "W", # Warning detected by Pycodestyle "UP", # pyupgrade "I", # isort - "TCH", # manage type checking blocks + "TC", # manage type checking blocks "TID251", # Banned imports "ICN", # Follow import conventions "PTH", # Pathlib instead of os.path diff --git a/src/lib.rs b/src/lib.rs index cae087c..c01759a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,8 @@ use rayon_iter_concurrent_limit::iter_concurrent_limit; use unsafe_cell_slice::UnsafeCellSlice; use zarrs::array::codec::{ArrayToBytesCodecTraits, CodecOptions, CodecOptionsBuilder}; use zarrs::array::{ - copy_fill_value_into, update_array_bytes, ArrayBytes, ArraySize, CodecChain, FillValue, + copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, + CodecChain, FillValue, }; use zarrs::array_subset::ArraySubset; use zarrs::metadata::v3::MetadataV3; @@ -107,7 +108,7 @@ impl CodecPipelineImpl { codec_options: &CodecOptions, ) -> PyResult<()> { let array_shape = item.representation().shape_u64(); - if !chunk_subset.inbounds(&array_shape) { + if !chunk_subset.inbounds_shape(&array_shape) { return Err(PyErr::new::(format!( "chunk subset ({chunk_subset}) is out of bounds for array shape ({array_shape:?})" ))); @@ -127,20 +128,14 @@ impl CodecPipelineImpl { let chunk_bytes_old = self.retrieve_chunk_bytes(item, codec_chain, codec_options)?; // Update the chunk - let chunk_bytes_new = unsafe { - // SAFETY: - // - chunk_bytes_old is compatible with the chunk shape and data type size (validated on decoding) - // - chunk_subset is compatible with chunk_subset_bytes and the data type size (validated above) - // - chunk_subset is within the bounds of the chunk shape (validated above) - // - output bytes and output subset bytes are compatible (same data type) - update_array_bytes( - chunk_bytes_old, - &array_shape, - chunk_subset, - &chunk_subset_bytes, - data_type_size, - ) - }; + let chunk_bytes_new = update_array_bytes( + chunk_bytes_old, + &array_shape, + chunk_subset, + &chunk_subset_bytes, + data_type_size, + ) + .map_py_err::()?; // Store the updated chunk self.store_chunk_bytes(item, codec_chain, chunk_bytes_new, codec_options) @@ -270,42 +265,50 @@ impl CodecPipelineImpl { // For variable length data types, need a codepath with non `_into` methods. // Collect all the subsets and copy into value on the Python side? let update_chunk_subset = |item: chunk_item::WithSubset| { + let chunk_item::WithSubset { + item, + subset, + chunk_subset, + } = item; + let mut output_view = unsafe { + // TODO: Is the following correct? + // can we guarantee that when this function is called from Python with arbitrary arguments? + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new( + output, + // TODO: why is data_type in `item`, it should be derived from `output`, no? + item.representation() + .data_type() + .fixed_size() + .ok_or("variable length data type not supported") + .map_py_err::()?, + &output_shape, + subset, + ) + .map_py_err::()? + }; + // See zarrs::array::Array::retrieve_chunk_subset_into - if item.chunk_subset.start().iter().all(|&o| o == 0) - && item.chunk_subset.shape() == item.representation().shape_u64() + if chunk_subset.start().iter().all(|&o| o == 0) + && chunk_subset.shape() == item.representation().shape_u64() { // See zarrs::array::Array::retrieve_chunk_into if let Some(chunk_encoded) = self.stores.get(&item)? { // Decode the encoded data into the output buffer let chunk_encoded: Vec = chunk_encoded.into(); - unsafe { - // SAFETY: - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - self.codec_chain.decode_into( - Cow::Owned(chunk_encoded), - item.representation(), - &output, - &output_shape, - &item.subset, - &codec_options, - ) - } + self.codec_chain.decode_into( + Cow::Owned(chunk_encoded), + item.representation(), + &mut output_view, + &codec_options, + ) } else { // The chunk is missing, write the fill value - unsafe { - // SAFETY: - // - data type and fill value are confirmed to be compatible when the ChunkRepresentation is created, - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - copy_fill_value_into( - item.representation().data_type(), - item.representation().fill_value(), - &output, - &output_shape, - &item.subset, - ) - } + copy_fill_value_into( + item.representation().data_type(), + item.representation().fill_value(), + &mut output_view, + ) } } else { let input_handle = Arc::new(self.stores.decoder(&item)?); @@ -314,19 +317,11 @@ impl CodecPipelineImpl { .clone() .partial_decoder(input_handle, item.representation(), &codec_options) .map_py_err::()?; - unsafe { - // SAFETY: - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - // - item.chunk_subset has the same number of elements as item.subset. - partial_decoder.partial_decode_into( - &item.chunk_subset, - &output, - &output_shape, - &item.subset, - &codec_options, - ) - } + partial_decoder.partial_decode_into( + &chunk_subset, + &mut output_view, + &codec_options, + ) } .map_py_err::() }; From eb5989f036c5ebb2327fa4e4e55bcbd0ff2ae9d3 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 12 May 2025 21:00:40 +1000 Subject: [PATCH 2/9] chore(deps): bump `zarrs` to 0.20.0-beta.1 --- Cargo.toml | 7 ++----- src/chunk_item.rs | 10 ++++++---- src/metadata_v2.rs | 7 ++++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index faa996f..a68fdde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,9 +10,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["abi3-py311"] } -zarrs = { git = "https://github.com/LDeakin/zarrs.git", rev = "1354a0fb830144146a7bf6b5cff31f1f429a6f63", features = [ - "async", -] } +zarrs = { version = "0.20.0-beta.1", features = ["async"] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path @@ -23,8 +21,7 @@ serde_json = "1.0.128" pyo3-stub-gen = "0.7.0" opendal = { version = "0.51.0", features = ["services-http"] } tokio = { version = "1.41.1", features = ["rt-multi-thread"] } -zarrs_opendal = { git = "https://github.com/LDeakin/zarrs.git", rev = "1354a0fb830144146a7bf6b5cff31f1f429a6f63" } -zarrs_metadata = "0.3.7" # require recent zarr-python compatibility fixes (remove with zarrs 0.20) +zarrs_opendal = "0.7.1" itertools = "0.9.0" [profile.release] diff --git a/src/chunk_item.rs b/src/chunk_item.rs index 7bdea4b..feac5cb 100644 --- a/src/chunk_item.rs +++ b/src/chunk_item.rs @@ -10,7 +10,7 @@ use pyo3_stub_gen::derive::{gen_stub_pyclass, gen_stub_pymethods}; use zarrs::{ array::{ChunkRepresentation, DataType, FillValue}, array_subset::ArraySubset, - metadata::v3::{array::data_type::DataTypeMetadataV3, MetadataV3}, + metadata::v3::MetadataV3, storage::StoreKey, }; @@ -146,9 +146,11 @@ fn get_chunk_representation( fill_value: Vec, ) -> PyResult { // Get the chunk representation - let data_type = - DataType::from_metadata(&DataTypeMetadataV3::from_metadata(&MetadataV3::new(dtype))) - .map_py_err::()?; + let data_type = DataType::from_metadata( + &MetadataV3::new(dtype), + zarrs::config::global_config().data_type_aliases_v3(), + ) + .map_py_err::()?; let chunk_shape = chunk_shape .into_iter() .map(|x| NonZeroU64::new(x).expect("chunk shapes should always be non-zero")) diff --git a/src/metadata_v2.rs b/src/metadata_v2.rs index de14123..a5b9cb7 100644 --- a/src/metadata_v2.rs +++ b/src/metadata_v2.rs @@ -1,7 +1,6 @@ use pyo3::{exceptions::PyRuntimeError, pyfunction, PyErr, PyResult}; use zarrs::metadata::{ - v2::{array::ArrayMetadataV2Order, MetadataV2}, - v3::array::data_type::DataTypeMetadataV3, + v2::{array::ArrayMetadataV2Order, MetadataV2}, v3::MetadataV3, }; #[pyfunction] @@ -38,10 +37,12 @@ pub fn codec_metadata_v2_to_v3( let metadata = zarrs::metadata::v2_to_v3::codec_metadata_v2_to_v3( ArrayMetadataV2Order::C, 0, // unused with C order - &DataTypeMetadataV3::Bool, // FIXME + &MetadataV3::new("bool"), // FIXME None, &filters, &compressor, + zarrs::config::global_config().codec_aliases_v2(), + zarrs::config::global_config().codec_aliases_v3(), ) .map_err(|err| { // TODO: More informative error messages from zarrs for ArrayMetadataV2ToV3ConversionError From 33e0b261ce58ee62535e0b7f929d863ece50287b Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 12 May 2025 21:00:54 +1000 Subject: [PATCH 3/9] feat: enable `zlib`/`pcodec`/`bz2` codecs in `zarrs` --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index a68fdde..609c5ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["abi3-py311"] } -zarrs = { version = "0.20.0-beta.1", features = ["async"] } +zarrs = { version = "0.20.0-beta.1", features = ["async", "zlib", "pcodec", "bz2"] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path From 75f3c37e73bbdd72949e9c66254e3e6c9e2f850b Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 12 May 2025 21:03:04 +1000 Subject: [PATCH 4/9] fmt --- src/metadata_v2.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/metadata_v2.rs b/src/metadata_v2.rs index a5b9cb7..adceb73 100644 --- a/src/metadata_v2.rs +++ b/src/metadata_v2.rs @@ -1,6 +1,7 @@ use pyo3::{exceptions::PyRuntimeError, pyfunction, PyErr, PyResult}; use zarrs::metadata::{ - v2::{array::ArrayMetadataV2Order, MetadataV2}, v3::MetadataV3, + v2::{array::ArrayMetadataV2Order, MetadataV2}, + v3::MetadataV3, }; #[pyfunction] @@ -36,7 +37,7 @@ pub fn codec_metadata_v2_to_v3( // However, CodecPipeline.from_codecs does not supply this information, and CodecPipeline.evolve_from_array_spec is seemingly never called. let metadata = zarrs::metadata::v2_to_v3::codec_metadata_v2_to_v3( ArrayMetadataV2Order::C, - 0, // unused with C order + 0, // unused with C order &MetadataV3::new("bool"), // FIXME None, &filters, From 374e5638750f77f9cea598a6bafdb374d9ff56a7 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 12 May 2025 21:09:26 +1000 Subject: [PATCH 5/9] chore(deps): bump `opendal` to 0.53 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 609c5ce..a0b300b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ numpy = "0.23.0" unsafe_cell_slice = "0.2.0" serde_json = "1.0.128" pyo3-stub-gen = "0.7.0" -opendal = { version = "0.51.0", features = ["services-http"] } +opendal = { version = "0.53.0", features = ["services-http"] } tokio = { version = "1.41.1", features = ["rt-multi-thread"] } zarrs_opendal = "0.7.1" itertools = "0.9.0" From 6ccf60259b0c64de8656217172a01849faca856b Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 12 May 2025 21:27:40 +1000 Subject: [PATCH 6/9] chore: bump ruff in pre-commit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32e342d..b927d42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: language: system pass_filenames: false - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.2 + rev: v0.11.9 hooks: - id: ruff args: ["--fix"] From 23164be4fbcdf2865d424f8477fc7835cbd5eb0b Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 12 May 2025 21:56:19 +1000 Subject: [PATCH 7/9] fix: revive the partial decoder cache accidentally removed in 48d06b9 --- src/lib.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6615cd0..c750f78 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -274,8 +274,8 @@ impl CodecPipelineImpl { .unique_by(|item| item.key()) .collect::>(); let mut partial_decoder_cache: HashMap> = - HashMap::new().into(); - if partial_chunk_descriptions.len() > 0 { + HashMap::new(); + if !partial_chunk_descriptions.is_empty() { let key_decoder_pairs = iter_concurrent_limit!( chunk_concurrent_limit, partial_chunk_descriptions, @@ -349,12 +349,10 @@ impl CodecPipelineImpl { ) } } else { - let input_handle = Arc::new(self.stores.decoder(&item)?); - let partial_decoder = self - .codec_chain - .clone() - .partial_decoder(input_handle, item.representation(), &codec_options) - .map_py_err::()?; + let key = item.key(); + let partial_decoder = partial_decoder_cache.get(key).ok_or_else(|| { + PyRuntimeError::new_err(format!("Partial decoder not found for key: {key}")) + })?; partial_decoder.partial_decode_into( &chunk_subset, &mut output_view, From 9ad993cea22fef1bbe8d8c6b271b0662a778e340 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Fri, 16 May 2025 17:51:51 +1000 Subject: [PATCH 8/9] chore(deps): bump `zarrs` to 0.20.0-beta.2 --- Cargo.toml | 2 +- src/metadata_v2.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a0b300b..514ee8e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["abi3-py311"] } -zarrs = { version = "0.20.0-beta.1", features = ["async", "zlib", "pcodec", "bz2"] } +zarrs = { version = "0.20.0-beta.2", features = ["async", "zlib", "pcodec", "bz2"] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path diff --git a/src/metadata_v2.rs b/src/metadata_v2.rs index adceb73..7714e57 100644 --- a/src/metadata_v2.rs +++ b/src/metadata_v2.rs @@ -1,6 +1,6 @@ use pyo3::{exceptions::PyRuntimeError, pyfunction, PyErr, PyResult}; use zarrs::metadata::{ - v2::{array::ArrayMetadataV2Order, MetadataV2}, + v2::{ArrayMetadataV2Order, MetadataV2}, v3::MetadataV3, }; @@ -35,7 +35,7 @@ pub fn codec_metadata_v2_to_v3( // FIXME: The array order, dimensionality, data type, and endianness are needed to exhaustively support all Zarr V2 data that zarrs can handle. // However, CodecPipeline.from_codecs does not supply this information, and CodecPipeline.evolve_from_array_spec is seemingly never called. - let metadata = zarrs::metadata::v2_to_v3::codec_metadata_v2_to_v3( + let metadata = zarrs::metadata_ext::v2_to_v3::codec_metadata_v2_to_v3( ArrayMetadataV2Order::C, 0, // unused with C order &MetadataV3::new("bool"), // FIXME From 85f5ca91cfa656df02730d2bca86a985682554f1 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 19 May 2025 08:10:02 +1000 Subject: [PATCH 9/9] chore(deps): bump `zarrs` to 0.20.0 --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 514ee8e..b61cb02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["abi3-py311"] } -zarrs = { version = "0.20.0-beta.2", features = ["async", "zlib", "pcodec", "bz2"] } +zarrs = { version = "0.20.0", features = ["async", "zlib", "pcodec", "bz2"] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path @@ -21,7 +21,7 @@ serde_json = "1.0.128" pyo3-stub-gen = "0.7.0" opendal = { version = "0.53.0", features = ["services-http"] } tokio = { version = "1.41.1", features = ["rt-multi-thread"] } -zarrs_opendal = "0.7.1" +zarrs_opendal = "0.7.2" itertools = "0.9.0" [profile.release]