diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 30a4fbf..9f560f7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,5 +46,5 @@ ends in `.bz2`, the function will compress first with bz2. In `tests.rs`, integration tests for reading and writing all file types are present. Small example files are contained in `tests/test_files`. -### `benches/io_benchmarking.rs` -This file contains benchmarking functions for checking the performance of the basic read functions. \ No newline at end of file +### `benches/bench.rs` +This file contains benchmarking functions for checking the performance of the basic read functions. diff --git a/Cargo.toml b/Cargo.toml index 567bed5..025a7e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,14 +1,15 @@ [package] name = "darn-dmap" -version = "0.5.0" +version = "0.6.0" edition = "2021" rust-version = "1.63.0" authors = ["Remington Rohel"] description = "SuperDARN DMAP file format I/O" repository = "https://github.com/SuperDARNCanada/dmap" license = "LGPL-3.0-or-later" -keywords = ["SuperDARN", "dmap", "I/O"] +keywords = ["SuperDARN"] categories = ["parser-implementations", "science"] +include = ["src/**/*.rs"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -34,5 +35,5 @@ paste = "1.0.15" criterion = { version = "0.4", features = ["html_reports"] } [[bench]] -name = "io_benchmarking" +name = "bench" harness = false diff --git a/README.md b/README.md index 5d41e5e..b88df50 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ A library for SuperDARN DMAP file I/O ===================================== -[![github]](https://github.com/SuperDARNCanada/dmap) [![crates-io]](https://crates.io/crates/darn-dmap) [![docs-rs]](crate) +[![github]](https://github.com/SuperDARNCanada/dmap) [![crates-io]](https://crates.io/crates/darn-dmap) [![docs-rs]](https://docs.rs/darn-dmap) [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 0000000..a235042 --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,79 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use dmap::formats::dmap::DmapRecord; +use dmap::formats::fitacf::FitacfRecord; +use dmap::formats::grid::GridRecord; +use dmap::formats::iqdat::IqdatRecord; +use dmap::formats::map::MapRecord; +use dmap::formats::rawacf::RawacfRecord; +use dmap::formats::snd::SndRecord; +use dmap::record::Record; +use dmap::types::DmapField; +use indexmap::IndexMap; +use paste::paste; + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("Read IQDAT", |b| b.iter(|| read_iqdat())); + c.bench_function("Read RAWACF", |b| b.iter(|| read_rawacf())); + c.bench_function("Read FITACF", |b| b.iter(|| read_fitacf())); + c.bench_function("Read GRID", |b| b.iter(|| read_grid())); + c.bench_function("Read SND", |b| b.iter(|| read_snd())); + c.bench_function("Read MAP", |b| b.iter(|| read_map())); + c.bench_function("Read DMAP", |b| b.iter(|| read_dmap())); + + c.bench_function("Read bzipped IQDAT", |b| b.iter(|| read_iqdat_bz2())); + c.bench_function("Read bzipped RAWACF", |b| b.iter(|| read_rawacf_bz2())); + c.bench_function("Read bzipped FITACF", |b| b.iter(|| read_fitacf_bz2())); + c.bench_function("Read bzipped GRID", |b| b.iter(|| read_grid_bz2())); + c.bench_function("Read bzipped SND", |b| b.iter(|| read_snd_bz2())); + c.bench_function("Read bzipped MAP", |b| b.iter(|| read_map_bz2())); + c.bench_function("Read bzipped DMAP", |b| b.iter(|| read_dmap_bz2())); + + c.bench_function("Read IQDAT metadata", |b| b.iter(|| read_iqdat_metadata())); + c.bench_function("Read RAWACF metadata", |b| b.iter(|| read_rawacf_metadata())); + c.bench_function("Read FITACF metadata", |b| b.iter(|| read_fitacf_metadata())); + c.bench_function("Read GRID metadata", |b| b.iter(|| read_grid_metadata())); + c.bench_function("Read SND metadata", |b| b.iter(|| read_snd_metadata())); + c.bench_function("Read MAP metadata", |b| b.iter(|| read_map_metadata())); + c.bench_function("Read DMAP metadata", |b| b.iter(|| read_dmap_metadata())); + + // let records = read_iqdat(); + // c.bench_with_input( + // BenchmarkId::new("Write IQDAT", "IQDAT Records"), + // &records, + // |b, s| b.iter(|| write_iqdat(s)), + // ); +} + +/// Generates benchmark functions for a given DMAP record type. +macro_rules! read_type { + ($type:ident, $name:literal) => { + paste! { + fn [< read_ $type >]() -> Vec<[< $type:camel Record >]> { + [< $type:camel Record >]::read_file(format!("tests/test_files/test.{}", $name)).unwrap() + } + + fn [< read_ $type _bz2 >]() -> Vec<[< $type:camel Record >]> { + [< $type:camel Record >]::read_file(format!("tests/test_files/test.{}.bz2", $name)).unwrap() + } + + fn [< read_ $type _metadata >]() -> Vec> { + [< $type:camel Record >]::read_file_metadata(format!("tests/test_files/test.{}", $name)).unwrap() + } + } + } +} + +read_type!(iqdat, "iqdat"); +read_type!(rawacf, "rawacf"); +read_type!(fitacf, "fitacf"); +read_type!(grid, "grid"); +read_type!(map, "map"); +read_type!(snd, "snd"); +read_type!(dmap, "rawacf"); + +criterion_group! { + name = benches; + config = Criterion::default(); + targets = criterion_benchmark +} +criterion_main!(benches); diff --git a/benches/io_benchmarking.rs b/benches/io_benchmarking.rs deleted file mode 100644 index 9216e4d..0000000 --- a/benches/io_benchmarking.rs +++ /dev/null @@ -1,58 +0,0 @@ -use criterion::{criterion_group, criterion_main, Criterion}; -use dmap::formats::fitacf::FitacfRecord; -use dmap::formats::grid::GridRecord; -use dmap::formats::iqdat::IqdatRecord; -use dmap::formats::map::MapRecord; -use dmap::formats::rawacf::RawacfRecord; -use dmap::formats::snd::SndRecord; -use dmap::record::Record; -use paste::paste; -use std::fs::File; - -fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("Read IQDAT", |b| b.iter(|| read_iqdat())); - c.bench_function("Read RAWACF", |b| b.iter(|| read_rawacf())); - c.bench_function("Read FITACF", |b| b.iter(|| read_fitacf())); - c.bench_function("Read GRID", |b| b.iter(|| read_grid())); - c.bench_function("Read SND", |b| b.iter(|| read_snd())); - c.bench_function("Read MAP", |b| b.iter(|| read_map())); - - // let records = read_iqdat(); - // c.bench_with_input( - // BenchmarkId::new("Write IQDAT", "IQDAT Records"), - // &records, - // |b, s| b.iter(|| write_iqdat(s)), - // ); -} - -/// Generates benchmark functions for a given DMAP record type. -macro_rules! read_type { - ($type:ident) => { - paste! { - fn [< read_ $type >]() -> Vec<[< $type:camel Record >]> { - let file = File::open(format!("tests/test_files/test.{}", stringify!($type))).expect("Test file not found"); - [< $type:camel Record >]::read_records(file).unwrap() - } - } - } -} - -read_type!(iqdat); -read_type!(rawacf); -read_type!(fitacf); -read_type!(grid); -read_type!(map); -read_type!(snd); - -// fn write_iqdat(records: &Vec) { -// let file = File::open("tests/test_files/test.iqdat").expect("Test file not found"); -// dmap::read_records(file).unwrap(); -// dmap::to_file("tests/test_files/temp.iqdat", records).unwrap(); -// } - -criterion_group! { - name = benches; - config = Criterion::default(); - targets = criterion_benchmark -} -criterion_main!(benches); diff --git a/pyproject.toml b/pyproject.toml index 9f7f204..c96dc9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "darn-dmap" -version = "0.5.0" +version = "0.6.0" requires-python = ">=3.8" authors = [ { name = "Remington Rohel" } @@ -34,4 +34,4 @@ strip = true dev = [ "pytest", "ruff", -] \ No newline at end of file +] diff --git a/python/dmap/_wrapper.py b/python/dmap/_wrapper.py index a4a7754..d62ac8b 100644 --- a/python/dmap/_wrapper.py +++ b/python/dmap/_wrapper.py @@ -1,7 +1,7 @@ """ Wrappers around the `dmap_rs` Python API. -Each file type will have one function for calling any type of reading (strict, lax, bytes, sniff) or any type of writing +Each file type will have one function for calling any type of reading (strict, lax, bytes, sniff, metadata) or any type of writing (regular, bytes). """ @@ -23,14 +23,14 @@ def read_dispatcher( fmt: str DMAP format being read. One of `["dmap", "iqdat", "rawacf", "fitacf", "grid", "map", "snd"]`. mode: str - Mode in which to read the data, either `strict`, `lax`, or `sniff`. In `strict` mode, any corruption + Mode in which to read the data, one of `["strict", "lax", "sniff", "metadata"]`. In `strict` mode, any corruption in the data will raise an error. In `lax` mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In `sniff` mode, `source` must be a `str`, and - only the first record will be read. + only the first record will be read. In `metadata` mode, only the metadata fields of the records are returned. Returns ------- - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, and the second is the byte index where `source` was no longer a valid record of type `fmt`. If `mode` is `sniff`, returns `dict` of the first record. @@ -40,10 +40,10 @@ def read_dispatcher( f"invalid fmt `{fmt}`: expected one of ['dmap', 'iqdat', 'rawacf', 'fitacf', 'grid', 'map', 'snd']" ) - if mode not in ["strict", "lax", "sniff"]: + if mode not in ["strict", "lax", "sniff", "metadata"]: raise ValueError(f"invalid mode `{mode}`: expected `strict`, `lax`, or `sniff`") - if mode == "sniff" and not isinstance(source, str): + if mode in ["sniff", "metadata"] and not isinstance(source, str): raise TypeError( f"invalid type for `source` {type(source)} in `sniff` mode: expected `str`" ) @@ -66,6 +66,7 @@ def read_dispatcher( f"_{fmt}" f"{'_bytes' if isinstance(source, bytes) else ''}" f"{'_lax' if mode == 'lax' else ''}" + f"{'_metadata' if mode == 'metadata' else ''}" ) return getattr(dmap_rs, fn_name)(source) @@ -114,17 +115,19 @@ def read_dmap( Where to read data from. If input is of type `str`, this is interpreted as the path to a file. If input is of type `bytes`, this is interpreted as the raw data itself. mode: str - Mode in which to read the data, either "lax" (default), "strict", or "sniff". + Mode in which to read the data, either "lax" (default), "strict", "sniff", or "metadata". In "lax" mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In "strict" mode, any corruption in the data will raise an error. In "sniff" mode, `source` must be a path, and only the first record will be read. + In "metadata" mode, `source` must be a path, and since this function is for generic DMAP records where there + is no information about which fields are metadata, the behaviour mirrors that of "strict" mode. Returns ------- If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, - and the second is the byte index where `source` was no longer a valid record of type `fmt`. - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + and the second is the byte index where `source` was no longer a valid DMAP record. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `sniff`, returns `dict`, which is the first record. """ return read_dispatcher(source, "dmap", mode) @@ -133,7 +136,7 @@ def read_dmap( def read_iqdat( source: Union[str, bytes], mode: str = "lax" ) -> Union[dict, list[dict], tuple[list[dict], Optional[int]]]: - """ + """ Reads in IQDAT data from `source`. Parameters @@ -142,17 +145,18 @@ def read_iqdat( Where to read data from. If input is of type `str`, this is interpreted as the path to a file. If input is of type `bytes`, this is interpreted as the raw data itself. mode: str - Mode in which to read the data, either "lax" (default), "strict", or "sniff". + Mode in which to read the data, either "lax" (default), "strict", "sniff", or "metadata". In "lax" mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In "strict" mode, any corruption in the data will raise an error. In "sniff" mode, `source` must be a path, and only the first record will be read. + In "metadata" mode, `source` must be a path, and only the metadata fields of the records are returned. Returns ------- If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, - and the second is the byte index where `source` was no longer a valid record of type `fmt`. - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + and the second is the byte index where `source` was no longer a valid DMAP record. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `sniff`, returns `dict`, which is the first record. """ return read_dispatcher(source, "iqdat", mode) @@ -161,7 +165,7 @@ def read_iqdat( def read_rawacf( source: Union[str, bytes], mode: str = "lax" ) -> Union[dict, list[dict], tuple[list[dict], Optional[int]]]: - """ + """ Reads in RAWACF data from `source`. Parameters @@ -170,17 +174,18 @@ def read_rawacf( Where to read data from. If input is of type `str`, this is interpreted as the path to a file. If input is of type `bytes`, this is interpreted as the raw data itself. mode: str - Mode in which to read the data, either "lax" (default), "strict", or "sniff". + Mode in which to read the data, either "lax" (default), "strict", "sniff", or "metadata". In "lax" mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In "strict" mode, any corruption in the data will raise an error. In "sniff" mode, `source` must be a path, and only the first record will be read. + In "metadata" mode, `source` must be a path, and only the metadata fields of the records are returned. Returns ------- If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, - and the second is the byte index where `source` was no longer a valid record of type `fmt`. - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + and the second is the byte index where `source` was no longer a valid DMAP record. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `sniff`, returns `dict`, which is the first record. """ return read_dispatcher(source, "rawacf", mode) @@ -189,7 +194,7 @@ def read_rawacf( def read_fitacf( source: Union[str, bytes], mode: str = "lax" ) -> Union[dict, list[dict], tuple[list[dict], Optional[int]]]: - """ + """ Reads in FITACF data from `source`. Parameters @@ -198,17 +203,18 @@ def read_fitacf( Where to read data from. If input is of type `str`, this is interpreted as the path to a file. If input is of type `bytes`, this is interpreted as the raw data itself. mode: str - Mode in which to read the data, either "lax" (default), "strict", or "sniff". + Mode in which to read the data, either "lax" (default), "strict", "sniff", or "metadata". In "lax" mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In "strict" mode, any corruption in the data will raise an error. In "sniff" mode, `source` must be a path, and only the first record will be read. + In "metadata" mode, `source` must be a path, and only the metadata fields of the records are returned. Returns ------- If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, - and the second is the byte index where `source` was no longer a valid record of type `fmt`. - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + and the second is the byte index where `source` was no longer a valid DMAP record. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `sniff`, returns `dict`, which is the first record. """ return read_dispatcher(source, "fitacf", mode) @@ -217,7 +223,7 @@ def read_fitacf( def read_grid( source: Union[str, bytes], mode: str = "lax" ) -> Union[dict, list[dict], tuple[list[dict], Optional[int]]]: - """ + """ Reads in GRID data from `source`. Parameters @@ -226,17 +232,18 @@ def read_grid( Where to read data from. If input is of type `str`, this is interpreted as the path to a file. If input is of type `bytes`, this is interpreted as the raw data itself. mode: str - Mode in which to read the data, either "lax" (default), "strict", or "sniff". + Mode in which to read the data, either "lax" (default), "strict", "sniff", or "metadata". In "lax" mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In "strict" mode, any corruption in the data will raise an error. In "sniff" mode, `source` must be a path, and only the first record will be read. + In "metadata" mode, `source` must be a path, and only the metadata fields of the records are returned. Returns ------- If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, - and the second is the byte index where `source` was no longer a valid record of type `fmt`. - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + and the second is the byte index where `source` was no longer a valid DMAP record. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `sniff`, returns `dict`, which is the first record. """ return read_dispatcher(source, "grid", mode) @@ -245,7 +252,7 @@ def read_grid( def read_map( source: Union[str, bytes], mode: str = "lax" ) -> Union[dict, list[dict], tuple[list[dict], Optional[int]]]: - """ + """ Reads in MAP data from `source`. Parameters @@ -254,17 +261,18 @@ def read_map( Where to read data from. If input is of type `str`, this is interpreted as the path to a file. If input is of type `bytes`, this is interpreted as the raw data itself. mode: str - Mode in which to read the data, either "lax" (default), "strict", or "sniff". + Mode in which to read the data, either "lax" (default), "strict", "sniff", or "metadata". In "lax" mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In "strict" mode, any corruption in the data will raise an error. In "sniff" mode, `source` must be a path, and only the first record will be read. + In "metadata" mode, `source` must be a path, and only the metadata fields of the records are returned. Returns ------- If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, - and the second is the byte index where `source` was no longer a valid record of type `fmt`. - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + and the second is the byte index where `source` was no longer a valid DMAP record. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `sniff`, returns `dict`, which is the first record. """ return read_dispatcher(source, "map", mode) @@ -273,7 +281,7 @@ def read_map( def read_snd( source: Union[str, bytes], mode: str = "lax" ) -> Union[dict, list[dict], tuple[list[dict], Optional[int]]]: - """ + """ Reads in SND data from `source`. Parameters @@ -282,17 +290,18 @@ def read_snd( Where to read data from. If input is of type `str`, this is interpreted as the path to a file. If input is of type `bytes`, this is interpreted as the raw data itself. mode: str - Mode in which to read the data, either "lax" (default), "strict", or "sniff". + Mode in which to read the data, either "lax" (default), "strict", "sniff", or "metadata". In "lax" mode, all valid records will be returned in a tuple along with the byte index of `source` where the corruption starts. In "strict" mode, any corruption in the data will raise an error. In "sniff" mode, `source` must be a path, and only the first record will be read. + In "metadata" mode, `source` must be a path, and only the metadata fields of the records are returned. Returns ------- If `mode` is `lax`, returns `tuple[list[dict], Optional[int]]`, where the first element is the records which were parsed, - and the second is the byte index where `source` was no longer a valid record of type `fmt`. - If `mode` is `strict`, returns `list[dict]` which is the parsed records. + and the second is the byte index where `source` was no longer a valid DMAP record. + If `mode` is `strict` or `metadata`, returns `list[dict]` which is the parsed records. If `mode` is `sniff`, returns `dict`, which is the first record. """ return read_dispatcher(source, "snd", mode) diff --git a/src/error.rs b/src/error.rs index a49b973..8aaa80d 100644 --- a/src/error.rs +++ b/src/error.rs @@ -11,11 +11,11 @@ pub enum DmapError { CorruptStream(&'static str), /// Unable to read from a buffer. - #[error("{0}")] + #[error(transparent)] Io(#[from] std::io::Error), /// Error casting between Dmap types. - #[error("{0}")] + #[error(transparent)] BadCast(#[from] std::num::TryFromIntError), /// Invalid key for a DMAP type. Valid keys are defined [here](https://github.com/SuperDARN/rst/blob/main/codebase/general/src.lib/dmap.1.25/include/dmap.h) diff --git a/src/formats/dmap.rs b/src/formats/dmap.rs index c5b1f1f..1a439b6 100644 --- a/src/formats/dmap.rs +++ b/src/formats/dmap.rs @@ -29,6 +29,9 @@ impl Record<'_> for DmapRecord { data: fields.to_owned(), }) } + fn is_metadata_field(_name: &str) -> bool { + true + } fn to_bytes(&self) -> Result, DmapError> { let mut data_bytes: Vec = vec![]; let mut num_scalars: i32 = 0; diff --git a/src/formats/fitacf.rs b/src/formats/fitacf.rs index 65110ea..9a398e5 100644 --- a/src/formats/fitacf.rs +++ b/src/formats/fitacf.rs @@ -154,6 +154,48 @@ static MATCHED_VECS: [[&str; 39]; 1] = [[ "x_sd_phi", ]]; +static DATA_FIELDS: [&str; 39] = [ + "slist", + "nlag", + "qflg", + "gflg", + "p_l", + "p_l_e", + "p_s", + "p_s_e", + "v", + "v_e", + "w_l", + "w_l_e", + "w_s", + "w_s_e", + "sd_l", + "sd_s", + "sd_phi", + "x_qflg", + "x_gflg", + "x_p_l", + "x_p_l_e", + "x_p_s", + "x_p_s_e", + "x_v", + "x_v_e", + "x_w_l", + "x_w_l_e", + "x_w_s", + "x_w_s_e", + "phi0", + "phi0_e", + "elv", + "elv_fitted", + "elv_error", + "elv_low", + "elv_high", + "x_sd_l", + "x_sd_s", + "x_sd_phi", +]; + lazy_static! { static ref FITACF_FIELDS: Fields<'static> = Fields { all_fields: { @@ -174,7 +216,8 @@ lazy_static! { grouped_vecs.push(group.to_vec()) } grouped_vecs - } + }, + data_fields: DATA_FIELDS.to_vec(), }; } diff --git a/src/formats/grid.rs b/src/formats/grid.rs index 4f7792f..f0d2f41 100644 --- a/src/formats/grid.rs +++ b/src/formats/grid.rs @@ -58,6 +58,22 @@ static VECTOR_FIELDS_OPT: [(&str, Type); 13] = [ ("vector.srng", Type::Float), ]; +static DATA_FIELDS: [&str; 13] = [ + "vector.mlat", + "vector.mlon", + "vector.kvect", + "vector.stid", + "vector.channel", + "vector.index", + "vector.vel.median", + "vector.vel.sd", + "vector.pwr.median", + "vector.pwr.sd", + "vector.wdt.median", + "vector.wdt.sd", + "vector.srng", +]; + lazy_static! { static ref MATCHED_VECS: Vec> = vec![ vec![ @@ -109,6 +125,7 @@ lazy_static! { vectors_required: VECTOR_FIELDS.to_vec(), vectors_optional: VECTOR_FIELDS_OPT.to_vec(), vector_dim_groups: MATCHED_VECS.clone(), + data_fields: DATA_FIELDS.to_vec(), }; } diff --git a/src/formats/iqdat.rs b/src/formats/iqdat.rs index 78dcfa9..24cbf15 100644 --- a/src/formats/iqdat.rs +++ b/src/formats/iqdat.rs @@ -76,6 +76,8 @@ static MATCHED_VECS: [[&str; 6]; 1] = [["tsc", "tus", "tatten", "tnoise", "toff" static VECTOR_FIELDS_OPT: [(&str, Type); 0] = []; +static DATA_FIELDS: [&str; 1] = ["data"]; + lazy_static! { static ref IQDAT_FIELDS: Fields<'static> = Fields { all_fields: { @@ -91,6 +93,7 @@ lazy_static! { vectors_required: VECTOR_FIELDS.to_vec(), vectors_optional: VECTOR_FIELDS_OPT.to_vec(), vector_dim_groups: MATCHED_VECS.to_vec().iter().map(|x| x.to_vec()).collect(), + data_fields: DATA_FIELDS.to_vec(), }; } diff --git a/src/formats/map.rs b/src/formats/map.rs index 128d93c..4992281 100644 --- a/src/formats/map.rs +++ b/src/formats/map.rs @@ -105,6 +105,22 @@ static VECTOR_FIELDS_OPT: [(&str, Type); 23] = [ ("boundary.mlon", Type::Float), ]; +static DATA_FIELDS: [&str; 13] = [ + "vector.mlat", + "vector.mlon", + "vector.kvect", + "vector.stid", + "vector.channel", + "vector.index", + "vector.srng", + "vector.vel.median", + "vector.vel.sd", + "vector.pwr.median", + "vector.pwr.sd", + "vector.wdt.median", + "vector.wdt.sd", +]; + lazy_static! { static ref MATCHED_VECS: Vec> = vec![ vec![ @@ -164,6 +180,7 @@ lazy_static! { vectors_required: VECTOR_FIELDS.to_vec(), vectors_optional: VECTOR_FIELDS_OPT.to_vec(), vector_dim_groups: MATCHED_VECS.clone(), + data_fields: DATA_FIELDS.to_vec(), }; } diff --git a/src/formats/rawacf.rs b/src/formats/rawacf.rs index 2d98f9c..713b54d 100644 --- a/src/formats/rawacf.rs +++ b/src/formats/rawacf.rs @@ -66,6 +66,8 @@ static VECTOR_FIELDS: [(&str, Type); 5] = [ static VECTOR_FIELDS_OPT: [(&str, Type); 1] = [("xcfd", Type::Float)]; +static DATA_FIELDS: [&str; 4] = ["pwr0", "slist", "acfd", "xcfd"]; + lazy_static! { static ref RAWACF_FIELDS: Fields<'static> = Fields { all_fields: { @@ -81,6 +83,7 @@ lazy_static! { vectors_required: VECTOR_FIELDS.to_vec(), vectors_optional: VECTOR_FIELDS_OPT.to_vec(), vector_dim_groups: vec![], + data_fields: DATA_FIELDS.to_vec(), }; } diff --git a/src/formats/snd.rs b/src/formats/snd.rs index 1ec1c32..d2d7a7d 100644 --- a/src/formats/snd.rs +++ b/src/formats/snd.rs @@ -65,6 +65,10 @@ static MATCHED_VECS: [[&str; 10]; 1] = [[ "slist", "qflg", "gflg", "v", "v_e", "p_l", "w_l", "x_qflg", "phi0", "phi0_e", ]]; +static DATA_FIELDS: [&str; 10] = [ + "slist", "qflg", "gflg", "v", "v_e", "p_l", "w_l", "x_qflg", "phi0", "phi0_e", +]; + lazy_static! { static ref SND_FIELDS: Fields<'static> = Fields { all_fields: { @@ -80,6 +84,7 @@ lazy_static! { vectors_required: VECTOR_FIELDS.to_vec(), vectors_optional: VECTOR_FIELDS_OPT.to_vec(), vector_dim_groups: MATCHED_VECS.to_vec().iter().map(|x| x.to_vec()).collect(), + data_fields: DATA_FIELDS.to_vec(), }; } diff --git a/src/lib.rs b/src/lib.rs index 128998a..fb70a27 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -139,10 +139,24 @@ write_rust!(dmap); /// Creates functions for reading DMAP files for the Python API. /// -/// Generates two functions: `read_[type]` and `read_[type]_lax`, for strict and lax +/// Generates six functions: +/// * `read_[name]` - reads a file, raising an error on a corrupted file +/// * `read_[name]_lax` - reads a file, returning the records and the byte where corruption starts, if corrupted. +/// * `read_[name]_bytes` - reads from bytes, similar to `read_[name]` +/// * `read_[name]_bytes_lax` - reads from bytes, similar to `read_[name]_lax` +/// * `sniff_[name]` - reads only the first record from file. +/// * `read_[name]_metadata` - reads only the metadata from records in a file. /// reading, respectively. macro_rules! read_py { - ($name:ident, $py_name:literal, $lax_name:literal, $bytes_name:literal, $lax_bytes_name:literal, $sniff_name:literal) => { + ( + $name:ident, + $py_name:literal, + $lax_name:literal, + $bytes_name:literal, + $lax_bytes_name:literal, + $sniff_name:literal, + $metadata_name:literal + ) => { paste! { #[doc = "Reads a `" $name:upper "` file, returning a list of dictionaries containing the fields." ] #[pyfunction] @@ -209,6 +223,16 @@ macro_rules! read_py { .inner() ) } + + #[doc = "Reads a `" $name:upper "` file, returning a list of dictionaries containing the only the metadata fields." ] + #[pyfunction] + #[pyo3(name = $metadata_name)] + #[pyo3(text_signature = "(infile: str, /)")] + fn [< read_ $name _metadata_py >](infile: PathBuf) -> PyResult>> { + Ok([< $name:camel Record >]::read_file_metadata(&infile) + .map_err(PyErr::from)? + ) + } } } } @@ -219,7 +243,8 @@ read_py!( "read_iqdat_lax", "read_iqdat_bytes", "read_iqdat_bytes_lax", - "sniff_iqdat" + "sniff_iqdat", + "read_iqdat_metadata" ); read_py!( rawacf, @@ -227,7 +252,8 @@ read_py!( "read_rawacf_lax", "read_rawacf_bytes", "read_rawacf_bytes_lax", - "sniff_rawacf" + "sniff_rawacf", + "read_rawacf_metadata" ); read_py!( fitacf, @@ -235,7 +261,8 @@ read_py!( "read_fitacf_lax", "read_fitacf_bytes", "read_fitacf_bytes_lax", - "sniff_fitacf" + "sniff_fitacf", + "read_fitacf_metadata" ); read_py!( grid, @@ -243,7 +270,8 @@ read_py!( "read_grid_lax", "read_grid_bytes", "read_grid_bytes_lax", - "sniff_grid" + "sniff_grid", + "read_grid_metadata" ); read_py!( map, @@ -251,7 +279,8 @@ read_py!( "read_map_lax", "read_map_bytes", "read_map_bytes_lax", - "sniff_map" + "sniff_map", + "read_map_metadata" ); read_py!( snd, @@ -259,7 +288,8 @@ read_py!( "read_snd_lax", "read_snd_bytes", "read_snd_bytes_lax", - "sniff_snd" + "sniff_snd", + "read_snd_metadata" ); read_py!( dmap, @@ -267,7 +297,8 @@ read_py!( "read_dmap_lax", "read_dmap_bytes", "read_dmap_bytes_lax", - "sniff_dmap" + "sniff_dmap", + "read_dmap_metadata" ); /// Checks that a list of dictionaries contains DMAP records, then appends to outfile. @@ -395,5 +426,14 @@ fn dmap_rs(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(sniff_grid_py, m)?)?; m.add_function(wrap_pyfunction!(sniff_map_py, m)?)?; + // Read only the metadata from files + m.add_function(wrap_pyfunction!(read_dmap_metadata_py, m)?)?; + m.add_function(wrap_pyfunction!(read_iqdat_metadata_py, m)?)?; + m.add_function(wrap_pyfunction!(read_rawacf_metadata_py, m)?)?; + m.add_function(wrap_pyfunction!(read_fitacf_metadata_py, m)?)?; + m.add_function(wrap_pyfunction!(read_snd_metadata_py, m)?)?; + m.add_function(wrap_pyfunction!(read_grid_metadata_py, m)?)?; + m.add_function(wrap_pyfunction!(read_map_metadata_py, m)?)?; + Ok(()) } diff --git a/src/record.rs b/src/record.rs index 3896f75..5ee25b9 100644 --- a/src/record.rs +++ b/src/record.rs @@ -3,7 +3,10 @@ use crate::compression::detect_bz2; use crate::error::DmapError; use crate::io; -use crate::types::{parse_scalar, parse_vector, read_data, DmapField, DmapType, DmapVec, Fields}; +use crate::types::{ + parse_scalar, parse_vector, parse_vector_header, read_data, DmapField, DmapType, DmapVec, + Fields, +}; use bzip2::read::BzDecoder; use indexmap::IndexMap; use itertools::izip; @@ -35,6 +38,9 @@ pub trait Record<'a>: /// Returns the names of all fields stored in the record. fn keys(&self) -> Vec<&String>; + /// Returns whether `name` is a metadata field of the record. + fn is_metadata_field(name: &str) -> bool; + /// Reads from `dmap_data` and parses into `Vec`. /// /// Returns `DmapError` if `dmap_data` cannot be read or contains invalid data. @@ -144,6 +150,81 @@ pub trait Record<'a>: Ok(dmap_records) } + /// Reads metadata of records from `dmap_data` and parses into `Vec`. + /// + /// Returns `DmapError` if `dmap_data` cannot be read or contains invalid data. + fn read_metadata( + mut dmap_data: impl Read, + ) -> Result>, DmapError> + where + Self: Sized, + Self: Send, + { + let mut buffer: Vec = vec![]; + let (is_bz2, mut chunk) = detect_bz2(&mut dmap_data)?; + if is_bz2 { + let mut stream = BzDecoder::new(chunk); + stream.read_to_end(&mut buffer)?; + } else { + chunk.read_to_end(&mut buffer)?; + } + + let mut slices: Vec<_> = vec![]; + let mut rec_start: usize = 0; + let mut rec_size: usize; + let mut rec_end: usize; + + while ((rec_start + 2 * i32::size()) as u64) < buffer.len() as u64 { + rec_size = i32::from_le_bytes(buffer[rec_start + 4..rec_start + 8].try_into().unwrap()) + as usize; // advance 4 bytes, skipping the "code" field + rec_end = rec_start + rec_size; // error-checking the size is conducted in Self::parse_record() + if rec_end > buffer.len() { + return Err(DmapError::InvalidRecord(format!("Record {} starting at byte {} has size greater than remaining length of buffer ({} > {})", slices.len(), rec_start, rec_size, buffer.len() - rec_start))); + } else if rec_size == 0 { + return Err(DmapError::InvalidRecord(format!( + "Record {} starting at byte {} has non-positive size {} <= 0", + slices.len(), + rec_start, + rec_size + ))); + } + slices.push(Cursor::new(buffer[rec_start..rec_end].to_vec())); + rec_start = rec_end; + } + if rec_start != buffer.len() { + return Err(DmapError::InvalidRecord(format!( + "Record {} starting at byte {} incomplete; has size of {} bytes", + slices.len() + 1, + rec_start, + buffer.len() - rec_start + ))); + } + let mut dmap_results: Vec, DmapError>> = vec![]; + dmap_results.par_extend( + slices + .par_iter_mut() + .map(|cursor| Self::parse_metadata(cursor)), + ); + + let mut dmap_records: Vec> = vec![]; + let mut bad_recs: Vec = vec![]; + let mut dmap_errors: Vec = vec![]; + for (i, rec) in dmap_results.into_iter().enumerate() { + match rec { + Ok(x) => dmap_records.push(x), + Err(e) => { + dmap_errors.push(e); + bad_recs.push(i); + } + } + } + if !dmap_errors.is_empty() { + return Err(DmapError::BadRecords(bad_recs, dmap_errors[0].to_string())); + } + + Ok(dmap_records) + } + /// Reads from `dmap_data` and parses into `Vec`. /// /// Returns a 2-tuple, where the first entry is the good records from the front of the buffer, @@ -239,6 +320,108 @@ pub trait Record<'a>: Self::read_first_record(file) } + /// Read the metadata from a DMAP file of type `Self` + fn read_file_metadata>( + infile: P, + ) -> Result>, DmapError> + where + Self: Sized, + Self: Send, + { + let file = File::open(infile)?; + Self::read_metadata(file) + } + + /// Reads a record from `cursor`, only keeping the metadata fields. + fn parse_metadata( + cursor: &mut Cursor>, + ) -> Result, DmapError> + where + Self: Sized, + { + let bytes_already_read = cursor.position(); + let _code = read_data::(cursor).map_err(|e| { + DmapError::InvalidRecord(format!( + "Cannot interpret code at byte {}: {e}", + bytes_already_read + )) + })?; + let size = read_data::(cursor).map_err(|e| { + DmapError::InvalidRecord(format!( + "Cannot interpret size at byte {}: {e}", + bytes_already_read + i32::size() as u64 + )) + })?; + + // adding 8 bytes because code and size are part of the record. + if size as u64 > cursor.get_ref().len() as u64 - cursor.position() + 2 * i32::size() as u64 + { + return Err(DmapError::InvalidRecord(format!( + "Record size {size} at byte {} bigger than remaining buffer {}", + cursor.position() - i32::size() as u64, + cursor.get_ref().len() as u64 - cursor.position() + 2 * i32::size() as u64 + ))); + } else if size <= 0 { + return Err(DmapError::InvalidRecord(format!("Record size {size} <= 0"))); + } + + let num_scalars = read_data::(cursor).map_err(|e| { + DmapError::InvalidRecord(format!( + "Cannot interpret number of scalars at byte {}: {e}", + cursor.position() - i32::size() as u64 + )) + })?; + let num_vectors = read_data::(cursor).map_err(|e| { + DmapError::InvalidRecord(format!( + "Cannot interpret number of vectors at byte {}: {e}", + cursor.position() - i32::size() as u64 + )) + })?; + if num_scalars <= 0 { + return Err(DmapError::InvalidRecord(format!( + "Number of scalars {num_scalars} at byte {} <= 0", + cursor.position() - 2 * i32::size() as u64 + ))); + } else if num_vectors <= 0 { + return Err(DmapError::InvalidRecord(format!( + "Number of vectors {num_vectors} at byte {} <= 0", + cursor.position() - i32::size() as u64 + ))); + } else if num_scalars + num_vectors > size { + return Err(DmapError::InvalidRecord(format!( + "Number of scalars {num_scalars} plus vectors {num_vectors} greater than size '{size}'"))); + } + + let mut fields: IndexMap = IndexMap::new(); + for _ in 0..num_scalars { + let (name, val) = parse_scalar(cursor)?; + fields.insert(name, val); + } + for _ in 0..num_vectors { + let here = cursor.position(); + let (name, dtype, _dims, num_elements) = parse_vector_header(cursor, size)?; + if Self::is_metadata_field(&name) { + cursor.set_position(here); + let (_, val) = parse_vector(cursor, size)?; + fields.insert(name.to_string(), val); + } else { + let vec_data_size = dtype.size() as u64 * num_elements as u64; + let here = cursor.position(); + cursor.set_position(here + vec_data_size); + } + } + + if cursor.position() - bytes_already_read != size as u64 { + return Err(DmapError::InvalidRecord(format!( + "Bytes read {} does not match the records size field {}", + cursor.position() - bytes_already_read, + size + ))); + } + + Ok(fields) + } + /// Reads a record from `cursor`. fn parse_record(cursor: &mut Cursor>) -> Result where @@ -820,6 +1003,9 @@ macro_rules! create_record_type { bytes.append(&mut data_bytes); // consumes data_bytes Ok(bytes) } + fn is_metadata_field(name: &str) -> bool { + !$fields.data_fields.iter().any(|e| e == &name) + } } impl TryFrom<&mut IndexMap> for [< $format:camel Record >] { diff --git a/src/types.rs b/src/types.rs index 62635be..7cebecb 100644 --- a/src/types.rs +++ b/src/types.rs @@ -29,6 +29,8 @@ pub struct Fields<'a> { pub vectors_optional: Vec<(&'a str, Type)>, /// Groups of vector fields which must have identical dimensions pub vector_dim_groups: Vec>, + /// The name of each field which is a data (as opposed to metadata) field + pub data_fields: Vec<&'a str>, } /// The possible data types that a scalar or vector field may have. @@ -102,7 +104,7 @@ impl Type { } } /// The size in bytes of the data for `Type` - fn size(&self) -> usize { + pub fn size(&self) -> usize { match self { Self::Char => 1, Self::Short => 2, @@ -981,7 +983,7 @@ pub(crate) fn parse_scalar(cursor: &mut Cursor>) -> Result<(String, Dmap /// Grabs the name and data type key from `cursor`. #[inline] -fn parse_header(cursor: &mut Cursor>) -> Result<(String, Type)> { +pub(crate) fn parse_header(cursor: &mut Cursor>) -> Result<(String, Type)> { let name = read_data::(cursor).map_err(|e| { DmapError::InvalidField(format!("Invalid name, byte {}: {e}", cursor.position())) })?; @@ -996,19 +998,17 @@ fn parse_header(cursor: &mut Cursor>) -> Result<(String, Type)> { Ok((name, data_type)) } -/// Parses a vector starting from the `cursor` position. +/// Parses a header for a vector starting from the `cursor` position. /// /// Interprets the bytes in `cursor` as follows: /// 1. `name`: a null-terminated string /// 2. `type`: a key indicating the data type ([`Type`]) /// 3. `num_dims`: the number of dimensions in the array, as an `i32`. /// 4. `dims`: the dimensions themselves, as a list of `num_dims` `i32`s, in column-major order. -/// 5. `data`: the data itself, of type `type` with shape `dims`, stored in column-major order. -pub(crate) fn parse_vector( +pub(crate) fn parse_vector_header( cursor: &mut Cursor>, record_size: i32, -) -> Result<(String, DmapField)> { - let start_position = cursor.position(); +) -> Result<(String, Type, Vec, i32)> { let (name, data_type) = parse_header(cursor)?; let vector_dimension = read_data::(cursor)?; @@ -1053,6 +1053,24 @@ pub(crate) fn parse_vector( ))); } + Ok((name, data_type, dimensions, total_elements)) +} + +/// Parses a vector starting from the `cursor` position. +/// +/// Interprets the bytes in `cursor` as follows: +/// 1. `name`: a null-terminated string +/// 2. `type`: a key indicating the data type ([`Type`]) +/// 3. `num_dims`: the number of dimensions in the array, as an `i32`. +/// 4. `dims`: the dimensions themselves, as a list of `num_dims` `i32`s, in column-major order. +/// 5. `data`: the data itself, of type `type` with shape `dims`, stored in column-major order. +pub(crate) fn parse_vector( + cursor: &mut Cursor>, + record_size: i32, +) -> Result<(String, DmapField)> { + let start_position = cursor.position(); + let (name, data_type, dimensions, total_elements) = parse_vector_header(cursor, record_size)?; + macro_rules! dmapvec_from_cursor { ($type:ty, $enum_var:path, $dims:ident, $cursor:ident, $num_elements:ident, $name:ident) => { $enum_var( diff --git a/tests/test_api.py b/tests/test_api.py index 411df63..d5cfc4c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -13,7 +13,82 @@ HERE = os.path.dirname(__file__) FORMATS = ("iqdat", "rawacf", "fitacf", "grid", "map", "snd") FILE_LENGTHS = (247688, 73528, 10780, 4612, 32668, 1659) - +DATA_FIELDS = ( + ("data"), + ("pwr0", "slist", "acfd", "xcfd"), + ( + "slist", + "nlag", + "qflg", + "gflg", + "p_l", + "p_l_e", + "p_s", + "p_s_e", + "v", + "v_e", + "w_l", + "w_l_e", + "w_s", + "w_s_e", + "sd_l", + "sd_s", + "sd_phi", + "x_qflg", + "x_gflg", + "x_p_l", + "x_p_l_e", + "x_p_s", + "x_p_s_e", + "x_v", + "x_v_e", + "x_w_l", + "x_w_l_e", + "x_w_s", + "x_w_s_e", + "phi0", + "phi0_e", + "elv", + "elv_fitted", + "elv_error", + "elv_low", + "elv_high", + "x_sd_l", + "x_sd_s", + "x_sd_phi" + ), + ( + "vector.mlat", + "vector.mlon", + "vector.kvect", + "vector.stid", + "vector.channel", + "vector.index", + "vector.vel.median", + "vector.vel.sd", + "vector.pwr.median", + "vector.pwr.sd", + "vector.wdt.median", + "vector.wdt.sd", + "vector.srng" + ), + ( + "vector.mlat", + "vector.mlon", + "vector.kvect", + "vector.stid", + "vector.channel", + "vector.index", + "vector.srng", + "vector.vel.median", + "vector.vel.sd", + "vector.pwr.median", + "vector.pwr.sd", + "vector.wdt.median", + "vector.wdt.sd" + ), + ("slist", "qflg", "gflg", "v", "v_e", "p_l", "w_l", "x_qflg", "phi0", "phi0_e") +) def compare_recs(data1, data2): """Compare two `list[dict]`s, checking they are identical.""" @@ -281,3 +356,12 @@ def test_key_wrong_type_read(fmt): with pytest.raises(ValueError): _ = getattr(dmap, f"read_{fmt}")(raw_bytes, mode="strict") + + +@pytest.mark.parametrize("fmt,data_fields", zip(FORMATS, DATA_FIELDS)) +def test_read_metadata(fmt, data_fields): + infile = f"{HERE}/test_files/test.{fmt}" + data = getattr(dmap, f"read_{fmt}")(infile, mode="metadata") + for rec in data: + assert not any([f in rec for f in data_fields]) + diff --git a/tests/tests.rs b/tests/tests.rs index ec3b857..0c9dad6 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -92,6 +92,17 @@ macro_rules! make_test { let all_recs = [< $record_type:camel Record >]::read_file(&filename).expect("Unable to read file"); assert_eq!(data, all_recs[0]) } + + #[test] + fn [< test_ $record_type _metadata >] () { + let filename: PathBuf = PathBuf::from(format!("tests/test_files/test.{}", stringify!($record_type))); + let data = [< $record_type:camel Record >]::read_file_metadata(&filename).expect("Unable to read file metadata"); + let all_recs = [< $record_type:camel Record >]::read_file(&filename).expect("Unable to read file"); + assert_eq!(data.len(), all_recs.len()); + for (mdata_rec, ref_rec) in izip!(data.iter(), all_recs.iter()) { + assert!(mdata_rec.keys().len() < ref_rec.keys().len()) + } + } } }; }