diff --git a/bao-http-tool/Cargo.toml b/bao-http-tool/Cargo.toml new file mode 100644 index 0000000..23e4a2a --- /dev/null +++ b/bao-http-tool/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "http-tool" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.82" +bao-tree = "0.13.0" +clap = { version = "4.5.4", features = ["derive"] } +iroh-io = { version = "0.6.0", features = ["x-http", "stats", "tokio-util"] } +range-collections = "0.4.5" +tokio = { version = "1.37.0", features = ["full"] } +tokio-util = "0.7.10" +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +url = "2.5.0" diff --git a/bao-http-tool/README.md b/bao-http-tool/README.md new file mode 100644 index 0000000..4e472a7 --- /dev/null +++ b/bao-http-tool/README.md @@ -0,0 +1,27 @@ +# Validate external data using external outboard + +## Usage + +Create an outboard for an asset that is available via http: + +``` +❯ bao-http-tool generate --data http://127.0.0.1:3003/370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2.data --block-size-log 4 +Computing outboard for http://127.0.0.1:3003/370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2.data of size 100000000 with block log 4, size 16384 +Computed hash: 370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2 +Writing outboard to 370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2.obao4 +``` + +Validate a range of the file: + +``` +validate --hash 370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2 --block-size-log 4 --data http://127.0.0.1:3003/370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2.data --outboard 370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2.obao4 --range 0..10000000 +Size: 100000000 +Outboard: 390592 +Hash: 370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2 +Block size: BlockSize(4) +Byte ranges: RangeSet{0..10000000} +Chunk ranges: RangeSet{0..9766} +confirmed that range RangeSet{0..10000000} of http://127.0.0.1:3003/370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2.data matches 370e2b3002be3b38b120f7b3be53da4cf646810e26f8f4a5018247c8188af5b2 +``` + +Note that for validation both data **and** outboard can be available via http. \ No newline at end of file diff --git a/bao-http-tool/src/args.rs b/bao-http-tool/src/args.rs new file mode 100644 index 0000000..94bd9e0 --- /dev/null +++ b/bao-http-tool/src/args.rs @@ -0,0 +1,74 @@ +use std::{fmt::Display, path::PathBuf, str::FromStr}; + +use bao_tree::blake3::Hash; +use clap::Parser; +use url::Url; + +#[derive(Debug, Parser)] +pub struct Args { + #[clap(subcommand)] + pub subcommand: SubCommand, +} + +#[derive(Debug, Parser)] +pub enum SubCommand { + Validate(ValidateArgs), + Generate(GenerateArgs), +} + +#[derive(Debug, Parser)] +pub struct ValidateArgs { + #[clap(long, help = "Hash of the data")] + pub hash: Hash, + + #[clap(long, help = "URL or local path to the data")] + pub data: PathOrUrl, + + #[clap(long, help = "URL or local path to the outboard")] + pub outboard: PathOrUrl, + + #[clap(long, default_value_t = 0, help = "Block size in log2(bytes)")] + pub block_size_log: u8, + + #[clap(long, help = "Range of the data to read")] + pub range: Option, +} + +#[derive(Debug, Parser)] +pub struct GenerateArgs { + #[clap(long, help = "URL to the data")] + pub data: Url, + + #[clap(long, help = "path where to create the outboard")] + pub target: Option, + + #[clap(long, default_value_t = 0, help = "Block size in log2(bytes)")] + pub block_size_log: u8, +} + +#[derive(Debug, Clone)] +pub enum PathOrUrl { + Path(std::path::PathBuf), + Url(Url), +} + +impl Display for PathOrUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PathOrUrl::Path(path) => write!(f, "{}", path.display()), + PathOrUrl::Url(url) => write!(f, "{}", url), + } + } +} + +impl FromStr for PathOrUrl { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + if let Ok(url) = Url::from_str(s) { + Ok(PathOrUrl::Url(url)) + } else { + Ok(PathOrUrl::Path(std::path::PathBuf::from(s))) + } + } +} diff --git a/bao-http-tool/src/main.rs b/bao-http-tool/src/main.rs new file mode 100644 index 0000000..ded4bb8 --- /dev/null +++ b/bao-http-tool/src/main.rs @@ -0,0 +1,116 @@ +use std::io::Cursor; + +use bao_tree::{ + io::{ + fsm::{encode_ranges_validated, outboard_post_order}, + outboard::PreOrderOutboard, + round_up_to_chunks, + }, + BaoTree, BlockSize, ByteRanges, +}; +use clap::Parser; +use iroh_io::{AsyncSliceReader, HttpAdapter}; +use tokio_util::either::Either; +mod args; +mod outboard; +use args::{GenerateArgs, PathOrUrl, SubCommand, ValidateArgs}; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; + +use crate::args::Args; + +type Reader = Either; + +async fn open(path: PathOrUrl) -> anyhow::Result { + Ok(match path { + PathOrUrl::Url(url) => Either::Left(iroh_io::HttpAdapter::new(url)), + PathOrUrl::Path(path) => Either::Right(iroh_io::File::open(path).await?), + }) +} + +async fn generate(args: GenerateArgs) -> anyhow::Result<()> { + let mut outboard = Vec::new(); + let mut data = HttpAdapter::new(args.data.clone()); + let block_size = BlockSize::from_chunk_log(args.block_size_log); + let size = data.size().await?; + let tree = BaoTree::new(size, block_size); + println!( + "Computing outboard for {} of size {} with block log {}, size {}", + args.data, + size, + block_size.chunk_log(), + block_size.bytes() + ); + let hash = outboard_post_order(Cursor::new(data), tree, &mut outboard).await?; + println!("Computed hash: {}", hash.to_hex()); + let filename = args + .target + .unwrap_or_else(|| format!("{}.obao{}", hash.to_hex(), block_size.chunk_log()).into()); + println!("Writing outboard to {}", filename.display()); + tokio::fs::write(filename, outboard).await?; + Ok(()) +} + +async fn validate(args: ValidateArgs) -> anyhow::Result<()> { + let mut data = open(args.data.clone()).await?; + let mut outboard = open(args.outboard.clone()).await?; + let size = data.size().await?; + let outboard_size = outboard.size().await?; + let byte_ranges = if let Some(range) = args + .range + .as_ref() + .map(|x| x.split("..").collect::>()) + { + if range.len() != 2 { + anyhow::bail!("Invalid range"); + } + let start: u64 = range[0].parse()?; + let end: u64 = range[1].parse()?; + ByteRanges::from(start..end) + } else { + ByteRanges::all() + }; + let chunk_ranges = round_up_to_chunks(&byte_ranges); + // let size = outboard.read_at(0, 8).await?; + // let size = u64::from_le_bytes(size.as_ref().try_into()?); + let block_size = BlockSize::from_chunk_log(args.block_size_log); + println!("Size: {}", size); + println!("Outboard: {}", outboard_size); + println!("Hash: {}", args.hash.to_hex()); + println!("Block size: {}", block_size); + println!("Byte ranges: {:?}", byte_ranges); + println!("Chunk ranges: {:?}", chunk_ranges); + let tree = BaoTree::new(size, block_size); + let outboard = PreOrderOutboard { + root: args.hash, + data: outboard, + tree, + }; + let encoded = Vec::new(); + encode_ranges_validated(data, outboard, &chunk_ranges, encoded).await?; + println!( + "confirmed that range {:?} of {} matches {}", + byte_ranges, + args.data, + args.hash.to_hex() + ); + Ok(()) +} + +fn setup_logging() { + tracing_subscriber::registry() + .with(tracing_subscriber::fmt::layer().with_writer(std::io::stderr)) + .with(EnvFilter::from_default_env()) + .try_init() + .ok(); +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + setup_logging(); + let args = Args::parse(); + match args.subcommand { + SubCommand::Validate(args) => validate(args).await?, + SubCommand::Generate(args) => generate(args).await?, + } + Ok(()) +} diff --git a/bao-http-tool/src/outboard.rs b/bao-http-tool/src/outboard.rs new file mode 100644 index 0000000..ebcc675 --- /dev/null +++ b/bao-http-tool/src/outboard.rs @@ -0,0 +1,93 @@ +use std::{ + collections::BTreeMap, + io, + sync::{Arc, RwLock}, +}; + +use bao_tree::{blake3::Hash, iter::BaoChunk, BaoTree, ByteRanges, ChunkRangesRef, TreeNode}; +use range_collections::{range_set::RangeSetRange, RangeSetRef}; + +struct SparseOutboardCache { + cache: Arc>>, + inner: T, +} + +impl bao_tree::io::fsm::Outboard for SparseOutboardCache { + fn root(&self) -> bao_tree::blake3::Hash { + self.inner.root() + } + + fn tree(&self) -> bao_tree::BaoTree { + self.inner.tree() + } + + async fn load( + &mut self, + node: bao_tree::TreeNode, + ) -> io::Result> { + if let Some(entry) = self.cache.read().unwrap().get(&node) { + return Ok(Some(entry.clone())); + } + let entry = self.inner.load(node).await?; + if let Some(entry) = entry { + self.cache.write().unwrap().insert(node, entry.clone()); + } + Ok(entry) + } +} + +/// Get the set of node IDs needed to encode the given chunk ranges. +pub async fn node_ids( + tree: BaoTree, + ranges: &ChunkRangesRef, +) -> impl Iterator + '_ { + // todo! + // let ranges = bao_tree::rec::truncate_ranges(ranges, outboard.tree().size()) + tree.ranges_pre_order_chunks_iter_ref(ranges, tree.block_size().chunk_log()) + .filter_map(|chunk| match chunk { + BaoChunk::Parent { node, .. } => Some(node), + _ => None, + }) +} + +/// Compute the byte ranges of a pre order offset without size prefix to encode the given nodes. +pub fn pre_order_ranges(tree: BaoTree, nodes: impl IntoIterator) -> ByteRanges { + let mut res = ByteRanges::empty(); + for node in nodes { + if let Some(offset) = tree.post_order_offset(node) { + let offset = offset.value() * 64; + res |= ByteRanges::from(offset..offset + 64); + } + } + res +} + +/// Rounds up ranges to the nearest chunk boundaries, where a chunk is 2^block_shift bytes. +pub fn simplify(ranges: &RangeSetRef, block_shift: u8) -> ByteRanges { + let mut res = ByteRanges::empty(); + for item in ranges.iter() { + match item { + RangeSetRange::Range(range) => { + let start = (range.start >> block_shift) << block_shift; + let end = ((range.end + ((1 << block_shift) - 1)) >> block_shift) << block_shift; + res |= ByteRanges::from(start..end); + } + RangeSetRange::RangeFrom(range) => { + let start = (range.start >> block_shift) << block_shift; + res |= ByteRanges::from(start..); + } + } + } + res +} + +/// Prefetch all nodes in the tree that are needed to encode the given chunk ranges. +pub async fn prefetch( + mut outboard: impl bao_tree::io::fsm::Outboard, + ranges: &ChunkRangesRef, +) -> io::Result<()> { + for node in node_ids(outboard.tree(), ranges).await { + outboard.load(node).await?; + } + Ok(()) +} diff --git a/validate-http/Cargo.toml b/validate-http/Cargo.toml new file mode 100644 index 0000000..f27c783 --- /dev/null +++ b/validate-http/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "bao-http-tool" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.82" +bao-tree = "0.13.0" +clap = { version = "4.5.4", features = ["derive"] } +iroh-io = { version = "0.6.0", features = ["x-http", "stats", "tokio-util"] } +tokio = { version = "1.37.0", features = ["full"] } +tokio-util = "0.7.10" +url = "2.5.0"