diff --git a/Cargo.lock b/Cargo.lock index a689c90..f499f18 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -389,6 +389,18 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "771fe0050b883fcc3ea2359b1a96bcfbc090b7116eae7c3c512c7a083fdf23d3" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.8.0" @@ -419,6 +431,15 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "631ae5198c9be5e753e5cc215e1bd73c2b466a3565173db433f52bb9d3e66dba" +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version 0.4.0", +] + [[package]] name = "cc" version = "1.0.71" @@ -505,6 +526,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "bitflags", + "textwrap", + "unicode-width", +] + [[package]] name = "colored" version = "1.9.3" @@ -571,6 +603,76 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + [[package]] name = "crossbeam-utils" version = "0.8.5" @@ -624,6 +726,28 @@ dependencies = [ "syn", ] +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "ctor" version = "0.1.21" @@ -712,7 +836,7 @@ dependencies = [ "convert_case", "proc-macro2", "quote", - "rustc_version", + "rustc_version 0.3.3", "syn", ] @@ -1106,6 +1230,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + [[package]] name = "hashbrown" version = "0.11.2" @@ -1347,6 +1477,7 @@ version = "0.1.0" dependencies = [ "chashmap", "cid", + "criterion", "ctrlc", "futures", "libp2p", @@ -2233,6 +2364,12 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -2483,6 +2620,34 @@ version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12295df4f294471248581bc09bef3c38a5e46f1e36d6a37353621a0c6c357e1f" +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + [[package]] name = "polling" version = "2.1.0" @@ -2782,6 +2947,31 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "rdrand" version = "0.4.0" @@ -2811,6 +3001,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + [[package]] name = "regex-syntax" version = "0.6.25" @@ -2894,7 +3090,16 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" dependencies = [ - "semver", + "semver 0.11.0", +] + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver 1.0.4", ] [[package]] @@ -2936,6 +3141,15 @@ dependencies = [ "cipher", ] +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.19" @@ -3030,6 +3244,12 @@ dependencies = [ "semver-parser", ] +[[package]] +name = "semver" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" + [[package]] name = "semver-parser" version = "0.10.2" @@ -3048,6 +3268,16 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + [[package]] name = "serde_derive" version = "1.0.130" @@ -3188,7 +3418,7 @@ dependencies = [ "rand 0.8.4", "rand_core 0.6.3", "ring", - "rustc_version", + "rustc_version 0.3.3", "sha2", "subtle", "x25519-dalek", @@ -3329,6 +3559,15 @@ dependencies = [ "utf-8", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + [[package]] name = "thin-slice" version = "0.1.1" @@ -3365,6 +3604,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.5.0" @@ -3641,6 +3890,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + [[package]] name = "want" version = "0.3.0" @@ -3815,6 +4075,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index 760b100..6d8022b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,4 +14,11 @@ log = "0.4.14" simple_logger = "1.13.0" reqwest = { version = "0.11.6", features = ["gzip", "blocking"] } scraper = "0.12.0" -ctrlc = "3.2.1" \ No newline at end of file +ctrlc = "3.2.1" + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "test_benchmark" +harness = false diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs new file mode 100644 index 0000000..4aa1e5f --- /dev/null +++ b/benches/test_benchmark.rs @@ -0,0 +1,17 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +fn fibonacci(n: u64) -> u64 { + match n { + 0 => 1, + 1 => 1, + n => fibonacci(n - 1) + fibonacci(n - 2), + } +} + +fn criterion_benchmark(c: &mut Criterion) { + // TODO: replace with real benchmarks + c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20)))); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/src/index_result.rs b/src/index_result.rs new file mode 100644 index 0000000..c151e2a --- /dev/null +++ b/src/index_result.rs @@ -0,0 +1,101 @@ +use std::{collections::HashMap, fmt}; + +pub struct IndexResult { + pub cid: String, + pub title: String, + pub excerpt: String, + pub keywords: HashMap, +} + +impl IndexResult { + pub fn new( + cid: String, + title: String, + excerpt: String, + keywords: HashMap, + ) -> IndexResult { + IndexResult { + cid: cid, + title: title, + excerpt: excerpt, + keywords: keywords, + } + } + + /** + * Returns the top n keywords. Todo: use a tree structure to store the rankings of the keywords + * so that this is faster + */ + pub fn top_n_keywords(&self, n: u32) -> Vec<(&String, &u32)> { + let mut hash_vec: Vec<(&String, &u32)> = self.keywords.iter().collect(); + hash_vec.sort_by(|a, b| b.1.cmp(a.1)); + hash_vec.iter().take(n as usize).cloned().collect() + } +} + +impl fmt::Display for IndexResult { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "CID: {} \nTitle: {}\n{}\nKeywords: {:?}", + self.cid, + self.title, + self.excerpt, + self.top_n_keywords(10) + ) + } +} + +#[cfg(test)] +mod tests { + use std::array::IntoIter; + use std::{collections::HashMap, iter::FromIterator}; + + use crate::index_result::IndexResult; + + #[test] + fn single_keyword() { + let keywords = HashMap::<_, _>::from_iter(IntoIter::new([("key1".to_string(), 1)])); + + let result = IndexResult::new( + "1".to_string(), + "title".to_string(), + "excerpt".to_string(), + keywords, + ); + assert_eq!(result.top_n_keywords(10).len(), 1); + } + #[test] + + fn all_keywords() { + let keywords = HashMap::<_, _>::from_iter(IntoIter::new([ + ("key1".to_string(), 1), + ("key2".to_string(), 2), + ])); + + let result = IndexResult::new( + "1".to_string(), + "title".to_string(), + "excerpt".to_string(), + keywords, + ); + assert_eq!(result.top_n_keywords(2).len(), 2); + } + + #[test] + fn subset_of_keywords() { + let keywords = HashMap::<_, _>::from_iter(IntoIter::new([ + ("key1".to_string(), 1), + ("key2".to_string(), 2), + ("key2".to_string(), 3), + ])); + + let result = IndexResult::new( + "1".to_string(), + "title".to_string(), + "excerpt".to_string(), + keywords, + ); + assert_eq!(result.top_n_keywords(2).len(), 2); + } +} diff --git a/src/indexer.rs b/src/indexer.rs index c27d779..5706ff2 100644 --- a/src/indexer.rs +++ b/src/indexer.rs @@ -1,62 +1,16 @@ +use super::index_result::IndexResult; use chashmap::CHashMap; use cid::multihash::{Code, MultihashDigest}; use cid::Cid; use log::{info, trace, warn}; use scraper::{Html, Selector}; use std::collections::HashMap; -use std::fmt; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc::{channel, Receiver, Sender}; use std::{sync, thread, time}; pub const RAW: u64 = 0x55; -struct IndexResult { - pub cid: String, - pub title: String, - pub excerpt: String, - pub keywords: HashMap, -} - -impl IndexResult { - pub fn new( - cid: String, - title: String, - excerpt: String, - keywords: HashMap, - ) -> IndexResult { - IndexResult { - cid: cid, - title: title, - excerpt: excerpt, - keywords: keywords, - } - } - - /** - * Returns the top n keywords. Todo: use a tree structure to store the rankings of the keywords - * so that this is faster - */ - pub fn top_n_keywords(&self, n: u32) -> Vec<(&String, &u32)> { - let mut hash_vec: Vec<(&String, &u32)> = self.keywords.iter().collect(); - hash_vec.sort_by(|a, b| b.1.cmp(a.1)); - hash_vec.iter().take(n as usize).cloned().collect() - } -} - -impl fmt::Display for IndexResult { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "CID: {} \nTitle: {}\n{}\nKeywords: {:?}", - self.cid, - self.title, - self.excerpt, - self.top_n_keywords(10) - ) - } -} - pub struct Indexer { // this map is for keeping track of which entries have been indexed map: sync::Arc>, diff --git a/src/main.rs b/src/main.rs index 7e30f79..bcde204 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,7 +9,9 @@ use std::error::Error; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; +mod index_result; mod indexer; + use indexer::Indexer; fn main() -> Result<(), Box> {