diff --git a/Cargo.lock b/Cargo.lock index b7ccff2fa..7c120a3ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2039,6 +2039,26 @@ version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e60eed09d8c01d3cee5b7d30acb059b76614c918fa0f992e0dd6eeb10daad6f" +[[package]] +name = "deepsize" +version = "0.2.0" +source = "git+https://github.com/chirino/deepsize?branch=main#c6a982656a57cf32efa345372c2955f1b8f68f92" +dependencies = [ + "cpe", + "deepsize_derive", + "petgraph 0.7.1", +] + +[[package]] +name = "deepsize_derive" +version = "0.1.2" +source = "git+https://github.com/chirino/deepsize?branch=main#c6a982656a57cf32efa345372c2955f1b8f68f92" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "deflate64" version = "0.1.9" @@ -8378,6 +8398,7 @@ dependencies = [ "chrono", "clap", "cpe", + "deepsize", "hex", "human-date-parser", "itertools 0.13.0", @@ -8433,6 +8454,7 @@ dependencies = [ "anyhow", "async-graphql", "cpe", + "deepsize", "log", "rstest", "schemars", @@ -8519,6 +8541,7 @@ dependencies = [ "cpe", "criterion", "csaf", + "deepsize", "hex", "humantime", "itertools 0.13.0", diff --git a/Cargo.toml b/Cargo.toml index 9d6982167..7645ddb7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,7 @@ criterion = "0.5.1" csaf = { version = "0.5.0", default-features = false } csaf-walker = { version = "0.10.0", default-features = false } cve = "0.3.1" +deepsize = "0.2.0" env_logger = "0.11.0" futures = "0.3.30" futures-util = "0.3" @@ -204,3 +205,6 @@ osv = { git = "https://github.com/ctron/osv", rev = "b53f1590bbbdc663e3efe405f1f # to pickup fix: https://github.com/Abraxas-365/langchain-rust/pull/246 # and fix: https://github.com/Abraxas-365/langchain-rust/pull/250 langchain-rust = { git = "https://github.com/chirino/langchain-rust", branch = "main" } + +# to pickup feat: https://github.com/Aeledfyr/deepsize/pull/41 +deepsize = { git = "https://github.com/chirino/deepsize", branch = "main" } diff --git a/common/Cargo.toml b/common/Cargo.toml index ce818b675..3210e5f07 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -15,6 +15,7 @@ bytesize = { workspace = true, features = ["serde"] } chrono = { workspace = true } clap = { workspace = true, features = ["derive", "env"] } cpe = { workspace = true } +deepsize = { workspace = true } hex = { workspace = true } human-date-parser = { workspace = true } itertools = { workspace = true } diff --git a/common/src/cpe.rs b/common/src/cpe.rs index 73259e205..444b87c25 100644 --- a/common/src/cpe.rs +++ b/common/src/cpe.rs @@ -2,6 +2,7 @@ use cpe::{ cpe::Cpe as _, uri::{OwnedUri, Uri}, }; +use deepsize::DeepSizeOf; use serde::{ de::{Error, Visitor}, Deserialize, Deserializer, Serialize, Serializer, @@ -17,7 +18,7 @@ use utoipa::{ }; use uuid::Uuid; -#[derive(Clone, Hash, Eq, PartialEq)] +#[derive(Clone, Hash, Eq, PartialEq, DeepSizeOf)] pub struct Cpe { uri: OwnedUri, } diff --git a/common/src/purl.rs b/common/src/purl.rs index d5ca52e08..2017739ff 100644 --- a/common/src/purl.rs +++ b/common/src/purl.rs @@ -1,3 +1,4 @@ +use deepsize::DeepSizeOf; use packageurl::PackageUrl; use percent_encoding::{utf8_percent_encode, AsciiSet, CONTROLS}; use serde::{ @@ -25,7 +26,7 @@ pub enum PurlErr { Package(#[from] packageurl::Error), } -#[derive(Clone, PartialEq, Eq, Hash)] +#[derive(Clone, PartialEq, Eq, Hash, DeepSizeOf)] pub struct Purl { pub ty: String, pub namespace: Option, diff --git a/entity/Cargo.toml b/entity/Cargo.toml index 79f81d8ab..7b0d825ce 100644 --- a/entity/Cargo.toml +++ b/entity/Cargo.toml @@ -11,6 +11,7 @@ trustify-cvss = { workspace = true } async-graphql = { workspace = true, features = ["uuid", "time"] } cpe = { workspace = true } +deepsize = { workspace = true } schemars = { workspace = true } sea-orm = { workspace = true, features = [ "sqlx-postgres", diff --git a/entity/src/relationship.rs b/entity/src/relationship.rs index 669221cb7..e3334b02d 100644 --- a/entity/src/relationship.rs +++ b/entity/src/relationship.rs @@ -1,3 +1,4 @@ +use deepsize::DeepSizeOf; use sea_orm::{DeriveActiveEnum, EnumIter}; use std::fmt; @@ -17,6 +18,7 @@ use std::fmt; )] #[sea_orm(rs_type = "i32", db_type = "Integer")] #[serde(rename_all = "snake_case")] +#[derive(DeepSizeOf)] // When adding a new variant, also add this to the "relationship" table. pub enum Relationship { #[sea_orm(num_value = 0)] diff --git a/modules/analysis/Cargo.toml b/modules/analysis/Cargo.toml index aaee79e57..7878c4bf7 100644 --- a/modules/analysis/Cargo.toml +++ b/modules/analysis/Cargo.toml @@ -14,6 +14,7 @@ actix-http = { workspace = true } actix-web = { workspace = true } anyhow = { workspace = true } cpe = { workspace = true } +deepsize = { workspace = true, features = ["cpe", "petgraph"] } log = { workspace = true } moka = { workspace = true, features = ["sync"] } parking_lot = { workspace = true } diff --git a/modules/analysis/src/model.rs b/modules/analysis/src/model.rs index 23a1fe8d3..81f91f1e5 100644 --- a/modules/analysis/src/model.rs +++ b/modules/analysis/src/model.rs @@ -5,6 +5,7 @@ use std::{ ops::{Deref, DerefMut}, }; +use deepsize::DeepSizeOf; use moka::sync::Cache; use std::sync::Arc; use trustify_common::{cpe::Cpe, purl::Purl}; @@ -25,7 +26,7 @@ impl fmt::Display for AnalysisStatus { } } -#[derive(Debug, Clone, PartialEq, Eq, ToSchema, serde::Serialize)] +#[derive(Debug, Clone, PartialEq, Eq, ToSchema, serde::Serialize, DeepSizeOf)] pub struct PackageNode { pub sbom_id: String, pub node_id: String, @@ -37,33 +38,6 @@ pub struct PackageNode { pub document_id: String, pub product_name: String, pub product_version: String, - pub approximate_memory_size: u32, -} - -impl PackageNode { - pub(crate) fn set_approximate_memory_size(&self) -> PackageNode { - // Is there a better way to do this? - let size = size_of::() - + self.sbom_id.len() - + self.node_id.len() - + self.purl.iter().fold(0, |acc, purl| - // use the json string length as an approximation of the memory size - acc + serde_json::to_string(purl).unwrap_or_else(|_| "".to_string()).len()) - + self.cpe.iter().fold(0, |acc, cpe| - // use the json string length as an approximation of the memory size - acc + serde_json::to_string(cpe).unwrap_or_else(|_| "".to_string()).len()) - + self.name.len() - + self.version.len() - + self.published.len() - + self.document_id.len() - + self.product_name.len() - + self.product_version.len(); - - PackageNode { - approximate_memory_size: size.try_into().unwrap_or(u32::MAX), - ..self.clone() - } - } } impl fmt::Display for PackageNode { @@ -188,20 +162,26 @@ pub struct GraphMap { } #[allow(clippy::ptr_arg)] // &String is required by Cache::builder().weigher() method -fn weigher(key: &String, value: &Arc) -> u32 { - let mut result = key.len(); - for n in value.raw_nodes() { - result += n.weight.approximate_memory_size as usize; - } - result += size_of_val(value.raw_edges()); - result.try_into().unwrap_or(u32::MAX) +fn size_of_graph_entry(key: &String, value: &Arc) -> u32 { + ( + key.deep_size_of() + + value.as_ref().deep_size_of() + // Also add in some entry overhead of the cache entry + + 20 + // todo: find a better estimate for the the moka ValueEntry + ) + .try_into() + .unwrap_or(u32::MAX) } impl GraphMap { // Create a new instance of GraphMap pub fn new(cap: u64) -> Self { GraphMap { - map: Cache::builder().weigher(weigher).max_capacity(cap).build(), + map: Cache::builder() + .weigher(size_of_graph_entry) + .max_capacity(cap) + .build(), } } diff --git a/modules/analysis/src/service/load.rs b/modules/analysis/src/service/load.rs index 8433364da..4a4b3edde 100644 --- a/modules/analysis/src/service/load.rs +++ b/modules/analysis/src/service/load.rs @@ -254,22 +254,18 @@ impl AnalysisService { match nodes.entry(package.node_id.clone()) { Entry::Vacant(entry) => { - let index = g.add_node( - PackageNode { - sbom_id: distinct_sbom_id.to_string(), - node_id: package.node_id, - purl: to_purls(package.purls), - cpe: to_cpes(package.cpes), - name: package.node_name, - version: package.node_version.clone().unwrap_or_default(), - published: package.published.clone(), - document_id: package.document_id.clone().unwrap_or_default(), - product_name: package.product_name.clone().unwrap_or_default(), - product_version: package.product_version.clone().unwrap_or_default(), - approximate_memory_size: 0, - } - .set_approximate_memory_size(), - ); + let index = g.add_node(PackageNode { + sbom_id: distinct_sbom_id.to_string(), + node_id: package.node_id, + purl: to_purls(package.purls), + cpe: to_cpes(package.cpes), + name: package.node_name, + version: package.node_version.clone().unwrap_or_default(), + published: package.published.clone(), + document_id: package.document_id.clone().unwrap_or_default(), + product_name: package.product_name.clone().unwrap_or_default(), + product_version: package.product_version.clone().unwrap_or_default(), + }); log::debug!("Inserting - id: {}, index: {index:?}", entry.key()); diff --git a/modules/analysis/src/service/test.rs b/modules/analysis/src/service/test.rs index b96fc6151..7af6fabf9 100644 --- a/modules/analysis/src/service/test.rs +++ b/modules/analysis/src/service/test.rs @@ -261,18 +261,19 @@ async fn test_cache_size_used(ctx: &TrustifyContext) -> Result<(), anyhow::Error let all_graphs = service.load_all_graphs(&ctx.db).await?; assert_eq!(all_graphs.len(), 1); - // Does 3.4 KB sound right? + let kb = 1024; let small_sbom_size = service.cache_size_used(); - assert_eq!(small_sbom_size, 3505u64); + assert!(small_sbom_size > 6 * kb); + assert!(small_sbom_size < 7 * kb); ctx.ingest_documents(["spdx/quarkus-bom-3.2.11.Final-redhat-00001.json"]) .await?; let all_graphs = service.load_all_graphs(&ctx.db).await?; assert_eq!(all_graphs.len(), 2); - // Does 676.7 KB sound right? let big_sbom_size = service.cache_size_used() - small_sbom_size; - assert_eq!(big_sbom_size, 693006u64); + assert!(big_sbom_size > 950 * kb); + assert!(big_sbom_size < 960 * kb); // Now lets try it with small cache that can at least fit the small bom let service = AnalysisService::new_sized(small_sbom_size * 2);