From 06d5566bfc75ca3ab0a14d4d48c7d12bd001c786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?= <91641923+quangdang46@users.noreply.github.com> Date: Sat, 9 May 2026 22:33:40 +0700 Subject: [PATCH 1/2] fix: port three upstream hardening follow-ups to Rust core Mirrors three small, isolated upstream Python commits to the Rust port. Each fix is gated by regression tests that fail on the pre-fix code. 1. miner Windows hardening (upstream 5488e7b) - SKIP_FILES gains pnpm-lock.yaml + yarn.lock so JS/TS lockfiles don't get embedded. - mine_file() refuses to embed any file that produces more than MAX_CHUNKS_PER_FILE chunks (500) and prints a concrete skip message naming the file. Catches the broader class of generated artifacts (CSV/JSON dumps, build outputs, lockfiles not yet in SKIP_FILES) that the named-file list cannot fully cover. Upstream cited Windows ONNX 'bad_alloc' as the symptom this prevents. 2. diary state backfill (upstream 2ff6283, follow-up to PR #2 fix #3) - When a pre-Fix-#3 state file has size set but no content_hash, the size-skip path correctly avoids re-ingesting the diary, but also writes the missing hash back so subsequent runs use the strict hash check. Without the backfill, post-upgrade same-size edits would still slip through the legacy code path forever. 3. exporter per-file symlink check (upstream 7545238, follow-up to PR #2 fix #2) - safe_open_for_write() opens room files and index.md with O_NOFOLLOW on POSIX so the open itself fails ELOOP if a symlink was swapped in between create-dir and file-open. Closes the TOCTOU window the directory-level reject_symlink leaves open. Windows path falls back to a symlink_metadata pre-check. Tests: 386/386 passing locally (+6 new regression tests). Out of scope (analyzed, do not apply to Rust): - 5134a63 SQLite preflight: Rust palace uses HNSW, not chromadb. - d5ce97c palace lock byte-0 sentinel: Rust uses file-existence locks. - ef8d83c lock holder + non-zero exit: already implemented. - 71804c0 / 3a76360 hooks Popen detach + PID guard: Rust hooks emit JSON only, do not spawn background processes. --- crates/core/src/diary_ingest.rs | 115 +++++++++++++++++++++++++++--- crates/core/src/exporter.rs | 119 ++++++++++++++++++++++++++++++-- crates/core/src/miner.rs | 91 ++++++++++++++++++++++++ 3 files changed, 311 insertions(+), 14 deletions(-) diff --git a/crates/core/src/diary_ingest.rs b/crates/core/src/diary_ingest.rs index a168b80..46c4bb6 100644 --- a/crates/core/src/diary_ingest.rs +++ b/crates/core/src/diary_ingest.rs @@ -45,6 +45,11 @@ pub fn ingest_diaries( let mut palace_db = PalaceDb::open(palace_path)?; let mut days_updated = 0usize; let mut closets_created = 0usize; + // Track legacy-state backfills (content_hash filled in for entries that + // previously had only the size). These don't update `days_updated` + // because no drawer is rewritten — but the state file still needs to + // be persisted so the strict hash check sticks across runs. + let mut state_dirty = false; // Find all .md files let diary_files: Vec = WalkDir::new(&diary_dir) @@ -84,14 +89,25 @@ pub fn ingest_diaries( let curr_size = text.len(); let curr_hash = format!("{:x}", Sha256::digest(text.as_bytes())); if !force { - if let Some(prev) = state.get(&state_key) { + if let Some(prev) = state.get(&state_key).cloned() { if let Some(prev_hash) = prev.content_hash.as_ref() { if prev_hash == &curr_hash { continue; } } else if prev.size > 0 && prev.size == curr_size { - // Legacy state without content_hash: keep size-based skip so a - // post-upgrade run doesn't re-ingest every untouched diary. + // Legacy state without content_hash: keep size-based skip but + // backfill the hash so future runs use the strict check + // (mirrors upstream mempalace `2ff6283`). Without this, a + // pre-Fix-#3 state file would stay on the size-only path + // forever and re-introduce the same-size-edit blind spot. + state.insert( + state_key.clone(), + StateEntry { + content_hash: Some(curr_hash.clone()), + ..prev + }, + ); + state_dirty = true; continue; } } @@ -143,13 +159,18 @@ pub fn ingest_diaries( palace_db.flush()?; - // Save state - if days_updated > 0 { + // Save state when any drawer was rewritten OR when legacy entries + // were backfilled with their content_hash. The latter is silent (no + // banner) because no drawer changed — but persisting it is what + // makes the strict hash check stick on the next run. + if days_updated > 0 || state_dirty { fs::write(&state_file, serde_json::to_string_pretty(&state)?)?; - println!( - "Diary: {} days updated, {} new closets", - days_updated, closets_created - ); + if days_updated > 0 { + println!( + "Diary: {} days updated, {} new closets", + days_updated, closets_created + ); + } } Ok(DiaryIngestStats { @@ -394,4 +415,80 @@ mod tests { None => std::env::remove_var("HOME"), } } + + #[test] + fn test_ingest_diaries_backfills_content_hash_on_legacy_state() { + // Regression for upstream mempalace 2ff6283 (follow-up to #925). + // + // A pre-Fix-#3 state file has `size` set but no `content_hash`. The + // size-skip path correctly avoids re-ingesting an untouched diary, + // but it must also write the missing hash back so subsequent runs + // use the strict hash check — otherwise a same-size edit done after + // the upgrade would still slip through the legacy code path + // forever. + let _guard = crate::test_env_lock() + .lock() + .expect("test env lock should be available"); + let temp = tempfile::TempDir::new().unwrap(); + let prev_home = std::env::var_os("HOME"); + std::env::set_var("HOME", temp.path()); + + let palace_path = temp.path().join("palace"); + let diary_dir = temp.path().join("diaries"); + std::fs::create_dir_all(&diary_dir).unwrap(); + + let diary_file = diary_dir.join("2024-01-15.md"); + let original = "## Notes\n\nteh quick brown fox jumps over the lazy dog and again here.\n"; + std::fs::write(&diary_file, original).unwrap(); + + // First ingest establishes baseline state (with content_hash). + let stats = ingest_diaries(&diary_dir, Some(&palace_path), "diary", false).unwrap(); + assert_eq!(stats.days_updated, 1); + + // Simulate a legacy state file: load, strip content_hash, rewrite. + let state_path = state_file_for(&palace_path, &diary_dir).unwrap(); + let mut raw_state: HashMap = + serde_json::from_str(&std::fs::read_to_string(&state_path).unwrap()).unwrap(); + for entry in raw_state.values_mut() { + if let Some(obj) = entry.as_object_mut() { + obj.remove("content_hash"); + } + } + std::fs::write( + &state_path, + serde_json::to_string_pretty(&raw_state).unwrap(), + ) + .unwrap(); + + // Re-ingest with unchanged content: legacy size-skip path triggers, + // but the backfill must persist content_hash to disk. + let stats = ingest_diaries(&diary_dir, Some(&palace_path), "diary", false).unwrap(); + assert_eq!( + stats.days_updated, 0, + "legacy size-skip path should still skip unchanged content" + ); + + let after: HashMap = + serde_json::from_str(&std::fs::read_to_string(&state_path).unwrap()).unwrap(); + let entry = after.values().next().expect("state should have one entry"); + assert!( + entry.content_hash.is_some(), + "backfill must populate content_hash so future runs use the strict check" + ); + + // Now a same-size edit must be detected via the freshly-backfilled hash. + let edited = original.replace("teh", "the"); + assert_eq!(edited.len(), original.len()); + std::fs::write(&diary_file, &edited).unwrap(); + let stats = ingest_diaries(&diary_dir, Some(&palace_path), "diary", false).unwrap(); + assert_eq!( + stats.days_updated, 1, + "post-backfill: same-size edit must be detected via content hash" + ); + + match prev_home { + Some(h) => std::env::set_var("HOME", h), + None => std::env::remove_var("HOME"), + } + } } diff --git a/crates/core/src/exporter.rs b/crates/core/src/exporter.rs index 57b777b..f8c3900 100644 --- a/crates/core/src/exporter.rs +++ b/crates/core/src/exporter.rs @@ -38,6 +38,58 @@ fn reject_symlink(path: &Path, label: &str) -> anyhow::Result<()> { Ok(()) } +/// Open a file for writing, refusing to follow a symlink at the target +/// path. +/// +/// Mirrors upstream mempalace `7545238` (Copilot review on #1156): the +/// directory-level `reject_symlink` check leaves a TOCTOU window where a +/// symlink swapped in between create-dir and file-open would still +/// redirect writes. On POSIX we close that window with `O_NOFOLLOW`, +/// which fails the open itself if `path` is a symlink. On Windows we +/// fall back to a `symlink_metadata` pre-check (narrower than no check +/// at all). +fn safe_open_for_write(path: &Path, append: bool) -> anyhow::Result { + #[cfg(unix)] + { + use std::os::unix::fs::OpenOptionsExt; + let mut opts = std::fs::OpenOptions::new(); + opts.write(true).create(true); + if append { + opts.append(true); + } else { + opts.truncate(true); + } + opts.custom_flags(libc::O_NOFOLLOW); + match opts.open(path) { + Ok(f) => Ok(f), + Err(e) => { + // ELOOP: the target was a symlink (O_NOFOLLOW). + if e.raw_os_error() == Some(libc::ELOOP) { + anyhow::bail!("refusing to write: {} is a symbolic link.", path.display()); + } + Err(e.into()) + } + } + } + #[cfg(not(unix))] + { + if std::fs::symlink_metadata(path) + .map(|m| m.file_type().is_symlink()) + .unwrap_or(false) + { + anyhow::bail!("refusing to write: {} is a symbolic link.", path.display()); + } + let mut opts = std::fs::OpenOptions::new(); + opts.write(true).create(true); + if append { + opts.append(true); + } else { + opts.truncate(true); + } + Ok(opts.open(path)?) + } +} + pub struct ExportStats { pub wings: usize, pub rooms: usize, @@ -115,10 +167,7 @@ pub fn export_palace(palace_path: Option<&Path>, output_dir: &Path) -> anyhow::R let key = format!("{}|{}", wing, room); let is_new = !opened_rooms.contains_key(&key); - let mut file = std::fs::OpenOptions::new() - .create(true) - .append(true) - .open(&room_file)?; + let mut file = safe_open_for_write(&room_file, true)?; use std::io::Write; if is_new { writeln!(file, "# {} / {}\n", wing, room)?; @@ -185,7 +234,11 @@ pub fn export_palace(palace_path: Option<&Path>, output_dir: &Path) -> anyhow::R )); } - std::fs::write(&index_path, index_lines.join("\n"))?; + { + use std::io::Write; + let mut f = safe_open_for_write(&index_path, false)?; + f.write_all(index_lines.join("\n").as_bytes())?; + } println!( "\n Exported {} drawers across {} wings, {} rooms", @@ -271,4 +324,60 @@ mod tests { assert!(msg.contains("symbolic link"), "unexpected error: {msg}"); assert!(msg.contains("output_dir"), "unexpected error: {msg}"); } + + #[test] + fn test_safe_open_for_write_allows_regular_file() { + // Sanity: a plain (non-existent) path opens fine. + let temp = tempfile::TempDir::new().unwrap(); + let path = temp.path().join("regular.md"); + let mut f = safe_open_for_write(&path, false).expect("regular path should open"); + use std::io::Write; + f.write_all(b"hello").unwrap(); + drop(f); + assert_eq!(std::fs::read_to_string(&path).unwrap(), "hello"); + } + + #[cfg(unix)] + #[test] + fn test_safe_open_for_write_blocks_symlinked_file() { + // Mirrors upstream mempalace 7545238: the per-file open must refuse + // to follow a symlink at the target path, closing the TOCTOU window + // that the directory-level reject_symlink check leaves open. + let temp = tempfile::TempDir::new().unwrap(); + let real_target = temp.path().join("real_target.md"); + // The target need not exist — O_NOFOLLOW fails on the symlink + // itself before resolution. + let link = temp.path().join("link.md"); + std::os::unix::fs::symlink(&real_target, &link).unwrap(); + + let err = safe_open_for_write(&link, false) + .expect_err("safe_open_for_write must refuse a symlinked target"); + let msg = format!("{}", err); + assert!(msg.contains("symbolic link"), "unexpected error: {msg}"); + + // The symlink target must not have been created behind us. + assert!( + !real_target.exists(), + "symlink target should not have been created" + ); + } + + #[cfg(unix)] + #[test] + fn test_safe_open_for_write_appends_to_regular_file() { + // The append=true variant used for room files preserves prior + // content (each drawer in a room writes a new section). + let temp = tempfile::TempDir::new().unwrap(); + let path = temp.path().join("room.md"); + std::fs::write(&path, "original\n").unwrap(); + { + let mut f = safe_open_for_write(&path, true).expect("append open should succeed"); + use std::io::Write; + f.write_all(b"appended\n").unwrap(); + } + assert_eq!( + std::fs::read_to_string(&path).unwrap(), + "original\nappended\n" + ); + } } diff --git a/crates/core/src/miner.rs b/crates/core/src/miner.rs index 321e23d..10e18a4 100644 --- a/crates/core/src/miner.rs +++ b/crates/core/src/miner.rs @@ -116,8 +116,21 @@ static SKIP_FILES: &[&str] = &[ "mempal.yml", ".gitignore", "package-lock.json", + "pnpm-lock.yaml", + "yarn.lock", ]; +/// Cap on chunks produced from a single file. +/// +/// A file producing more than this is almost always a generated artifact +/// (CSV/JSON dump, lockfile not in `SKIP_FILES`, etc.). Embedding +/// thousands of chunks from one file in one batch has triggered ONNX +/// runtime `bad allocation` errors on Windows (upstream mempalace +/// `5488e7b`, #1296). The cap is conservative: a 500-chunk file at +/// `CHUNK_SIZE` (800 chars) is ~400 KB of source, which covers most +/// legitimate hand-written content while bounding the worst-case batch. +const MAX_CHUNKS_PER_FILE: usize = 500; + #[derive(Debug, Clone)] struct GitignoreRule { pattern: String, @@ -709,6 +722,24 @@ impl Miner { return Ok(0); } + if chunks.len() > MAX_CHUNKS_PER_FILE { + // Catches the broader class of generated artifacts (CSV/JSON + // dumps, build outputs, lockfiles not yet in `SKIP_FILES`) + // that the named-file list will never fully cover. + let display = filepath + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| source_file.clone()); + let truncated: String = display.chars().take(50).collect(); + println!( + " ! [skip] {:<50} produced {} chunks (> {}); add to SKIP_FILES or .gitignore", + truncated, + chunks.len(), + MAX_CHUNKS_PER_FILE + ); + return Ok(0); + } + let chunks_added = chunks.len(); // Batch insert all chunks for this file in a single call @@ -1283,4 +1314,64 @@ mod tests { assert!(metadata.get("filed_at").is_some()); assert!(metadata.get("source_mtime").is_some()); } + + #[test] + fn test_skip_files_includes_lockfiles() { + // Mirrors upstream mempalace 5488e7b: pnpm/yarn lockfiles + // should be skipped just like package-lock.json. + for name in ["package-lock.json", "pnpm-lock.yaml", "yarn.lock"] { + assert!( + SKIP_FILES.contains(&name), + "expected SKIP_FILES to include {name}" + ); + } + } + + #[tokio::test] + async fn test_mine_file_skips_when_chunks_exceed_cap() { + // A file that would produce > MAX_CHUNKS_PER_FILE chunks should be + // skipped with a warning instead of triggering a worst-case batch + // through the embedder. Mirrors upstream mempalace 5488e7b (#1296). + let temp = tempfile::TempDir::new().unwrap(); + let palace = temp.path().join("palace"); + let file = temp.path().join("generated.csv"); + + // Build a payload with enough non-overlapping chunks to exceed the + // cap. Each chunk needs >= MIN_CHUNK_SIZE chars and the chunker + // splits on `\n\n`, so we emit (MAX_CHUNKS_PER_FILE + 50) blocks + // separated by blank lines. + let block = "lorem ipsum dolor sit amet consectetur adipiscing elit "; + let mut payload = String::new(); + for _ in 0..(MAX_CHUNKS_PER_FILE + 50) { + payload.push_str(&block.repeat(20)); + payload.push_str("\n\n"); + } + std::fs::write(&file, &payload).unwrap(); + + let rooms = vec![RoomMapping { + name: "general".to_string(), + description: "General".to_string(), + keywords: vec![], + }]; + let mut miner = Miner::new(&palace, "wing", rooms).unwrap(); + + // Sanity: the chunker really would emit > MAX_CHUNKS_PER_FILE + // before the cap kicks in. + let raw_chunks = miner.chunk_text(payload.trim(), "generated.csv"); + assert!( + raw_chunks.len() > MAX_CHUNKS_PER_FILE, + "fixture should exceed cap; produced {}", + raw_chunks.len() + ); + + let added = miner.mine_file(&file).await.unwrap(); + assert_eq!( + added, 0, + "files exceeding chunk cap should not be filed (got {added})" + ); + + // No drawers should have been added to the palace. + miner.palace_db.flush().unwrap(); + assert_eq!(miner.palace_db.count(), 0); + } } From b4087d2b238891422987c27d6e2c9abcbc482c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?= <91641923+quangdang46@users.noreply.github.com> Date: Sat, 9 May 2026 22:38:18 +0700 Subject: [PATCH 2/2] test(diary): canonicalize diary_dir before computing state_file_for MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ingest_diaries canonicalizes diary_dir internally (resolve_path). On macOS /tmp -> /private/tmp and on Windows the verbatim UNC form, both differ from the raw temp.path().join('diaries') used in the regression test — so state_file_for produced a different SHA256 key and read_to_string returned ENOENT. Look the state file up via the same canonical form ingest_diaries actually used. Linux /tmp didn't symlink, which is why local CI passed but macOS/Windows didn't. Test result: 386/386 passing locally on Linux; macOS/Windows expected to pass on next CI run. --- crates/core/src/diary_ingest.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/core/src/diary_ingest.rs b/crates/core/src/diary_ingest.rs index 46c4bb6..ba4bb51 100644 --- a/crates/core/src/diary_ingest.rs +++ b/crates/core/src/diary_ingest.rs @@ -445,8 +445,16 @@ mod tests { let stats = ingest_diaries(&diary_dir, Some(&palace_path), "diary", false).unwrap(); assert_eq!(stats.days_updated, 1); + // ingest_diaries canonicalizes diary_dir internally (`resolve_path`). + // On macOS `/tmp` symlinks to `/private/tmp`, which changes the + // hashed state-file key — look up the state file via the same + // canonical form ingest_diaries actually used. + let canonical_diary_dir = diary_dir + .canonicalize() + .unwrap_or_else(|_| diary_dir.clone()); + // Simulate a legacy state file: load, strip content_hash, rewrite. - let state_path = state_file_for(&palace_path, &diary_dir).unwrap(); + let state_path = state_file_for(&palace_path, &canonical_diary_dir).unwrap(); let mut raw_state: HashMap = serde_json::from_str(&std::fs::read_to_string(&state_path).unwrap()).unwrap(); for entry in raw_state.values_mut() {