From c6d01cf1503af165bc3369dc504a391f577d3706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?= Date: Sun, 17 May 2026 10:14:15 +0000 Subject: [PATCH 1/2] fix: port upstream symlink logging and MCP unknown-param error (#1462, #1512) --- crates/core/src/convo_miner.rs | 62 +++++++++++- crates/core/src/mcp_server.rs | 173 ++++++++++++++++++++++++++++++++- crates/core/src/miner.rs | 98 +++++++++++++++++++ port.txt | 67 +++++++++++++ 4 files changed, 396 insertions(+), 4 deletions(-) diff --git a/crates/core/src/convo_miner.rs b/crates/core/src/convo_miner.rs index 5c40dcc..d36b03a 100644 --- a/crates/core/src/convo_miner.rs +++ b/crates/core/src/convo_miner.rs @@ -3,6 +3,7 @@ use crate::normalize::normalize; use crate::palace_db::PalaceDb; use chrono::Utc; use sha2::{Digest, Sha256}; +use std::io::Write; use std::path::{Path, PathBuf}; use walkdir::WalkDir; @@ -426,6 +427,12 @@ fn generate_drawer_id(wing: &str, room: &str, source_file: &str, chunk_index: us format!("drawer_{}_{}_{}", wing, room, &hex[..24]) } +/// Scan `convo_dir` for conversation files, returning paths to mine. +/// +/// Skips symlinks (which could otherwise follow links to recursive structures +/// or `/dev/urandom`) and oversized files. Each skipped symlink is logged to +/// `stderr` with a `" SKIP: (symlink)"` line so callers can +/// tell why a directory looks empty after walking (#1462). fn scan_convos(convo_dir: &Path) -> Vec { let mut files = Vec::new(); for entry in WalkDir::new(convo_dir) @@ -445,7 +452,19 @@ fn scan_convos(convo_dir: &Path) -> Vec { .file_name() .and_then(|n| n.to_str()) .unwrap_or_default(); - if name.ends_with(".meta.json") || path.is_symlink() { + if name.ends_with(".meta.json") { + continue; + } + // Skip symlinks — prevents following recursive/bogus links. Log to + // stderr with the path relative to the scan root so the diagnostic + // is unambiguous and renders with forward slashes on every platform. + // Mirrors upstream Python `scan_convos` post-#1462. (#1462) + if path.is_symlink() { + let rel = path + .strip_prefix(convo_dir) + .map(|p| p.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| path.to_string_lossy().to_string()); + let _ = writeln!(std::io::stderr(), " SKIP: {rel} (symlink)"); continue; } let extension = path @@ -671,6 +690,47 @@ mod tests { assert!(!names.contains(&"config.txt".to_string())); } + #[cfg(unix)] + #[test] + fn test_scan_convos_skips_symlinks() { + // Regression for upstream #1462: scan_convos drops symlinked files + // so the walker can't recurse into bogus link targets. The stderr + // SKIP log surfaces the skip with a path relative to the scan root. + let temp = tempfile::TempDir::new().unwrap(); + let real = temp.path().join("real.md"); + std::fs::write(&real, "hello world").unwrap(); + std::os::unix::fs::symlink(&real, temp.path().join("link.md")).unwrap(); + + let files = scan_convos(temp.path()); + let names: Vec = files + .iter() + .map(|path| path.file_name().unwrap().to_string_lossy().to_string()) + .collect(); + assert_eq!(names, vec!["real.md".to_string()]); + } + + #[cfg(unix)] + #[test] + fn test_scan_convos_skips_dangling_symlinks() { + // A dangling symlink in the convo dir must not panic the walker nor + // surface in the result set. Mirrors upstream coverage for #1462's + // polished dangling-link path. + let temp = tempfile::TempDir::new().unwrap(); + std::fs::write(temp.path().join("real.md"), "hello world").unwrap(); + std::os::unix::fs::symlink( + temp.path().join("missing.md"), + temp.path().join("dangling.md"), + ) + .unwrap(); + + let files = scan_convos(temp.path()); + let names: Vec = files + .iter() + .map(|path| path.file_name().unwrap().to_string_lossy().to_string()) + .collect(); + assert_eq!(names, vec!["real.md".to_string()]); + } + #[test] fn test_scan_convos_skips_python_parity_dirs() { let temp = tempfile::TempDir::new().unwrap(); diff --git a/crates/core/src/mcp_server.rs b/crates/core/src/mcp_server.rs index 6b468ea..532bcbf 100644 --- a/crates/core/src/mcp_server.rs +++ b/crates/core/src/mcp_server.rs @@ -3,12 +3,12 @@ //! Exposes MemPalace functionality as MCP tools via stdio transport. //! Read-only mode restricts mutations (diary_write, config_write, people_write). -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fs; use std::io::Write; use std::path::PathBuf; use std::pin::Pin; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use rmcp::model::{ CallToolResult, Content, Implementation, InitializeResult, JsonObject, ListToolsResult, @@ -209,7 +209,10 @@ where 16, ); log_tool_invocation(&tool_name, &args, None, &trace_id); - let result = dispatch(tool_name.clone(), args.clone()).await; + let result = match validate_known_params(&tool_name, &args) { + Err(err) => Err(err), + Ok(()) => dispatch(tool_name.clone(), args.clone()).await, + }; log_tool_invocation( &tool_name, &args, @@ -219,6 +222,78 @@ where result } +/// Tools whose Input struct accepts arbitrary extra fields (via +/// `#[serde(flatten)] custom_metadata: Option` or +/// equivalent). These bypass the unknown-parameter check so callers can keep +/// supplying custom metadata keys — mirrors Python's `accepts_var_keyword` +/// gate on `**kwargs` handlers. (#1512) +const TOOLS_ACCEPTING_EXTRAS: &[&str] = &["mempalace_add_drawer"]; + +/// Keys that are internal transport metadata and live in no tool schema. They +/// are stripped before dispatch elsewhere; flagging them as unknown here would +/// surface a misleading error for legitimate transport-level options. (#1512) +const TRANSPORT_RESERVED_KEYS: &[&str] = &["wait_for_previous"]; + +/// Lazily-built lookup of `tool_name -> declared input-schema property names`. +/// Source of truth is `make_tools()`'s JSON schema, so adding a property in +/// one place automatically updates the unknown-parameter check. +fn tool_schema_props() -> &'static HashMap> { + static SCHEMA_PROPS: OnceLock>> = OnceLock::new(); + SCHEMA_PROPS.get_or_init(|| { + let mut by_tool = HashMap::new(); + for tool in make_tools() { + let value = serde_json::Value::Object((*tool.input_schema).clone()); + let props = value + .get("properties") + .and_then(|p| p.as_object()) + .map(|obj| obj.keys().cloned().collect::>()) + .unwrap_or_default(); + by_tool.insert(tool.name.to_string(), props); + } + by_tool + }) +} + +/// Reject unknown parameter *names* with JSON-RPC -32602 instead of letting +/// serde silently drop them and resurfacing the typo as a downstream +/// "missing required" error. Skips tools whose Input struct uses +/// `#[serde(flatten)]` extras (`TOOLS_ACCEPTING_EXTRAS`) and the +/// `wait_for_previous` transport kwarg, matching upstream Python's +/// `accepts_var_keyword` gate. (#1512) +fn validate_known_params(tool_name: &str, args: &JsonObject) -> Result<(), ErrorData> { + if TOOLS_ACCEPTING_EXTRAS.contains(&tool_name) { + return Ok(()); + } + let Some(allowed) = tool_schema_props().get(tool_name) else { + // Unknown tool — let dispatch surface method_not_found. + return Ok(()); + }; + let mut unknown: Vec<&str> = args + .keys() + .filter(|k| !allowed.contains(k.as_str())) + .filter(|k| !TRANSPORT_RESERVED_KEYS.contains(&k.as_str())) + .map(String::as_str) + .collect(); + if unknown.is_empty() { + return Ok(()); + } + unknown.sort_unstable(); + let quoted = unknown + .iter() + .map(|k| format!("'{k}'")) + .collect::>() + .join(", "); + let word = if unknown.len() == 1 { + "parameter" + } else { + "parameters" + }; + Err(ErrorData::invalid_params( + format!("Unknown {word} {quoted} for tool {tool_name}"), + None, + )) +} + fn make_dispatch(state: Arc) -> impl Fn(String, JsonObject) -> DynResult { move |name, args| { let state = state.clone(); @@ -2350,4 +2425,96 @@ mod tests { ); assert!(result.is_ok(), "traverse failed: {:?}", result); } + + // --------------------------------------------------------------------- + // Unknown parameter name (#1512) + // + // A kwarg not in the tool schema (wrong parameter *name*, e.g. `text=` + // instead of `content=`) should surface as JSON-RPC -32602 naming the + // offending kwarg, instead of being silently dropped by serde and + // resurfacing indirectly as a later "Missing required 'X'". Symmetric + // with the missing-required-shape path. The internal `wait_for_previous` + // transport kwarg must never be flagged, and handlers whose Input struct + // uses `#[serde(flatten)]` extras (`mempalace_add_drawer`) must keep + // accepting unknown kwargs. + // --------------------------------------------------------------------- + + #[test] + fn test_unknown_param_returns_invalid_params_for_wrong_kwarg_name() { + let state = test_state(); + let err = dispatch( + &state, + "mempalace_search", + json!({ "query": "hello", "txt": "oops" }), + ) + .expect_err("unknown 'txt' should surface as an error"); + assert_eq!(err.code.0, -32602); + let message = err.message.as_ref(); + assert!(message.contains("'txt'"), "message: {message}"); + assert!(message.contains("Unknown parameter"), "message: {message}"); + assert!(message.contains("mempalace_search"), "message: {message}"); + // Names the actual wrong kwarg, not the indirect missing-required symptom. + assert!(!message.contains("Missing required"), "message: {message}"); + } + + #[test] + fn test_two_unknown_params_list_both_names() { + let state = test_state(); + let err = dispatch( + &state, + "mempalace_search", + json!({ "query": "hello", "txt": "a", "bogus": "b" }), + ) + .expect_err("multiple unknown params should error"); + assert_eq!(err.code.0, -32602); + let message = err.message.as_ref(); + assert!(message.contains("parameters"), "message: {message}"); + assert!(message.contains("'txt'"), "message: {message}"); + assert!(message.contains("'bogus'"), "message: {message}"); + } + + #[test] + fn test_wait_for_previous_not_flagged_as_unknown() { + // `wait_for_previous` is an internal transport kwarg in no tool + // schema; it must not trip the unknown-param check. + let state = test_state(); + let result = dispatch( + &state, + "mempalace_diary_write", + json!({ + "agent_name": "x", + "entry": "y", + "wait_for_previous": true, + }), + ); + assert!( + result.is_ok(), + "wait_for_previous should pass through: {:?}", + result + ); + } + + #[test] + fn test_add_drawer_accepts_unknown_custom_metadata_keys() { + // `mempalace_add_drawer` uses `#[serde(flatten)] custom_metadata`, + // so callers may supply arbitrary string-valued metadata keys — + // those must not be rejected as unknown parameters. + let state = test_state(); + let result = dispatch( + &state, + "mempalace_add_drawer", + json!({ + "wing": "test", + "room": "decisions", + "content": "we picked sqlite", + "priority": "high", + "status": "open", + }), + ); + assert!( + result.is_ok(), + "add_drawer custom metadata should pass through: {:?}", + result + ); + } } diff --git a/crates/core/src/miner.rs b/crates/core/src/miner.rs index 10e18a4..6569c95 100644 --- a/crates/core/src/miner.rs +++ b/crates/core/src/miner.rs @@ -5,6 +5,7 @@ use chrono::Utc; use regex::Regex; use sha2::{Digest, Sha256}; use std::collections::{HashMap, HashSet}; +use std::io::Write; use std::path::Path; use walkdir::WalkDir; @@ -414,6 +415,12 @@ fn is_force_included(path: &Path, project_path: &Path, include_paths: &HashSet (symlink)"` line so +/// callers can tell why a directory looks empty after walking (#1462). pub fn scan_project( project_dir: &Path, respect_gitignore: bool, @@ -497,7 +504,18 @@ pub fn scan_project( continue; } + // Skip symlinks — prevents following links to /dev/urandom, recursive + // structures, etc. Log to stderr with the path relative to the scan + // root so a nested symlink in a deep subdirectory is unambiguous and + // the log renders with forward slashes on every platform. stdout stays + // clean for "Files: N" / "Drawers filed: N" markers callers parse. + // Mirrors upstream Python `scan_project` post-#1462. (#1462) if path.is_symlink() { + let rel = path + .strip_prefix(&project_path) + .map(|p| p.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| path.to_string_lossy().to_string()); + let _ = writeln!(std::io::stderr(), " SKIP: {rel} (symlink)"); continue; } @@ -1073,6 +1091,86 @@ mod tests { assert_eq!(room, "frontend"); } + #[cfg(unix)] + #[test] + fn test_scan_project_skips_symlinks() { + // Regression for upstream #1462: symlinks are skipped so the walker + // never follows links to /dev/urandom, recursive directories, or + // resources outside the scan root. The "Files: 0" outcome surfaces + // as a stderr SKIP log so callers can distinguish "no files" from + // "all the files were symlinks". + let temp = tempfile::TempDir::new().unwrap(); + let root = temp.path(); + let canonical_root = root.canonicalize().unwrap(); + let real = root.join("real.py"); + std::fs::write(&real, "print('real')\n".repeat(20)).unwrap(); + std::os::unix::fs::symlink(&real, root.join("link.py")).unwrap(); + + let files = scan_project(root, false, None); + let rel: Vec = files + .iter() + .map(|p| { + p.strip_prefix(&canonical_root) + .unwrap() + .to_string_lossy() + .replace('\\', "/") + }) + .collect(); + assert_eq!(rel, vec!["real.py"]); + } + + #[cfg(unix)] + #[test] + fn test_scan_project_skips_nested_symlinks() { + // Nested symlinks (e.g. inside a subdirectory) are reported with the + // path relative to the scan root, not the leaf filename, so they can + // be located unambiguously. See upstream `d7d9604` polish to #1462. + let temp = tempfile::TempDir::new().unwrap(); + let root = temp.path(); + let canonical_root = root.canonicalize().unwrap(); + std::fs::create_dir_all(root.join("a/b")).unwrap(); + let real = root.join("a/b/real.py"); + std::fs::write(&real, "print('real')\n".repeat(20)).unwrap(); + std::os::unix::fs::symlink(&real, root.join("a/b/link.py")).unwrap(); + + let files = scan_project(root, false, None); + let rel: Vec = files + .iter() + .map(|p| { + p.strip_prefix(&canonical_root) + .unwrap() + .to_string_lossy() + .replace('\\', "/") + }) + .collect(); + assert_eq!(rel, vec!["a/b/real.py"]); + } + + #[cfg(unix)] + #[test] + fn test_scan_project_skips_dangling_symlinks() { + // A dangling symlink (target does not exist) must not panic the + // walker nor surface in the result set. Mirrors upstream coverage + // for the polished #1462 case. + let temp = tempfile::TempDir::new().unwrap(); + let root = temp.path(); + let canonical_root = root.canonicalize().unwrap(); + std::fs::write(root.join("real.py"), "print('real')\n".repeat(20)).unwrap(); + std::os::unix::fs::symlink(root.join("missing.py"), root.join("dangling.py")).unwrap(); + + let files = scan_project(root, false, None); + let rel: Vec = files + .iter() + .map(|p| { + p.strip_prefix(&canonical_root) + .unwrap() + .to_string_lossy() + .replace('\\', "/") + }) + .collect(); + assert_eq!(rel, vec!["real.py"]); + } + #[test] fn test_scan_project_respects_gitignore() { let temp = tempfile::TempDir::new().unwrap(); diff --git a/port.txt b/port.txt index f452f16..87e48d0 100644 --- a/port.txt +++ b/port.txt @@ -214,3 +214,70 @@ prune, LLM init, palace-graph cache, hook/doc/i18n sync). No new gaps were introduced by this PR. This file is the current gap ledger for the `68319dc` reference state. + + +Sync update — upstream develop `bb31396` (2026-05-15) +===================================================== + +Followed `.devin/skills/sync-upstream-mempalace/SKILL.md`. Diffed upstream +`68319dc..bb31396` (11 non-merge commits touching `mempalace/`) and ported +the two highest-priority upstream fixes: a CLI-ergonomics diagnostic for +silently-skipped symlinks during mining and a structured MCP error for +unknown parameter names. Chunk-size configuration plumbing (#1024, #1410, +#1519, plus the `342cfa6`/`bea7cb1` follow-ups) is feature-flag work and +is deferred to a follow-up sync. + +Ported in this session +---------------------- + +1. `mempalace/miner.py` `becf561` + `d7d9604` (#1462) — `mine_project` and + `mine_conversations` used to silently skip symlinks, so a directory full + of symlinked files surfaced only as a baffling `Files: 0` line with no + indication why. Emit a ` SKIP: (symlink)` diagnostic to + `stderr` (stdout stays clean for callers parsing the `Files:` / + `Drawers filed:` markers). The polished path uses the scan-root-relative + posix-style path so nested symlinks remain unambiguous. + - Rust: `crates/core/src/miner.rs::scan_project` and + `crates/core/src/convo_miner.rs::scan_convos`. Regression tests + `miner::tests::test_scan_project_skips_symlinks`, + `test_scan_project_skips_nested_symlinks`, + `test_scan_project_skips_dangling_symlinks`, + `convo_miner::tests::test_scan_convos_skips_symlinks`, and + `test_scan_convos_skips_dangling_symlinks` (`#[cfg(unix)]`-gated since + the Windows symlink API requires admin privileges). + +2. `mempalace/mcp_server.py` `51c03a0` (#1512) — unknown kwargs at the MCP + dispatch layer (a typo like `text=` instead of `content=`) used to be + silently dropped by the schema filter, only resurfacing indirectly as a + later "Missing required 'X'". Emit `-32602` naming the offending kwarg + instead, symmetric with the missing-required path. Gated by an + `accepts_var_keyword` equivalent (`TOOLS_ACCEPTING_EXTRAS` — currently + `mempalace_add_drawer`, whose Input uses `#[serde(flatten)]` for custom + metadata), and skips the internal `wait_for_previous` transport kwarg. + - Rust: `crates/core/src/mcp_server.rs::validate_known_params`, wired + into `invoke_with_wal`. Tool-name → allowed-property lookup is cached + once via `OnceLock` from `make_tools()`'s JSON schema so the schema + stays the single source of truth. Regression tests + `mcp_server::tests::test_unknown_param_returns_invalid_params_for_wrong_kwarg_name`, + `test_two_unknown_params_list_both_names`, + `test_wait_for_previous_not_flagged_as_unknown`, and + `test_add_drawer_accepts_unknown_custom_metadata_keys`. + +Remaining gaps after this sync +------------------------------ + +Carried forward unchanged: gitignore-aware drawer prune, LLM-assisted +init / CLI flags, palace-graph cache invalidation, hook/doc/i18n sync, +broader test parity for upstream-added modules. + +New deferred work (not ported here): + +1. Configurable chunk sizes (`1cc2876` feat (#1024), `b1d75b3` / + `fd63703` follow-ups (#1410, #1519), `bea7cb1` / `342cfa6` validation + fixes). Upstream wires `--chunk-size`/`--chunk-overlap` through CLI → + config → `miner` / `convo_miner`. Touches non-trivial config plumbing + and is not yet user-requested in the Rust port. + +2. CHANGELOG / docs polish around the same chunk-size feature (`68dc1bf`). + +This file is the current gap ledger for the `bb31396` reference state. From 6fd459d9f72f4cb1e70311b6642e406314151f50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?= Date: Sun, 17 May 2026 12:07:51 +0000 Subject: [PATCH 2/2] fix(miner): make symlink SKIP diagnostic actually fire (#1462) End-to-end testing of the initial port surfaced that the new symlink branch in both scan_project and scan_convos was dead code: walkdir under follow_links(false) reports symlinks-to-files with file_type().is_file() == false, and both functions had an early `if !entry.file_type().is_file() { continue; }` filter that ran before the diagnostic branch. The 5 regression tests still passed because they only asserted the symlink was absent from the result set -- which the prior filter already guaranteed. Refactor scan_project and scan_convos to delegate to internal scan_project_with_log / scan_convos_with_log helpers that accept an arbitrary writer, so the diagnostic is now observable from unit tests without spawning a subprocess. The public wrappers still write to stderr. Change the early filter to let symlinks through alongside regular files, and use entry.file_type().is_symlink() in the diagnostic branch (uses walkdir's cached lstat, no extra syscall). For scan_convos, also reorder so the extension filter runs before the symlink-log branch, matching upstream Python ordering -- a .png symlink is silently dropped at the extension gate rather than logged. Strengthen the 5 #[cfg(unix)] regression tests to capture the writer buffer and assert the exact " SKIP: (symlink)" line fires (would-have caught the dead-code bug). Add 2 negative-control tests: test_scan_project_emits_no_skip_lines_when_no_symlinks_present locks in that a clean scan stays silent, and test_scan_convos_does_not_log_extension_filtered_symlinks locks in the upstream extension-before-symlink ordering. Verified end-to-end against the release binary on a temp project with 3 symlinks (top-level, nested two-deep, dangling): all 3 SKIP lines appear on stderr with forward-slash, scan-root-relative paths. --- crates/core/src/convo_miner.rs | 84 +++++++++++++++++++++++++------ crates/core/src/miner.rs | 92 +++++++++++++++++++++++++++++----- 2 files changed, 148 insertions(+), 28 deletions(-) diff --git a/crates/core/src/convo_miner.rs b/crates/core/src/convo_miner.rs index d36b03a..ae78499 100644 --- a/crates/core/src/convo_miner.rs +++ b/crates/core/src/convo_miner.rs @@ -434,6 +434,13 @@ fn generate_drawer_id(wing: &str, room: &str, source_file: &str, chunk_index: us /// `stderr` with a `" SKIP: (symlink)"` line so callers can /// tell why a directory looks empty after walking (#1462). fn scan_convos(convo_dir: &Path) -> Vec { + scan_convos_with_log(convo_dir, &mut std::io::stderr()) +} + +/// Same as [`scan_convos`] but routes the skipped-symlink diagnostic to an +/// arbitrary writer. Lets unit tests assert the log fires without having to +/// fork a subprocess to capture stderr. +fn scan_convos_with_log(convo_dir: &Path, skip_log: &mut W) -> Vec { let mut files = Vec::new(); for entry in WalkDir::new(convo_dir) .follow_links(false) @@ -444,7 +451,12 @@ fn scan_convos(convo_dir: &Path) -> Vec { }) .filter_map(|entry| entry.ok()) { - if !entry.file_type().is_file() { + let ft = entry.file_type(); + // Let regular files AND symlinks through. Walkdir's `is_file()` + // returns `false` for symlinks-to-files under `follow_links(false)`, + // so a bare `!is_file()` check would silently drop every symlink + // before the diagnostic branch below can fire. + if !ft.is_file() && !ft.is_symlink() { continue; } let path = entry.path(); @@ -455,18 +467,6 @@ fn scan_convos(convo_dir: &Path) -> Vec { if name.ends_with(".meta.json") { continue; } - // Skip symlinks — prevents following recursive/bogus links. Log to - // stderr with the path relative to the scan root so the diagnostic - // is unambiguous and renders with forward slashes on every platform. - // Mirrors upstream Python `scan_convos` post-#1462. (#1462) - if path.is_symlink() { - let rel = path - .strip_prefix(convo_dir) - .map(|p| p.to_string_lossy().replace('\\', "/")) - .unwrap_or_else(|_| path.to_string_lossy().to_string()); - let _ = writeln!(std::io::stderr(), " SKIP: {rel} (symlink)"); - continue; - } let extension = path .extension() .and_then(|ext| ext.to_str()) @@ -475,6 +475,20 @@ fn scan_convos(convo_dir: &Path) -> Vec { if !CONVO_EXTENSIONS.contains(&extension.as_str()) { continue; } + // Skip symlinks — prevents following recursive/bogus links. Log to + // `skip_log` with the path relative to the scan root so the + // diagnostic is unambiguous and renders with forward slashes on + // every platform. Runs AFTER the extension filter to match upstream + // Python `scan_convos` ordering — a `.png` symlink is silently + // dropped at the extension gate rather than logged. (#1462) + if ft.is_symlink() { + let rel = path + .strip_prefix(convo_dir) + .map(|p| p.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| path.to_string_lossy().to_string()); + let _ = writeln!(skip_log, " SKIP: {rel} (symlink)"); + continue; + } let Ok(metadata) = path.metadata() else { continue; }; @@ -696,17 +710,26 @@ mod tests { // Regression for upstream #1462: scan_convos drops symlinked files // so the walker can't recurse into bogus link targets. The stderr // SKIP log surfaces the skip with a path relative to the scan root. + // Asserts the diagnostic actually fires — relying on result-set + // exclusion alone passes against dead-code symlink branches, which + // is how the initial port shipped. let temp = tempfile::TempDir::new().unwrap(); let real = temp.path().join("real.md"); std::fs::write(&real, "hello world").unwrap(); std::os::unix::fs::symlink(&real, temp.path().join("link.md")).unwrap(); - let files = scan_convos(temp.path()); + let mut log = Vec::new(); + let files = scan_convos_with_log(temp.path(), &mut log); let names: Vec = files .iter() .map(|path| path.file_name().unwrap().to_string_lossy().to_string()) .collect(); assert_eq!(names, vec!["real.md".to_string()]); + let log = String::from_utf8(log).unwrap(); + assert!( + log.contains(" SKIP: link.md (symlink)\n"), + "expected SKIP diagnostic for link.md, got: {log:?}" + ); } #[cfg(unix)] @@ -723,12 +746,43 @@ mod tests { ) .unwrap(); - let files = scan_convos(temp.path()); + let mut log = Vec::new(); + let files = scan_convos_with_log(temp.path(), &mut log); + let names: Vec = files + .iter() + .map(|path| path.file_name().unwrap().to_string_lossy().to_string()) + .collect(); + assert_eq!(names, vec!["real.md".to_string()]); + let log = String::from_utf8(log).unwrap(); + assert!( + log.contains(" SKIP: dangling.md (symlink)\n"), + "expected SKIP diagnostic for dangling.md, got: {log:?}" + ); + } + + #[cfg(unix)] + #[test] + fn test_scan_convos_does_not_log_extension_filtered_symlinks() { + // A symlink whose name doesn't match `CONVO_EXTENSIONS` is silently + // dropped at the extension gate — it must NOT surface in the SKIP + // log, matching upstream Python ordering (extension → symlink-log + // → size). + let temp = tempfile::TempDir::new().unwrap(); + std::fs::write(temp.path().join("real.md"), "hello world").unwrap(); + std::os::unix::fs::symlink("real.md", temp.path().join("link.png")).unwrap(); + + let mut log = Vec::new(); + let files = scan_convos_with_log(temp.path(), &mut log); let names: Vec = files .iter() .map(|path| path.file_name().unwrap().to_string_lossy().to_string()) .collect(); assert_eq!(names, vec!["real.md".to_string()]); + let log = String::from_utf8(log).unwrap(); + assert!( + !log.contains("link.png"), + "extension-filtered symlink leaked into SKIP log: {log:?}" + ); } #[test] diff --git a/crates/core/src/miner.rs b/crates/core/src/miner.rs index 6569c95..78c013a 100644 --- a/crates/core/src/miner.rs +++ b/crates/core/src/miner.rs @@ -421,10 +421,31 @@ fn is_force_included(path: &Path, project_path: &Path, include_paths: &HashSet (symlink)"` line so /// callers can tell why a directory looks empty after walking (#1462). +/// Walk `project_dir` and return readable file paths, logging each +/// skipped symlink to `stderr` with a `" SKIP: (symlink)"` +/// line so callers can tell why a directory looks empty after walking. +/// Mirrors upstream Python `scan_project` post-#1462. (#1462) pub fn scan_project( project_dir: &Path, respect_gitignore: bool, include_ignored: Option<&[String]>, +) -> Vec { + scan_project_with_log( + project_dir, + respect_gitignore, + include_ignored, + &mut std::io::stderr(), + ) +} + +/// Same as [`scan_project`] but routes the skipped-symlink diagnostic to an +/// arbitrary writer. Lets unit tests assert the log fires without having to +/// fork a subprocess to capture stderr. +fn scan_project_with_log( + project_dir: &Path, + respect_gitignore: bool, + include_ignored: Option<&[String]>, + skip_log: &mut W, ) -> Vec { let project_path = match project_dir.canonicalize() { Ok(path) => path, @@ -442,8 +463,9 @@ pub fn scan_project( .filter_map(|e| e.ok()) { let path = entry.path().to_path_buf(); + let ft = entry.file_type(); - if entry.file_type().is_dir() { + if ft.is_dir() { if respect_gitignore { active_matchers.retain(|matcher| { path == matcher.base_dir || path.starts_with(&matcher.base_dir) @@ -455,7 +477,12 @@ pub fn scan_project( continue; } - if !entry.file_type().is_file() { + // Let regular files AND symlinks through. Walkdir's `is_file()` + // returns `false` for symlinks-to-files under `follow_links(false)`, + // so a bare `!is_file()` check would silently drop every symlink + // before the diagnostic branch below can fire — that was the bug in + // the initial port of upstream #1462. + if !ft.is_file() && !ft.is_symlink() { continue; } @@ -505,17 +532,17 @@ pub fn scan_project( } // Skip symlinks — prevents following links to /dev/urandom, recursive - // structures, etc. Log to stderr with the path relative to the scan - // root so a nested symlink in a deep subdirectory is unambiguous and - // the log renders with forward slashes on every platform. stdout stays - // clean for "Files: N" / "Drawers filed: N" markers callers parse. - // Mirrors upstream Python `scan_project` post-#1462. (#1462) - if path.is_symlink() { + // structures, etc. Log to `skip_log` with the path relative to the + // scan root so a nested symlink in a deep subdirectory is unambiguous + // and the log renders with forward slashes on every platform. stdout + // stays clean for "Files: N" / "Drawers filed: N" markers callers + // parse. Mirrors upstream Python `scan_project` post-#1462. (#1462) + if ft.is_symlink() { let rel = path .strip_prefix(&project_path) .map(|p| p.to_string_lossy().replace('\\', "/")) .unwrap_or_else(|_| path.to_string_lossy().to_string()); - let _ = writeln!(std::io::stderr(), " SKIP: {rel} (symlink)"); + let _ = writeln!(skip_log, " SKIP: {rel} (symlink)"); continue; } @@ -1098,7 +1125,9 @@ mod tests { // never follows links to /dev/urandom, recursive directories, or // resources outside the scan root. The "Files: 0" outcome surfaces // as a stderr SKIP log so callers can distinguish "no files" from - // "all the files were symlinks". + // "all the files were symlinks". Asserts the diagnostic actually + // fires — relying on result-set exclusion alone passes against + // dead-code symlink branches, which is how the initial port shipped. let temp = tempfile::TempDir::new().unwrap(); let root = temp.path(); let canonical_root = root.canonicalize().unwrap(); @@ -1106,7 +1135,8 @@ mod tests { std::fs::write(&real, "print('real')\n".repeat(20)).unwrap(); std::os::unix::fs::symlink(&real, root.join("link.py")).unwrap(); - let files = scan_project(root, false, None); + let mut log = Vec::new(); + let files = scan_project_with_log(root, false, None, &mut log); let rel: Vec = files .iter() .map(|p| { @@ -1117,6 +1147,11 @@ mod tests { }) .collect(); assert_eq!(rel, vec!["real.py"]); + let log = String::from_utf8(log).unwrap(); + assert!( + log.contains(" SKIP: link.py (symlink)\n"), + "expected SKIP diagnostic for link.py, got: {log:?}" + ); } #[cfg(unix)] @@ -1133,7 +1168,8 @@ mod tests { std::fs::write(&real, "print('real')\n".repeat(20)).unwrap(); std::os::unix::fs::symlink(&real, root.join("a/b/link.py")).unwrap(); - let files = scan_project(root, false, None); + let mut log = Vec::new(); + let files = scan_project_with_log(root, false, None, &mut log); let rel: Vec = files .iter() .map(|p| { @@ -1144,6 +1180,11 @@ mod tests { }) .collect(); assert_eq!(rel, vec!["a/b/real.py"]); + let log = String::from_utf8(log).unwrap(); + assert!( + log.contains(" SKIP: a/b/link.py (symlink)\n"), + "expected scan-root-relative SKIP path, got: {log:?}" + ); } #[cfg(unix)] @@ -1158,7 +1199,8 @@ mod tests { std::fs::write(root.join("real.py"), "print('real')\n".repeat(20)).unwrap(); std::os::unix::fs::symlink(root.join("missing.py"), root.join("dangling.py")).unwrap(); - let files = scan_project(root, false, None); + let mut log = Vec::new(); + let files = scan_project_with_log(root, false, None, &mut log); let rel: Vec = files .iter() .map(|p| { @@ -1169,6 +1211,30 @@ mod tests { }) .collect(); assert_eq!(rel, vec!["real.py"]); + let log = String::from_utf8(log).unwrap(); + assert!( + log.contains(" SKIP: dangling.py (symlink)\n"), + "expected SKIP diagnostic for dangling.py, got: {log:?}" + ); + } + + #[test] + fn test_scan_project_emits_no_skip_lines_when_no_symlinks_present() { + // Negative control: a regular directory with only real files must + // not emit any SKIP diagnostics on the writer. + let temp = tempfile::TempDir::new().unwrap(); + let root = temp.path(); + std::fs::write(root.join("a.py"), "print('a')\n".repeat(20)).unwrap(); + std::fs::write(root.join("b.py"), "print('b')\n".repeat(20)).unwrap(); + + let mut log = Vec::new(); + let files = scan_project_with_log(root, false, None, &mut log); + assert_eq!(files.len(), 2); + let log = String::from_utf8(log).unwrap(); + assert!( + !log.contains("SKIP"), + "no symlinks present, expected empty log, got: {log:?}" + ); } #[test]