Skip to content

Commit 2082fb3

Browse files
authored
fix(native): skip unsupported-extension files in detect_removed_files (#1070)
The Rust file_collector only collects files whose extension is recognized by `LanguageKind::from_extension` or listed in `SUPPORTED_EXTENSIONS`. The JS LANGUAGE_REGISTRY is broader — Clojure, Gleam, Julia, F# files exist in `file_hashes` because the JS-side WASM backfill writes them (#1068), but Rust's narrower collector never sees them. Before this fix, `detect_removed_files` flagged every such file as "removed" on every incremental rebuild because they were absent from `current` (the just-collected file list). The orchestrator's purge step then deleted their `nodes` and `file_hashes` rows, and the JS-side `backfillNativeDroppedFiles` (now running on every pass per #1069) re-parsed them with WASM and re-inserted the rows — the ~2s 1-file rebuild floor reported in #1066. Add `is_supported_extension` to `file_collector` (exposing the same predicate used by `collect_files`) and apply it as a pre-filter in `detect_removed_files`. Files outside Rust's capability are now left alone: their absence from `current` is a capability boundary, not a deletion. Rows owned by the JS layer persist across incremental rebuilds and the backfill's missing-file early-return at `pipeline.ts:811` finally fires. Refs #1066 Impact: 3 functions changed, 4 affected
1 parent 8fcdb86 commit 2082fb3

2 files changed

Lines changed: 72 additions & 0 deletions

File tree

crates/codegraph-core/src/change_detection.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
//! when switching between JS and native engines, so hash format compatibility is
1111
//! not required.
1212
13+
use crate::file_collector::is_supported_extension;
1314
use crate::journal;
1415
use rusqlite::Connection;
1516
use sha2::{Digest, Sha256};
@@ -129,6 +130,14 @@ fn load_file_hashes(conn: &Connection) -> Option<HashMap<String, FileHashRow>> {
129130
/// When `scoped_rel_paths` is provided (scoped rebuild), only files within that
130131
/// scope are considered candidates for removal. Without it, all DB files not
131132
/// found on disk are treated as removed.
133+
///
134+
/// Files whose extension is outside the Rust file_collector's supported set
135+
/// (e.g. `.clj`, `.gleam`, `.jl`, `.fs` — WASM-only languages) are skipped:
136+
/// the orchestrator's narrower collector never sees them, so absence from
137+
/// `current` is a capability boundary, not a deletion. Their `nodes` and
138+
/// `file_hashes` rows are owned by the JS-side WASM backfill (#967, #1068)
139+
/// and must be left alone, otherwise every incremental rebuild purges and
140+
/// re-creates them — the ~2s floor reported in #1066.
132141
fn detect_removed_files(
133142
existing: &HashMap<String, FileHashRow>,
134143
all_files: &[String],
@@ -143,6 +152,9 @@ fn detect_removed_files(
143152
existing
144153
.keys()
145154
.filter(|f| {
155+
if !is_supported_extension(f) {
156+
return false;
157+
}
146158
// When scope is set, only consider files within scope as candidates.
147159
if let Some(scope) = scoped_rel_paths {
148160
scope.contains(*f) && !current.contains(*f)
@@ -759,4 +771,45 @@ mod tests {
759771
let removed = detect_removed_files(&existing, &all_files, "/project", None);
760772
assert_eq!(removed, vec!["src/b.ts"]);
761773
}
774+
775+
#[test]
776+
fn detect_removed_skips_unsupported_extensions() {
777+
// Files in WASM-only languages (Clojure, Gleam, Julia, F#) live in
778+
// `file_hashes` because the JS-side WASM backfill writes them, but
779+
// Rust's narrower file_collector never collects them. Without this
780+
// skip, every incremental rebuild would flag them as removed and
781+
// purge their rows — the #1066 ~2s floor.
782+
let mut existing = HashMap::new();
783+
for path in [
784+
"tests/fixtures/clojure/main.clj",
785+
"tests/fixtures/gleam/main.gleam",
786+
"tests/fixtures/julia/main.jl",
787+
"tests/fixtures/fsharp/Main.fs",
788+
] {
789+
existing.insert(
790+
path.to_string(),
791+
FileHashRow {
792+
file: path.to_string(),
793+
hash: "h".to_string(),
794+
mtime: 0,
795+
size: 0,
796+
},
797+
);
798+
}
799+
// Also include a supported file that IS missing from disk — should
800+
// still be flagged as removed.
801+
existing.insert(
802+
"src/deleted.ts".to_string(),
803+
FileHashRow {
804+
file: "src/deleted.ts".to_string(),
805+
hash: "h".to_string(),
806+
mtime: 0,
807+
size: 0,
808+
},
809+
);
810+
811+
let all_files: Vec<String> = Vec::new();
812+
let removed = detect_removed_files(&existing, &all_files, "/project", None);
813+
assert_eq!(removed, vec!["src/deleted.ts"]);
814+
}
762815
}

crates/codegraph-core/src/file_collector.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,25 @@ const SUPPORTED_EXTENSIONS: &[&str] = &[
3838
"kts", "swift", "scala", "sh", "bash", "ex", "exs", "lua", "dart", "zig", "hs", "ml", "mli",
3939
];
4040

41+
/// Returns whether `path` has an extension the Rust file_collector would accept.
42+
///
43+
/// Mirrors the predicate at the heart of `collect_files`: a file is collected
44+
/// if `LanguageKind::from_extension` recognizes it OR its raw extension is in
45+
/// `SUPPORTED_EXTENSIONS`. Exposed for `change_detection::detect_removed_files`
46+
/// so that files outside Rust's capability (e.g. WASM-only `.clj`, `.gleam`,
47+
/// `.jl`) are not flagged as "removed" merely because the orchestrator's
48+
/// narrower collector never sees them.
49+
pub fn is_supported_extension(path: &str) -> bool {
50+
if LanguageKind::from_extension(path).is_some() {
51+
return true;
52+
}
53+
let ext = Path::new(path)
54+
.extension()
55+
.and_then(|e| e.to_str())
56+
.unwrap_or("");
57+
SUPPORTED_EXTENSIONS.contains(&ext)
58+
}
59+
4160
/// Result of file collection.
4261
pub struct CollectResult {
4362
/// Absolute paths of all collected source files.

0 commit comments

Comments
 (0)