Skip to content
Open
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/codegraph-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ tree-sitter-dart = "0.0.4"
tree-sitter-zig = "1"
tree-sitter-haskell = "0.23"
tree-sitter-ocaml = "0.24"
tree-sitter-fsharp = "0.3"
tree-sitter-gleam = "1"
tree-sitter-julia = "0.23"
tree-sitter-clojure-orchard = "0.2"
Expand Down
8 changes: 4 additions & 4 deletions crates/codegraph-core/src/change_detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ fn load_file_hashes(conn: &Connection) -> Option<HashMap<String, FileHashRow>> {
/// found on disk are treated as removed.
///
/// Files whose extension is outside the Rust file_collector's supported set
/// (e.g. `.fs`, `.fsx` — WASM-only languages) are skipped:
/// (e.g. `.m`, `.v` — WASM-only languages) are skipped:
/// the orchestrator's narrower collector never sees them, so absence from
/// `current` is a capability boundary, not a deletion. Their `nodes` and
/// `file_hashes` rows are owned by the JS-side WASM backfill (#967, #1068)
Expand Down Expand Up @@ -774,15 +774,15 @@ mod tests {

#[test]
fn detect_removed_skips_unsupported_extensions() {
// Files in WASM-only languages (F#, F# Script) live in
// Files in WASM-only languages (Objective-C, Verilog) live in
// `file_hashes` because the JS-side WASM backfill writes them, but
// Rust's narrower file_collector never collects them. Without this
// skip, every incremental rebuild would flag them as removed and
// purge their rows — the #1066 ~2s floor.
let mut existing = HashMap::new();
for path in [
"tests/fixtures/fsharp/Main.fs",
"tests/fixtures/fsharp/Main.fsx",
"tests/fixtures/objc/main.m",
"tests/fixtures/verilog/main.v",
] {
existing.insert(
path.to_string(),
Expand Down
302 changes: 302 additions & 0 deletions crates/codegraph-core/src/extractors/fsharp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
use tree_sitter::{Node, Tree};
use crate::cfg::build_function_cfg;
use crate::complexity::compute_all_metrics;
use crate::types::*;
use super::helpers::*;
use super::SymbolExtractor;

pub struct FSharpExtractor;

impl SymbolExtractor for FSharpExtractor {
fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols {
let mut symbols = FileSymbols::new(file_path.to_string());
walk_tree(&tree.root_node(), source, &mut symbols, match_fsharp_node);
walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &FSHARP_AST_CONFIG);
symbols
}
}

fn match_fsharp_node(node: &Node, source: &[u8], symbols: &mut FileSymbols, _depth: usize) {
match node.kind() {
"named_module" => handle_named_module(node, source, symbols),
"function_declaration_left" => handle_function_decl(node, source, symbols),
"type_definition" => handle_type_def(node, source, symbols),
"import_decl" => handle_import_decl(node, source, symbols),
"application_expression" => handle_application(node, source, symbols),
"dot_expression" => handle_dot_expression(node, source, symbols),
_ => {}
}
}

/// Find the enclosing `named_module` and return its identifier text.
fn enclosing_module_name(node: &Node, source: &[u8]) -> Option<String> {
let module = find_parent_of_type(node, "named_module")?;
let id = find_child(&module, "long_identifier")?;
Some(node_text(&id, source).to_string())
}

fn handle_named_module(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
let name_node = match find_child(node, "long_identifier") {
Some(n) => n,
None => return,
};
symbols.definitions.push(Definition {
name: node_text(&name_node, source).to_string(),
kind: "module".to_string(),
line: start_line(node),
end_line: Some(end_line(node)),
decorators: None,
complexity: None,
cfg: None,
children: None,
});
}

fn handle_function_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
// function_declaration_left: first child is the function name identifier,
// followed by argument_patterns.
let name_node = match find_child(node, "identifier") {
Some(n) => n,
None => return,
};
let raw_name = node_text(&name_node, source).to_string();
let line = start_line(node);

// Avoid duplicates — the DFS walk also visits the inner curried
// `function_declaration_left` of multi-parameter functions
// (e.g. `let add x y = …`), which would otherwise push the same
// `(name, line)` definition twice. Mirrors the JS extractor's guard,
// which compares against the raw (unqualified) identifier text.
if symbols
.definitions
.iter()
.any(|d| d.name == raw_name && d.line == line)
{
return;
}

let module_name = enclosing_module_name(node, source);
let qualified = match module_name {
Some(m) => format!("{}.{}", m, raw_name),
None => raw_name,
};

let params = extract_fsharp_params(node, source);

// JS extractor uses the parent's endLine (the function_or_value_defn) for
// a tighter bound; do the same to preserve parity.
let end = node.parent().unwrap_or(*node);

symbols.definitions.push(Definition {
name: qualified,
kind: "function".to_string(),
line,
end_line: Some(end_line(&end)),
decorators: None,
complexity: compute_all_metrics(&end, source, "fsharp"),
cfg: build_function_cfg(&end, "fsharp", source),
children: opt_children(params),
});
Comment on lines +55 to +99
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Missing deduplication guard diverges from JS extractor

The JS handleFunctionDecl has an explicit guard (lines 92–94 in src/extractors/fsharp.ts) that skips adding a definition when one with the same name and line already exists. The comment reads: "Avoid duplicates — the walk will also visit children." This guard exists because tree-sitter-fsharp nests function_declaration_left nodes for curried multi-parameter functions (e.g. let add x y = x + y), causing the walk to encounter two nodes at the same start line that both expose an identifier "add" as a direct child. The Rust walk_tree visits every node exactly once in DFS order, so both the outer and inner function_declaration_left are processed, and handle_function_decl pushes two identical Definition entries where the JS engine pushes one. Any downstream consumer that de-dupes by name+line will be unaffected, but callers that use definitions.len() or iterate all entries will see doubled results for every multi-parameter function in an F# file.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 2dc3fbf. The Rust extractor now mirrors the JS dedup guard: before pushing a function definition, it checks whether one with the same raw identifier name and start line already exists, and skips if so. The check uses the unqualified raw_name to match the JS behavior exactly (where the dedup compares against nameNode.text). This eliminates the doubled Definition entries for curried multi-parameter functions like let add x y = ….

See crates/codegraph-core/src/extractors/fsharp.rs:55-78.

}

fn extract_fsharp_params(decl_left: &Node, source: &[u8]) -> Vec<Definition> {
let mut params = Vec::new();
if let Some(arg_patterns) = find_child(decl_left, "argument_patterns") {
collect_param_identifiers(&arg_patterns, source, &mut params);
}
params
}

fn collect_param_identifiers(node: &Node, source: &[u8], params: &mut Vec<Definition>) {
if node.kind() == "identifier" {
params.push(child_def(
node_text(node, source).to_string(),
"parameter",
start_line(node),
));
return;
}
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
collect_param_identifiers(&child, source, params);
}
}
}

fn handle_type_def(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
// type_definition contains union_type_defn, record_type_defn, etc.
for i in 0..node.child_count() {
let child = match node.child(i) {
Some(c) => c,
None => continue,
};
let kind = child.kind();
if !matches!(
kind,
"union_type_defn"
| "record_type_defn"
| "type_abbreviation_defn"
| "class_type_defn"
| "interface_type_defn"
| "type_defn"
) {
continue;
}

let name = match find_child(&child, "type_name") {
Some(type_name) => find_child(&type_name, "identifier")
.map(|n| node_text(&n, source).to_string())
.unwrap_or_else(|| node_text(&type_name, source).to_string()),
None => match find_child(&child, "identifier") {
Some(id) => node_text(&id, source).to_string(),
None => continue,
},
};

let mut children: Vec<Definition> = Vec::new();
extract_type_members(&child, source, &mut children);

symbols.definitions.push(Definition {
name,
kind: determine_type_kind(kind).to_string(),
line: start_line(&child),
end_line: Some(end_line(&child)),
decorators: None,
complexity: None,
cfg: None,
children: opt_children(children),
});
}
}

fn determine_type_kind(node_kind: &str) -> &'static str {
match node_kind {
"union_type_defn" => "enum",
"record_type_defn" => "record",
"class_type_defn" => "class",
"interface_type_defn" => "interface",
_ => "type",
}
}

fn extract_type_members(type_defn: &Node, source: &[u8], children: &mut Vec<Definition>) {
for i in 0..type_defn.child_count() {
let child = match type_defn.child(i) {
Some(c) => c,
None => continue,
};

match child.kind() {
"union_type_case" => {
if let Some(name) = find_child(&child, "identifier") {
children.push(child_def(
node_text(&name, source).to_string(),
"property",
start_line(&child),
));
}
}
"record_field" => {
let name_node = child
.child_by_field_name("name")
.or_else(|| find_child(&child, "identifier"));
if let Some(name) = name_node {
children.push(child_def(
node_text(&name, source).to_string(),
"property",
start_line(&child),
));
}
}
// Recurse into container nodes that hold cases/fields.
"union_type_cases" | "record_fields" => {
extract_type_members(&child, source, children);
}
_ => {}
}
}
}

fn handle_import_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
let module_node = match find_child(node, "long_identifier") {
Some(n) => n,
None => return,
};

let source_name = node_text(&module_node, source).to_string();
let last = source_name
.split('.')
.last()
.unwrap_or(&source_name)
.to_string();

symbols
.imports
.push(Import::new(source_name, vec![last], start_line(node)));
}

fn handle_application(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
let func_node = match node.child(0) {
Some(n) => n,
None => return,
};

// Mirrors the JS extractor's `handleApplication`: the full dotted name
// (e.g. `Service.createUser`) is stored in `name`. Splitting `name` into
// `(receiver, method)` would diverge from the JS engine's output and
// change which resolution rules fire downstream.
match func_node.kind() {
"identifier" | "long_identifier" => {
symbols.calls.push(Call {
name: node_text(&func_node, source).to_string(),
line: start_line(node),
dynamic: None,
receiver: None,
});
}
"long_identifier_or_op" => {
// Inner child is either `identifier` (bare, e.g. `validateUser`) or
// `long_identifier` (qualified, e.g. `Repository.save`). Order
// matches the JS extractor (`identifier` first). Operator forms
// like `( + )` have neither child; we emit nothing in that case,
// mirroring the JS extractor's silent skip.
if let Some(inner) = find_child(&func_node, "identifier")
.or_else(|| find_child(&func_node, "long_identifier"))
{
symbols.calls.push(Call {
name: node_text(&inner, source).to_string(),
line: start_line(node),
dynamic: None,
receiver: None,
});
}
}
Comment on lines +257 to +273
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Divergence from JS extractor in handle_application

Two behavioural differences exist versus the JS handleApplication that the PR claims to mirror:

  1. Search order flipped: The JS extractor tries identifier first, then long_identifier inside a long_identifier_or_op wrapper (findChild(funcNode, 'identifier') || findChild(funcNode, 'long_identifier')). The Rust version tries long_identifier first. For a node containing both kinds, the preferred result will differ.

  2. Extra fallback emits operator calls: When neither child is found (e.g., an operator expression like ( + )), JS emits nothing. Rust falls back to the raw text of func_node and still pushes a Call. This means every operator application in an F# file produces a spurious call entry in the native engine that the WASM engine never produces, diverging the two outputs.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in the merge resolution commit. The handle_application branch for long_identifier_or_op now matches the JS extractor exactly:

  1. Search order is now identifier first, then long_identifier (matches findChild(funcNode, 'identifier') || findChild(funcNode, 'long_identifier') in the JS extractor).
  2. When neither child is present (operator forms like ( + )), the Rust extractor emits nothing — mirroring the JS extractor's silent skip. The previous fallback that pushed a Call with the raw func_node text has been removed.

See crates/codegraph-core/src/extractors/fsharp.rs:242-260.

_ => {}
}
}

fn handle_dot_expression(node: &Node, source: &[u8], symbols: &mut FileSymbols) {
// Mirrors the JS extractor's `handleDotExpression`: collect identifier
// segments and emit `name = last`, `receiver = everything-before`.
let mut parts: Vec<String> = Vec::new();
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
match child.kind() {
"identifier" | "long_identifier" => {
parts.push(node_text(&child, source).to_string());
}
_ => {}
}
}
}
if parts.len() >= 2 {
let method = parts.last().cloned().unwrap_or_default();
let receiver = parts[..parts.len() - 1].join(".");
symbols.calls.push(Call {
name: method,
line: start_line(node),
dynamic: None,
receiver: Some(receiver),
});
}
}
12 changes: 12 additions & 0 deletions crates/codegraph-core/src/extractors/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,18 @@ pub const OCAML_AST_CONFIG: LangAstConfig = LangAstConfig {
string_prefixes: &[],
};

// F# string nodes in tree-sitter-fsharp surface under the `string` kind inside
// `const` literals. The grammar exposes no dedicated raw-string or regex form.
pub const FSHARP_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &[],
throw_types: &[],
await_types: &[],
string_types: &["string"],
regex_types: &[],
quote_chars: &['"'],
string_prefixes: &[],
};

pub const GLEAM_AST_CONFIG: LangAstConfig = LangAstConfig {
new_types: &[],
throw_types: &[],
Expand Down
Loading
Loading