Optimise string trimming on JavaScript

gleam-lang · Nov 20, 2024 · da7f50e · da7f50e
1 parent 2f9e187
commit da7f50e
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Unreleased
+
+- The performance of `string.trim`, `string.trim_start`, and `string.trim_end`
+  has been improved on JavaScript.
+
 ## v0.43.0 - 2024-11-17
 
 - `BytesBuilder` is now an alias of `BytesTree`.

diff --git a/src/gleam_stdlib.mjs b/src/gleam_stdlib.mjs
@@ -295,19 +295,80 @@ const unicode_whitespaces = [
   "\u2029", // Paragraph separator
 ].join("");
 
-const left_trim_regex = new RegExp(`^([${unicode_whitespaces}]*)`, "g");
-const right_trim_regex = new RegExp(`([${unicode_whitespaces}]*)$`, "g");
+const trim_end_regex = new RegExp(`[${unicode_whitespaces}]*$`);
+const trim_regex = new RegExp(
+  `^[${unicode_whitespaces}]*(.*?)[${unicode_whitespaces}]*$`
+);
+
+// Regex-based string trimming is fastest on Bun as of v1.1.35, but on
+// Node.js 22.11 it is fastest to implement the trimming loops manually.
+// Future versions of Bun or Node.js may change these performance
+// characteristics.
+const trim_using_regex = typeof Bun !== "undefined";
 
 export function trim(string) {
-  return trim_start(trim_end(string));
+  if (trim_using_regex) {
+    return string.match(trim_regex)[1];
+  } else {
+    const start_index = find_non_whitespace_char(string);
+
+    let end_index = rfind_non_whitespace_char(string) + 1;
+    if (end_index < start_index) {
+      end_index = start_index;
+    }
+
+    return string.substring(start_index, end_index);
+  }
 }
 
 export function trim_start(string) {
-  return string.replace(left_trim_regex, "");
+  return string.substring(find_non_whitespace_char(string));
 }
 
 export function trim_end(string) {
-  return string.replace(right_trim_regex, "");
+  if (trim_using_regex) {
+    return string.replace(trim_end_regex, "");
+  } else {
+    return string.substring(0, rfind_non_whitespace_char(string) + 1);
+  }
+}
+
+function isUnicodeWhitespace(c) {
+  return (
+    c === "\u0020" || // Space
+    c === "\u0009" || // Horizontal tab
+    c === "\u000A" || // Line feed
+    c === "\u000B" || // Vertical tab
+    c === "\u000C" || // Form feed
+    c === "\u000D" || // Carriage return
+    c === "\u0085" || // Next line
+    c === "\u2028" || // Line separator
+    c === "\u2029" // Paragraph separator
+  );
+}
+
+function find_non_whitespace_char(string) {
+  let i = 0;
+
+  for (; i < string.length; i++) {
+    if (!isUnicodeWhitespace(string[i])) {
+      break;
+    }
+  }
+
+  return i;
+}
+
+function rfind_non_whitespace_char(string) {
+  let i = string.length - 1;
+
+  for (; i >= 0; i--) {
+    if (!isUnicodeWhitespace(string[i])) {
+      break;
+    }
+  }
+
+  return i;
 }
 
 export function bit_array_from_string(string) {

diff --git a/test/gleam/string_test.gleam b/test/gleam/string_test.gleam
@@ -179,6 +179,23 @@ pub fn trim_end_test() {
   |> should.equal("  hats")
 }
 
+pub fn trim_whole_string_test() {
+  let s =
+    "\u{0020}\u{0009}\u{000A}\u{000B}\u{000C}\u{000D}\u{0085}\u{2028}\u{2029}"
+
+  s
+  |> string.trim_start
+  |> should.equal("")
+
+  s
+  |> string.trim_end
+  |> should.equal("")
+
+  s
+  |> string.trim
+  |> should.equal("")
+}
+
 // unicode whitespaces
 pub fn trim_horizontal_tab_test() {
   "hats\u{0009}"