diff --git a/CHANGELOG.md b/CHANGELOG.md index 114fd9663..afceb35e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Unreleased + +- The performance of `string.trim`, `string.trim_start`, and `string.trim_end` + has been improved on JavaScript. + ## v0.43.0 - 2024-11-17 - `BytesBuilder` is now an alias of `BytesTree`. diff --git a/src/gleam_stdlib.mjs b/src/gleam_stdlib.mjs index a70309e59..7d8e4766f 100644 --- a/src/gleam_stdlib.mjs +++ b/src/gleam_stdlib.mjs @@ -295,19 +295,80 @@ const unicode_whitespaces = [ "\u2029", // Paragraph separator ].join(""); -const left_trim_regex = new RegExp(`^([${unicode_whitespaces}]*)`, "g"); -const right_trim_regex = new RegExp(`([${unicode_whitespaces}]*)$`, "g"); +const trim_end_regex = new RegExp(`[${unicode_whitespaces}]*$`); +const trim_regex = new RegExp( + `^[${unicode_whitespaces}]*(.*?)[${unicode_whitespaces}]*$` +); + +// Regex-based string trimming is fastest on Bun as of v1.1.35, but on +// Node.js 22.11 it is fastest to implement the trimming loops manually. +// Future versions of Bun or Node.js may change these performance +// characteristics. +const trim_using_regex = typeof Bun !== "undefined"; export function trim(string) { - return trim_start(trim_end(string)); + if (trim_using_regex) { + return string.match(trim_regex)[1]; + } else { + const start_index = find_non_whitespace_char(string); + + let end_index = rfind_non_whitespace_char(string) + 1; + if (end_index < start_index) { + end_index = start_index; + } + + return string.substring(start_index, end_index); + } } export function trim_start(string) { - return string.replace(left_trim_regex, ""); + return string.substring(find_non_whitespace_char(string)); } export function trim_end(string) { - return string.replace(right_trim_regex, ""); + if (trim_using_regex) { + return string.replace(trim_end_regex, ""); + } else { + return string.substring(0, rfind_non_whitespace_char(string) + 1); + } +} + +function isUnicodeWhitespace(c) { + return ( + c === "\u0020" || // Space + c === "\u0009" || // Horizontal tab + c === "\u000A" || // Line feed + c === "\u000B" || // Vertical tab + c === "\u000C" || // Form feed + c === "\u000D" || // Carriage return + c === "\u0085" || // Next line + c === "\u2028" || // Line separator + c === "\u2029" // Paragraph separator + ); +} + +function find_non_whitespace_char(string) { + let i = 0; + + for (; i < string.length; i++) { + if (!isUnicodeWhitespace(string[i])) { + break; + } + } + + return i; +} + +function rfind_non_whitespace_char(string) { + let i = string.length - 1; + + for (; i >= 0; i--) { + if (!isUnicodeWhitespace(string[i])) { + break; + } + } + + return i; } export function bit_array_from_string(string) { diff --git a/test/gleam/string_test.gleam b/test/gleam/string_test.gleam index b98a4e145..641d64dd9 100644 --- a/test/gleam/string_test.gleam +++ b/test/gleam/string_test.gleam @@ -179,6 +179,23 @@ pub fn trim_end_test() { |> should.equal(" hats") } +pub fn trim_whole_string_test() { + let s = + "\u{0020}\u{0009}\u{000A}\u{000B}\u{000C}\u{000D}\u{0085}\u{2028}\u{2029}" + + s + |> string.trim_start + |> should.equal("") + + s + |> string.trim_end + |> should.equal("") + + s + |> string.trim + |> should.equal("") +} + // unicode whitespaces pub fn trim_horizontal_tab_test() { "hats\u{0009}"