Skip to content

Commit

Permalink
Optimise string trimming on JavaScript
Browse files Browse the repository at this point in the history
  • Loading branch information
richard-viney committed Nov 20, 2024
1 parent 2f9e187 commit da7f50e
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 5 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## Unreleased

- The performance of `string.trim`, `string.trim_start`, and `string.trim_end`
has been improved on JavaScript.

## v0.43.0 - 2024-11-17

- `BytesBuilder` is now an alias of `BytesTree`.
Expand Down
71 changes: 66 additions & 5 deletions src/gleam_stdlib.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -295,19 +295,80 @@ const unicode_whitespaces = [
"\u2029", // Paragraph separator
].join("");

const left_trim_regex = new RegExp(`^([${unicode_whitespaces}]*)`, "g");
const right_trim_regex = new RegExp(`([${unicode_whitespaces}]*)$`, "g");
const trim_end_regex = new RegExp(`[${unicode_whitespaces}]*$`);
const trim_regex = new RegExp(
`^[${unicode_whitespaces}]*(.*?)[${unicode_whitespaces}]*$`
);

// Regex-based string trimming is fastest on Bun as of v1.1.35, but on
// Node.js 22.11 it is fastest to implement the trimming loops manually.
// Future versions of Bun or Node.js may change these performance
// characteristics.
const trim_using_regex = typeof Bun !== "undefined";

export function trim(string) {
return trim_start(trim_end(string));
if (trim_using_regex) {
return string.match(trim_regex)[1];
} else {
const start_index = find_non_whitespace_char(string);

let end_index = rfind_non_whitespace_char(string) + 1;
if (end_index < start_index) {
end_index = start_index;
}

return string.substring(start_index, end_index);
}
}

export function trim_start(string) {
return string.replace(left_trim_regex, "");
return string.substring(find_non_whitespace_char(string));
}

export function trim_end(string) {
return string.replace(right_trim_regex, "");
if (trim_using_regex) {
return string.replace(trim_end_regex, "");
} else {
return string.substring(0, rfind_non_whitespace_char(string) + 1);
}
}

function isUnicodeWhitespace(c) {
return (
c === "\u0020" || // Space
c === "\u0009" || // Horizontal tab
c === "\u000A" || // Line feed
c === "\u000B" || // Vertical tab
c === "\u000C" || // Form feed
c === "\u000D" || // Carriage return
c === "\u0085" || // Next line
c === "\u2028" || // Line separator
c === "\u2029" // Paragraph separator
);
}

function find_non_whitespace_char(string) {
let i = 0;

for (; i < string.length; i++) {
if (!isUnicodeWhitespace(string[i])) {
break;
}
}

return i;
}

function rfind_non_whitespace_char(string) {
let i = string.length - 1;

for (; i >= 0; i--) {
if (!isUnicodeWhitespace(string[i])) {
break;
}
}

return i;
}

export function bit_array_from_string(string) {
Expand Down
17 changes: 17 additions & 0 deletions test/gleam/string_test.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,23 @@ pub fn trim_end_test() {
|> should.equal(" hats")
}

pub fn trim_whole_string_test() {
let s =
"\u{0020}\u{0009}\u{000A}\u{000B}\u{000C}\u{000D}\u{0085}\u{2028}\u{2029}"

s
|> string.trim_start
|> should.equal("")

s
|> string.trim_end
|> should.equal("")

s
|> string.trim
|> should.equal("")
}

// unicode whitespaces
pub fn trim_horizontal_tab_test() {
"hats\u{0009}"
Expand Down

0 comments on commit da7f50e

Please sign in to comment.