diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d674b29..ce167625 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - The `uri` module gains the `empty` value, representing an empty URI which equivalent to `""`. +- The `bit_array` module gains the `to_string_lossy` function. ## v0.54.0 - 2025-02-04 diff --git a/src/gleam/bit_array.gleam b/src/gleam/bit_array.gleam index df75be59..f50deb26 100644 --- a/src/gleam/bit_array.gleam +++ b/src/gleam/bit_array.gleam @@ -97,6 +97,154 @@ pub fn to_string(bits: BitArray) -> Result(String, Nil) { @external(erlang, "gleam_stdlib", "identity") fn unsafe_to_string(a: BitArray) -> String +/// Converts a bit array to a string. Invalid bits are passed to the provided +/// callback and its result is included in the final string in place of the +/// invalid data. +/// +/// ## Examples +/// +/// ```gleam +/// to_string_lossy(<<"A":utf8, 0x80, "1":utf8, 0:size(5)>>, fn(_) { "�" }) +/// // -> "A�1�" +/// ``` +/// +pub fn to_string_lossy( + bits: BitArray, + map_invalid_bits: fn(BitArray) -> String, +) -> String { + to_string_lossy_impl(bits, map_invalid_bits, "") +} + +@target(erlang) +fn to_string_lossy_impl( + bits: BitArray, + map_invalid_bits: fn(BitArray) -> String, + acc: String, +) -> String { + case bits { + <<>> -> acc + + <> -> + to_string_lossy_impl( + rest, + map_invalid_bits, + acc <> string.from_utf_codepoints([x]), + ) + + <> -> + to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x)) + + _ -> acc <> map_invalid_bits(bits) + } +} + +// The following is the same as the above function but supports the JavaScript +// target due to not using the `utf8_codepoint` bit array segment type. Once +// the JavaScript target supports `utf8_codepoint` this function should be +// removed. +@target(javascript) +fn to_string_lossy_impl( + bits: BitArray, + map_invalid_bits: fn(BitArray) -> String, + acc: String, +) -> String { + case bits { + <<>> -> acc + + // 1-byte UTF-8 character + <> if b0 <= 0x7F -> { + let codepoint_value = b0 + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + // 2-byte UTF-8 character + <> + if b0 >= 0xC0 && b0 <= 0xDF && b1 >= 0x80 && b1 <= 0xBF + -> { + let codepoint_value = + int.bitwise_and(b0, 0x1F) * 64 + int.bitwise_and(b1, 0x3F) + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + // 3-byte UTF-8 character + <> + if b0 >= 0xE0 + && b0 <= 0xEF + && b1 >= 0x80 + && b1 <= 0xBF + && b2 >= 0x80 + && b2 <= 0xBF + -> { + let codepoint_value = + int.bitwise_and(b0, 0x0F) + * 4096 + + int.bitwise_and(b1, 0x3F) + * 64 + + int.bitwise_and(b2, 0x3F) + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + // 4-byte UTF-8 character + <> + if b0 >= 0xF0 + && b0 <= 0xF7 + && b1 >= 0x80 + && b1 <= 0xBF + && b2 >= 0x80 + && b2 <= 0xBF + && b3 >= 0x80 + && b3 <= 0xBF + -> { + let codepoint_value = + int.bitwise_and(b0, 0x07) + * 262_144 + + int.bitwise_and(b1, 0x3F) + * 4096 + + int.bitwise_and(b2, 0x3F) + * 64 + + int.bitwise_and(b3, 0x3F) + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + <> -> + to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x)) + + _ -> acc <> map_invalid_bits(bits) + } +} + /// Creates a new bit array by joining multiple binaries. /// /// ## Examples diff --git a/test/gleam/bit_array_test.gleam b/test/gleam/bit_array_test.gleam index 638a8b2d..ea2b3b03 100644 --- a/test/gleam/bit_array_test.gleam +++ b/test/gleam/bit_array_test.gleam @@ -233,6 +233,28 @@ pub fn to_string_erlang_only_test() { |> should.equal(Error(Nil)) } +pub fn to_string_lossy_test() { + <<>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("") + + <<0x80, "A":utf8, 0x81>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("�A�") + + // Test some codepoints that require 2/3/4 bytes to be stored as UTF-8 + <<"£И한𐍈":utf8>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("£И한𐍈") +} + +@target(erlang) +pub fn to_string_lossy_erlang_only_test() { + <<"ø":utf8, 50:4>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("ø�") +} + pub fn is_utf8_test() { <<>> |> bit_array.is_utf8