From d092588b525ec9f646bb80e1ad535abcff8bdc75 Mon Sep 17 00:00:00 2001 From: Louis Pilfold Date: Tue, 4 Feb 2025 11:30:35 +0000 Subject: [PATCH 1/2] v0.54.0 --- CHANGELOG.md | 2 +- gleam.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c1b6945..0a7eb6d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## Unreleased +## v0.54.0 - 2025-02-04 - The deprecated `drop_left`, `drop_right`, `pad_left`, `pad_right`, `trim_left`, and `trim_right` functions have been removed. diff --git a/gleam.toml b/gleam.toml index feccba3a..53d5d0d1 100644 --- a/gleam.toml +++ b/gleam.toml @@ -1,5 +1,5 @@ name = "gleam_stdlib" -version = "0.53.0" +version = "0.54.0" gleam = ">= 0.32.0" licences = ["Apache-2.0"] description = "A standard library for the Gleam programming language" From 78c816e90f0edac96d91468544677438f772915e Mon Sep 17 00:00:00 2001 From: Richard Viney Date: Thu, 6 Feb 2025 13:15:57 +1300 Subject: [PATCH 2/2] Add bit_array.to_string_lossy --- CHANGELOG.md | 4 + src/gleam/bit_array.gleam | 148 ++++++++++++++++++++++++++++++++ test/gleam/bit_array_test.gleam | 22 +++++ 3 files changed, 174 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a7eb6d9..95a76fd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Unreleased + +- The `bit_array` module gains the `to_string_lossy` function. + ## v0.54.0 - 2025-02-04 - The deprecated `drop_left`, `drop_right`, `pad_left`, `pad_right`, diff --git a/src/gleam/bit_array.gleam b/src/gleam/bit_array.gleam index df75be59..f50deb26 100644 --- a/src/gleam/bit_array.gleam +++ b/src/gleam/bit_array.gleam @@ -97,6 +97,154 @@ pub fn to_string(bits: BitArray) -> Result(String, Nil) { @external(erlang, "gleam_stdlib", "identity") fn unsafe_to_string(a: BitArray) -> String +/// Converts a bit array to a string. Invalid bits are passed to the provided +/// callback and its result is included in the final string in place of the +/// invalid data. +/// +/// ## Examples +/// +/// ```gleam +/// to_string_lossy(<<"A":utf8, 0x80, "1":utf8, 0:size(5)>>, fn(_) { "�" }) +/// // -> "A�1�" +/// ``` +/// +pub fn to_string_lossy( + bits: BitArray, + map_invalid_bits: fn(BitArray) -> String, +) -> String { + to_string_lossy_impl(bits, map_invalid_bits, "") +} + +@target(erlang) +fn to_string_lossy_impl( + bits: BitArray, + map_invalid_bits: fn(BitArray) -> String, + acc: String, +) -> String { + case bits { + <<>> -> acc + + <> -> + to_string_lossy_impl( + rest, + map_invalid_bits, + acc <> string.from_utf_codepoints([x]), + ) + + <> -> + to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x)) + + _ -> acc <> map_invalid_bits(bits) + } +} + +// The following is the same as the above function but supports the JavaScript +// target due to not using the `utf8_codepoint` bit array segment type. Once +// the JavaScript target supports `utf8_codepoint` this function should be +// removed. +@target(javascript) +fn to_string_lossy_impl( + bits: BitArray, + map_invalid_bits: fn(BitArray) -> String, + acc: String, +) -> String { + case bits { + <<>> -> acc + + // 1-byte UTF-8 character + <> if b0 <= 0x7F -> { + let codepoint_value = b0 + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + // 2-byte UTF-8 character + <> + if b0 >= 0xC0 && b0 <= 0xDF && b1 >= 0x80 && b1 <= 0xBF + -> { + let codepoint_value = + int.bitwise_and(b0, 0x1F) * 64 + int.bitwise_and(b1, 0x3F) + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + // 3-byte UTF-8 character + <> + if b0 >= 0xE0 + && b0 <= 0xEF + && b1 >= 0x80 + && b1 <= 0xBF + && b2 >= 0x80 + && b2 <= 0xBF + -> { + let codepoint_value = + int.bitwise_and(b0, 0x0F) + * 4096 + + int.bitwise_and(b1, 0x3F) + * 64 + + int.bitwise_and(b2, 0x3F) + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + // 4-byte UTF-8 character + <> + if b0 >= 0xF0 + && b0 <= 0xF7 + && b1 >= 0x80 + && b1 <= 0xBF + && b2 >= 0x80 + && b2 <= 0xBF + && b3 >= 0x80 + && b3 <= 0xBF + -> { + let codepoint_value = + int.bitwise_and(b0, 0x07) + * 262_144 + + int.bitwise_and(b1, 0x3F) + * 4096 + + int.bitwise_and(b2, 0x3F) + * 64 + + int.bitwise_and(b3, 0x3F) + + let acc = + acc + <> case string.utf_codepoint(codepoint_value) { + Ok(codepoint) -> string.from_utf_codepoints([codepoint]) + Error(Nil) -> map_invalid_bits(<>) + } + + to_string_lossy_impl(rest, map_invalid_bits, acc) + } + + <> -> + to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x)) + + _ -> acc <> map_invalid_bits(bits) + } +} + /// Creates a new bit array by joining multiple binaries. /// /// ## Examples diff --git a/test/gleam/bit_array_test.gleam b/test/gleam/bit_array_test.gleam index 638a8b2d..ea2b3b03 100644 --- a/test/gleam/bit_array_test.gleam +++ b/test/gleam/bit_array_test.gleam @@ -233,6 +233,28 @@ pub fn to_string_erlang_only_test() { |> should.equal(Error(Nil)) } +pub fn to_string_lossy_test() { + <<>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("") + + <<0x80, "A":utf8, 0x81>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("�A�") + + // Test some codepoints that require 2/3/4 bytes to be stored as UTF-8 + <<"£И한𐍈":utf8>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("£И한𐍈") +} + +@target(erlang) +pub fn to_string_lossy_erlang_only_test() { + <<"ø":utf8, 50:4>> + |> bit_array.to_string_lossy(fn(_) { "�" }) + |> should.equal("ø�") +} + pub fn is_utf8_test() { <<>> |> bit_array.is_utf8