gleam-lang · richard-viney · Feb 4, 2025 · Feb 6, 2025 · lpil · Feb 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Unreleased
 
+- The `bit_array` module gains the `to_string_lossy` function.
+
+## v0.54.0 - 2025-02-04
+
 - The deprecated `drop_left`, `drop_right`, `pad_left`, `pad_right`,
   `trim_left`, and `trim_right` functions have been removed.
 - Fixed a bug that would result in `list.unique` having quadratic runtime.

diff --git a/gleam.toml b/gleam.toml
@@ -1,5 +1,5 @@
 name = "gleam_stdlib"
-version = "0.53.0"
+version = "0.54.0"
 gleam = ">= 0.32.0"
 licences = ["Apache-2.0"]
 description = "A standard library for the Gleam programming language"

diff --git a/src/gleam/bit_array.gleam b/src/gleam/bit_array.gleam
@@ -97,6 +97,154 @@ pub fn to_string(bits: BitArray) -> Result(String, Nil) {
 @external(erlang, "gleam_stdlib", "identity")
 fn unsafe_to_string(a: BitArray) -> String
 
+/// Converts a bit array to a string. Invalid bits are passed to the provided
+/// callback and its result is included in the final string in place of the
+/// invalid data.
+///
+/// ## Examples
+///
+/// ```gleam
+/// to_string_lossy(<<"A":utf8, 0x80, "1":utf8, 0:size(5)>>, fn(_) { "�" })
+/// // -> "A�1�"
+/// ```
+/// 
+pub fn to_string_lossy(
+  bits: BitArray,
+  map_invalid_bits: fn(BitArray) -> String,
+) -> String {
+  to_string_lossy_impl(bits, map_invalid_bits, "")
+}
+
+@target(erlang)
+fn to_string_lossy_impl(
+  bits: BitArray,
+  map_invalid_bits: fn(BitArray) -> String,
+  acc: String,
+) -> String {
+  case bits {
+    <<>> -> acc
+
+    <<x:utf8_codepoint, rest:bits>> ->
+      to_string_lossy_impl(
+        rest,
+        map_invalid_bits,
+        acc <> string.from_utf_codepoints([x]),
+      )
+
+    <<x:bytes-1, rest:bits>> ->
+      to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x))
+
+    _ -> acc <> map_invalid_bits(bits)
+  }
+}
+
+// The following is the same as the above function but supports the JavaScript
+// target due to not using the `utf8_codepoint` bit array segment type. Once
+// the JavaScript target supports `utf8_codepoint` this function should be
+// removed.
+@target(javascript)
+fn to_string_lossy_impl(
+  bits: BitArray,
+  map_invalid_bits: fn(BitArray) -> String,
+  acc: String,
+) -> String {
+  case bits {
+    <<>> -> acc
+
+    // 1-byte UTF-8 character
+    <<b0, rest:bytes>> if b0 <= 0x7F -> {
+      let codepoint_value = b0
+
+      let acc =
+        acc
+        <> case string.utf_codepoint(codepoint_value) {
+          Ok(codepoint) -> string.from_utf_codepoints([codepoint])
+          Error(Nil) -> map_invalid_bits(<<b0>>)
+        }
+
+      to_string_lossy_impl(rest, map_invalid_bits, acc)
+    }
+
+    // 2-byte UTF-8 character
+    <<b0, b1, rest:bytes>>
+      if b0 >= 0xC0 && b0 <= 0xDF && b1 >= 0x80 && b1 <= 0xBF
+    -> {
+      let codepoint_value =
+        int.bitwise_and(b0, 0x1F) * 64 + int.bitwise_and(b1, 0x3F)
+
+      let acc =
+        acc
+        <> case string.utf_codepoint(codepoint_value) {
+          Ok(codepoint) -> string.from_utf_codepoints([codepoint])
+          Error(Nil) -> map_invalid_bits(<<b0, b1>>)
+        }
+
+      to_string_lossy_impl(rest, map_invalid_bits, acc)
+    }
+
+    // 3-byte UTF-8 character
+    <<b0, b1, b2, rest:bytes>>
+      if b0 >= 0xE0
+      && b0 <= 0xEF
+      && b1 >= 0x80
+      && b1 <= 0xBF
+      && b2 >= 0x80
+      && b2 <= 0xBF
+    -> {
+      let codepoint_value =
+        int.bitwise_and(b0, 0x0F)
+        * 4096
+        + int.bitwise_and(b1, 0x3F)
+        * 64
+        + int.bitwise_and(b2, 0x3F)
+
+      let acc =
+        acc
+        <> case string.utf_codepoint(codepoint_value) {
+          Ok(codepoint) -> string.from_utf_codepoints([codepoint])
+          Error(Nil) -> map_invalid_bits(<<b0, b1, b2>>)
+        }
+
+      to_string_lossy_impl(rest, map_invalid_bits, acc)
+    }
+
+    // 4-byte UTF-8 character
+    <<b0, b1, b2, b3, rest:bytes>>
+      if b0 >= 0xF0
+      && b0 <= 0xF7
+      && b1 >= 0x80
+      && b1 <= 0xBF
+      && b2 >= 0x80
+      && b2 <= 0xBF
+      && b3 >= 0x80
+      && b3 <= 0xBF
+    -> {
+      let codepoint_value =
+        int.bitwise_and(b0, 0x07)
+        * 262_144
+        + int.bitwise_and(b1, 0x3F)
+        * 4096
+        + int.bitwise_and(b2, 0x3F)
+        * 64
+        + int.bitwise_and(b3, 0x3F)
+
+      let acc =
+        acc
+        <> case string.utf_codepoint(codepoint_value) {
+          Ok(codepoint) -> string.from_utf_codepoints([codepoint])
+          Error(Nil) -> map_invalid_bits(<<b0, b1, b2, b3>>)
+        }
+
+      to_string_lossy_impl(rest, map_invalid_bits, acc)
+    }
+
+    <<x:bytes-1, rest:bytes>> ->
+      to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x))
+
+    _ -> acc <> map_invalid_bits(bits)
+  }
+}
+
 /// Creates a new bit array by joining multiple binaries.
 ///
 /// ## Examples

diff --git a/test/gleam/bit_array_test.gleam b/test/gleam/bit_array_test.gleam
@@ -233,6 +233,28 @@ pub fn to_string_erlang_only_test() {
   |> should.equal(Error(Nil))
 }
 
+pub fn to_string_lossy_test() {
+  <<>>
+  |> bit_array.to_string_lossy(fn(_) { "�" })
+  |> should.equal("")
+
+  <<0x80, "A":utf8, 0x81>>
+  |> bit_array.to_string_lossy(fn(_) { "�" })
+  |> should.equal("�A�")
+
+  // Test some codepoints that require 2/3/4 bytes to be stored as UTF-8
+  <<"£И한𐍈":utf8>>
+  |> bit_array.to_string_lossy(fn(_) { "�" })
+  |> should.equal("£И한𐍈")
+}
+
+@target(erlang)
+pub fn to_string_lossy_erlang_only_test() {
+  <<"ø":utf8, 50:4>>
+  |> bit_array.to_string_lossy(fn(_) { "�" })
+  |> should.equal("ø�")
+}
+
 pub fn is_utf8_test() {
   <<>>
   |> bit_array.is_utf8