Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bit_array.to_string_lossy #800

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

- The `bit_array` module gains the `to_string_lossy` function.

## v0.54.0 - 2025-02-04

- The deprecated `drop_left`, `drop_right`, `pad_left`, `pad_right`,
`trim_left`, and `trim_right` functions have been removed.
- Fixed a bug that would result in `list.unique` having quadratic runtime.
Expand Down
2 changes: 1 addition & 1 deletion gleam.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = "gleam_stdlib"
version = "0.53.0"
version = "0.54.0"
gleam = ">= 0.32.0"
licences = ["Apache-2.0"]
description = "A standard library for the Gleam programming language"
Expand Down
148 changes: 148 additions & 0 deletions src/gleam/bit_array.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,154 @@ pub fn to_string(bits: BitArray) -> Result(String, Nil) {
@external(erlang, "gleam_stdlib", "identity")
fn unsafe_to_string(a: BitArray) -> String

/// Converts a bit array to a string. Invalid bits are passed to the provided
/// callback and its result is included in the final string in place of the
/// invalid data.
///
/// ## Examples
///
/// ```gleam
/// to_string_lossy(<<"A":utf8, 0x80, "1":utf8, 0:size(5)>>, fn(_) { "�" })
/// // -> "A�1�"
/// ```
///
pub fn to_string_lossy(
bits: BitArray,
map_invalid_bits: fn(BitArray) -> String,
) -> String {
to_string_lossy_impl(bits, map_invalid_bits, "")
}

@target(erlang)
fn to_string_lossy_impl(
bits: BitArray,
map_invalid_bits: fn(BitArray) -> String,
acc: String,
) -> String {
case bits {
<<>> -> acc

<<x:utf8_codepoint, rest:bits>> ->
to_string_lossy_impl(
rest,
map_invalid_bits,
acc <> string.from_utf_codepoints([x]),
)

<<x:bytes-1, rest:bits>> ->
to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x))

_ -> acc <> map_invalid_bits(bits)
}
}

// The following is the same as the above function but supports the JavaScript
// target due to not using the `utf8_codepoint` bit array segment type. Once
// the JavaScript target supports `utf8_codepoint` this function should be
// removed.
@target(javascript)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a shame so much is needed for JavaScript, and this introduces a new @target, which we have been working hard to remove.

How challenging would it be to have utf8_codepoint support in JavaScript? That seems like it would be a better solution.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bit array pattern matching in JS would need to be expanded to support dynamic sizes first, then utf_codepoint could be implemented on top of that.

I think it's reasonable to wait for that to happen, or alternatively a @target can be avoided by either using the longer JS-compatible version on both targets (it works fine on Erlang), or by implementing the JS version directly in JS as an external function.

fn to_string_lossy_impl(
bits: BitArray,
map_invalid_bits: fn(BitArray) -> String,
acc: String,
) -> String {
case bits {
<<>> -> acc

// 1-byte UTF-8 character
<<b0, rest:bytes>> if b0 <= 0x7F -> {
let codepoint_value = b0

let acc =
acc
<> case string.utf_codepoint(codepoint_value) {
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
Error(Nil) -> map_invalid_bits(<<b0>>)
}

to_string_lossy_impl(rest, map_invalid_bits, acc)
}

// 2-byte UTF-8 character
<<b0, b1, rest:bytes>>
if b0 >= 0xC0 && b0 <= 0xDF && b1 >= 0x80 && b1 <= 0xBF
-> {
let codepoint_value =
int.bitwise_and(b0, 0x1F) * 64 + int.bitwise_and(b1, 0x3F)

let acc =
acc
<> case string.utf_codepoint(codepoint_value) {
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
Error(Nil) -> map_invalid_bits(<<b0, b1>>)
}

to_string_lossy_impl(rest, map_invalid_bits, acc)
}

// 3-byte UTF-8 character
<<b0, b1, b2, rest:bytes>>
if b0 >= 0xE0
&& b0 <= 0xEF
&& b1 >= 0x80
&& b1 <= 0xBF
&& b2 >= 0x80
&& b2 <= 0xBF
-> {
let codepoint_value =
int.bitwise_and(b0, 0x0F)
* 4096
+ int.bitwise_and(b1, 0x3F)
* 64
+ int.bitwise_and(b2, 0x3F)

let acc =
acc
<> case string.utf_codepoint(codepoint_value) {
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
Error(Nil) -> map_invalid_bits(<<b0, b1, b2>>)
}

to_string_lossy_impl(rest, map_invalid_bits, acc)
}

// 4-byte UTF-8 character
<<b0, b1, b2, b3, rest:bytes>>
if b0 >= 0xF0
&& b0 <= 0xF7
&& b1 >= 0x80
&& b1 <= 0xBF
&& b2 >= 0x80
&& b2 <= 0xBF
&& b3 >= 0x80
&& b3 <= 0xBF
-> {
let codepoint_value =
int.bitwise_and(b0, 0x07)
* 262_144
+ int.bitwise_and(b1, 0x3F)
* 4096
+ int.bitwise_and(b2, 0x3F)
* 64
+ int.bitwise_and(b3, 0x3F)

let acc =
acc
<> case string.utf_codepoint(codepoint_value) {
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
Error(Nil) -> map_invalid_bits(<<b0, b1, b2, b3>>)
}

to_string_lossy_impl(rest, map_invalid_bits, acc)
}

<<x:bytes-1, rest:bytes>> ->
to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x))

_ -> acc <> map_invalid_bits(bits)
}
}

/// Creates a new bit array by joining multiple binaries.
///
/// ## Examples
Expand Down
22 changes: 22 additions & 0 deletions test/gleam/bit_array_test.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,28 @@ pub fn to_string_erlang_only_test() {
|> should.equal(Error(Nil))
}

pub fn to_string_lossy_test() {
<<>>
|> bit_array.to_string_lossy(fn(_) { "�" })
|> should.equal("")

<<0x80, "A":utf8, 0x81>>
|> bit_array.to_string_lossy(fn(_) { "�" })
|> should.equal("�A�")

// Test some codepoints that require 2/3/4 bytes to be stored as UTF-8
<<"£И한𐍈":utf8>>
|> bit_array.to_string_lossy(fn(_) { "�" })
|> should.equal("£И한𐍈")
}

@target(erlang)
pub fn to_string_lossy_erlang_only_test() {
<<"ø":utf8, 50:4>>
|> bit_array.to_string_lossy(fn(_) { "�" })
|> should.equal("ø�")
}

pub fn is_utf8_test() {
<<>>
|> bit_array.is_utf8
Expand Down