Skip to content

Commit

Permalink
Merge pull request #7496 from HajagosNorbert/ascii-builtins
Browse files Browse the repository at this point in the history
with_ascii_lowercased zig builtin
  • Loading branch information
smores56 authored Jan 20, 2025
2 parents 809fe23 + f372e18 commit 255a388
Show file tree
Hide file tree
Showing 49 changed files with 1,859 additions and 1,707 deletions.
1 change: 1 addition & 0 deletions crates/compiler/builtins/bitcode/src/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ comptime {
exportStrFn(str.withCapacityC, "with_capacity");
exportStrFn(str.strAllocationPtr, "allocation_ptr");
exportStrFn(str.strReleaseExcessCapacity, "release_excess_capacity");
exportStrFn(str.strWithAsciiLowercased, "with_ascii_lowercased");

for (INTEGERS) |T| {
str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");
Expand Down
81 changes: 69 additions & 12 deletions crates/compiler/builtins/bitcode/src/str.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ const utils = @import("utils.zig");
const RocList = @import("list.zig").RocList;
const UpdateMode = utils.UpdateMode;
const std = @import("std");
const ascii = std.ascii;
const mem = std.mem;
const unicode = std.unicode;
const testing = std.testing;
Expand Down Expand Up @@ -370,11 +371,17 @@ pub const RocStr = extern struct {
}

fn refcount(self: RocStr) usize {
if ((self.getCapacity() == 0 and !self.isSeamlessSlice()) or self.isSmallStr()) {
const is_seamless_slice = self.isSeamlessSlice();
if ((self.getCapacity() == 0 and !is_seamless_slice) or self.isSmallStr()) {
return 1;
}

const ptr: [*]usize = @as([*]usize, @ptrCast(@alignCast(self.bytes)));
const data_ptr = if (is_seamless_slice)
self.getAllocationPtr()
else
self.bytes;

const ptr: [*]usize = @as([*]usize, @ptrCast(@alignCast(data_ptr)));
return (ptr - 1)[0];
}

Expand Down Expand Up @@ -611,16 +618,6 @@ fn initFromSmallStr(slice_bytes: [*]u8, len: usize, _: usize) RocStr {
return RocStr.init(slice_bytes, len);
}

// The alloc_ptr must already be shifted to be ready for storing in a seamless slice.
fn initFromBigStr(slice_bytes: [*]u8, len: usize, alloc_ptr: usize) RocStr {
// Here we can make seamless slices instead of copying to a new small str.
return RocStr{
.bytes = slice_bytes,
.length = len | SEAMLESS_SLICE_BIT,
.capacity_or_alloc_ptr = alloc_ptr,
};
}

fn strSplitOnHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
if (delimiter.len() == 0) {
string.incref(1);
Expand Down Expand Up @@ -1968,6 +1965,66 @@ fn countTrailingWhitespaceBytes(string: RocStr) usize {
return byte_count;
}

// Str.with_ascii_lowercased
pub fn strWithAsciiLowercased(string: RocStr) callconv(.C) RocStr {
var new_str = if (string.isUnique())
string
else blk: {
string.decref();
break :blk RocStr.fromSlice(string.asSlice());
};

const new_str_bytes = new_str.asU8ptrMut()[0..string.len()];
for (new_str_bytes) |*c| {
c.* = ascii.toLower(c.*);
}
return new_str;
}

test "withAsciiLowercased: small str" {
const original = RocStr.fromSlice("cOFFÉ");
try expect(original.isSmallStr());

const expected = RocStr.fromSlice("coffÉ");
defer expected.decref();

const str_result = strWithAsciiLowercased(original);
defer str_result.decref();

try expect(str_result.isSmallStr());
try expect(str_result.eq(expected));
}

test "withAsciiLowercased: non small str" {
const original = RocStr.fromSlice("cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ");
defer original.decref();
try expect(!original.isSmallStr());

const expected = RocStr.fromSlice("coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ");
defer expected.decref();

const str_result = strWithAsciiLowercased(original);

try expect(!str_result.isSmallStr());
try expect(str_result.eq(expected));
}

test "withAsciiLowercased: seamless slice" {
const l = RocStr.fromSlice("cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ");
const original = substringUnsafeC(l, 1, l.len() - 1);
defer original.decref();

try expect(original.isSeamlessSlice());

const expected = RocStr.fromSlice("offÉ coffÉ coffÉ coffÉ coffÉ coffÉ");
defer expected.decref();

const str_result = strWithAsciiLowercased(original);

try expect(!str_result.isSmallStr());
try expect(str_result.eq(expected));
}

fn rcNone(_: ?[*]u8) callconv(.C) void {}

fn decStr(ptr: ?[*]u8) callconv(.C) void {
Expand Down
27 changes: 27 additions & 0 deletions crates/compiler/builtins/roc/Str.roc
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ module [
contains,
drop_prefix,
drop_suffix,
with_ascii_lowercased,
]

import Bool exposing [Bool]
Expand Down Expand Up @@ -1092,3 +1093,29 @@ drop_suffix = |haystack, suffix|
substring_unsafe(haystack, start, len)
else
haystack

## Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) lowercased.
## Non-ASCII characters are left unmodified. For example:
##
## ```roc
## expect "CAFÉ".with_ascii_lowercased() == "cafÉ"
## ```
##
## This function is useful for things like [command-line options](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option)
## and [environment variables](https://en.wikipedia.org/wiki/Environment_variable)
## know in advance that you're dealing with a hardcoded string containing only ASCII characters.
## It has better performance than lowercasing operations which take Unicode into account.
##
## That said, strings received from user input can always contain
## non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works
## differently in different languages. For example, the string `"I"` lowercases to `"i"`
## in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I))
## in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/),
## so we have separate [`unicode` package](https://github.com/roc-lang/unicode)
## for Unicode capitalization that can be upgraded independently from the language's builtins.
##
## To do a case-insensitive comparison of the ASCII characters in a string,
## use [`caseless_ascii_equals`](#caseless_ascii_equals).
with_ascii_lowercased : Str -> Str

expect Str.with_ascii_lowercased("cOFFÉ") == "coffÉ"
1 change: 1 addition & 0 deletions crates/compiler/builtins/src/bitcode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
pub const STR_ALLOCATION_PTR: &str = "roc_builtins.str.allocation_ptr";
pub const STR_RELEASE_EXCESS_CAPACITY: &str = "roc_builtins.str.release_excess_capacity";
pub const STR_WITH_ASCII_LOWERCASED: &str = "roc_builtins.str.with_ascii_lowercased";

pub const LIST_MAP: &str = "roc_builtins.list.map";
pub const LIST_MAP2: &str = "roc_builtins.list.map2";
Expand Down
1 change: 1 addition & 0 deletions crates/compiler/can/src/builtins.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ map_symbol_to_lowlevel_and_arity! {
StrToNum; STR_TO_NUM; 1,
StrWithCapacity; STR_WITH_CAPACITY; 1,
StrReleaseExcessCapacity; STR_RELEASE_EXCESS_CAPACITY; 1,
StrWithAsciiLowercased; STR_WITH_ASCII_LOWERCASED; 1,

ListLenUsize; LIST_LEN_USIZE; 1,
ListLenU64; LIST_LEN_U64; 1,
Expand Down
7 changes: 7 additions & 0 deletions crates/compiler/gen_dev/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1712,6 +1712,13 @@ trait Backend<'a> {
arg_layouts,
ret_layout,
),
LowLevel::StrWithAsciiLowercased => self.build_fn_call(
sym,
bitcode::STR_WITH_ASCII_LOWERCASED.to_string(),
args,
arg_layouts,
ret_layout,
),
LowLevel::StrToNum => {
let number_layout = match self.interner().get_repr(*ret_layout) {
LayoutRepr::Struct(field_layouts) => field_layouts[0], // TODO: why is it sometimes a struct?
Expand Down
12 changes: 12 additions & 0 deletions crates/compiler/gen_llvm/src/llvm/lowlevel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,7 @@ pub(crate) fn run_low_level<'a, 'ctx>(
bitcode::STR_WITH_CAPACITY,
)
}

ListLenU64 => {
// List.len : List * -> U64
arguments!(list);
Expand Down Expand Up @@ -635,6 +636,17 @@ pub(crate) fn run_low_level<'a, 'ctx>(
list_element_layout!(layout_interner, result_layout),
)
}
StrWithAsciiLowercased => {
arguments!(string);

call_str_bitcode_fn(
env,
&[string],
&[],
BitcodeReturns::Str,
bitcode::STR_WITH_ASCII_LOWERCASED,
)
}
ListConcat => {
debug_assert_eq!(args.len(), 2);

Expand Down
3 changes: 3 additions & 0 deletions crates/compiler/gen_wasm/src/low_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,9 @@ impl<'a> LowLevelCall<'a> {
self.load_args_and_call_zig(backend, bitcode::STR_SUBSTRING_UNSAFE)
}
StrWithCapacity => self.load_args_and_call_zig(backend, bitcode::STR_WITH_CAPACITY),
StrWithAsciiLowercased => {
self.load_args_and_call_zig(backend, bitcode::STR_WITH_ASCII_LOWERCASED)
}

// List
ListLenU64 => {
Expand Down
2 changes: 2 additions & 0 deletions crates/compiler/module/src/low_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub enum LowLevel {
StrReserve,
StrWithCapacity,
StrReleaseExcessCapacity,
StrWithAsciiLowercased,
ListLenUsize,
ListLenU64,
ListWithCapacity,
Expand Down Expand Up @@ -265,6 +266,7 @@ map_symbol_to_lowlevel! {
StrToNum <= STR_TO_NUM;
StrWithCapacity <= STR_WITH_CAPACITY;
StrReleaseExcessCapacity <= STR_RELEASE_EXCESS_CAPACITY;
StrWithAsciiLowercased <= STR_WITH_ASCII_LOWERCASED;
ListLenU64 <= LIST_LEN_U64;
ListLenUsize <= LIST_LEN_USIZE;
ListGetCapacity <= LIST_CAPACITY;
Expand Down
1 change: 1 addition & 0 deletions crates/compiler/module/src/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1420,6 +1420,7 @@ define_builtins! {
48 STR_RELEASE_EXCESS_CAPACITY: "release_excess_capacity"
49 STR_DROP_PREFIX: "drop_prefix"
50 STR_DROP_SUFFIX: "drop_suffix"
51 STR_WITH_ASCII_LOWERCASED: "with_ascii_lowercased"
}
6 LIST: "List" => {
0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias
Expand Down
1 change: 1 addition & 0 deletions crates/compiler/mono/src/drop_specialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1549,6 +1549,7 @@ fn low_level_no_rc(lowlevel: &LowLevel) -> RC {
ListPrepend => RC::Rc,
StrJoinWith => RC::NoRc,
ListSortWith => RC::Rc,
StrWithAsciiLowercased => RC::Rc,

ListAppendUnsafe
| ListReserve
Expand Down
1 change: 1 addition & 0 deletions crates/compiler/mono/src/inc_dec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,7 @@ pub(crate) fn lowlevel_borrow_signature(op: LowLevel) -> &'static [Ownership] {
StrReleaseExcessCapacity => &[OWNED],
ListIncref => &[OWNED],
ListDecref => &[OWNED],
StrWithAsciiLowercased => &[OWNED],

Eq | NotEq => &[BORROWED, BORROWED],

Expand Down
12 changes: 12 additions & 0 deletions crates/compiler/solve/tests/solve_expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3826,6 +3826,18 @@ mod solve_expr {
);
}

#[test]
fn str_with_ascii_lowercased() {
infer_eq_without_problem(
indoc!(
r"
Str.with_ascii_lowercased
"
),
"Str -> Str",
);
}

#[test]
fn list_take_first() {
infer_eq_without_problem(
Expand Down
26 changes: 26 additions & 0 deletions crates/compiler/test_gen/src/gen_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2060,3 +2060,29 @@ fn str_drop_suffix() {
RocStr
);
}

#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev", feature = "gen-wasm"))]
fn with_ascii_lowercased() {
assert_evals_to!(
r#"
Str.with_ascii_lowercased("cOFFÉ")
"#,
RocStr::from("coffÉ"),
RocStr
);
}

#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev", feature = "gen-wasm"))]
fn with_ascii_lowercased_non_zero_refcount() {
assert_evals_to!(
r#"
original = "cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ cOFFÉ"
res = Str.with_ascii_lowercased(original)
Str.drop_prefix(res, original)
"#,
RocStr::from("coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ coffÉ"),
RocStr
);
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions crates/compiler/test_mono/generated/dbg_expr.txt

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 255a388

Please sign in to comment.