Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/hstr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ atom_size_64 = []
atom_size_128 = []

[dependencies]
cfg-if = "1.0.0"
hashbrown = { version = "0.14.3", default-features = false }
new_debug_unreachable = "1.0.4"
once_cell = "1.18.0"
phf = "0.11.2"
rkyv = { version = "0.7.42", optional = true }
rustc-hash = "1.1.0"
serde = { version = "1.0.192", optional = true }
static_assertions = "1.1.0"
triomphe = "0.1.11"

[dev-dependencies]
Expand Down
1 change: 1 addition & 0 deletions crates/hstr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use crate::dynamic::Entry;

mod dynamic;
mod global_store;
mod repr;
mod tagged_value;
#[cfg(test)]
mod tests;
Expand Down
169 changes: 169 additions & 0 deletions crates/hstr/src/repr/capacity.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
use crate::repr::HEAP_MASK;

// how many bytes a `usize` occupies
const USIZE_SIZE: usize = core::mem::size_of::<usize>();

/// Used to generate [`CAPACITY_IS_ON_THE_HEAP`]
#[allow(non_snake_case)]
const fn CAP_ON_HEAP_FLAG() -> [u8; USIZE_SIZE] {
// all bytes 255, with the last being HEAP_MASK
let mut flag = [255; USIZE_SIZE];
flag[USIZE_SIZE - 1] = HEAP_MASK;
flag
}

/// State that describes the capacity as being stored on the heap.
///
/// All bytes `255`, with the last being [`HEAP_MASK`], using the same amount of
/// bytes as `usize` Example (64-bit): `[255, 255, 255, 255, 255, 255, 255,
/// 254]`
const CAPACITY_IS_ON_THE_HEAP: [u8; USIZE_SIZE] = CAP_ON_HEAP_FLAG();

// how many bytes we can use for capacity
const SPACE_FOR_CAPACITY: usize = USIZE_SIZE - 1;
// the maximum value we're able to store, e.g. on 64-bit arch this is 2^56 - 2
//
// note: Preferably we'd used usize.pow(..) here, but that's not a `const fn`,
// so we need to use bitshift operators, and there's a lint against using them
// in this pattern, which IMO isn't a great lint
pub const MAX_VALUE: usize = 2usize.pow(SPACE_FOR_CAPACITY as u32 * 8) - 2;

/// An integer type that uses `core::mem::size_of::<usize>() - 1` bytes to store
/// the capacity of a heap buffer.
///
/// Assumming a 64-bit arch, a [`super::BoxString`] uses 8 bytes for a pointer,
/// 8 bytes for a length, and then needs 1 byte for a discriminant. We need to
/// store the capacity somewhere, and we could store it on the heap, but we also
/// have 7 unused bytes. [`Capacity`] handles storing a value in these 7 bytes,
/// returning an error if it's not possible, at which point we'll store the
/// capacity on the heap.
///
/// # Max Values
/// * __64-bit:__ `(2 ^ (7 * 8)) - 2 = 72_057_594_037_927_934 ~= 64 petabytes`
/// * __32-bit:__ `(2 ^ (3 * 8)) - 2 = 16_777_214 ~= 16 megabytes`
///
/// Practically speaking, on a 64-bit architecture we'll never need to store the
/// capacity on the heap, because with it's impossible to create a string that
/// is 64 petabytes or larger. But for 32-bit architectures we need to be able
/// to store a capacity larger than 16 megabytes, since a string larger than 16
/// megabytes probably isn't that uncommon.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[cfg_attr(target_pointer_width = "64", repr(align(8)))]
#[cfg_attr(target_pointer_width = "32", repr(align(4)))]
pub struct Capacity([u8; USIZE_SIZE]);

static_assertions::assert_eq_size!(Capacity, usize);
static_assertions::assert_eq_align!(Capacity, usize);

impl Capacity {
#[inline]
pub const fn new(capacity: usize) -> Self {
cfg_if::cfg_if! {
if #[cfg(target_pointer_width = "64")] {
// on 64-bit arches we can always fit the capacity inline
debug_assert!(capacity <= MAX_VALUE);

let mut bytes = capacity.to_le_bytes();
bytes[core::mem::size_of::<usize>() - 1] = HEAP_MASK;
Capacity(bytes)
} else if #[cfg(target_pointer_width = "32")] {
// on 32-bit arches we might need to store the capacity on the heap
if capacity > MAX_VALUE {
// if we need the last byte to encode this capacity then we need to put the capacity on
// the heap. return an Error so `BoxString` can do the right thing
Capacity(CAPACITY_IS_ON_THE_HEAP)
} else {
// otherwise, we can store this capacity inline! Set the last byte to be our `HEAP_MASK`
// for our discriminant, using the leading bytes to store the actual value
let mut bytes = capacity.to_le_bytes();
bytes[core::mem::size_of::<usize>() - 1] = HEAP_MASK;
Capacity(bytes)
}
} else {
compile_error!("Unsupported target_pointer_width");
}
}
}

/// Re-interprets a [`Capacity`] as a `usize`
///
/// # SAFETY:
/// * `self` must be less than or equal to [`MAX_VALUE`]
#[inline(always)]
pub unsafe fn as_usize(&self) -> usize {
let mut usize_buf = [0u8; USIZE_SIZE];
// SAFETY:
// * `src` is valid for reads of `SPACE_FOR_CAPACITY` because it is less than
// `USIZE_SIZE`
// * `dst` is valid for reads of `SPACE_FOR_CAPACITY` because it is less than
// `USIZE_SIZE`
// * `src` and `dst` do not overlap because we created `usize_buf`
core::ptr::copy_nonoverlapping(self.0.as_ptr(), usize_buf.as_mut_ptr(), SPACE_FOR_CAPACITY);
usize::from_le_bytes(usize_buf)
}

/// Returns whether or not this [`Capacity`] has a value that indicates the
/// capacity is being stored on the heap
#[inline(always)]
pub fn is_heap(&self) -> bool {
self.0 == CAPACITY_IS_ON_THE_HEAP
}
}

#[cfg(test)]
mod tests {
use rayon::prelude::*;

use super::Capacity;

#[test]
fn test_zero_roundtrips() {
let og = 0;
let cap = Capacity::new(og);
let after = unsafe { cap.as_usize() };

assert_eq!(og, after);
}

#[test]
fn test_max_value() {
let available_bytes = (core::mem::size_of::<usize>() - 1) as u32;
let max_value = 2usize.pow(available_bytes * 8) - 2;

#[cfg(target_pointer_width = "64")]
assert_eq!(max_value, 72057594037927934);
#[cfg(target_pointer_width = "32")]
assert_eq!(max_value, 16777214);

let cap = Capacity::new(max_value);
let after = unsafe { cap.as_usize() };

assert_eq!(max_value, after);
}

#[cfg(target_pointer_width = "32")]
#[test]

fn test_invalid_value() {
let invalid_val = usize::MAX;
let cap = Capacity::new(invalid_val);
let after = unsafe { cap.as_usize() };

// anything greater than or equal to 16777215, should "resolve" to 16777215
assert_eq!(16777215, after);
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_all_valid_32bit_values() {
#[cfg(target_pointer_width = "32")]
assert_eq!(16_777_214, super::MAX_VALUE);

(0..=16_777_214).into_par_iter().for_each(|i| {
let cap = Capacity::new(i);
let val = unsafe { cap.as_usize() };

assert_eq!(val, i, "value roundtriped to wrong value?");
});
}
}
173 changes: 173 additions & 0 deletions crates/hstr/src/repr/heap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
use std::{
mem,
ptr::{self, NonNull},
};

use super::{capacity::Capacity, Repr};

pub struct HeapStr {
ptr: ptr::NonNull<u8>,
len: Capacity,
}

static_assertions::assert_eq_size!(HeapStr, Repr);

impl HeapStr {
pub unsafe fn new(text: &str) -> Self {
let len = Capacity::new(text.len());
let ptr = NonNull::new_unchecked(text as *const str as *mut u8);
Self { ptr, len }
}

pub fn len(&self) -> usize {
unsafe { self.len.as_usize() }
}

pub fn as_str(&self) -> &str {
unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr.as_ptr(), self.len()))
}
}

#[inline]
pub fn dealloc(&mut self) {
deallocate_ptr(self.ptr, self.len)
}
}

/// Deallocates a buffer on the heap, handling when the capacity is also stored
/// on the heap
#[inline]
pub fn deallocate_ptr(ptr: ptr::NonNull<u8>, cap: Capacity) {
#[cold]
fn deallocate_with_capacity_on_heap(ptr: ptr::NonNull<u8>) {
// re-adjust the pointer to include the capacity that's on the heap
let adj_ptr = ptr.as_ptr().wrapping_sub(mem::size_of::<usize>());
// read the capacity from the heap so we know how much to deallocate
let mut buf = [0u8; mem::size_of::<usize>()];
// SAFETY: `src` and `dst` don't overlap, and are valid for usize number of
// bytes
unsafe {
ptr::copy_nonoverlapping(adj_ptr, buf.as_mut_ptr(), mem::size_of::<usize>());
}
let capacity = usize::from_ne_bytes(buf);
// SAFETY: We know the pointer is not null since we got it as a NonNull
let ptr = unsafe { ptr::NonNull::new_unchecked(adj_ptr) };
// SAFETY: We checked above that our capacity is on the heap, and we readjusted
// the pointer to reference the capacity
unsafe { heap_capacity::dealloc(ptr, capacity) }
}

if cap.is_heap() {
deallocate_with_capacity_on_heap(ptr);
} else {
// SAFETY: Our capacity is always inline on 64-bit archs
unsafe { inline_capacity::dealloc(ptr, cap.as_usize()) }
}
}

mod heap_capacity {
use core::ptr;
use std::alloc;

use super::HeapStr;

#[inline]
pub fn alloc(capacity: usize) -> ptr::NonNull<u8> {
let layout = layout(capacity);
debug_assert!(layout.size() > 0);

// SAFETY: `alloc(...)` has undefined behavior if the layout is zero-sized. We
// know the layout can't be zero-sized though because we're always at
// least allocating one `usize`
let raw_ptr = unsafe { alloc::alloc(layout) };

// Check to make sure our pointer is non-null, some allocators return null
// pointers instead of panicking
match ptr::NonNull::new(raw_ptr) {
Some(ptr) => ptr,
None => alloc::handle_alloc_error(layout),
}
}

/// Deallocates a pointer which references a `HeapBuffer` whose capacity is
/// on the heap
///
/// # Saftey
/// * `ptr` must point to the start of a `HeapBuffer` whose capacity is on
/// the heap. i.e. we must have `ptr -> [cap<usize> ; string<bytes>]`
pub unsafe fn dealloc(ptr: ptr::NonNull<u8>, capacity: usize) {
let layout = layout(capacity);
alloc::dealloc(ptr.as_ptr(), layout);
}

#[repr(C)]
struct HeapBufferInnerHeapCapacity {
capacity: usize,
buffer: HeapStr,
}

#[inline(always)]
pub fn layout(capacity: usize) -> alloc::Layout {
let buffer_layout = alloc::Layout::array::<u8>(capacity).expect("valid capacity");
alloc::Layout::new::<HeapBufferInnerHeapCapacity>()
.extend(buffer_layout)
.expect("valid layout")
.0
.pad_to_align()
}
}

mod inline_capacity {
use core::ptr;
use std::alloc;

use super::HeapStr;

/// # SAFETY:
/// * `capacity` must be > 0
#[inline]
pub unsafe fn alloc(capacity: usize) -> ptr::NonNull<u8> {
let layout = layout(capacity);
debug_assert!(layout.size() > 0);

// SAFETY: `alloc(...)` has undefined behavior if the layout is zero-sized. We
// specify that `capacity` must be > 0 as a constraint to uphold the
// safety of this method. If capacity is greater than 0, then our layout
// will be non-zero-sized.
let raw_ptr = alloc::alloc(layout);

// Check to make sure our pointer is non-null, some allocators return null
// pointers instead of panicking
match ptr::NonNull::new(raw_ptr) {
Some(ptr) => ptr,
None => alloc::handle_alloc_error(layout),
}
}

/// Deallocates a pointer which references a `HeapBuffer` whose capacity is
/// stored inline
///
/// # Saftey
/// * `ptr` must point to the start of a `HeapBuffer` whose capacity is on
/// the inline
pub unsafe fn dealloc(ptr: ptr::NonNull<u8>, capacity: usize) {
let layout = layout(capacity);
alloc::dealloc(ptr.as_ptr(), layout);
}

#[repr(C)]
struct HeapBufferInnerInlineCapacity {
buffer: HeapStr,
}

#[inline(always)]
pub fn layout(capacity: usize) -> alloc::Layout {
let buffer_layout = alloc::Layout::array::<u8>(capacity).expect("valid capacity");
alloc::Layout::new::<HeapBufferInnerInlineCapacity>()
.extend(buffer_layout)
.expect("valid layout")
.0
.pad_to_align()
}
}
Loading