From 38376b26c7c4e7389f629c2abbabe804668f0d74 Mon Sep 17 00:00:00 2001 From: Caio Date: Wed, 29 Jan 2025 17:38:44 -0300 Subject: [PATCH] [cfg_match] Library edition --- library/core/src/ffi/mod.rs | 116 ++++++++++++++++++ library/core/src/internal_macros.rs | 1 + library/core/src/lib.rs | 1 + library/core/src/num/f32.rs | 40 ++++++ library/core/src/slice/sort/select.rs | 10 ++ library/core/src/slice/sort/stable/mod.rs | 45 +++++++ library/core/src/slice/sort/unstable/mod.rs | 23 ++++ .../core/src/slice/sort/unstable/quicksort.rs | 10 ++ 8 files changed, 246 insertions(+) diff --git a/library/core/src/ffi/mod.rs b/library/core/src/ffi/mod.rs index 51687a3adcdd4..02d32d6490464 100644 --- a/library/core/src/ffi/mod.rs +++ b/library/core/src/ffi/mod.rs @@ -90,6 +90,7 @@ pub type c_ptrdiff_t = isize; pub type c_ssize_t = isize; mod c_char_definition { + #[cfg(bootstrap)] cfg_if! { // These are the targets on which c_char is unsigned. Usually the // signedness is the same for all target_os values on a given architecture @@ -180,9 +181,100 @@ mod c_char_definition { pub(super) type c_char = i8; } } + #[cfg(not(bootstrap))] + crate::cfg_match! { + // These are the targets on which c_char is unsigned. Usually the + // signedness is the same for all target_os values on a given architecture + // but there are some exceptions (see isSignedCharDefault() in clang). + // + // aarch64: + // Section 10 "Arm C and C++ language mappings" in Procedure Call Standard for the Arm® + // 64-bit Architecture (AArch64) says C/C++ char is unsigned byte. + // https://github.com/ARM-software/abi-aa/blob/2024Q3/aapcs64/aapcs64.rst#arm-c-and-c-language-mappings + // arm: + // Section 8 "Arm C and C++ Language Mappings" in Procedure Call Standard for the Arm® + // Architecture says C/C++ char is unsigned byte. + // https://github.com/ARM-software/abi-aa/blob/2024Q3/aapcs32/aapcs32.rst#arm-c-and-c-language-mappings + // csky: + // Section 2.1.2 "Primary Data Type" in C-SKY V2 CPU Applications Binary Interface + // Standards Manual says ANSI C char is unsigned byte. + // https://github.com/c-sky/csky-doc/blob/9f7121f7d40970ba5cc0f15716da033db2bb9d07/C-SKY_V2_CPU_Applications_Binary_Interface_Standards_Manual.pdf + // Note: this doesn't seem to match Clang's default (https://github.com/rust-lang/rust/issues/129945). + // hexagon: + // Section 3.1 "Basic data type" in Qualcomm Hexagon™ Application + // Binary Interface User Guide says "By default, the `char` data type is unsigned." + // https://docs.qualcomm.com/bundle/publicresource/80-N2040-23_REV_K_Qualcomm_Hexagon_Application_Binary_Interface_User_Guide.pdf + // msp430: + // Section 2.1 "Basic Types" in MSP430 Embedded Application Binary + // Interface says "The char type is unsigned by default". + // https://www.ti.com/lit/an/slaa534a/slaa534a.pdf + // Note: this doesn't seem to match Clang's default (https://github.com/rust-lang/rust/issues/129945). + // powerpc/powerpc64: + // - PPC32 SysV: "Table 3-1 Scalar Types" in System V Application Binary Interface PowerPC + // Processor Supplement says ANSI C char is unsigned byte + // https://refspecs.linuxfoundation.org/elf/elfspec_ppc.pdf + // - PPC64 ELFv1: Section 3.1.4 "Fundamental Types" in 64-bit PowerPC ELF Application + // Binary Interface Supplement 1.9 says ANSI C is unsigned byte + // https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi.html#FUND-TYPE + // - PPC64 ELFv2: Section 2.1.2.2 "Fundamental Types" in 64-Bit ELF V2 ABI Specification + // says char is unsigned byte + // https://openpowerfoundation.org/specifications/64bitelfabi/ + // - AIX: XL C for AIX Language Reference says "By default, char behaves like an unsigned char." + // https://www.ibm.com/docs/en/xl-c-aix/13.1.3?topic=specifiers-character-types + // riscv32/riscv64: + // C/C++ type representations section in RISC-V Calling Conventions + // page in RISC-V ELF psABI Document says "char is unsigned." + // https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/draft-20240829-13bfa9f54634cb60d86b9b333e109f077805b4b3/riscv-cc.adoc#cc-type-representations + // s390x: + // - ELF: "Table 1.1.: Scalar types" in ELF Application Binary Interface s390x Supplement + // Version 1.6.1 categorize ISO C char in unsigned integer + // https://github.com/IBM/s390x-abi/releases/tag/v1.6.1 + // - z/OS: XL C/C++ Language Reference says: "By default, char behaves like an unsigned char." + // https://www.ibm.com/docs/en/zos/3.1.0?topic=specifiers-character-types + // Xtensa: + // - "The char type is unsigned by default for Xtensa processors." + // + // On the following operating systems, c_char is signed by default, regardless of architecture. + // Darwin (macOS, iOS, etc.): + // Apple targets' c_char is signed by default even on arm + // https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Handle-data-types-and-data-alignment-properly + // Windows: + // Windows MSVC C++ Language Reference says "Microsoft-specific: Variables of type char + // are promoted to int as if from type signed char by default, unless the /J compilation + // option is used." + // https://learn.microsoft.com/en-us/cpp/cpp/fundamental-types-cpp?view=msvc-170#character-types) + // L4RE: + // The kernel builds with -funsigned-char on all targets (but useserspace follows the + // architecture defaults). As we only have a target for userspace apps so there are no + // special cases for L4RE below. + all( + not(windows), + not(target_vendor = "apple"), + any( + target_arch = "aarch64", + target_arch = "arm", + target_arch = "csky", + target_arch = "hexagon", + target_arch = "msp430", + target_arch = "powerpc", + target_arch = "powerpc64", + target_arch = "riscv64", + target_arch = "riscv32", + target_arch = "s390x", + target_arch = "xtensa", + ) + ) => { + pub(super) type c_char = u8; + } + _ => { + // On every other target, c_char is signed. + pub(super) type c_char = i8; + } + } } mod c_int_definition { + #[cfg(bootstrap)] cfg_if! { if #[cfg(any(target_arch = "avr", target_arch = "msp430"))] { pub(super) type c_int = i16; @@ -192,9 +284,21 @@ mod c_int_definition { pub(super) type c_uint = u32; } } + #[cfg(not(bootstrap))] + crate::cfg_match! { + any(target_arch = "avr", target_arch = "msp430") => { + pub(super) type c_int = i16; + pub(super) type c_uint = u16; + } + _ => { + pub(super) type c_int = i32; + pub(super) type c_uint = u32; + } + } } mod c_long_definition { + #[cfg(bootstrap)] cfg_if! { if #[cfg(all(target_pointer_width = "64", not(windows)))] { pub(super) type c_long = i64; @@ -205,6 +309,18 @@ mod c_long_definition { pub(super) type c_ulong = u32; } } + #[cfg(not(bootstrap))] + crate::cfg_match! { + all(target_pointer_width = "64", not(windows)) => { + pub(super) type c_long = i64; + pub(super) type c_ulong = u64; + } + _ => { + // The minimal size of `long` in the C standard is 32 bits + pub(super) type c_long = i32; + pub(super) type c_ulong = u32; + } + } } // N.B., for LLVM to recognize the void pointer type and by extension diff --git a/library/core/src/internal_macros.rs b/library/core/src/internal_macros.rs index fe4fa80263c28..fc4b377a2f2f6 100644 --- a/library/core/src/internal_macros.rs +++ b/library/core/src/internal_macros.rs @@ -146,6 +146,7 @@ macro_rules! impl_fn_for_zst { /// ``` // This is a copy of `cfg_if!` from the `cfg_if` crate. // The recursive invocations should use $crate if this is ever exported. +#[cfg(bootstrap)] macro_rules! cfg_if { // match if/else chains with a final `else` ( diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index c18e0405f7293..7920e94c844db 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -113,6 +113,7 @@ #![feature(bigint_helper_methods)] #![feature(bstr)] #![feature(bstr_internals)] +#![feature(cfg_match)] #![feature(const_carrying_mul_add)] #![feature(const_eval_select)] #![feature(core_intrinsics)] diff --git a/library/core/src/num/f32.rs b/library/core/src/num/f32.rs index 817bedbd44f98..766346c6b16c0 100644 --- a/library/core/src/num/f32.rs +++ b/library/core/src/num/f32.rs @@ -997,6 +997,7 @@ impl f32 { #[stable(feature = "num_midpoint", since = "1.85.0")] #[rustc_const_stable(feature = "num_midpoint", since = "1.85.0")] pub const fn midpoint(self, other: f32) -> f32 { + #[cfg(bootstrap)] cfg_if! { // Allow faster implementation that have known good 64-bit float // implementations. Falling back to the branchy code on targets that don't @@ -1019,6 +1020,45 @@ impl f32 { let abs_a = a.abs(); let abs_b = b.abs(); + if abs_a <= HI && abs_b <= HI { + // Overflow is impossible + (a + b) / 2. + } else if abs_a < LO { + // Not safe to halve `a` (would underflow) + a + (b / 2.) + } else if abs_b < LO { + // Not safe to halve `b` (would underflow) + (a / 2.) + b + } else { + // Safe to halve `a` and `b` + (a / 2.) + (b / 2.) + } + } + } + #[cfg(not(bootstrap))] + crate::cfg_match! { + // Allow faster implementation that have known good 64-bit float + // implementations. Falling back to the branchy code on targets that don't + // have 64-bit hardware floats or buggy implementations. + // https://github.com/rust-lang/rust/pull/121062#issuecomment-2123408114 + any( + target_arch = "x86_64", + target_arch = "aarch64", + all(any(target_arch = "riscv32", target_arch = "riscv64"), target_feature = "d"), + all(target_arch = "arm", target_feature = "vfp2"), + target_arch = "wasm32", + target_arch = "wasm64", + ) => { + ((self as f64 + other as f64) / 2.0) as f32 + } + _ => { + const LO: f32 = f32::MIN_POSITIVE * 2.; + const HI: f32 = f32::MAX / 2.; + + let (a, b) = (self, other); + let abs_a = a.abs(); + let abs_b = b.abs(); + if abs_a <= HI && abs_b <= HI { // Overflow is impossible (a + b) / 2. diff --git a/library/core/src/slice/sort/select.rs b/library/core/src/slice/sort/select.rs index 3358c03d30a9b..5e6341d0150e5 100644 --- a/library/core/src/slice/sort/select.rs +++ b/library/core/src/slice/sort/select.rs @@ -41,6 +41,7 @@ where let min_idx = min_index(v, &mut is_less).unwrap(); v.swap(min_idx, index); } else { + #[cfg(bootstrap)] cfg_if! { if #[cfg(feature = "optimize_for_size")] { median_of_medians(v, &mut is_less, index); @@ -48,6 +49,15 @@ where partition_at_index_loop(v, index, None, &mut is_less); } } + #[cfg(not(bootstrap))] + crate::cfg_match! { + feature = "optimize_for_size" => { + median_of_medians(v, &mut is_less, index); + } + _ => { + partition_at_index_loop(v, index, None, &mut is_less); + } + } } let (left, right) = v.split_at_mut(index); diff --git a/library/core/src/slice/sort/stable/mod.rs b/library/core/src/slice/sort/stable/mod.rs index 7adcc83b818d1..f6bdb867182b1 100644 --- a/library/core/src/slice/sort/stable/mod.rs +++ b/library/core/src/slice/sort/stable/mod.rs @@ -39,6 +39,7 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less return; } + #[cfg(bootstrap)] cfg_if! { if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { let alloc_len = len / 2; @@ -79,6 +80,50 @@ pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less driftsort_main::(v, is_less); } } + + #[cfg(not(bootstrap))] + crate::cfg_match! { + any(feature = "optimize_for_size", target_pointer_width = "16") => { + let alloc_len = len / 2; + + crate::cfg_match! { + target_pointer_width = "16" => { + let mut heap_buf = BufT::with_capacity(alloc_len); + let scratch = heap_buf.as_uninit_slice_mut(); + } + _ => { + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + } + } + + tiny::mergesort(v, scratch, is_less); + } + _ => { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + driftsort_main::(v, is_less); + } + } } /// See [`sort`] diff --git a/library/core/src/slice/sort/unstable/mod.rs b/library/core/src/slice/sort/unstable/mod.rs index 2eb653c4601a7..5684795dadf4d 100644 --- a/library/core/src/slice/sort/unstable/mod.rs +++ b/library/core/src/slice/sort/unstable/mod.rs @@ -30,6 +30,7 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { return; } + #[cfg(bootstrap)] cfg_if! { if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] { heapsort::heapsort(v, is_less); @@ -49,6 +50,28 @@ pub fn sort bool>(v: &mut [T], is_less: &mut F) { ipnsort(v, is_less); } } + + #[cfg(not(bootstrap))] + crate::cfg_match! { + any(feature = "optimize_for_size", target_pointer_width = "16") => { + heapsort::heapsort(v, is_less); + } + _ => { + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + ipnsort(v, is_less); + } + } } /// See [`sort`] diff --git a/library/core/src/slice/sort/unstable/quicksort.rs b/library/core/src/slice/sort/unstable/quicksort.rs index 4feef5deeb0fb..278d40b2825aa 100644 --- a/library/core/src/slice/sort/unstable/quicksort.rs +++ b/library/core/src/slice/sort/unstable/quicksort.rs @@ -140,6 +140,7 @@ const fn inst_partition bool>() -> fn(&mut [T], &T, &mut if mem::size_of::() <= MAX_BRANCHLESS_PARTITION_SIZE { // Specialize for types that are relatively cheap to copy, where branchless optimizations // have large leverage e.g. `u64` and `String`. + #[cfg(bootstrap)] cfg_if! { if #[cfg(feature = "optimize_for_size")] { partition_lomuto_branchless_simple:: @@ -147,6 +148,15 @@ const fn inst_partition bool>() -> fn(&mut [T], &T, &mut partition_lomuto_branchless_cyclic:: } } + #[cfg(not(bootstrap))] + crate::cfg_match! { + feature = "optimize_for_size" => { + partition_lomuto_branchless_simple:: + } + _ => { + partition_lomuto_branchless_cyclic:: + } + } } else { partition_hoare_branchy_cyclic:: }