From cf7db3f1bc696629a549bdf77350ce7905da370a Mon Sep 17 00:00:00 2001 From: Ivan Boldyrev Date: Wed, 21 May 2025 18:48:07 +0200 Subject: [PATCH 1/3] aarch64 fibers with a trampoline --- ch05/README.md | 2 + ch05/c-fibers-aarch64/Cargo.toml | 8 + ch05/c-fibers-aarch64/README.md | 32 ++++ ch05/c-fibers-aarch64/src/main.rs | 239 ++++++++++++++++++++++++++++++ 4 files changed, 281 insertions(+) create mode 100644 ch05/c-fibers-aarch64/Cargo.toml create mode 100644 ch05/c-fibers-aarch64/README.md create mode 100644 ch05/c-fibers-aarch64/src/main.rs diff --git a/ch05/README.md b/ch05/README.md index 605465c..c421160 100644 --- a/ch05/README.md +++ b/ch05/README.md @@ -8,3 +8,5 @@ For MacOS users running M-family chips (most newer Macs), there is a few simple steps you can take to get the example running on your machine. It's explained step-by-step in [How-to-MacOS-M.md](./How-to-MacOS-M.md). + +Or you may check the ./c-fibers-aarch64 example. diff --git a/ch05/c-fibers-aarch64/Cargo.toml b/ch05/c-fibers-aarch64/Cargo.toml new file mode 100644 index 0000000..3032db6 --- /dev/null +++ b/ch05/c-fibers-aarch64/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "c-fibers-aarch64" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/ch05/c-fibers-aarch64/README.md b/ch05/c-fibers-aarch64/README.md new file mode 100644 index 0000000..64dfeb2 --- /dev/null +++ b/ch05/c-fibers-aarch64/README.md @@ -0,0 +1,32 @@ +# c-fibers-aarch64 + +This example is an adaptation of `c-fibers` to AArch64 architecture. + +## Technical requirements + +This example will only work correctly on Unix platforms running on +a AArch64 processor. + +## Running the example + +This example uses the unstable feature "naked_functions" so we need to run it +using nightly Rust. There are two ways to do that. + +1. Tell cargo to use the nightly toolchain when you run the program: + +``` +cargo +nightly run +``` + +2. Override the default toolchain for this directory: + +``` +rustup override set nightly +cargo run +``` + +## Safety + +The implementation is very unsafe and only focuses on the bare minimum to get a working example running. We focus on explaining the concepts, and the only focus is on explaining them as simple as I can. + +While a fiber implementation like this will never be possible to do fully in safe Rust, there are many ways to make it safer, and it's a good readers excercise to do so. Just beware that you might have to change the API somewhat since some of the unsafe parts of this example is there just to make the API very easy to understand for learning purposes. diff --git a/ch05/c-fibers-aarch64/src/main.rs b/ch05/c-fibers-aarch64/src/main.rs new file mode 100644 index 0000000..8d54a07 --- /dev/null +++ b/ch05/c-fibers-aarch64/src/main.rs @@ -0,0 +1,239 @@ +#![feature(naked_functions)] +use std::arch::asm; + +const DEFAULT_STACK_SIZE: usize = 1024 * 1024 * 2; +const MAX_THREADS: usize = 4; +static mut RUNTIME: usize = 0; + +pub struct Runtime { + threads: Vec, + current: usize, +} + +#[derive(PartialEq, Eq, Debug)] +enum State { + Available, + Running, + Ready, +} + +struct Thread { + stack: Vec, + ctx: ThreadContext, + state: State, +} + +#[derive(Debug, Default)] +#[repr(C)] +struct ThreadContext { + // fp is stored on stack before the switch call + sp: u64, // 00 + lr: u64, // 08 + r19: u64, // 10 + r20: u64, // 18 + r21: u64, // 20 + r22: u64, // 28 + r23: u64, // 30 + r24: u64, // 38 + r25: u64, // 40 + r26: u64, // 48 + r27: u64, // 50 + r28: u64, // 58 +} + +impl Thread { + fn new() -> Self { + Thread { + stack: vec![0_u8; DEFAULT_STACK_SIZE], + ctx: ThreadContext::default(), + state: State::Available, + } + } +} + +impl Runtime { + pub fn new() -> Self { + let base_thread = Thread { + stack: vec![0_u8; DEFAULT_STACK_SIZE], + ctx: ThreadContext::default(), + state: State::Running, + }; + + let mut threads = Vec::with_capacity(MAX_THREADS); + threads.push(base_thread); + let available_threads = (1..MAX_THREADS).map(|_| Thread::new()); + threads.extend(available_threads); + + Runtime { + threads, + current: 0, + } + } + + pub unsafe fn init(&self) { + let r_ptr: *const Runtime = self; + RUNTIME = r_ptr as usize; + } + + pub fn run(&mut self) -> ! { + while self.t_yield() {} + std::process::exit(0); + } + + fn t_return(&mut self) { + if self.current != 0 { + self.threads[self.current].state = State::Available; + self.t_yield(); + } + } + + #[inline(never)] + fn t_yield(&mut self) -> bool { + let mut pos = self.current; + while self.threads[pos].state != State::Ready { + pos += 1; + if pos == self.threads.len() { + pos = 0; + } + if pos == self.current { + return false; + } + } + + if self.threads[self.current].state != State::Available { + self.threads[self.current].state = State::Ready; + } + + self.threads[pos].state = State::Running; + let old_pos = self.current; + self.current = pos; + + unsafe { + let old: *mut ThreadContext = &mut self.threads[old_pos].ctx; + let new: *const ThreadContext = &self.threads[pos].ctx; + asm!( + "bl _switch", + in("x0") old, + in("x1") new, + clobber_abi("C")); + } + self.threads.len() > 0 + } + + pub fn spawn(&mut self, f: fn()) { + let available = self + .threads + .iter_mut() + .find(|t| t.state == State::Available) + .expect("no available thread."); + + let size = available.stack.len(); + + unsafe { + let s_ptr = available.stack.as_mut_ptr().offset(size as isize); + let s_ptr = (s_ptr as usize & !15) as *mut u8; + std::ptr::write(s_ptr.offset(-32) as *mut u64, f as u64); + std::ptr::write(s_ptr.offset(-16) as *mut u64, guard as u64); + + available.ctx.lr = trampoline as u64; + available.ctx.sp = s_ptr.offset(-32) as u64; + } + available.state = State::Ready; + } +} // We close the `impl Runtime` block here + +fn guard() { + unsafe { + let rt_ptr = RUNTIME as *mut Runtime; + (*rt_ptr).t_return(); + }; +} + +pub fn yield_thread() { + unsafe { + let rt_ptr = RUNTIME as *mut Runtime; + (*rt_ptr).t_yield(); + }; +} + +#[naked] +#[no_mangle] +unsafe extern "C" fn trampoline() { + // sp + 00 function_address + // sp + 08 reserved + // sp + 10 guard address + // sp + 18 .. unused + asm!( + "ldr lr, [sp, 0x10]", + "ldr x1, [sp, 0x00]", + "br x1", + options(noreturn) + ) +} + +#[naked] +#[no_mangle] +unsafe extern "C" fn switch() { + asm!( + "str lr, [x0, 0x08]", + "str x19, [x0, 0x10]", + "str x20, [x0, 0x18]", + "str x21, [x0, 0x20]", + "str x22, [x0, 0x28]", + "str x23, [x0, 0x30]", + "str x24, [x0, 0x38]", + "str x25, [x0, 0x40]", + "str x26, [x0, 0x48]", + "str x27, [x0, 0x50]", + "str x28, [x0, 0x58]", + // sp cannot be stored/loaded directly -- use an intermediate register, one of the stored/loaded + "mov x19, sp", + "str x19, [x0, 0x00]", + "ldr x19, [x1, 0x00]", + "mov sp, x19", + "ldr lr, [x1, 0x08]", + "ldr x19, [x1, 0x10]", + "ldr x20, [x1, 0x18]", + "ldr x21, [x1, 0x20]", + "ldr x22, [x1, 0x28]", + "ldr x23, [x1, 0x30]", + "ldr x24, [x1, 0x38]", + "ldr x25, [x1, 0x40]", + "ldr x26, [x1, 0x48]", + "ldr x27, [x1, 0x50]", + "ldr x28, [x1, 0x58]", + "br lr", + options(noreturn) + ); +} + +fn main() { + let mut runtime = Runtime::new(); + // safety: we use single instance of new, use spawn and run correctly + // TODO make runtime thread-local? + // TODO pass runtime as an arg to spawn lambda? + unsafe { + runtime.init(); + } + + runtime.spawn(|| { + println!("THREAD 1 STARTING"); + let id = 1; + for i in 0..10 { + println!("thread: {} counter: {}", id, i); + yield_thread(); + } + println!("THREAD 1 FINISHED"); + }); + + runtime.spawn(|| { + println!("THREAD 2 STARTING"); + let id = 2; + for i in 0..15 { + println!("thread: {} counter: {}", id, i); + yield_thread(); + } + println!("THREAD 2 FINISHED"); + }); + runtime.run(); +} From 7ac6d7f4be5de848f1632646ffeed788663ba587 Mon Sep 17 00:00:00 2001 From: Ivan Boldyrev Date: Thu, 22 May 2025 12:55:29 +0200 Subject: [PATCH 2/3] Fixed and update for current nightly Rust 1. fix storing `r29` aka FP; 2. use `std::arch::naked_asm` where appropriate; 3. use `#[unstable(naked)]`. --- ch05/c-fibers-aarch64/src/main.rs | 144 +++++++++++++++++------------- 1 file changed, 82 insertions(+), 62 deletions(-) diff --git a/ch05/c-fibers-aarch64/src/main.rs b/ch05/c-fibers-aarch64/src/main.rs index 8d54a07..b7e1a4c 100644 --- a/ch05/c-fibers-aarch64/src/main.rs +++ b/ch05/c-fibers-aarch64/src/main.rs @@ -1,8 +1,12 @@ #![feature(naked_functions)] use std::arch::asm; +use std::arch::naked_asm; const DEFAULT_STACK_SIZE: usize = 1024 * 1024 * 2; const MAX_THREADS: usize = 4; +const F_TRAMPOLINE_OFFSET: usize = 0; +const GUARD_TRAMPOLINE_OFFSET: usize = F_TRAMPOLINE_OFFSET + std::mem::size_of::(); + static mut RUNTIME: usize = 0; pub struct Runtime { @@ -26,19 +30,19 @@ struct Thread { #[derive(Debug, Default)] #[repr(C)] struct ThreadContext { - // fp is stored on stack before the switch call - sp: u64, // 00 - lr: u64, // 08 - r19: u64, // 10 - r20: u64, // 18 - r21: u64, // 20 - r22: u64, // 28 - r23: u64, // 30 - r24: u64, // 38 - r25: u64, // 40 - r26: u64, // 48 - r27: u64, // 50 - r28: u64, // 58 + r19: u64, // 00 + r20: u64, // 08 + r21: u64, // 10 + r22: u64, // 18 + r23: u64, // 20 + r24: u64, // 28 + r25: u64, // 30 + r26: u64, // 38 + r27: u64, // 40 + r28: u64, // 48 + fp: u64, // 50 + lr: u64, // 58 + sp: u64, // 60 } impl Thread { @@ -117,7 +121,7 @@ impl Runtime { in("x1") new, clobber_abi("C")); } - self.threads.len() > 0 + !self.threads.is_empty() } pub fn spawn(&mut self, f: fn()) { @@ -129,15 +133,21 @@ impl Runtime { let size = available.stack.len(); + // prepare stack for the trampoline + let stack_top; unsafe { - let s_ptr = available.stack.as_mut_ptr().offset(size as isize); - let s_ptr = (s_ptr as usize & !15) as *mut u8; - std::ptr::write(s_ptr.offset(-32) as *mut u64, f as u64); - std::ptr::write(s_ptr.offset(-16) as *mut u64, guard as u64); - - available.ctx.lr = trampoline as u64; - available.ctx.sp = s_ptr.offset(-32) as u64; + let s_end = available.stack.as_mut_ptr().add(size); + let s_end_aligned = (s_end as usize & !15) as *mut u8; + stack_top = s_end_aligned.offset(-32); + std::ptr::write(stack_top.add(F_TRAMPOLINE_OFFSET).cast::(), f as u64); + std::ptr::write( + stack_top.add(GUARD_TRAMPOLINE_OFFSET).cast::(), + guard as u64, + ); } + available.ctx.lr = trampoline as u64; + available.ctx.fp = 0; + available.ctx.sp = stack_top as u64; available.state = State::Ready; } } // We close the `impl Runtime` block here @@ -156,55 +166,65 @@ pub fn yield_thread() { }; } -#[naked] +#[unsafe(naked)] #[no_mangle] unsafe extern "C" fn trampoline() { + // the stack is prepared by the `Runtime::spawn`: // sp + 00 function_address - // sp + 08 reserved - // sp + 10 guard address - // sp + 18 .. unused - asm!( - "ldr lr, [sp, 0x10]", - "ldr x1, [sp, 0x00]", + // sp + 08 guard address + // sp + 10 .. unused + naked_asm! { + "ldr x1, [sp, {f_trampoline_offset}]", + "ldr lr, [sp, {guard_trampoline_offset}]", + "sub sp, sp, 0x10", // current stack frame is not neeeded anymore "br x1", - options(noreturn) - ) + f_trampoline_offset = const F_TRAMPOLINE_OFFSET, + guard_trampoline_offset = const GUARD_TRAMPOLINE_OFFSET, + }; } -#[naked] +#[unsafe(naked)] #[no_mangle] unsafe extern "C" fn switch() { - asm!( - "str lr, [x0, 0x08]", - "str x19, [x0, 0x10]", - "str x20, [x0, 0x18]", - "str x21, [x0, 0x20]", - "str x22, [x0, 0x28]", - "str x23, [x0, 0x30]", - "str x24, [x0, 0x38]", - "str x25, [x0, 0x40]", - "str x26, [x0, 0x48]", - "str x27, [x0, 0x50]", - "str x28, [x0, 0x58]", - // sp cannot be stored/loaded directly -- use an intermediate register, one of the stored/loaded - "mov x19, sp", - "str x19, [x0, 0x00]", - "ldr x19, [x1, 0x00]", - "mov sp, x19", - "ldr lr, [x1, 0x08]", - "ldr x19, [x1, 0x10]", - "ldr x20, [x1, 0x18]", - "ldr x21, [x1, 0x20]", - "ldr x22, [x1, 0x28]", - "ldr x23, [x1, 0x30]", - "ldr x24, [x1, 0x38]", - "ldr x25, [x1, 0x40]", - "ldr x26, [x1, 0x48]", - "ldr x27, [x1, 0x50]", - "ldr x28, [x1, 0x58]", - "br lr", - options(noreturn) - ); + naked_asm! { + // saving the old fiber. + // TODO we might use `stp` instruction to store a pair of registers at once, but we don't. + // TODO we might also use a postincrement instead of precomputed offsets. + // But this is not an ARM tutorial, really. + "str x19, [x0, 0x00]", + "str x20, [x0, 0x08]", + "str x21, [x0, 0x10]", + "str x22, [x0, 0x18]", + "str x23, [x0, 0x20]", + "str x24, [x0, 0x28]", + "str x25, [x0, 0x30]", + "str x26, [x0, 0x38]", + "str x27, [x0, 0x40]", + "str x28, [x0, 0x48]", + "str fp, [x0, 0x50]", + "str lr, [x0, 0x58]", + // sp cannot be stored/loaded directly -- use an intermediate register, one of the stored/loaded. + "mov x2, sp", + "str x2, [x0, 0x60]", + + // loading the new fiber + // TODO we might use `ldp` instruction to load a pair of registers at once, but we don't. + "ldr x19, [x1, 0x00]", + "ldr x20, [x1, 0x08]", + "ldr x21, [x1, 0x10]", + "ldr x22, [x1, 0x18]", + "ldr x23, [x1, 0x20]", + "ldr x24, [x1, 0x28]", + "ldr x25, [x1, 0x30]", + "ldr x26, [x1, 0x38]", + "ldr x27, [x1, 0x40]", + "ldr x28, [x1, 0x48]", + "ldr fp, [x1, 0x50]", + "ldr lr, [x1, 0x58]", + "ldr x2, [x1, 0x60]", + "mov sp, x2", + "ret", + }; } fn main() { From a1503b464510a7ed5918b1f5b95824ded813421d Mon Sep 17 00:00:00 2001 From: Ivan Boldyrev Date: Tue, 27 May 2025 13:41:19 +0200 Subject: [PATCH 3/3] Reduce initial frame size 16 bytes is enough. --- ch05/c-fibers-aarch64/src/main.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ch05/c-fibers-aarch64/src/main.rs b/ch05/c-fibers-aarch64/src/main.rs index b7e1a4c..3b7a8c8 100644 --- a/ch05/c-fibers-aarch64/src/main.rs +++ b/ch05/c-fibers-aarch64/src/main.rs @@ -138,7 +138,7 @@ impl Runtime { unsafe { let s_end = available.stack.as_mut_ptr().add(size); let s_end_aligned = (s_end as usize & !15) as *mut u8; - stack_top = s_end_aligned.offset(-32); + stack_top = s_end_aligned.offset(-16); std::ptr::write(stack_top.add(F_TRAMPOLINE_OFFSET).cast::(), f as u64); std::ptr::write( stack_top.add(GUARD_TRAMPOLINE_OFFSET).cast::(), @@ -172,7 +172,6 @@ unsafe extern "C" fn trampoline() { // the stack is prepared by the `Runtime::spawn`: // sp + 00 function_address // sp + 08 guard address - // sp + 10 .. unused naked_asm! { "ldr x1, [sp, {f_trampoline_offset}]", "ldr lr, [sp, {guard_trampoline_offset}]",