Skip to content

Commit

Permalink
fix jl_active_task_stack to return correct values
Browse files Browse the repository at this point in the history
Seems potentially slightly costly to need to generate these extra
writes for every safepoint transition to keep track of the current
stack location, but that is potentially an unavoidable cost to doing
conservative stack scanning for gc.
  • Loading branch information
vtjnash authored and fingolfin committed Jan 21, 2025
1 parent be7b44d commit df68965
Show file tree
Hide file tree
Showing 11 changed files with 107 additions and 79 deletions.
1 change: 1 addition & 0 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -3414,6 +3414,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
}
jl_gc_debug_print();

ct->ctx.activefp = (char*)jl_get_frame_addr();
int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
// `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
Expand Down
3 changes: 3 additions & 0 deletions src/julia_gcext.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) JL_NOTSAFEPOINT;
// *active_start and *active_end respectively *total_start and *total_end
// accordingly. The range for the active part is a best-effort approximation
// and may not be tight.
//
// NOTE: Only valid to call from within a GC context. May return incorrect
// values and segfault otherwise.
JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task,
char **active_start, char **active_end,
char **total_start, char **total_end) JL_NOTSAFEPOINT;
Expand Down
12 changes: 0 additions & 12 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1765,18 +1765,6 @@ JL_DLLEXPORT unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *fi
void register_eh_frames(uint8_t *Addr, size_t Size) JL_NOTSAFEPOINT;
void deregister_eh_frames(uint8_t *Addr, size_t Size) JL_NOTSAFEPOINT;

STATIC_INLINE void *jl_get_frame_addr(void) JL_NOTSAFEPOINT
{
#ifdef __GNUC__
return __builtin_frame_address(0);
#else
void *dummy = NULL;
// The mask is to suppress the compiler warning about returning
// address of local variable
return (void*)((uintptr_t)&dummy & ~(uintptr_t)15);
#endif
}

// Log `msg` to the current logger by calling CoreLogging.logmsg_shim() on the
// julia side. If any of module, group, id, file or line are NULL, these will
// be passed to the julia side as `nothing`. If `kwargs` is NULL an empty set
Expand Down
17 changes: 17 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ typedef struct {
_jl_ucontext_t *ctx;
jl_stack_context_t *copy_ctx;
};
void *activefp;
void *stkbuf; // malloc'd memory (either copybuf or stack)
size_t bufsz; // actual sizeof stkbuf
unsigned int copy_stack:31; // sizeof stack for copybuf
Expand Down Expand Up @@ -226,6 +227,7 @@ typedef jl_value_t jl_function_t;
typedef struct _jl_timing_block_t jl_timing_block_t;
typedef struct _jl_timing_event_t jl_timing_event_t;
typedef struct _jl_excstack_t jl_excstack_t;

typedef struct _jl_handler_t jl_handler_t;

typedef struct _jl_task_t {
Expand Down Expand Up @@ -316,6 +318,18 @@ JL_DLLEXPORT void (jl_cpu_pause)(void);
JL_DLLEXPORT void (jl_cpu_suspend)(void);
JL_DLLEXPORT void (jl_cpu_wake)(void);

STATIC_INLINE void *jl_get_frame_addr(void) JL_NOTSAFEPOINT
{
#ifdef __GNUC__
return __builtin_frame_address(0);
#else
void *dummy = NULL;
// The mask is to suppress the compiler warning about returning
// address of local variable
return (void*)((uintptr_t)&dummy & ~(uintptr_t)15);
#endif
}

#ifdef __clang_gcanalyzer__
// Note that the sigint safepoint can also trigger GC, albeit less likely
void jl_gc_safepoint_(jl_ptls_t tls);
Expand Down Expand Up @@ -343,6 +357,9 @@ STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state,
{
assert(old_state != JL_GC_PARALLEL_COLLECTOR_THREAD);
assert(old_state != JL_GC_CONCURRENT_COLLECTOR_THREAD);
if (__builtin_constant_p(state) ? state : // required if !old_state && state, otherwise optional
__builtin_constant_p(old_state) ? !old_state : 1)
jl_atomic_load_relaxed(&ptls->current_task)->ctx.activefp = (char*)jl_get_frame_addr();
jl_atomic_store_release(&ptls->gc_state, state);
if (state == JL_GC_STATE_UNSAFE || old_state == JL_GC_STATE_UNSAFE)
jl_gc_safepoint_(ptls);
Expand Down
29 changes: 18 additions & 11 deletions src/llvm-codegen-shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,16 +237,24 @@ static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Type *T_s
emit_signal_fence(builder);
}

static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state, bool final)
static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, llvm::Value *state, llvm::Value *old_state, bool final)
{
using namespace llvm;
Value *ptls = get_current_ptls_from_task(builder, task, tbaa);
Type *T_int8 = state->getType();
unsigned offset = offsetof(jl_tls_states_t, gc_state);
Value *gc_state = builder.CreateConstInBoundsGEP1_32(T_int8, ptls, offset, "gc_state");
if (old_state == nullptr) {
old_state = builder.CreateLoad(T_int8, gc_state);
cast<LoadInst>(old_state)->setOrdering(AtomicOrdering::Monotonic);
}
if (isa<Constant>(state) ? state == builder.getInt8(0) :
isa<Constant>(old_state) ? old_state == builder.getInt8(0) : true) {
unsigned offset = offsetof(jl_task_t, ctx.activefp);
Value *currentfp = builder.CreateConstInBoundsGEP1_32(T_int8, task, offset, "gc_state");
Value *fp = builder.CreateIntrinsic(Intrinsic::frameaddress, {builder.getPtrTy()}, {builder.getInt32(0)});
builder.CreateAlignedStore(fp, currentfp, Align(sizeof(void*)));
}
builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release);
if (auto *C = dyn_cast<ConstantInt>(old_state))
if (C->isZero())
Expand All @@ -261,39 +269,38 @@ static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::T
builder.CreateICmpEQ(state, zero8)),
passBB, exitBB);
builder.SetInsertPoint(passBB);
MDNode *tbaa = get_tbaa_const(builder.getContext());
emit_gc_safepoint(builder, T_size, ptls, tbaa, final);
emit_gc_safepoint(builder, T_size, ptls, get_tbaa_const(builder.getContext()), final);
builder.CreateBr(exitBB);
builder.SetInsertPoint(exitBB);
return old_state;
}

static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, bool final)
static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, bool final)
{
using namespace llvm;
Value *state = builder.getInt8(0);
return emit_gc_state_set(builder, T_size, ptls, state, nullptr, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, nullptr, final);
}

static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::Value *state, bool final)
static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, llvm::Value *state, bool final)
{
using namespace llvm;
Value *old_state = builder.getInt8(JL_GC_STATE_UNSAFE);
return emit_gc_state_set(builder, T_size, ptls, state, old_state, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, old_state, final);
}

static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, bool final)
static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, bool final)
{
using namespace llvm;
Value *state = builder.getInt8(JL_GC_STATE_SAFE);
return emit_gc_state_set(builder, T_size, ptls, state, nullptr, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, nullptr, final);
}

static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *ptls, llvm::Value *state, bool final)
static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Type *T_size, llvm::Value *task, llvm::MDNode *tbaa, llvm::Value *state, bool final)
{
using namespace llvm;
Value *old_state = builder.getInt8(JL_GC_STATE_SAFE);
return emit_gc_state_set(builder, T_size, ptls, state, old_state, final);
return emit_gc_state_set(builder, T_size, task, tbaa, state, old_state, final);
}

// Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14.
Expand Down
4 changes: 2 additions & 2 deletions src/llvm-ptls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
builder.SetInsertPoint(fastTerm->getParent());
fastTerm->removeFromParent();
MDNode *tbaa = tbaa_gcframe;
Value *prior = emit_gc_unsafe_enter(builder, T_size, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa), true);
Value *prior = emit_gc_unsafe_enter(builder, T_size, get_current_task_from_pgcstack(builder, pgcstack), tbaa, true);
builder.Insert(fastTerm);
phi->addIncoming(pgcstack, fastTerm->getParent());
// emit pre-return cleanup
Expand All @@ -197,7 +197,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter,
for (auto &BB : *pgcstack->getParent()->getParent()) {
if (isa<ReturnInst>(BB.getTerminator())) {
builder.SetInsertPoint(BB.getTerminator());
emit_gc_unsafe_leave(builder, T_size, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state, true);
emit_gc_unsafe_leave(builder, T_size, get_current_task_from_pgcstack(builder, phi), tbaa, last_gc_state, true);
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ void jl_set_gc_and_wait(jl_task_t *ct) // n.b. not used on _OS_DARWIN_
// reading own gc state doesn't need atomic ops since no one else
// should store to it.
int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
ct->ctx.activefp = (char*)jl_get_frame_addr();
jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
uv_mutex_lock(&safepoint_lock);
uv_cond_broadcast(&safepoint_cond_begin);
Expand Down Expand Up @@ -279,6 +280,7 @@ void jl_safepoint_wait_thread_resume(jl_task_t *ct)
// might have already observed our gc_state.
// if (!jl_atomic_load_relaxed(&ct->ptls->suspend_count)) return;
int8_t state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
ct->ctx.activefp = (char*)jl_get_frame_addr();
jl_atomic_store_release(&ct->ptls->gc_state, JL_GC_STATE_WAITING);
uv_mutex_lock(&ct->ptls->sleep_lock);
if (jl_atomic_load_relaxed(&ct->ptls->suspend_count)) {
Expand Down
11 changes: 11 additions & 0 deletions src/signals-mach.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,17 @@ void jl_mach_gc_end(void)
// implement jl_set_gc_and_wait from a different thread
static void jl_mach_gc_wait(jl_ptls_t ptls2, mach_port_t thread, int16_t tid)
{
unsigned int count = MACH_THREAD_STATE_COUNT;
host_thread_state_t state;
kern_return_t ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, &count);
ptls2->current_task->ctx.activefp =
#ifdef _CPU_X86_64_
state->__rsp;
#elif defined(_CPU_AARCH64_)
state->__sp;
#else
#error fp not defined for platform
#endif
// relaxed, since we don't mind missing one--we will hit another soon (immediately probably)
uv_mutex_lock(&safepoint_lock);
// Since this gets set to zero only while the safepoint_lock was held this
Expand Down
63 changes: 25 additions & 38 deletions src/task.c
Original file line number Diff line number Diff line change
Expand Up @@ -356,49 +356,35 @@ JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task,
char **active_start, char **active_end,
char **total_start, char **total_end)
{
if (!task->ctx.started) {
*total_start = *active_start = 0;
*total_end = *active_end = 0;
*total_start = *active_start = 0;
*total_end = *active_end = 0;
if (!task->ctx.started)
return;
}

jl_ptls_t ptls2 = task->ptls;
if (task->ctx.copy_stack && ptls2) {
*total_start = *active_start = (char*)ptls2->stackbase - ptls2->stacksize;
*total_end = *active_end = (char*)ptls2->stackbase;
}
else if (task->ctx.stkbuf) {
*total_start = *active_start = (char*)task->ctx.stkbuf;
#ifndef _OS_WINDOWS_
jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0];
if (ptls0->root_task == task) {
// See jl_init_root_task(). The root task of the main thread
// has its buffer enlarged by an artificial 3000000 bytes, but
// that means that the start of the buffer usually points to
// inaccessible memory. We need to correct for this.
*active_start += ROOT_TASK_STACK_ADJUSTMENT;
*total_start += ROOT_TASK_STACK_ADJUSTMENT;
}
#endif

*total_end = *active_end = (char*)task->ctx.stkbuf + task->ctx.bufsz;
#ifdef COPY_STACKS
// save_stack stores the stack of an inactive task in stkbuf, and the
// actual number of used bytes in copy_stack.
if (task->ctx.copy_stack > 1)
if (task->ctx.copy_stack) {
if (task->ctx.ctx) {
// currently paused task
// save_stack stores the stack of an inactive task in stkbuf, and the
// actual number of used bytes in copy_stack.
*active_start = *total_start = (char*)task->ctx.stkbuf;
*active_end = (char*)task->ctx.stkbuf + task->ctx.copy_stack;
#endif
*total_end = (char*)task->ctx.stkbuf + task->ctx.bufsz;
}
else if (task->ptls) {
jl_ptls_t ptls2 = task->ptls;
// currently running copy task
*active_start = (char*)task->ctx.activefp;
*total_start = (char*)ptls2->stackbase - ptls2->stacksize;
*total_end = *active_end = (char*)ptls2->stackbase;
}
}
else {
// no stack allocated yet
*total_start = *active_start = 0;
*total_end = *active_end = 0;
return;
}

if (task == jl_current_task) {
// scan up to current `sp` for current thread and task
*active_start = (char*)jl_get_frame_addr();
if (task->ctx.stkbuf) {
// currently running task
*active_start = (char*)task->ctx.activefp;
*total_start = (char*)task->ctx.stkbuf;
*active_end = *total_end = (char*)task->ctx.stkbuf + task->ctx.bufsz;
}
}
}

Expand Down Expand Up @@ -506,6 +492,7 @@ JL_NO_ASAN static void ctx_switch(jl_task_t *lastt)
}

// set up global state for new task and clear global state for old task
lastt->ctx.activefp = (char*)jl_get_frame_addr();
t->ptls = ptls;
jl_atomic_store_relaxed(&ptls->current_task, t);
JL_GC_PROMISE_ROOTED(t);
Expand Down
5 changes: 3 additions & 2 deletions test/gcext/gcext-test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ end
close(out.in)
close(err.in)
out_task = @async readlines(out)
err_task = @async readlines(err)
err_task = @async read(err, String)
# @test success(p)
errlines = fetch(err_task)
lines = fetch(out_task)
@test length(errlines) == 0
print(errlines)
@test isempty(errlines)
# @test length(lines) == 6
@test length(lines) == 5
@test checknum(lines[2], r"([0-9]+) full collections", n -> n >= 10)
Expand Down
39 changes: 25 additions & 14 deletions test/gcext/gcext.c
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,6 @@ static int stack_grows_down(void) {

void task_scanner(jl_task_t *task, int root_task)
{
int var_on_frame;

// The task scanner is not necessary for liveness, as the
// corresponding task stack is already part of the stack.
// Its purpose is simply to test that the task scanner
Expand All @@ -491,23 +489,36 @@ void task_scanner(jl_task_t *task, int root_task)
char *total_end_stack;
jl_active_task_stack(task, &start_stack, &end_stack, &total_start_stack, &total_end_stack);

if (end_stack == 0)
return;

if (!(start_stack < end_stack) ||
!(total_start_stack < total_end_stack) ||
!(total_start_stack <= start_stack) ||
!(end_stack <= total_end_stack)) {
fputs("incorrect stack bounds\n", stderr);
fflush(stderr);
abort();
}

// this is the live stack of a thread. Is it ours?
if (start_stack && task == (jl_task_t*)jl_get_current_task()) {
if (!(lt_ptr(start_stack, &var_on_frame) && lt_ptr(&var_on_frame, end_stack))) {
if (task == (jl_task_t*)jl_get_current_task()) {
void *var_on_frame = task->ctx.activefp + 1;
if (!(lt_ptr(start_stack, var_on_frame) && lt_ptr(var_on_frame, end_stack))) {
// error, current stack frame must be on the live stack.
jl_error("stack frame not part of the current task");
fputs("stack frame not part of the current task\n", stderr);
fflush(stderr);
abort();
}
}

if (start_stack) {
void **start = (void **)start_stack;
void **end = (void **)end_stack;
while (start < end) {
void *p = *start++;
void *q = jl_gc_internal_obj_base_ptr(p);
if (q) {
jl_gc_mark_queue_obj(ptls, q);
}
void **start = (void **)start_stack;
void **end = (void **)end_stack;
while (start < end) {
void *p = *start++;
void *q = jl_gc_internal_obj_base_ptr(p);
if (q) {
jl_gc_mark_queue_obj(ptls, q);
}
}
}
Expand Down

0 comments on commit df68965

Please sign in to comment.