From a111bff3bf7cd327f958036f469c836bfd77508f Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 8 Jan 2025 14:53:02 -0800 Subject: [PATCH 01/19] wip: gc mark alive for nogil --- Include/internal/pycore_gc.h | 50 ++++- Python/gc_free_threading.c | 408 ++++++++++++++++++++++++++++++++++- 2 files changed, 441 insertions(+), 17 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 4ff34bf8ead7d0..289b4453bc41e3 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -45,12 +45,13 @@ static inline PyObject* _Py_FROM_GC(PyGC_Head *gc) { * the per-object lock. */ #ifdef Py_GIL_DISABLED -# define _PyGC_BITS_TRACKED (1) // Tracked by the GC -# define _PyGC_BITS_FINALIZED (2) // tp_finalize was called -# define _PyGC_BITS_UNREACHABLE (4) -# define _PyGC_BITS_FROZEN (8) -# define _PyGC_BITS_SHARED (16) -# define _PyGC_BITS_DEFERRED (64) // Use deferred reference counting +# define _PyGC_BITS_TRACKED (1<<0) // Tracked by the GC +# define _PyGC_BITS_FINALIZED (1<<1) // tp_finalize was called +# define _PyGC_BITS_UNREACHABLE (1<<2) +# define _PyGC_BITS_FROZEN (1<<3) +# define _PyGC_BITS_SHARED (1<<4) +# define _PyGC_BITS_ALIVE (1<<5) // Reachable from a known root. +# define _PyGC_BITS_DEFERRED (1<<7) // Use deferred reference counting #endif #ifdef Py_GIL_DISABLED @@ -289,6 +290,38 @@ enum _GCPhase { GC_PHASE_COLLECT = 1 }; +// if true, enable GC timing statistics +#define WITH_GC_TIMING_STATS 0 + +#if WITH_GC_TIMING_STATS + +#define QUANTILE_COUNT 5 +#define MARKER_COUNT (QUANTILE_COUNT * 3 + 2) + +typedef struct { + double q[MARKER_COUNT]; + double dn[MARKER_COUNT]; + double np[MARKER_COUNT]; + int n[MARKER_COUNT]; + int count; + double max; +} p2_engine; + +struct gc_timing_state { + /* timing statistics computed by P^2 algorithm */ + p2_engine auto_all; // timing for all automatic collections + p2_engine auto_full; // timing for full (gen2) automatic collections + /* Total time spent inside cyclic GC */ + PyTime_t gc_total_time; + /* Time spent inside incremental mark part of cyclic GC */ + PyTime_t gc_mark_time; + /* Maximum GC pause time */ + PyTime_t gc_max_pause; + /* Total number of times GC was run */ + PyTime_t gc_runs; +}; +#endif // WITH_GC_TIMING_STATS + struct _gc_runtime_state { /* List of objects that still need to be cleaned up, singly linked * via their gc headers' gc_prev pointers. */ @@ -318,6 +351,11 @@ struct _gc_runtime_state { int visited_space; int phase; +#if WITH_GC_TIMING_STATS + /* state for GC timing statistics */ + struct gc_timing_state timing_state; +#endif + #ifdef Py_GIL_DISABLED /* This is the number of objects that survived the last full collection. It approximates the number of long lived objects diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index f7f44407494e51..aa115e869e05d0 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -7,6 +7,7 @@ #include "pycore_freelist.h" // _PyObject_ClearFreeLists() #include "pycore_initconfig.h" #include "pycore_interp.h" // PyInterpreterState.gc +#include "pycore_time.h" #include "pycore_object.h" #include "pycore_object_alloc.h" // _PyObject_MallocWithType() #include "pycore_object_stack.h" @@ -33,6 +34,156 @@ typedef struct _gc_runtime_state GCState; // Automatically choose the generation that needs collecting. #define GENERATION_AUTO (-1) +#if WITH_GC_TIMING_STATS + +static void p2engine_init(p2_engine* engine, const double quantiles[QUANTILE_COUNT]) { + engine->count = 0; + engine->max = 0.0; + + engine->dn[0] = 0; + engine->dn[MARKER_COUNT - 1] = 1; + + int pointer = 1; + for (int i = 0; i < QUANTILE_COUNT; i++) { + double quantile = quantiles[i]; + engine->dn[pointer] = quantile; + engine->dn[pointer + 1] = quantile / 2; + engine->dn[pointer + 2] = (1 + quantile) / 2; + pointer += 3; + } + + // Sort the markers + for (int i = 0; i < MARKER_COUNT; i++) { + for (int j = i + 1; j < MARKER_COUNT; j++) { + if (engine->dn[i] > engine->dn[j]) { + double temp = engine->dn[i]; + engine->dn[i] = engine->dn[j]; + engine->dn[j] = temp; + } + } + } + + for (int i = 0; i < MARKER_COUNT; i++) { + engine->np[i] = (MARKER_COUNT - 1) * engine->dn[i] + 1; + } +} + +static int sign(double d) { + return (d >= 0) ? 1 : -1; +} + +static double parabolic(p2_engine* engine, int i, int d) { + return engine->q[i] + d / ((engine->n[i + 1] - engine->n[i - 1]) * + ((engine->n[i] - engine->n[i - 1] + d) * ((engine->q[i + 1] - engine->q[i]) / (engine->n[i + 1] - engine->n[i])) + + (engine->n[i + 1] - (engine->n[i] - d)) * ((engine->q[i] - engine->q[i - 1]) / (engine->n[i] - engine->n[i - 1])))); +} + +static double linear(p2_engine* engine, int i, int d) { + return engine->q[i] + d * ((engine->q[i + d] - engine->q[i]) / (engine->n[i + d] - engine->n[i])); +} + +static void p2engine_add(p2_engine* engine, double data) { + int i, k = 0; + if (data > engine->max) { + engine->max = data; + } + if (engine->count >= MARKER_COUNT) { + engine->count++; + // B1 + if (data < engine->q[0]) { + engine->q[0] = data; + k = 1; + } else if (data >= engine->q[MARKER_COUNT - 1]) { + engine->q[MARKER_COUNT - 1] = data; + k = MARKER_COUNT - 1; + } else { + for (i = 1; i < MARKER_COUNT; i++) { + if (data < engine->q[i]) { + k = i; + break; + } + } + } + + // B2 + for (i = k; i < MARKER_COUNT; i++) { + engine->n[i]++; + engine->np[i] = engine->np[i] + engine->dn[i]; + } + + for (i = 0; i < k; i++) { + engine->np[i] = engine->np[i] + engine->dn[i]; + } + + // B3 + for (i = 1; i < MARKER_COUNT - 1; i++) { + double d = engine->np[i] - engine->n[i]; + if ((d >= 1 && engine->n[i + 1] - engine->n[i] > 1) || + (d <= -1 && engine->n[i - 1] - engine->n[i] < -1)) { + double newq = parabolic(engine, i, sign(d)); + if (engine->q[i - 1] < newq && newq < engine->q[i + 1]) { + engine->q[i] = newq; + } else { + engine->q[i] = linear(engine, i, sign(d)); + } + engine->n[i] = engine->n[i] + sign(d); + } + } + } else { + engine->q[engine->count] = data; + engine->count++; + if (engine->count == MARKER_COUNT) { + // We have enough to start the algorithm, initialize + for (i = 0; i < MARKER_COUNT; i++) { + for (int j = i + 1; j < MARKER_COUNT; j++) { + if (engine->q[i] > engine->q[j]) { + double temp = engine->q[i]; + engine->q[i] = engine->q[j]; + engine->q[j] = temp; + } + } + } + for (i = 0; i < MARKER_COUNT; i++) { + engine->n[i] = i + 1; + } + } + } +} + +static double p2engine_result(p2_engine* engine, double quantile) { + if (engine->count < MARKER_COUNT) { + int closest = 1; + for (int i = 0; i < engine->count; i++) { + for (int j = i + 1; j < engine->count; j++) { + if (engine->q[i] > engine->q[j]) { + double temp = engine->q[i]; + engine->q[i] = engine->q[j]; + engine->q[j] = temp; + } + } + } + for (int i = 2; i < engine->count; i++) { + if (fabs((double)i / engine->count - quantile) < + fabs((double)closest / MARKER_COUNT - quantile)) { + closest = i; + } + } + return engine->q[closest]; + } else { + int closest = 1; + for (int i = 2; i < MARKER_COUNT - 1; i++) { + if (fabs(engine->dn[i] - quantile) < fabs(engine->dn[closest] - quantile)) { + closest = i; + } + } + return engine->q[closest]; + } +} + +static double gc_timing_quantiles[QUANTILE_COUNT] = {0.5, 0.75, 0.9, 0.95, 0.99}; + +#endif // WITH_GC_TIMING_STATS + // A linked list of objects using the `ob_tid` field as the next pointer. // The linked list pointers are distinct from any real thread ids, because the // thread ids returned by _Py_ThreadId() are also pointers to distinct objects. @@ -113,28 +264,64 @@ worklist_remove(struct worklist_iter *iter) iter->next = iter->ptr; } +static inline int +gc_has_bit(PyObject *op, uint8_t bit) +{ + return (op->ob_gc_bits & bit) != 0; +} + +static inline void +gc_set_bit(PyObject *op, uint8_t bit) +{ + op->ob_gc_bits |= bit; +} + +static inline void +gc_clear_bit(PyObject *op, uint8_t bit) +{ + op->ob_gc_bits &= ~bit; +} + static inline int gc_is_frozen(PyObject *op) { - return (op->ob_gc_bits & _PyGC_BITS_FROZEN) != 0; + return gc_has_bit(op, _PyGC_BITS_FROZEN); } static inline int gc_is_unreachable(PyObject *op) { - return (op->ob_gc_bits & _PyGC_BITS_UNREACHABLE) != 0; + return gc_has_bit(op, _PyGC_BITS_UNREACHABLE); } static void gc_set_unreachable(PyObject *op) { - op->ob_gc_bits |= _PyGC_BITS_UNREACHABLE; + gc_set_bit(op, _PyGC_BITS_UNREACHABLE); } static void gc_clear_unreachable(PyObject *op) { - op->ob_gc_bits &= ~_PyGC_BITS_UNREACHABLE; + gc_clear_bit(op, _PyGC_BITS_UNREACHABLE); +} + +static inline int +gc_is_alive(PyObject *op) +{ + return gc_has_bit(op, _PyGC_BITS_ALIVE); +} + +static void +gc_set_alive(PyObject *op) +{ + gc_set_bit(op, _PyGC_BITS_ALIVE); +} + +static void +gc_clear_alive(PyObject *op) +{ + gc_clear_bit(op, _PyGC_BITS_ALIVE); } // Initialize the `ob_tid` field to zero if the object is not already @@ -143,6 +330,7 @@ static void gc_maybe_init_refs(PyObject *op) { if (!gc_is_unreachable(op)) { + assert(!gc_is_alive(op)); gc_set_unreachable(op); op->ob_tid = 0; } @@ -264,9 +452,13 @@ static void gc_restore_refs(PyObject *op) { if (gc_is_unreachable(op)) { + assert(!gc_is_alive(op)); gc_restore_tid(op); gc_clear_unreachable(op); } + else { + gc_clear_alive(op); + } } // Given a mimalloc memory block return the PyObject stored in it or NULL if @@ -460,7 +652,8 @@ visit_decref(PyObject *op, void *arg) { if (_PyObject_GC_IS_TRACKED(op) && !_Py_IsImmortal(op) - && !gc_is_frozen(op)) + && !gc_is_frozen(op) + && !gc_is_alive(op)) { // If update_refs hasn't reached this object yet, mark it // as (tentatively) unreachable and initialize ob_tid to zero. @@ -470,6 +663,11 @@ visit_decref(PyObject *op, void *arg) return 0; } +unsigned int num_alive; +unsigned int num_immortal; +unsigned int num_checked; +unsigned int num_gc; + // Compute the number of external references to objects in the heap // by subtracting internal references from the refcount. The difference is // computed in the ob_tid field (we restore it later). @@ -482,6 +680,20 @@ update_refs(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } + bool is_alive = gc_is_alive(op); + num_gc++; + if (is_alive) { + num_alive++; + } + if (_Py_IsImmortal(op)) { + num_immortal++; + } + if (is_alive) { + return true; + } + + num_checked++; + // Exclude immortal objects from garbage collection if (_Py_IsImmortal(op)) { op->ob_tid = 0; @@ -553,6 +765,21 @@ mark_reachable(PyObject *op) } #ifdef GC_DEBUG +static bool +validate_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area, + void *block, size_t block_size, void *args) +{ + PyObject *op = op_from_block(block, args, false); + if (op == NULL) { + return true; + } + + _PyObject_ASSERT_WITH_MSG(op, !gc_is_alive(op), + "object should not be marked as alive yet"); + + return true; +} + static bool validate_refcounts(const mi_heap_t *heap, const mi_heap_area_t *area, void *block, size_t block_size, void *args) @@ -586,6 +813,11 @@ validate_gc_objects(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } + if (gc_is_alive(op)) { + _PyObject_ASSERT(op, !gc_is_unreachable(op)); + return true; + } + _PyObject_ASSERT(op, gc_is_unreachable(op)); _PyObject_ASSERT_WITH_MSG(op, gc_get_refs(op) >= 0, "refcount is too small"); @@ -602,10 +834,14 @@ mark_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } + if (gc_is_alive(op)) { + return true; + } + _PyObject_ASSERT_WITH_MSG(op, gc_get_refs(op) >= 0, "refcount is too small"); - if (gc_is_unreachable(op) && gc_get_refs(op) != 0) { + if (gc_get_refs(op) != 0) { // Object is reachable but currently marked as unreachable. // Mark it as reachable and traverse its pointers to find // any other object that may be directly reachable from it. @@ -630,6 +866,7 @@ restore_refs(const mi_heap_t *heap, const mi_heap_area_t *area, } gc_restore_tid(op); gc_clear_unreachable(op); + gc_clear_alive(op); return true; } @@ -679,6 +916,7 @@ scan_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, // object is reachable, restore `ob_tid`; we're done with these objects gc_restore_tid(op); + gc_clear_alive(op); state->long_lived_total++; return true; } @@ -686,6 +924,75 @@ scan_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, static int move_legacy_finalizer_reachable(struct collection_state *state); +#if WITH_GC_TIMING_STATS +static void +print_gc_times(GCState *gcstate) +{ + fprintf(stderr, "gc times: runs %ld total %.3fs mark %.3fs max %ldus avg %ldus\n", + gcstate->timing_state.gc_runs, + PyTime_AsSecondsDouble(gcstate->timing_state.gc_total_time), + PyTime_AsSecondsDouble(gcstate->timing_state.gc_mark_time), + (long)_PyTime_AsMicroseconds(gcstate->timing_state.gc_max_pause, _PyTime_ROUND_HALF_EVEN), + (long)_PyTime_AsMicroseconds(gcstate->timing_state.gc_total_time / gcstate->timing_state.gc_runs, _PyTime_ROUND_HALF_EVEN) + ); +} +#endif // WITH_GC_TIMING_STATS + +static int +visit_propagate_alive(PyObject *op, _PyObjectStack *stack) +{ + if (_PyObject_GC_IS_TRACKED(op) && !gc_is_alive(op)) { + gc_set_alive(op); + return _PyObjectStack_Push(stack, op); + } + return 0; +} + +static int +propagate_alive_bits(PyObject *op) +{ + _PyObjectStack stack = { NULL }; + do { + assert(_PyObject_GC_IS_TRACKED(op)); + assert(gc_is_alive(op)); + traverseproc traverse = Py_TYPE(op)->tp_traverse; + if (traverse(op, (visitproc)&visit_propagate_alive, &stack) < 0) { + _PyObjectStack_Clear(&stack); + return -1; + } + op = _PyObjectStack_Pop(&stack); + } while (op != NULL); + return 0; +} + +static int +mark_root_reachable(PyInterpreterState *interp, + struct collection_state *state) +{ +#if WITH_GC_TIMING_STATS + PyTime_t t1; + (void)PyTime_PerfCounterRaw(&t1); +#endif + +#ifdef GC_DEBUG + // Check that all objects don't have ALIVE bits set + gc_visit_heaps(interp, &validate_alive_bits, &state->base); +#endif + PyObject *root = interp->sysdict; + assert(!gc_is_alive(root)); + gc_set_alive(root); + propagate_alive_bits(root); + +#if WITH_GC_TIMING_STATS + PyTime_t t2; + (void)PyTime_PerfCounterRaw(&t2); + interp->gc.timing_state.gc_mark_time += t2 - t1; +#endif + + return 0; +} + + static int deduce_unreachable_heap(PyInterpreterState *interp, struct collection_state *state) @@ -697,11 +1004,19 @@ deduce_unreachable_heap(PyInterpreterState *interp, gc_visit_heaps(interp, &validate_refcounts, &state->base); #endif + num_alive = 0; + num_gc = 0; + num_checked = 0; + // Identify objects that are directly reachable from outside the GC heap // by computing the difference between the refcount and the number of // incoming references. gc_visit_heaps(interp, &update_refs, &state->base); + #if WITH_GC_TIMING_STATS + //fprintf(stderr, "gc alive %d immortal %d checked %d gc %d\n", num_alive, num_immortal, num_checked, num_gc); + #endif + #ifdef GC_DEBUG // Check that all objects are marked as unreachable and that the computed // reference count difference (stored in `ob_tid`) is non-negative. @@ -870,6 +1185,11 @@ _PyGC_Init(PyInterpreterState *interp) return _PyStatus_NO_MEMORY(); } +#if WITH_GC_TIMING_STATS + //p2engine_init(&gcstate->timing_state.auto_all, gc_timing_quantiles); + p2engine_init(&gcstate->timing_state.auto_full, gc_timing_quantiles); +#endif + return _PyStatus_OK(); } @@ -1175,11 +1495,15 @@ gc_should_collect(GCState *gcstate) if (count <= threshold || threshold == 0 || !gcstate->enabled) { return false; } + #if 1 // disable to force more frequent collections // Avoid quadratic behavior by scaling threshold to the number of live // objects. A few tests rely on immediate scheduling of the GC so we ignore // the scaled threshold if generations[1].threshold is set to zero. - return (count > gcstate->long_lived_total / 4 || - gcstate->old[0].threshold == 0); + if (count < gcstate->long_lived_total / 4 && gcstate->old[0].threshold != 0) { + return false; + } + #endif + return true; } static void @@ -1217,6 +1541,8 @@ record_deallocation(PyThreadState *tstate) } } +static bool freeze_used; + static void gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, int generation) { @@ -1245,6 +1571,19 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, process_delayed_frees(interp, state); + #if 1 + if (!freeze_used) { + // Mark objects reachable from known roots as "alive". These will + // be ignored for rest of the GC pass. + int err = mark_root_reachable(interp, state); + if (err < 0) { + _PyEval_StartTheWorld(interp); + PyErr_NoMemory(); + return; + } + } + #endif + // Find unreachable objects int err = deduce_unreachable_heap(interp, state); if (err < 0) { @@ -1253,6 +1592,11 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, return; } +#ifdef GC_DEBUG + // At this point, no object should have ALIVE bit set + gc_visit_heaps(interp, &validate_alive_bits, &state->base); +#endif + // Print debugging information. if (interp->gc.debug & _PyGC_DEBUG_COLLECTABLE) { PyObject *op; @@ -1341,6 +1685,11 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) invoke_gc_callback(tstate, "start", generation, 0, 0); } +#if WITH_GC_TIMING_STATS + PyTime_t gc_timing_t1; + (void)PyTime_PerfCounterRaw(&gc_timing_t1); +#endif + if (gcstate->debug & _PyGC_DEBUG_STATS) { PySys_WriteStderr("gc: collecting generation %d...\n", generation); show_stats_each_generations(gcstate); @@ -1370,9 +1719,30 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) (void)PyTime_PerfCounterRaw(&t2); double d = PyTime_AsSecondsDouble(t2 - t1); PySys_WriteStderr( - "gc: done, %zd unreachable, %zd uncollectable, %.4fs elapsed\n", - n+m, n, d); + "gc: done, %zd unreachable, %zd uncollectable, %zd long-lived, %.4fs elapsed\n", + n+m, n, state.long_lived_total, d); + } + +#if WITH_GC_TIMING_STATS + PyTime_t gc_timing_t2, dt; + (void)PyTime_PerfCounterRaw(&gc_timing_t2); + dt = gc_timing_t2 - gc_timing_t1; + if (reason == _Py_GC_REASON_HEAP) { + #if 0 // no generations, always full collection + p2engine_add(&gcstate->timing_state.auto_all, + (double)_PyTime_AsMicroseconds(dt, + _PyTime_ROUND_HALF_EVEN)); + #endif + p2engine_add(&gcstate->timing_state.auto_full, + (double)_PyTime_AsMicroseconds(dt, + _PyTime_ROUND_HALF_EVEN)); + if (dt > gcstate->timing_state.gc_max_pause) { + gcstate->timing_state.gc_max_pause = dt; + } } + gcstate->timing_state.gc_total_time += dt; + gcstate->timing_state.gc_runs++; +#endif // WITH_GC_TIMING_STATS // Clear the current thread's free-list again. _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; @@ -1564,6 +1934,7 @@ _PyGC_Freeze(PyInterpreterState *interp) { struct visitor_args args; _PyEval_StopTheWorld(interp); + freeze_used = true; gc_visit_heaps(interp, &visit_freeze, &args); _PyEval_StartTheWorld(interp); } @@ -1574,7 +1945,7 @@ visit_unfreeze(const mi_heap_t *heap, const mi_heap_area_t *area, { PyObject *op = op_from_block(block, args, true); if (op != NULL) { - op->ob_gc_bits &= ~_PyGC_BITS_FROZEN; + gc_clear_bit(op, _PyGC_BITS_FROZEN); } return true; } @@ -1585,6 +1956,7 @@ _PyGC_Unfreeze(PyInterpreterState *interp) struct visitor_args args; _PyEval_StopTheWorld(interp); gc_visit_heaps(interp, &visit_unfreeze, &args); + freeze_used = false; _PyEval_StartTheWorld(interp); } @@ -1727,6 +2099,20 @@ _PyGC_Fini(PyInterpreterState *interp) Py_CLEAR(gcstate->garbage); Py_CLEAR(gcstate->callbacks); + #if WITH_GC_TIMING_STATS + print_gc_times(gcstate); + #if 0 // no generations so all are full collections + for (int i = 0; i < QUANTILE_COUNT; i++) { + double result = p2engine_result(&gcstate->timing_state.auto_all, gc_timing_quantiles[i]); + fprintf(stderr, "gc timing all Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); + } + #endif + for (int i = 0; i < QUANTILE_COUNT; i++) { + double result = p2engine_result(&gcstate->timing_state.auto_full, gc_timing_quantiles[i]); + fprintf(stderr, "gc timing full Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); + } + #endif // WITH_GC_TIMING_STATS + /* We expect that none of this interpreters objects are shared with other interpreters. See https://github.com/python/cpython/issues/90228. */ From 200b69b12591326646a7e6d85e2296f267e73982 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 02/19] wip: mark more roots --- Python/gc_free_threading.c | 50 +++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index aa115e869e05d0..9fe6c34e3c120e 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -949,22 +949,38 @@ visit_propagate_alive(PyObject *op, _PyObjectStack *stack) } static int -propagate_alive_bits(PyObject *op) +propagate_alive_bits(_PyObjectStack *stack) { - _PyObjectStack stack = { NULL }; - do { + for (;;) { + PyObject *op = _PyObjectStack_Pop(stack); + if (op == NULL) { + break; + } assert(_PyObject_GC_IS_TRACKED(op)); assert(gc_is_alive(op)); traverseproc traverse = Py_TYPE(op)->tp_traverse; - if (traverse(op, (visitproc)&visit_propagate_alive, &stack) < 0) { - _PyObjectStack_Clear(&stack); + if (traverse(op, (visitproc)&visit_propagate_alive, stack) < 0) { + _PyObjectStack_Clear(stack); return -1; } - op = _PyObjectStack_Pop(&stack); - } while (op != NULL); + } return 0; } +static bool +mark_stack_push(_PyObjectStack *stack, PyObject *op) +{ + if (op == NULL) { + return true; + } + assert(!gc_is_alive(op)); + gc_set_alive(op); + if (_PyObjectStack_Push(stack, Py_NewRef(op)) < 0) { + return false; + } + return true; +} + static int mark_root_reachable(PyInterpreterState *interp, struct collection_state *state) @@ -978,10 +994,20 @@ mark_root_reachable(PyInterpreterState *interp, // Check that all objects don't have ALIVE bits set gc_visit_heaps(interp, &validate_alive_bits, &state->base); #endif - PyObject *root = interp->sysdict; - assert(!gc_is_alive(root)); - gc_set_alive(root); - propagate_alive_bits(root); + _PyObjectStack stack = { NULL }; + mark_stack_push(&stack, interp->sysdict); + mark_stack_push(&stack, interp->builtins); + mark_stack_push(&stack, interp->dict); + struct types_state *types = &interp->types; + for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) { + mark_stack_push(&stack, types->builtins.initialized[i].tp_dict); + mark_stack_push(&stack, types->builtins.initialized[i].tp_subclasses); + } + for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) { + mark_stack_push(&stack, types->for_extensions.initialized[i].tp_dict); + mark_stack_push(&stack, types->for_extensions.initialized[i].tp_subclasses); + } + propagate_alive_bits(&stack); #if WITH_GC_TIMING_STATS PyTime_t t2; @@ -1014,7 +1040,7 @@ deduce_unreachable_heap(PyInterpreterState *interp, gc_visit_heaps(interp, &update_refs, &state->base); #if WITH_GC_TIMING_STATS - //fprintf(stderr, "gc alive %d immortal %d checked %d gc %d\n", num_alive, num_immortal, num_checked, num_gc); + fprintf(stderr, "gc alive %d immortal %d checked %d gc %d\n", num_alive, num_immortal, num_checked, num_gc); #endif #ifdef GC_DEBUG From 713689ab789678ec3cf5a5e7df71f046f384aa43 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 03/19] wip: log gc timing to temp file --- Python/gc_free_threading.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 9fe6c34e3c120e..400df1851c9126 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -925,10 +925,11 @@ static int move_legacy_finalizer_reachable(struct collection_state *state); #if WITH_GC_TIMING_STATS +FILE *gc_log; static void print_gc_times(GCState *gcstate) { - fprintf(stderr, "gc times: runs %ld total %.3fs mark %.3fs max %ldus avg %ldus\n", + fprintf(gc_log, "gc times: runs %ld total %.3fs mark %.3fs max %ldus avg %ldus\n", gcstate->timing_state.gc_runs, PyTime_AsSecondsDouble(gcstate->timing_state.gc_total_time), PyTime_AsSecondsDouble(gcstate->timing_state.gc_mark_time), @@ -975,7 +976,8 @@ mark_stack_push(_PyObjectStack *stack, PyObject *op) } assert(!gc_is_alive(op)); gc_set_alive(op); - if (_PyObjectStack_Push(stack, Py_NewRef(op)) < 0) { + //gc_maybe_merge_refcount(op); + if (_PyObjectStack_Push(stack, op) < 0) { return false; } return true; @@ -1040,7 +1042,7 @@ deduce_unreachable_heap(PyInterpreterState *interp, gc_visit_heaps(interp, &update_refs, &state->base); #if WITH_GC_TIMING_STATS - fprintf(stderr, "gc alive %d immortal %d checked %d gc %d\n", num_alive, num_immortal, num_checked, num_gc); + fprintf(gc_log, "gc alive %d immortal %d checked %d gc %d\n", num_alive, num_immortal, num_checked, num_gc); #endif #ifdef GC_DEBUG @@ -1212,6 +1214,7 @@ _PyGC_Init(PyInterpreterState *interp) } #if WITH_GC_TIMING_STATS + gc_log = fopen("/tmp/gc_timing.log", "a"); //p2engine_init(&gcstate->timing_state.auto_all, gc_timing_quantiles); p2engine_init(&gcstate->timing_state.auto_full, gc_timing_quantiles); #endif @@ -2130,13 +2133,14 @@ _PyGC_Fini(PyInterpreterState *interp) #if 0 // no generations so all are full collections for (int i = 0; i < QUANTILE_COUNT; i++) { double result = p2engine_result(&gcstate->timing_state.auto_all, gc_timing_quantiles[i]); - fprintf(stderr, "gc timing all Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); + fprintf(gc_log, "gc timing all Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); } #endif for (int i = 0; i < QUANTILE_COUNT; i++) { double result = p2engine_result(&gcstate->timing_state.auto_full, gc_timing_quantiles[i]); - fprintf(stderr, "gc timing full Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); + fprintf(gc_log, "gc timing full Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); } + fclose(gc_log); #endif // WITH_GC_TIMING_STATS /* We expect that none of this interpreters objects are shared From 2635fe0b5b59c393dd3c76c554db4a13955321e5 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 04/19] wip: mark stack refs as alive --- Python/gc_free_threading.c | 66 +++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 400df1851c9126..4cf75b52117b83 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -584,6 +584,56 @@ gc_visit_thread_stacks(PyInterpreterState *interp) _Py_FOR_EACH_TSTATE_END(interp); } +static bool +mark_stack_push(_PyObjectStack *stack, PyObject *op) +{ + if (op == NULL) { + return true; + } + assert(!gc_is_alive(op)); + gc_set_alive(op); + //gc_maybe_merge_refcount(op); + if (_PyObjectStack_Push(stack, op) < 0) { + return false; + } + return true; +} + +static inline void +gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) +{ + // Note: we MUST check that it is deferred before checking the rest. + // Otherwise we might read into invalid memory due to non-deferred references + // being dead already. + if (PyStackRef_IsDeferred(stackref) && !PyStackRef_IsNull(stackref)) { + PyObject *obj = PyStackRef_AsPyObjectBorrow(stackref); + if (_PyObject_GC_IS_TRACKED(obj) && !gc_is_frozen(obj)) { + mark_stack_push(stack, obj); + } + } +} + +static void +gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack) +{ + _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { + for (_PyInterpreterFrame *f = p->current_frame; f != NULL; f = f->previous) { + PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable); + if (executable == NULL || !PyCode_Check(executable)) { + continue; + } + + PyCodeObject *co = (PyCodeObject *)executable; + int max_stack = co->co_nlocalsplus + co->co_stacksize; + gc_visit_stackref(f->f_executable); + for (int i = 0; i < max_stack; i++) { + gc_visit_stackref_mark_alive(stack, f->localsplus[i]); + } + } + } + _Py_FOR_EACH_TSTATE_END(interp); +} + static void queue_untracked_obj_decref(PyObject *op, struct collection_state *state) { @@ -968,21 +1018,6 @@ propagate_alive_bits(_PyObjectStack *stack) return 0; } -static bool -mark_stack_push(_PyObjectStack *stack, PyObject *op) -{ - if (op == NULL) { - return true; - } - assert(!gc_is_alive(op)); - gc_set_alive(op); - //gc_maybe_merge_refcount(op); - if (_PyObjectStack_Push(stack, op) < 0) { - return false; - } - return true; -} - static int mark_root_reachable(PyInterpreterState *interp, struct collection_state *state) @@ -997,6 +1032,7 @@ mark_root_reachable(PyInterpreterState *interp, gc_visit_heaps(interp, &validate_alive_bits, &state->base); #endif _PyObjectStack stack = { NULL }; + gc_visit_thread_stacks_mark_alive(interp, &stack); mark_stack_push(&stack, interp->sysdict); mark_stack_push(&stack, interp->builtins); mark_stack_push(&stack, interp->dict); From 72128e2bf79682489a3abfc9cad792b4ee13f056 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 05/19] wip: include # of collected in debug log --- Python/gc_free_threading.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 4cf75b52117b83..af764be3110c22 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1077,10 +1077,6 @@ deduce_unreachable_heap(PyInterpreterState *interp, // incoming references. gc_visit_heaps(interp, &update_refs, &state->base); - #if WITH_GC_TIMING_STATS - fprintf(gc_log, "gc alive %d immortal %d checked %d gc %d\n", num_alive, num_immortal, num_checked, num_gc); - #endif - #ifdef GC_DEBUG // Check that all objects are marked as unreachable and that the computed // reference count difference (stored in `ob_tid`) is non-negative. @@ -1837,6 +1833,11 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) } #endif + #if WITH_GC_TIMING_STATS + fprintf(gc_log, "gc alive %d collected %ld checked %d gc %d\n", num_alive, m, num_checked, num_gc); + fflush(gc_log); + #endif + if (PyDTrace_GC_DONE_ENABLED()) { PyDTrace_GC_DONE(n + m); } From 942f77d6805a43e841a1951b551c324cf99ecb25 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 06/19] wip: move freeze_used to gc state --- Include/internal/pycore_gc.h | 2 ++ Python/gc_free_threading.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 289b4453bc41e3..bc19cfda89b1e1 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -351,6 +351,8 @@ struct _gc_runtime_state { int visited_space; int phase; + int freeze_used; + #if WITH_GC_TIMING_STATS /* state for GC timing statistics */ struct gc_timing_state timing_state; diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index af764be3110c22..276cffed2ab156 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1602,8 +1602,6 @@ record_deallocation(PyThreadState *tstate) } } -static bool freeze_used; - static void gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, int generation) { @@ -1633,7 +1631,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, process_delayed_frees(interp, state); #if 1 - if (!freeze_used) { + if (!state->gcstate->freeze_used) { // Mark objects reachable from known roots as "alive". These will // be ignored for rest of the GC pass. int err = mark_root_reachable(interp, state); @@ -2000,7 +1998,8 @@ _PyGC_Freeze(PyInterpreterState *interp) { struct visitor_args args; _PyEval_StopTheWorld(interp); - freeze_used = true; + GCState *gcstate = get_gc_state(); + gcstate->freeze_used = 1; gc_visit_heaps(interp, &visit_freeze, &args); _PyEval_StartTheWorld(interp); } @@ -2021,8 +2020,9 @@ _PyGC_Unfreeze(PyInterpreterState *interp) { struct visitor_args args; _PyEval_StopTheWorld(interp); + GCState *gcstate = get_gc_state(); + gcstate->freeze_used = 0; gc_visit_heaps(interp, &visit_unfreeze, &args); - freeze_used = false; _PyEval_StartTheWorld(interp); } From a3089e5d8fe84ad1d8e57c0d8e5844d97d8a6d31 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 07/19] wip: fix OOM error handling, add ifdef toggles --- Include/internal/pycore_gc.h | 6 +-- Python/gc_free_threading.c | 91 ++++++++++++++++++++++++++---------- 2 files changed, 70 insertions(+), 27 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index bc19cfda89b1e1..8ffb7f5b0a5c8d 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -291,9 +291,9 @@ enum _GCPhase { }; // if true, enable GC timing statistics -#define WITH_GC_TIMING_STATS 0 +//#define WITH_GC_TIMING_STATS 1 -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS #define QUANTILE_COUNT 5 #define MARKER_COUNT (QUANTILE_COUNT * 3 + 2) @@ -353,7 +353,7 @@ struct _gc_runtime_state { int freeze_used; -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS /* state for GC timing statistics */ struct gc_timing_state timing_state; #endif diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 276cffed2ab156..e9a771fef8372f 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -18,6 +18,17 @@ #include "pydtrace.h" #include "pycore_uniqueid.h" // _PyObject_MergeThreadLocalRefcounts() + +// enable the "mark alive" pass of GC +#define GC_ENABLE_MARK_ALIVE 1 + +// include addtional roots in "mark alive" pass +//#define GC_MARK_ALIVE_EXTRA_ROOTS 1 + +// include Python stacks as set of known roots +//#define GC_MARK_ALIVE_STACKS 1 + + #ifdef Py_GIL_DISABLED typedef struct _gc_runtime_state GCState; @@ -34,7 +45,7 @@ typedef struct _gc_runtime_state GCState; // Automatically choose the generation that needs collecting. #define GENERATION_AUTO (-1) -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS static void p2engine_init(p2_engine* engine, const double quantiles[QUANTILE_COUNT]) { engine->count = 0; @@ -312,11 +323,13 @@ gc_is_alive(PyObject *op) return gc_has_bit(op, _PyGC_BITS_ALIVE); } +#ifdef GC_ENABLE_MARK_ALIVE static void gc_set_alive(PyObject *op) { gc_set_bit(op, _PyGC_BITS_ALIVE); } +#endif static void gc_clear_alive(PyObject *op) @@ -584,8 +597,9 @@ gc_visit_thread_stacks(PyInterpreterState *interp) _Py_FOR_EACH_TSTATE_END(interp); } +#ifdef GC_ENABLE_MARK_ALIVE static bool -mark_stack_push(_PyObjectStack *stack, PyObject *op) +mark_alive_stack_push(_PyObjectStack *stack, PyObject *op) { if (op == NULL) { return true; @@ -599,7 +613,9 @@ mark_stack_push(_PyObjectStack *stack, PyObject *op) return true; } -static inline void + +#ifdef GC_MARK_ALIVE_STACKS +static bool gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) { // Note: we MUST check that it is deferred before checking the rest. @@ -608,12 +624,15 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) if (PyStackRef_IsDeferred(stackref) && !PyStackRef_IsNull(stackref)) { PyObject *obj = PyStackRef_AsPyObjectBorrow(stackref); if (_PyObject_GC_IS_TRACKED(obj) && !gc_is_frozen(obj)) { - mark_stack_push(stack, obj); + if (!mark_alive_stack_push(stack, obj)) { + return false; + } } } + return true; } -static void +static bool gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack) { _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { @@ -625,14 +644,21 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st PyCodeObject *co = (PyCodeObject *)executable; int max_stack = co->co_nlocalsplus + co->co_stacksize; - gc_visit_stackref(f->f_executable); + if (!gc_visit_stackref_mark_alive(stack, f->f_executable)) { + return false; + } for (int i = 0; i < max_stack; i++) { - gc_visit_stackref_mark_alive(stack, f->localsplus[i]); + if (!gc_visit_stackref_mark_alive(stack, f->localsplus[i])) { + return false; + } } } } _Py_FOR_EACH_TSTATE_END(interp); + return true; } +#endif // GC_MARK_ALIVE_STACKS +#endif // GC_ENABLE_MARK_ALIVE static void queue_untracked_obj_decref(PyObject *op, struct collection_state *state) @@ -974,7 +1000,7 @@ scan_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, static int move_legacy_finalizer_reachable(struct collection_state *state); -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS FILE *gc_log; static void print_gc_times(GCState *gcstate) @@ -989,6 +1015,7 @@ print_gc_times(GCState *gcstate) } #endif // WITH_GC_TIMING_STATS +#ifdef GC_ENABLE_MARK_ALIVE static int visit_propagate_alive(PyObject *op, _PyObjectStack *stack) { @@ -1022,7 +1049,7 @@ static int mark_root_reachable(PyInterpreterState *interp, struct collection_state *state) { -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS PyTime_t t1; (void)PyTime_PerfCounterRaw(&t1); #endif @@ -1032,22 +1059,37 @@ mark_root_reachable(PyInterpreterState *interp, gc_visit_heaps(interp, &validate_alive_bits, &state->base); #endif _PyObjectStack stack = { NULL }; - gc_visit_thread_stacks_mark_alive(interp, &stack); - mark_stack_push(&stack, interp->sysdict); - mark_stack_push(&stack, interp->builtins); - mark_stack_push(&stack, interp->dict); + + #define STACK_PUSH(op) \ + if (!mark_alive_stack_push(&stack, op)) { \ + _PyObjectStack_Clear(&stack); \ + return -1; \ + } + STACK_PUSH(interp->sysdict); +#ifdef GC_MARK_ALIVE_EXTRA_ROOTS + STACK_PUSH(interp->builtins); + STACK_PUSH(interp->dict); struct types_state *types = &interp->types; for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) { - mark_stack_push(&stack, types->builtins.initialized[i].tp_dict); - mark_stack_push(&stack, types->builtins.initialized[i].tp_subclasses); + STACK_PUSH(types->builtins.initialized[i].tp_dict); + STACK_PUSH(types->builtins.initialized[i].tp_subclasses); } for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) { - mark_stack_push(&stack, types->for_extensions.initialized[i].tp_dict); - mark_stack_push(&stack, types->for_extensions.initialized[i].tp_subclasses); + STACK_PUSH(types->for_extensions.initialized[i].tp_dict); + STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses); + } +#endif +#ifdef GC_MARK_ALIVE_STACKS + if (!gc_visit_thread_stacks_mark_alive(interp, &stack)) { + _PyObjectStack_Clear(&stack); + return -1; } +#endif + #undef STACK_PUSH + propagate_alive_bits(&stack); -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS PyTime_t t2; (void)PyTime_PerfCounterRaw(&t2); interp->gc.timing_state.gc_mark_time += t2 - t1; @@ -1055,6 +1097,7 @@ mark_root_reachable(PyInterpreterState *interp, return 0; } +#endif // GC_ENABLE_MARK_ALIVE static int @@ -1245,7 +1288,7 @@ _PyGC_Init(PyInterpreterState *interp) return _PyStatus_NO_MEMORY(); } -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS gc_log = fopen("/tmp/gc_timing.log", "a"); //p2engine_init(&gcstate->timing_state.auto_all, gc_timing_quantiles); p2engine_init(&gcstate->timing_state.auto_full, gc_timing_quantiles); @@ -1630,7 +1673,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, process_delayed_frees(interp, state); - #if 1 + #ifdef GC_ENABLE_MARK_ALIVE if (!state->gcstate->freeze_used) { // Mark objects reachable from known roots as "alive". These will // be ignored for rest of the GC pass. @@ -1744,7 +1787,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) invoke_gc_callback(tstate, "start", generation, 0, 0); } -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS PyTime_t gc_timing_t1; (void)PyTime_PerfCounterRaw(&gc_timing_t1); #endif @@ -1782,7 +1825,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) n+m, n, state.long_lived_total, d); } -#if WITH_GC_TIMING_STATS +#ifdef WITH_GC_TIMING_STATS PyTime_t gc_timing_t2, dt; (void)PyTime_PerfCounterRaw(&gc_timing_t2); dt = gc_timing_t2 - gc_timing_t1; @@ -1831,7 +1874,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) } #endif - #if WITH_GC_TIMING_STATS + #ifdef WITH_GC_TIMING_STATS fprintf(gc_log, "gc alive %d collected %ld checked %d gc %d\n", num_alive, m, num_checked, num_gc); fflush(gc_log); #endif @@ -2165,7 +2208,7 @@ _PyGC_Fini(PyInterpreterState *interp) Py_CLEAR(gcstate->garbage); Py_CLEAR(gcstate->callbacks); - #if WITH_GC_TIMING_STATS + #ifdef WITH_GC_TIMING_STATS print_gc_times(gcstate); #if 0 // no generations so all are full collections for (int i = 0; i < QUANTILE_COUNT; i++) { From 4f99de6dfc3266e7375113f3661c1b17e619b7f8 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 08/19] wip: fix reversion on mark_heap_visitor() If the object is already marked as reachable, we shouldn't traverse it again. --- Python/gc_free_threading.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index e9a771fef8372f..f84c1dbdf53ee6 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -917,7 +917,7 @@ mark_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, _PyObject_ASSERT_WITH_MSG(op, gc_get_refs(op) >= 0, "refcount is too small"); - if (gc_get_refs(op) != 0) { + if (gc_is_unreachable(op) && gc_get_refs(op) != 0) { // Object is reachable but currently marked as unreachable. // Mark it as reachable and traverse its pointers to find // any other object that may be directly reachable from it. From 4f6e5390eaf3ff0739eb5dd5e9ab380cb24c041d Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 09/19] wip: revert changes to gc_should_collect() These are not strictly needed, simplify PR. --- Python/gc_free_threading.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index f84c1dbdf53ee6..ce24a03de46fb2 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1599,15 +1599,11 @@ gc_should_collect(GCState *gcstate) if (count <= threshold || threshold == 0 || !gcstate->enabled) { return false; } - #if 1 // disable to force more frequent collections // Avoid quadratic behavior by scaling threshold to the number of live // objects. A few tests rely on immediate scheduling of the GC so we ignore // the scaled threshold if generations[1].threshold is set to zero. - if (count < gcstate->long_lived_total / 4 && gcstate->old[0].threshold != 0) { - return false; - } - #endif - return true; + return (count > gcstate->long_lived_total / 4 || + gcstate->old[0].threshold == 0); } static void From b99f2dfd368e4972d0529b971d6cdc029d0f7133 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 10/19] wip: removing timing code --- Include/internal/pycore_gc.h | 42 +----- Python/gc_free_threading.c | 255 +---------------------------------- 2 files changed, 7 insertions(+), 290 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 8ffb7f5b0a5c8d..66396545ef2f52 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -290,38 +290,6 @@ enum _GCPhase { GC_PHASE_COLLECT = 1 }; -// if true, enable GC timing statistics -//#define WITH_GC_TIMING_STATS 1 - -#ifdef WITH_GC_TIMING_STATS - -#define QUANTILE_COUNT 5 -#define MARKER_COUNT (QUANTILE_COUNT * 3 + 2) - -typedef struct { - double q[MARKER_COUNT]; - double dn[MARKER_COUNT]; - double np[MARKER_COUNT]; - int n[MARKER_COUNT]; - int count; - double max; -} p2_engine; - -struct gc_timing_state { - /* timing statistics computed by P^2 algorithm */ - p2_engine auto_all; // timing for all automatic collections - p2_engine auto_full; // timing for full (gen2) automatic collections - /* Total time spent inside cyclic GC */ - PyTime_t gc_total_time; - /* Time spent inside incremental mark part of cyclic GC */ - PyTime_t gc_mark_time; - /* Maximum GC pause time */ - PyTime_t gc_max_pause; - /* Total number of times GC was run */ - PyTime_t gc_runs; -}; -#endif // WITH_GC_TIMING_STATS - struct _gc_runtime_state { /* List of objects that still need to be cleaned up, singly linked * via their gc headers' gc_prev pointers. */ @@ -351,13 +319,6 @@ struct _gc_runtime_state { int visited_space; int phase; - int freeze_used; - -#ifdef WITH_GC_TIMING_STATS - /* state for GC timing statistics */ - struct gc_timing_state timing_state; -#endif - #ifdef Py_GIL_DISABLED /* This is the number of objects that survived the last full collection. It approximates the number of long lived objects @@ -370,6 +331,9 @@ struct _gc_runtime_state { collections, and are awaiting to undergo a full collection for the first time. */ Py_ssize_t long_lived_pending; + + /* True if gc.freeze() has been used. */ + int freeze_active; #endif }; diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index ce24a03de46fb2..8d403911910139 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -45,156 +45,6 @@ typedef struct _gc_runtime_state GCState; // Automatically choose the generation that needs collecting. #define GENERATION_AUTO (-1) -#ifdef WITH_GC_TIMING_STATS - -static void p2engine_init(p2_engine* engine, const double quantiles[QUANTILE_COUNT]) { - engine->count = 0; - engine->max = 0.0; - - engine->dn[0] = 0; - engine->dn[MARKER_COUNT - 1] = 1; - - int pointer = 1; - for (int i = 0; i < QUANTILE_COUNT; i++) { - double quantile = quantiles[i]; - engine->dn[pointer] = quantile; - engine->dn[pointer + 1] = quantile / 2; - engine->dn[pointer + 2] = (1 + quantile) / 2; - pointer += 3; - } - - // Sort the markers - for (int i = 0; i < MARKER_COUNT; i++) { - for (int j = i + 1; j < MARKER_COUNT; j++) { - if (engine->dn[i] > engine->dn[j]) { - double temp = engine->dn[i]; - engine->dn[i] = engine->dn[j]; - engine->dn[j] = temp; - } - } - } - - for (int i = 0; i < MARKER_COUNT; i++) { - engine->np[i] = (MARKER_COUNT - 1) * engine->dn[i] + 1; - } -} - -static int sign(double d) { - return (d >= 0) ? 1 : -1; -} - -static double parabolic(p2_engine* engine, int i, int d) { - return engine->q[i] + d / ((engine->n[i + 1] - engine->n[i - 1]) * - ((engine->n[i] - engine->n[i - 1] + d) * ((engine->q[i + 1] - engine->q[i]) / (engine->n[i + 1] - engine->n[i])) + - (engine->n[i + 1] - (engine->n[i] - d)) * ((engine->q[i] - engine->q[i - 1]) / (engine->n[i] - engine->n[i - 1])))); -} - -static double linear(p2_engine* engine, int i, int d) { - return engine->q[i] + d * ((engine->q[i + d] - engine->q[i]) / (engine->n[i + d] - engine->n[i])); -} - -static void p2engine_add(p2_engine* engine, double data) { - int i, k = 0; - if (data > engine->max) { - engine->max = data; - } - if (engine->count >= MARKER_COUNT) { - engine->count++; - // B1 - if (data < engine->q[0]) { - engine->q[0] = data; - k = 1; - } else if (data >= engine->q[MARKER_COUNT - 1]) { - engine->q[MARKER_COUNT - 1] = data; - k = MARKER_COUNT - 1; - } else { - for (i = 1; i < MARKER_COUNT; i++) { - if (data < engine->q[i]) { - k = i; - break; - } - } - } - - // B2 - for (i = k; i < MARKER_COUNT; i++) { - engine->n[i]++; - engine->np[i] = engine->np[i] + engine->dn[i]; - } - - for (i = 0; i < k; i++) { - engine->np[i] = engine->np[i] + engine->dn[i]; - } - - // B3 - for (i = 1; i < MARKER_COUNT - 1; i++) { - double d = engine->np[i] - engine->n[i]; - if ((d >= 1 && engine->n[i + 1] - engine->n[i] > 1) || - (d <= -1 && engine->n[i - 1] - engine->n[i] < -1)) { - double newq = parabolic(engine, i, sign(d)); - if (engine->q[i - 1] < newq && newq < engine->q[i + 1]) { - engine->q[i] = newq; - } else { - engine->q[i] = linear(engine, i, sign(d)); - } - engine->n[i] = engine->n[i] + sign(d); - } - } - } else { - engine->q[engine->count] = data; - engine->count++; - if (engine->count == MARKER_COUNT) { - // We have enough to start the algorithm, initialize - for (i = 0; i < MARKER_COUNT; i++) { - for (int j = i + 1; j < MARKER_COUNT; j++) { - if (engine->q[i] > engine->q[j]) { - double temp = engine->q[i]; - engine->q[i] = engine->q[j]; - engine->q[j] = temp; - } - } - } - for (i = 0; i < MARKER_COUNT; i++) { - engine->n[i] = i + 1; - } - } - } -} - -static double p2engine_result(p2_engine* engine, double quantile) { - if (engine->count < MARKER_COUNT) { - int closest = 1; - for (int i = 0; i < engine->count; i++) { - for (int j = i + 1; j < engine->count; j++) { - if (engine->q[i] > engine->q[j]) { - double temp = engine->q[i]; - engine->q[i] = engine->q[j]; - engine->q[j] = temp; - } - } - } - for (int i = 2; i < engine->count; i++) { - if (fabs((double)i / engine->count - quantile) < - fabs((double)closest / MARKER_COUNT - quantile)) { - closest = i; - } - } - return engine->q[closest]; - } else { - int closest = 1; - for (int i = 2; i < MARKER_COUNT - 1; i++) { - if (fabs(engine->dn[i] - quantile) < fabs(engine->dn[closest] - quantile)) { - closest = i; - } - } - return engine->q[closest]; - } -} - -static double gc_timing_quantiles[QUANTILE_COUNT] = {0.5, 0.75, 0.9, 0.95, 0.99}; - -#endif // WITH_GC_TIMING_STATS - // A linked list of objects using the `ob_tid` field as the next pointer. // The linked list pointers are distinct from any real thread ids, because the // thread ids returned by _Py_ThreadId() are also pointers to distinct objects. @@ -739,11 +589,6 @@ visit_decref(PyObject *op, void *arg) return 0; } -unsigned int num_alive; -unsigned int num_immortal; -unsigned int num_checked; -unsigned int num_gc; - // Compute the number of external references to objects in the heap // by subtracting internal references from the refcount. The difference is // computed in the ob_tid field (we restore it later). @@ -756,20 +601,10 @@ update_refs(const mi_heap_t *heap, const mi_heap_area_t *area, return true; } - bool is_alive = gc_is_alive(op); - num_gc++; - if (is_alive) { - num_alive++; - } - if (_Py_IsImmortal(op)) { - num_immortal++; - } - if (is_alive) { + if (gc_is_alive(op)) { return true; } - num_checked++; - // Exclude immortal objects from garbage collection if (_Py_IsImmortal(op)) { op->ob_tid = 0; @@ -1000,21 +835,6 @@ scan_heap_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, static int move_legacy_finalizer_reachable(struct collection_state *state); -#ifdef WITH_GC_TIMING_STATS -FILE *gc_log; -static void -print_gc_times(GCState *gcstate) -{ - fprintf(gc_log, "gc times: runs %ld total %.3fs mark %.3fs max %ldus avg %ldus\n", - gcstate->timing_state.gc_runs, - PyTime_AsSecondsDouble(gcstate->timing_state.gc_total_time), - PyTime_AsSecondsDouble(gcstate->timing_state.gc_mark_time), - (long)_PyTime_AsMicroseconds(gcstate->timing_state.gc_max_pause, _PyTime_ROUND_HALF_EVEN), - (long)_PyTime_AsMicroseconds(gcstate->timing_state.gc_total_time / gcstate->timing_state.gc_runs, _PyTime_ROUND_HALF_EVEN) - ); -} -#endif // WITH_GC_TIMING_STATS - #ifdef GC_ENABLE_MARK_ALIVE static int visit_propagate_alive(PyObject *op, _PyObjectStack *stack) @@ -1049,11 +869,6 @@ static int mark_root_reachable(PyInterpreterState *interp, struct collection_state *state) { -#ifdef WITH_GC_TIMING_STATS - PyTime_t t1; - (void)PyTime_PerfCounterRaw(&t1); -#endif - #ifdef GC_DEBUG // Check that all objects don't have ALIVE bits set gc_visit_heaps(interp, &validate_alive_bits, &state->base); @@ -1089,12 +904,6 @@ mark_root_reachable(PyInterpreterState *interp, propagate_alive_bits(&stack); -#ifdef WITH_GC_TIMING_STATS - PyTime_t t2; - (void)PyTime_PerfCounterRaw(&t2); - interp->gc.timing_state.gc_mark_time += t2 - t1; -#endif - return 0; } #endif // GC_ENABLE_MARK_ALIVE @@ -1111,10 +920,6 @@ deduce_unreachable_heap(PyInterpreterState *interp, gc_visit_heaps(interp, &validate_refcounts, &state->base); #endif - num_alive = 0; - num_gc = 0; - num_checked = 0; - // Identify objects that are directly reachable from outside the GC heap // by computing the difference between the refcount and the number of // incoming references. @@ -1288,12 +1093,6 @@ _PyGC_Init(PyInterpreterState *interp) return _PyStatus_NO_MEMORY(); } -#ifdef WITH_GC_TIMING_STATS - gc_log = fopen("/tmp/gc_timing.log", "a"); - //p2engine_init(&gcstate->timing_state.auto_all, gc_timing_quantiles); - p2engine_init(&gcstate->timing_state.auto_full, gc_timing_quantiles); -#endif - return _PyStatus_OK(); } @@ -1670,7 +1469,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, process_delayed_frees(interp, state); #ifdef GC_ENABLE_MARK_ALIVE - if (!state->gcstate->freeze_used) { + if (!state->gcstate->freeze_active) { // Mark objects reachable from known roots as "alive". These will // be ignored for rest of the GC pass. int err = mark_root_reachable(interp, state); @@ -1783,11 +1582,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) invoke_gc_callback(tstate, "start", generation, 0, 0); } -#ifdef WITH_GC_TIMING_STATS - PyTime_t gc_timing_t1; - (void)PyTime_PerfCounterRaw(&gc_timing_t1); -#endif - if (gcstate->debug & _PyGC_DEBUG_STATS) { PySys_WriteStderr("gc: collecting generation %d...\n", generation); show_stats_each_generations(gcstate); @@ -1821,27 +1615,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) n+m, n, state.long_lived_total, d); } -#ifdef WITH_GC_TIMING_STATS - PyTime_t gc_timing_t2, dt; - (void)PyTime_PerfCounterRaw(&gc_timing_t2); - dt = gc_timing_t2 - gc_timing_t1; - if (reason == _Py_GC_REASON_HEAP) { - #if 0 // no generations, always full collection - p2engine_add(&gcstate->timing_state.auto_all, - (double)_PyTime_AsMicroseconds(dt, - _PyTime_ROUND_HALF_EVEN)); - #endif - p2engine_add(&gcstate->timing_state.auto_full, - (double)_PyTime_AsMicroseconds(dt, - _PyTime_ROUND_HALF_EVEN)); - if (dt > gcstate->timing_state.gc_max_pause) { - gcstate->timing_state.gc_max_pause = dt; - } - } - gcstate->timing_state.gc_total_time += dt; - gcstate->timing_state.gc_runs++; -#endif // WITH_GC_TIMING_STATS - // Clear the current thread's free-list again. _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; _PyObject_ClearFreeLists(&tstate_impl->freelists, 0); @@ -1870,11 +1643,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) } #endif - #ifdef WITH_GC_TIMING_STATS - fprintf(gc_log, "gc alive %d collected %ld checked %d gc %d\n", num_alive, m, num_checked, num_gc); - fflush(gc_log); - #endif - if (PyDTrace_GC_DONE_ENABLED()) { PyDTrace_GC_DONE(n + m); } @@ -2038,7 +1806,7 @@ _PyGC_Freeze(PyInterpreterState *interp) struct visitor_args args; _PyEval_StopTheWorld(interp); GCState *gcstate = get_gc_state(); - gcstate->freeze_used = 1; + gcstate->freeze_active = true; gc_visit_heaps(interp, &visit_freeze, &args); _PyEval_StartTheWorld(interp); } @@ -2060,7 +1828,7 @@ _PyGC_Unfreeze(PyInterpreterState *interp) struct visitor_args args; _PyEval_StopTheWorld(interp); GCState *gcstate = get_gc_state(); - gcstate->freeze_used = 0; + gcstate->freeze_active = false; gc_visit_heaps(interp, &visit_unfreeze, &args); _PyEval_StartTheWorld(interp); } @@ -2204,21 +1972,6 @@ _PyGC_Fini(PyInterpreterState *interp) Py_CLEAR(gcstate->garbage); Py_CLEAR(gcstate->callbacks); - #ifdef WITH_GC_TIMING_STATS - print_gc_times(gcstate); - #if 0 // no generations so all are full collections - for (int i = 0; i < QUANTILE_COUNT; i++) { - double result = p2engine_result(&gcstate->timing_state.auto_all, gc_timing_quantiles[i]); - fprintf(gc_log, "gc timing all Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); - } - #endif - for (int i = 0; i < QUANTILE_COUNT; i++) { - double result = p2engine_result(&gcstate->timing_state.auto_full, gc_timing_quantiles[i]); - fprintf(gc_log, "gc timing full Q%.0f: %.2f\n", gc_timing_quantiles[i]*100, result); - } - fclose(gc_log); - #endif // WITH_GC_TIMING_STATS - /* We expect that none of this interpreters objects are shared with other interpreters. See https://github.com/python/cpython/issues/90228. */ From 5f6ab4c50db7d2f6f218e84426301b9228c11182 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 11/19] wip: code cleanup, small bug fix More code cleanup (better names, comments, simplify error handling). Fix bug in that "alive" bit must be checked in mark_alive_stack_push() to avoid visiting already visited objects. --- Include/internal/pycore_gc.h | 2 +- Python/gc_free_threading.c | 76 ++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 66396545ef2f52..b1806df2706097 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -51,7 +51,7 @@ static inline PyObject* _Py_FROM_GC(PyGC_Head *gc) { # define _PyGC_BITS_FROZEN (1<<3) # define _PyGC_BITS_SHARED (1<<4) # define _PyGC_BITS_ALIVE (1<<5) // Reachable from a known root. -# define _PyGC_BITS_DEFERRED (1<<7) // Use deferred reference counting +# define _PyGC_BITS_DEFERRED (1<<6) // Use deferred reference counting #endif #ifdef Py_GIL_DISABLED diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 8d403911910139..242359c674e37a 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -155,13 +155,13 @@ gc_is_unreachable(PyObject *op) return gc_has_bit(op, _PyGC_BITS_UNREACHABLE); } -static void +static inline void gc_set_unreachable(PyObject *op) { gc_set_bit(op, _PyGC_BITS_UNREACHABLE); } -static void +static inline void gc_clear_unreachable(PyObject *op) { gc_clear_bit(op, _PyGC_BITS_UNREACHABLE); @@ -174,14 +174,14 @@ gc_is_alive(PyObject *op) } #ifdef GC_ENABLE_MARK_ALIVE -static void +static inline void gc_set_alive(PyObject *op) { gc_set_bit(op, _PyGC_BITS_ALIVE); } #endif -static void +static inline void gc_clear_alive(PyObject *op) { gc_clear_bit(op, _PyGC_BITS_ALIVE); @@ -448,22 +448,26 @@ gc_visit_thread_stacks(PyInterpreterState *interp) } #ifdef GC_ENABLE_MARK_ALIVE -static bool -mark_alive_stack_push(_PyObjectStack *stack, PyObject *op) +static int +mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) { if (op == NULL) { - return true; + return 0; + } + if (!_PyObject_GC_IS_TRACKED(op)) { + return 0; + } + if (gc_is_alive(op)) { + return 0; // already visited this object } - assert(!gc_is_alive(op)); gc_set_alive(op); - //gc_maybe_merge_refcount(op); if (_PyObjectStack_Push(stack, op) < 0) { - return false; + _PyObjectStack_Clear(stack); + return -1; } - return true; + return 0; } - #ifdef GC_MARK_ALIVE_STACKS static bool gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) @@ -472,11 +476,9 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) // Otherwise we might read into invalid memory due to non-deferred references // being dead already. if (PyStackRef_IsDeferred(stackref) && !PyStackRef_IsNull(stackref)) { - PyObject *obj = PyStackRef_AsPyObjectBorrow(stackref); - if (_PyObject_GC_IS_TRACKED(obj) && !gc_is_frozen(obj)) { - if (!mark_alive_stack_push(stack, obj)) { - return false; - } + PyObject *op = PyStackRef_AsPyObjectBorrow(stackref); + if (mark_alive_stack_push(op, stack) < 0) { + return false; } } return true; @@ -836,16 +838,6 @@ static int move_legacy_finalizer_reachable(struct collection_state *state); #ifdef GC_ENABLE_MARK_ALIVE -static int -visit_propagate_alive(PyObject *op, _PyObjectStack *stack) -{ - if (_PyObject_GC_IS_TRACKED(op) && !gc_is_alive(op)) { - gc_set_alive(op); - return _PyObjectStack_Push(stack, op); - } - return 0; -} - static int propagate_alive_bits(_PyObjectStack *stack) { @@ -857,27 +849,37 @@ propagate_alive_bits(_PyObjectStack *stack) assert(_PyObject_GC_IS_TRACKED(op)); assert(gc_is_alive(op)); traverseproc traverse = Py_TYPE(op)->tp_traverse; - if (traverse(op, (visitproc)&visit_propagate_alive, stack) < 0) { - _PyObjectStack_Clear(stack); + if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) { return -1; } } return 0; } +// Using tp_traverse, mark everything reachable from known root objects +// (which must be non-garbage) as alive (_PyGC_BITS_ALIVE is set). In +// most programs, this marks nearly all objects that are not actually +// unreachable. Actually alive objects can be missed in this pass if +// they are alive due to being referenced from an unknown root (e.g. an +// extension module global), some tp_traverse methods are either missing +// or not accurate, or objects that have been untracked (and objects +// referenced by those). If gc.freeze() is used, this pass is disabled +// since it is unlikely to help much. The next stages of cyclic GC will +// ignore objects with the alive bit set. +// +// Returns -1 on failure (out of memory). static int -mark_root_reachable(PyInterpreterState *interp, - struct collection_state *state) +mark_alive_from_roots(PyInterpreterState *interp, + struct collection_state *state) { #ifdef GC_DEBUG - // Check that all objects don't have ALIVE bits set + // Check that all objects don't have alive bit set gc_visit_heaps(interp, &validate_alive_bits, &state->base); #endif _PyObjectStack stack = { NULL }; #define STACK_PUSH(op) \ - if (!mark_alive_stack_push(&stack, op)) { \ - _PyObjectStack_Clear(&stack); \ + if (mark_alive_stack_push(op, &stack) < 0) { \ return -1; \ } STACK_PUSH(interp->sysdict); @@ -896,12 +898,12 @@ mark_root_reachable(PyInterpreterState *interp, #endif #ifdef GC_MARK_ALIVE_STACKS if (!gc_visit_thread_stacks_mark_alive(interp, &stack)) { - _PyObjectStack_Clear(&stack); return -1; } #endif #undef STACK_PUSH + // Use tp_traverse to find everything reachable from roots. propagate_alive_bits(&stack); return 0; @@ -1472,7 +1474,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, if (!state->gcstate->freeze_active) { // Mark objects reachable from known roots as "alive". These will // be ignored for rest of the GC pass. - int err = mark_root_reachable(interp, state); + int err = mark_alive_from_roots(interp, state); if (err < 0) { _PyEval_StartTheWorld(interp); PyErr_NoMemory(); @@ -1490,7 +1492,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, } #ifdef GC_DEBUG - // At this point, no object should have ALIVE bit set + // At this point, no object should have the alive bit set gc_visit_heaps(interp, &validate_alive_bits, &state->base); #endif From bd46b5fc18b8966829fc3cfd34ee167da5182bbf Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 12/19] wip: untrack tuples in "mark alive" pass Make sure we still do this optimization. There is also a unit test that checks for this. --- Python/gc_free_threading.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 242359c674e37a..32472e498b13c9 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -460,6 +460,16 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) if (gc_is_alive(op)) { return 0; // already visited this object } + if (!_PyObject_HasDeferredRefcount(op)) { + // Untrack objects that can never create reference cycles. Currently + // we only check for tuples containing only non-GC objects. + if (PyTuple_CheckExact(op)) { + _PyTuple_MaybeUntrack(op); + if (!_PyObject_GC_IS_TRACKED(op)) { + return 0; + } + } + } gc_set_alive(op); if (_PyObjectStack_Push(stack, op) < 0) { _PyObjectStack_Clear(stack); From d25cb4a3f0417099ecc97c8c5726f43c386e51e7 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 13/19] wip: enable stacks and extra roots --- Python/gc_free_threading.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 32472e498b13c9..4207908d3d4b9e 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -23,10 +23,10 @@ #define GC_ENABLE_MARK_ALIVE 1 // include addtional roots in "mark alive" pass -//#define GC_MARK_ALIVE_EXTRA_ROOTS 1 +#define GC_MARK_ALIVE_EXTRA_ROOTS 1 // include Python stacks as set of known roots -//#define GC_MARK_ALIVE_STACKS 1 +#define GC_MARK_ALIVE_STACKS 1 #ifdef Py_GIL_DISABLED From 347db45e2b277211c24d210e9f944348ee51512d Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 14/19] wip: add helper gc_maybe_untrack() Reduces duplicate code. --- Python/gc_free_threading.c | 42 +++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 4207908d3d4b9e..117a59f33c7938 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -447,6 +447,23 @@ gc_visit_thread_stacks(PyInterpreterState *interp) _Py_FOR_EACH_TSTATE_END(interp); } +// Untrack objects that can never create reference cycles. +// Return true if the object was untracked. +static bool +gc_maybe_untrack(PyObject *op) +{ + // Currently we only check for tuples containing only non-GC objects. In + // theory we could check other immutable objects that contain references + // to non-GC objects. + if (PyTuple_CheckExact(op)) { + _PyTuple_MaybeUntrack(op); + if (!_PyObject_GC_IS_TRACKED(op)) { + return true; + } + } + return false; +} + #ifdef GC_ENABLE_MARK_ALIVE static int mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) @@ -460,16 +477,12 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) if (gc_is_alive(op)) { return 0; // already visited this object } - if (!_PyObject_HasDeferredRefcount(op)) { - // Untrack objects that can never create reference cycles. Currently - // we only check for tuples containing only non-GC objects. - if (PyTuple_CheckExact(op)) { - _PyTuple_MaybeUntrack(op); - if (!_PyObject_GC_IS_TRACKED(op)) { - return 0; - } - } + if (gc_maybe_untrack(op)) { + return 0; // was untracked, don't visit it } + + // Need to call tp_traverse on this object. Add to stack and mark it + // alive so we don't re-visit it a second time. gc_set_alive(op); if (_PyObjectStack_Push(stack, op) < 0) { _PyObjectStack_Clear(stack); @@ -632,14 +645,9 @@ update_refs(const mi_heap_t *heap, const mi_heap_area_t *area, _PyObject_ASSERT(op, refcount >= 0); if (refcount > 0 && !_PyObject_HasDeferredRefcount(op)) { - // Untrack tuples and dicts as necessary in this pass, but not objects - // with zero refcount, which we will want to collect. - if (PyTuple_CheckExact(op)) { - _PyTuple_MaybeUntrack(op); - if (!_PyObject_GC_IS_TRACKED(op)) { - gc_restore_refs(op); - return true; - } + if (gc_maybe_untrack(op)) { + gc_restore_refs(op); + return true; } } From 680f80fec7e3797ca5a74f11824975283e0ce814 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:09:55 -0800 Subject: [PATCH 15/19] Add NEWS entry. --- .../2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst new file mode 100644 index 00000000000000..34952e9abb66e5 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-01-13-17-03-49.gh-issue-128807.BGGBxD.rst @@ -0,0 +1,6 @@ +Add a marking phase to the free-threaded GC. This is similar to what was +done in GH-126491. Since the free-threaded GC does not have generations and +is not incremental, the marking phase looks for all objects reachable from +known roots. The roots are objects known to not be garbage, like the module +dictionary for :mod:`sys`. For most programs, this marking phase should +make the GC a bit faster since typically less work is done per object. From 0c173c976c4602e1f414672ce5670f70060461b6 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Mon, 13 Jan 2025 17:22:52 -0800 Subject: [PATCH 16/19] wip: spelling fixes, minor code cleanup --- Python/gc_free_threading.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 117a59f33c7938..4b7e1e24f1e60e 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -7,7 +7,6 @@ #include "pycore_freelist.h" // _PyObject_ClearFreeLists() #include "pycore_initconfig.h" #include "pycore_interp.h" // PyInterpreterState.gc -#include "pycore_time.h" #include "pycore_object.h" #include "pycore_object_alloc.h" // _PyObject_MallocWithType() #include "pycore_object_stack.h" @@ -22,7 +21,7 @@ // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 -// include addtional roots in "mark alive" pass +// include additional roots in "mark alive" pass #define GC_MARK_ALIVE_EXTRA_ROOTS 1 // include Python stacks as set of known roots @@ -482,7 +481,7 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) } // Need to call tp_traverse on this object. Add to stack and mark it - // alive so we don't re-visit it a second time. + // alive so we don't traverse it a second time. gc_set_alive(op); if (_PyObjectStack_Push(stack, op) < 0) { _PyObjectStack_Clear(stack); @@ -877,13 +876,17 @@ propagate_alive_bits(_PyObjectStack *stack) // Using tp_traverse, mark everything reachable from known root objects // (which must be non-garbage) as alive (_PyGC_BITS_ALIVE is set). In // most programs, this marks nearly all objects that are not actually -// unreachable. Actually alive objects can be missed in this pass if -// they are alive due to being referenced from an unknown root (e.g. an -// extension module global), some tp_traverse methods are either missing -// or not accurate, or objects that have been untracked (and objects -// referenced by those). If gc.freeze() is used, this pass is disabled -// since it is unlikely to help much. The next stages of cyclic GC will -// ignore objects with the alive bit set. +// unreachable. +// +// Actually alive objects can be missed in this pass if they are alive +// due to being referenced from an unknown root (e.g. an extension +// module global), some tp_traverse methods are either missing or not +// accurate, or objects that have been untracked. Objects that are only +// reachable from the aforementioned are also missed. +// +// If gc.freeze() is used, this pass is disabled since it is unlikely to +// help much. The next stages of cyclic GC will ignore objects with the +// alive bit set. // // Returns -1 on failure (out of memory). static int @@ -1631,8 +1634,8 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) (void)PyTime_PerfCounterRaw(&t2); double d = PyTime_AsSecondsDouble(t2 - t1); PySys_WriteStderr( - "gc: done, %zd unreachable, %zd uncollectable, %zd long-lived, %.4fs elapsed\n", - n+m, n, state.long_lived_total, d); + "gc: done, %zd unreachable, %zd uncollectable, %.4fs elapsed\n", + n+m, n, d); } // Clear the current thread's free-list again. From 79ea47e16b50d2e2c12e30201c8fce8aba27dcaf Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 14 Jan 2025 14:45:09 -0800 Subject: [PATCH 17/19] wip: use -1 for error convention for functions --- Python/gc_free_threading.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 4b7e1e24f1e60e..078def22c489ff 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -491,7 +491,7 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) } #ifdef GC_MARK_ALIVE_STACKS -static bool +static int gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) { // Note: we MUST check that it is deferred before checking the rest. @@ -500,13 +500,13 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) if (PyStackRef_IsDeferred(stackref) && !PyStackRef_IsNull(stackref)) { PyObject *op = PyStackRef_AsPyObjectBorrow(stackref); if (mark_alive_stack_push(op, stack) < 0) { - return false; + return -1; } } - return true; + return 0; } -static bool +static int gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack) { _Py_FOR_EACH_TSTATE_BEGIN(interp, p) { @@ -518,18 +518,18 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st PyCodeObject *co = (PyCodeObject *)executable; int max_stack = co->co_nlocalsplus + co->co_stacksize; - if (!gc_visit_stackref_mark_alive(stack, f->f_executable)) { - return false; + if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) { + return -1; } for (int i = 0; i < max_stack; i++) { - if (!gc_visit_stackref_mark_alive(stack, f->localsplus[i])) { - return false; + if (gc_visit_stackref_mark_alive(stack, f->localsplus[i]) < 0) { + return -1; } } } } _Py_FOR_EACH_TSTATE_END(interp); - return true; + return 0; } #endif // GC_MARK_ALIVE_STACKS #endif // GC_ENABLE_MARK_ALIVE @@ -918,7 +918,7 @@ mark_alive_from_roots(PyInterpreterState *interp, } #endif #ifdef GC_MARK_ALIVE_STACKS - if (!gc_visit_thread_stacks_mark_alive(interp, &stack)) { + if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) { return -1; } #endif From 0090365728756749142e1c2202c6213c44a0aa3d Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 14 Jan 2025 15:03:55 -0800 Subject: [PATCH 18/19] wip: improve error handling for OOM Use gc_abort_mark_alive() helper in case of OOM. In addition to freeing the stack, we need to ensure that no object has the alive bit set on it. This also adding missing error handling in the case that propagate_alive_bits() fails. --- Python/gc_free_threading.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 078def22c489ff..17614526303fcb 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -484,12 +484,37 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack) // alive so we don't traverse it a second time. gc_set_alive(op); if (_PyObjectStack_Push(stack, op) < 0) { - _PyObjectStack_Clear(stack); return -1; } return 0; } +static bool +gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area, + void *block, size_t block_size, void *args) +{ + PyObject *op = op_from_block(block, args, false); + if (op == NULL) { + return true; + } + if (gc_is_alive(op)) { + gc_clear_alive(op); + } + return true; +} + +static void +gc_abort_mark_alive(PyInterpreterState *interp, + struct collection_state *state, + _PyObjectStack *stack) +{ + // We failed to allocate memory for "stack" while doing the "mark + // alive" phase. In that case, free the object stack and make sure + // that no objects have the alive bit set. + _PyObjectStack_Clear(stack); + gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base); +} + #ifdef GC_MARK_ALIVE_STACKS static int gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref) @@ -901,6 +926,7 @@ mark_alive_from_roots(PyInterpreterState *interp, #define STACK_PUSH(op) \ if (mark_alive_stack_push(op, &stack) < 0) { \ + gc_abort_mark_alive(interp, state, &stack); \ return -1; \ } STACK_PUSH(interp->sysdict); @@ -919,13 +945,17 @@ mark_alive_from_roots(PyInterpreterState *interp, #endif #ifdef GC_MARK_ALIVE_STACKS if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) { + gc_abort_mark_alive(interp, state, &stack); return -1; } #endif #undef STACK_PUSH // Use tp_traverse to find everything reachable from roots. - propagate_alive_bits(&stack); + if (propagate_alive_bits(&stack) < 0) { + gc_abort_mark_alive(interp, state, &stack); + return -1; + } return 0; } From 610069139cef1c74651d5260e15bf7f870fd8ec9 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Tue, 14 Jan 2025 15:12:45 -0800 Subject: [PATCH 19/19] Add comment about gc.freeze() disabling marking. --- Python/gc_free_threading.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 17614526303fcb..d1023d9351086f 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1522,6 +1522,12 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, process_delayed_frees(interp, state); #ifdef GC_ENABLE_MARK_ALIVE + // If gc.freeze() was used, it seems likely that doing this "mark alive" + // pass will not be a performance win. Typically the majority of alive + // objects will be marked as frozen and will be skipped anyhow, without + // doing this extra work. Doing this pass also defeats one of the + // purposes of using freeze: avoiding writes to objects that are frozen. + // So, we just skip this if gc.freeze() was used. if (!state->gcstate->freeze_active) { // Mark objects reachable from known roots as "alive". These will // be ignored for rest of the GC pass.