Fix audit findings: overflow, VLA guards, pread checks, portability

farkasmark · farkasmark · commit b3873c2c8ee0 · 2026-03-21T13:55:19.000+01:00
H1: Add BN_MAX_SCALE_BLOCKS/8 guard to Q4_K/Q6_K VLA allocations
H2: Cap cache n_slots to INT_MAX/2, use size_t for raw division
H3: Use unsigned for hash_size computation to avoid signed overflow
M1: Check/log fallback pread return values when prefetch fails
M2: Make threadpool dispatching flag _Atomic
M3: Better hash: layer*65537+expert instead of 16-bit truncation
M4: Guard layers[0] access with n_layers &gt; 0 check
M5: Replace memset(0xFF) with explicit -1 loop for portability
L1: Static assert BN_QK_K % 16 == 0 for NEON alignment
L2-L3: Assert n divisibility in Q8_K and Q8_0 quantization
L4: Overflow-safe chunk size computation in threadpool
L5: Clean up partial prefetch init (free succeeded thread on failure)
L6: Move atomic cursors to pool-internal storage (no public _Atomic)
diff --git a/include/threadpool.h b/include/threadpool.h
@@ -4,19 +4,12 @@
 // Persistent pthread thread pool with atomic work-stealing dispatch.
 // Threads grab chunks of rows via atomic_fetch_add for load balancing.
 
-#ifndef __EMSCRIPTEN__
-#include <stdatomic.h>
-#endif
-
 typedef void (*bn_tp_fn)(void *ctx, int start, int end);
 
 typedef struct {
     bn_tp_fn fn;    // range function: called with [start, end)
     void *ctx;      // opaque context pointer
     int   n;        // iteration count
-#ifndef __EMSCRIPTEN__
-    _Atomic int cursor;  // atomic work-stealing cursor (initialized by dispatch)
-#endif
 } BnTPTask;
 
 typedef struct BnThreadPool BnThreadPool;
diff --git a/src/main.c b/src/main.c
@@ -290,7 +290,8 @@ int main(int argc, char **argv) {
         bn_moe_prefetch_create(model.moe_state);
 
         // Create expert LRU cache (pread only)
-        if (args.cache_mb > 0 && !model.moe_state->mmap_base && model.moe_state->fd >= 0) {
+        if (args.cache_mb > 0 && !model.moe_state->mmap_base && model.moe_state->fd >= 0
+            && model.config.n_layers > 0) {
             BnMoEExpertMap *em = &model.weights.layers[0].expert_map;
             model.moe_state->cache = bn_moe_cache_create(
                 (size_t)args.cache_mb * 1024 * 1024,
diff --git a/src/moe.c b/src/moe.c
@@ -6,6 +6,7 @@
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
+#include <limits.h>
 
 #ifndef __EMSCRIPTEN__
 #include <unistd.h>
@@ -43,7 +44,7 @@ typedef struct {
 } BnMoECache;
 
 static uint32_t moe_cache_hash(int layer, int expert_idx) {
-    uint32_t key = ((uint32_t)layer << 16) | (uint32_t)(expert_idx & 0xFFFF);
+    uint32_t key = (uint32_t)layer * 65537u + (uint32_t)expert_idx;
     // murmurhash3 finalizer
     key ^= key >> 16;
     key *= 0x85ebca6b;
@@ -187,8 +188,10 @@ void *bn_moe_cache_create(size_t budget_bytes, size_t gate_bytes,
     size_t entry_bytes = gate_bytes + up_bytes + down_bytes;
     if (entry_bytes == 0) return NULL;
 
-    int n_slots = (int)(budget_bytes / entry_bytes);
-    if (n_slots < 1) return NULL;
+    size_t raw_slots = budget_bytes / entry_bytes;
+    if (raw_slots < 1) return NULL;
+    if (raw_slots > (size_t)INT_MAX / 2) raw_slots = (size_t)INT_MAX / 2;  // cap to avoid overflow
+    int n_slots = (int)raw_slots;
 
     BnMoECache *c = (BnMoECache *)calloc(1, sizeof(BnMoECache));
     if (!c) return NULL;
@@ -198,10 +201,10 @@ void *bn_moe_cache_create(size_t budget_bytes, size_t gate_bytes,
     c->up_bytes = up_bytes;
     c->n_slots = n_slots;
 
-    // Hash table: next power of 2 >= 2 * n_slots
-    int hs = 1;
-    while (hs < 2 * n_slots) hs *= 2;
-    c->hash_size = hs;
+    // Hash table: next power of 2 >= 2 * n_slots (unsigned to avoid overflow)
+    unsigned hs = 1;
+    while (hs < (unsigned)n_slots * 2) hs *= 2;
+    c->hash_size = (int)hs;
 
     // Allocate slab (32-byte aligned)
     size_t slab_size = (size_t)n_slots * entry_bytes;
@@ -222,8 +225,8 @@ void *bn_moe_cache_create(size_t budget_bytes, size_t gate_bytes,
         return NULL;
     }
 
-    // Initialize
-    memset(c->hash_table, 0xFF, (size_t)hs * sizeof(int));  // -1
+    // Initialize hash table to -1 (empty)
+    for (int i = 0; i < (int)hs; i++) c->hash_table[i] = -1;
     c->lru_head = c->lru_tail = -1;
 
     // Build free list (singly-linked via .next)
@@ -896,14 +899,16 @@ void bn_moe_forward(BnModel *m, BnLayerWeights *lw, int l) {
                     ms->prefetch_wait_ms += moe_time_ms() - tw;
                     COLLECT_PF_STATS(pf_gu);
                     if (!ok) {
-                        pread(ms->fd, miss_g_dst, miss_g_sz, (off_t)miss_g_off);
-                        pread(ms->fd, miss_u_dst, miss_u_sz, (off_t)miss_u_off);
+                        if (pread(ms->fd, miss_g_dst, miss_g_sz, (off_t)miss_g_off) < 0)
+                            SH_LOG_ERROR("Fallback gate pread failed");
+                        if (pread(ms->fd, miss_u_dst, miss_u_sz, (off_t)miss_u_off) < 0)
+                            SH_LOG_ERROR("Fallback up pread failed");
                     }
                 } else {
-                    pread(ms->fd, miss_g_dst, miss_g_sz, (off_t)miss_g_off);
-                    pread(ms->fd, miss_u_dst, miss_u_sz, (off_t)miss_u_off);
+                    (void)pread(ms->fd, miss_g_dst, miss_g_sz, (off_t)miss_g_off);
+                    (void)pread(ms->fd, miss_u_dst, miss_u_sz, (off_t)miss_u_off);
                     if (!pf_dn)
-                        pread(ms->fd, miss_d_dst, miss_d_sz, (off_t)miss_d_off);
+                        (void)pread(ms->fd, miss_d_dst, miss_d_sz, (off_t)miss_d_off);
                 }
                 gate_ptr = miss_g_dst;
                 up_ptr   = miss_u_dst;
@@ -934,13 +939,15 @@ void bn_moe_forward(BnModel *m, BnLayerWeights *lw, int l) {
                     ms->prefetch_wait_ms += moe_time_ms() - tw;
                     COLLECT_PF_STATS(pf_gu);
                     if (!ok) {
-                        pread(ms->fd, g_dst, g_sz, (off_t)g_off);
-                        pread(ms->fd, u_dst, u_sz, (off_t)u_off);
+                        if (pread(ms->fd, g_dst, g_sz, (off_t)g_off) < 0)
+                            SH_LOG_ERROR("Fallback gate pread failed");
+                        if (pread(ms->fd, u_dst, u_sz, (off_t)u_off) < 0)
+                            SH_LOG_ERROR("Fallback up pread failed");
                     }
                 } else {
-                    pread(ms->fd, g_dst, g_sz, (off_t)g_off);
-                    pread(ms->fd, u_dst, u_sz, (off_t)u_off);
-                    pread(ms->fd, d_dst, d_sz, (off_t)d_off);
+                    (void)pread(ms->fd, g_dst, g_sz, (off_t)g_off);
+                    (void)pread(ms->fd, u_dst, u_sz, (off_t)u_off);
+                    (void)pread(ms->fd, d_dst, d_sz, (off_t)d_off);
                 }
 
                 gate_ptr = g_dst;
@@ -1148,10 +1155,14 @@ void bn_moe_prefetch_create(BnMoEState *ms) {
     if (ms->fd >= 0 && !ms->mmap_base) {
         ms->prefetch = moe_prefetch_init(ms->fd);
         ms->prefetch_down = moe_prefetch_init(ms->fd);
-        if (ms->prefetch && ms->prefetch_down)
+        if (ms->prefetch && ms->prefetch_down) {
             SH_LOG_INFO("MoE I/O prefetch threads", "status", "2 created (gate+up, down)");
-        else
-            SH_LOG_INFO("MoE I/O prefetch threads", "status", "partial");
+        } else {
+            // Clean up partial init — free whichever succeeded
+            if (ms->prefetch) { moe_prefetch_free((BnMoEPrefetch *)ms->prefetch); ms->prefetch = NULL; }
+            if (ms->prefetch_down) { moe_prefetch_free((BnMoEPrefetch *)ms->prefetch_down); ms->prefetch_down = NULL; }
+            SH_LOG_WARN("MoE I/O prefetch threads failed to create");
+        }
     }
 #endif
 }
diff --git a/src/quant/dispatch.c b/src/quant/dispatch.c
@@ -128,6 +128,7 @@ void bn_quant_matvec(float *out, const BnQWeight *W, const float *x,
     if (W->type == BN_GGUF_TENSOR_Q6_K) {
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
         int n_sb = W->cols / BN_QK_K;
+        if (n_sb < 1 || n_sb > BN_MAX_SCALE_BLOCKS / 8) return;
         float q8k_d[n_sb];
         int16_t q8k_bsums[n_sb * 16];
         bn_quant_x_to_q8k(x, x_q_buf, q8k_d, q8k_bsums, W->cols);
@@ -173,6 +174,7 @@ void bn_quant_matvec(float *out, const BnQWeight *W, const float *x,
     if (W->type == BN_GGUF_TENSOR_Q4_K) {
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
         int n_sb = W->cols / BN_QK_K;
+        if (n_sb < 1 || n_sb > BN_MAX_SCALE_BLOCKS / 8) return;
         float q8k_d[n_sb];
         int16_t q8k_bsums[n_sb * 16];
         bn_quant_x_to_q8k(x, x_q_buf, q8k_d, q8k_bsums, W->cols);
@@ -632,6 +634,7 @@ void bn_quant_matvec_batch(const BnMatvecTask *tasks, int n_tasks,
 
     if (all_q6k && n_tasks <= BN_MAX_BATCH) {
         int n_sb = cols / BN_QK_K;
+        if (n_sb < 1 || n_sb > BN_MAX_SCALE_BLOCKS / 8) { for (int t = 0; t < n_tasks; t++) bn_quant_matvec(tasks[t].out, tasks[t].W, x, x_q_buf, pool); return; }
         float q8k_d[n_sb];
         int16_t q8k_bsums[n_sb * 16];
         bn_quant_x_to_q8k(x, x_q_buf, q8k_d, q8k_bsums, cols);
@@ -650,6 +653,7 @@ void bn_quant_matvec_batch(const BnMatvecTask *tasks, int n_tasks,
 
     if (all_q4k && n_tasks <= BN_MAX_BATCH) {
         int n_sb = cols / BN_QK_K;
+        if (n_sb < 1 || n_sb > BN_MAX_SCALE_BLOCKS / 8) { for (int t = 0; t < n_tasks; t++) bn_quant_matvec(tasks[t].out, tasks[t].W, x, x_q_buf, pool); return; }
         float q8k_d[n_sb];
         int16_t q8k_bsums[n_sb * 16];
         bn_quant_x_to_q8k(x, x_q_buf, q8k_d, q8k_bsums, cols);
diff --git a/src/quant/x_quant_neon.c b/src/quant/x_quant_neon.c
@@ -1,5 +1,6 @@
 #include "quant_internal.h"
 #include <arm_neon.h>
+#include <assert.h>
 #include <math.h>
 
 // Quantize float vector x[n] to int8, returning scale = amax/127.
@@ -113,11 +114,14 @@ void bn_quant_f16_rows_to_i8(const uint16_t *f16, int8_t *i8_out,
     }
 }
 
+_Static_assert(BN_QK_K % 16 == 0, "BN_QK_K must be a multiple of 16 for NEON");
+
 // Q8_K quantization: 256-element super-blocks with bsums for Q4_K SDOT.
 // x_d[n/256]: one float scale per super-block
 // x_bsums[n/256 * 16]: int16 sum per 16-element group (for min correction)
 void bn_quant_x_to_q8k(const float *x, int8_t *x_q, float *x_d,
                          int16_t *x_bsums, int n) {
+    assert(n % BN_QK_K == 0 && "bn_quant_x_to_q8k: n must be multiple of BN_QK_K");
     int n_sb = n / BN_QK_K;
     for (int sb = 0; sb < n_sb; sb++) {
         const float *xb = x + sb * BN_QK_K;
@@ -170,6 +174,7 @@ void bn_quant_x_to_q8k(const float *x, int8_t *x_q, float *x_d,
 // Per-block Q8_0 quantization for Q4_0 integer dot product path.
 // Quantizes each 32-element block independently with its own scale.
 void bn_quant_x_to_q8_blocks(const float *x, int8_t *x_q, float *x_scales, int n) {
+    assert(n % 32 == 0 && "bn_quant_x_to_q8_blocks: n must be multiple of 32");
     int n_blocks = n / 32;
     for (int b = 0; b < n_blocks; b++) {
         const float *xb = x + b * 32;
diff --git a/src/threadpool.c b/src/threadpool.c
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdatomic.h>
+#include <limits.h>
 #include <assert.h>
 
 #if defined(__APPLE__)
@@ -19,32 +20,36 @@ typedef struct {
     int tid;
 } WorkerArg;
 
+#define TP_MAX_TASKS 32  // max concurrent tasks per dispatch
+
 struct BnThreadPool {
     pthread_t    *threads;
     int           n_workers;   // background threads
     int           n_threads;   // n_workers + 1 (main)
     BnTPTask     *tasks;
     int           n_tasks;
+    _Atomic int   cursors[TP_MAX_TASKS];  // atomic work-stealing cursors
     pthread_mutex_t mtx;
     pthread_cond_t  work_cond;
     pthread_cond_t  done_cond;
     int64_t       generation;
     int           n_done;
     int           shutdown;
-    int           dispatching; // reentrancy guard
+    _Atomic int   dispatching; // reentrancy guard (main-thread-only, atomic for safety)
 };
 
 // Execute all tasks via atomic work-stealing with adaptive chunk size.
 // Chunk = n / (4 * n_threads) — mostly static, stealing for tail imbalance.
-static void tp_execute(const BnThreadPool *pool) {
+static void tp_execute(BnThreadPool *pool) {
     int nt = pool->n_threads;
     for (int t = 0; t < pool->n_tasks; t++) {
         BnTPTask *task = &pool->tasks[t];
         int n = task->n;
-        int chunk = n / (nt * 4);
+        int nt4 = nt <= INT_MAX / 4 ? nt * 4 : nt;  // avoid overflow
+        int chunk = n / nt4;
         if (chunk < TP_CHUNK_MIN) chunk = TP_CHUNK_MIN;
         for (;;) {
-            int start = atomic_fetch_add_explicit(&task->cursor, chunk,
+            int start = atomic_fetch_add_explicit(&pool->cursors[t], chunk,
                                                    memory_order_relaxed);
             if (start >= n) break;
             int end = start + chunk;
@@ -180,9 +185,10 @@ void bn_tp_dispatch(BnThreadPool *pool, BnTPTask *tasks, int n_tasks) {
     assert(!pool->dispatching && "bn_tp_dispatch is not reentrant");
     pool->dispatching = 1;
 
-    // Initialize atomic cursors
-    for (int t = 0; t < n_tasks; t++)
-        atomic_store_explicit(&tasks[t].cursor, 0, memory_order_relaxed);
+    // Initialize atomic cursors (pool-internal storage)
+    int capped_tasks = n_tasks <= TP_MAX_TASKS ? n_tasks : TP_MAX_TASKS;
+    for (int t = 0; t < capped_tasks; t++)
+        atomic_store_explicit(&pool->cursors[t], 0, memory_order_relaxed);
 
     // Set up work and wake workers
     pthread_mutex_lock(&pool->mtx);