diff --git a/src/solver.rs b/src/solver.rs index 78abeee..2ef2ee6 100644 --- a/src/solver.rs +++ b/src/solver.rs @@ -21,8 +21,12 @@ const GPU_POLL_TIMEOUT: Duration = Duration::from_secs(5); const MAX_DISTINGUISHED_POINTS: u32 = 65_536; const JUMP_TABLE_SIZE: u32 = 256; -/// Target dispatch time in milliseconds (stay under TDR threshold) -const TARGET_DISPATCH_MS: u128 = 50; +/// Target dispatch time in milliseconds for calibration. +/// +/// This is still comfortably below the multi-second GPU watchdog budgets on the +/// supported backends, but high enough to avoid overpaying host round-trip cost +/// on benchmark-sized solves. +const TARGET_DISPATCH_MS: u128 = 120; struct JumpTableRefs<'a> { jump_points: &'a [crate::gpu::GpuAffinePoint], @@ -198,7 +202,7 @@ impl KangarooSolver { num_kangaroos: u32, verbose: bool, ) -> Result { - let variant = if ctx.max_workgroup_size() >= 128 && num_kangaroos > 65_536 { + let variant = if ctx.max_workgroup_size() >= 128 && num_kangaroos >= 65_536 { WorkgroupVariant::Wg128 } else { WorkgroupVariant::Wg64 @@ -594,7 +598,11 @@ impl KangarooSolver { /// Calibrate steps_per_call by measuring actual GPU dispatch times fn calibrate(&mut self, dp_bits: u32, verbose: bool) -> Result<()> { - let candidates = [16u32, 32, 64, 128, 256, 512]; + // Benchmark-sized solves hit a sharp cliff between 24 and 32 steps on the + // tested GPUs, so probe a few low-end values before jumping back to the + // usual power-of-two sweep. Keep the list short - calibration dispatches + // still burn startup work even though their DP output gets dropped. + let candidates = [16u32, 17, 18, 24, 64, 128, 256, 512]; let mut best_steps = candidates[0]; let dp_meta = Self::dp_meta(dp_bits);