Merge pull request #87 from MatteoMer/perf/dory-pairing-microopts

MatteoMer · web-flow · commit 2bc7f5c57c6d · 2026-04-19T10:35:19.000+01:00
perf: Dory/pairing micro-optimizations
diff --git a/packages/zolt-arith/src/field/pairing.zig b/packages/zolt-arith/src/field/pairing.zig
@@ -584,7 +584,7 @@ const MAX_UNPREPARED_BATCH: usize = 8;
 
 /// Maximum pairs per sub-batch for prepared Miller loop.
 /// Only reads EllCoeff per step (~48 bytes/pair), so larger batches fit L1.
-const MAX_PREPARED_BATCH: usize = 16;
+const MAX_PREPARED_BATCH: usize = 64;
 
 /// Batched Miller loop using precomputed G2 coefficients.
 /// Shares a single Fp12.square() per ATE iteration across all pairs,
@@ -707,31 +707,34 @@ pub fn batchedMillerLoopUnprepared(
             f = f.square();
         }
 
-        // Doubling step for all pairs
-        for (0..n) |k| {
-            if (g1_points[k].infinity or g2_points[k].infinity) continue;
-            const coeffs_dbl = rs[k].double_in_place(two_inv);
-            const c0_eval = fp2ScalarMul(coeffs_dbl.c0, g1_points[k].y);
-            const c1_eval = fp2ScalarMul(coeffs_dbl.c1, g1_points[k].x);
-            f = fp12MulBy034(f, c0_eval, c1_eval, coeffs_dbl.c2);
-        }
-
         const bit = ATE_LOOP_COUNT[idx - 1];
-        if (bit == 1) {
+
+        if (bit == 1 or bit == -1) {
+            // Non-zero bit: combine doubling + addition lines via sparse-sparse
             for (0..n) |k| {
                 if (g1_points[k].infinity or g2_points[k].infinity) continue;
-                const coeffs_add = rs[k].add_in_place(g2_points[k]);
-                const c0_add = fp2ScalarMul(coeffs_add.c0, g1_points[k].y);
-                const c1_add = fp2ScalarMul(coeffs_add.c1, g1_points[k].x);
-                f = fp12MulBy034(f, c0_add, c1_add, coeffs_add.c2);
+                // Doubling line coefficients
+                const coeffs_dbl = rs[k].double_in_place(two_inv);
+                const dbl_c0 = fp2ScalarMul(coeffs_dbl.c0, g1_points[k].y);
+                const dbl_c1 = fp2ScalarMul(coeffs_dbl.c1, g1_points[k].x);
+                // Addition line coefficients
+                const q = if (bit == 1) g2_points[k] else nqs[k];
+                const coeffs_add = rs[k].add_in_place(q);
+                const add_c0 = fp2ScalarMul(coeffs_add.c0, g1_points[k].y);
+                const add_c1 = fp2ScalarMul(coeffs_add.c1, g1_points[k].x);
+                // Sparse × sparse combination (6 Fp2.mul)
+                const combined = fp12Mul034By034(dbl_c0, dbl_c1, coeffs_dbl.c2, add_c0, add_c1, coeffs_add.c2);
+                // 01234-sparse × full (17 Fp2.mul)
+                f = fp12MulBy01234(f, combined);
             }
-        } else if (bit == -1) {
+        } else {
+            // Zero bit: only doubling line
             for (0..n) |k| {
                 if (g1_points[k].infinity or g2_points[k].infinity) continue;
-                const coeffs_add = rs[k].add_in_place(nqs[k]);
-                const c0_add = fp2ScalarMul(coeffs_add.c0, g1_points[k].y);
-                const c1_add = fp2ScalarMul(coeffs_add.c1, g1_points[k].x);
-                f = fp12MulBy034(f, c0_add, c1_add, coeffs_add.c2);
+                const coeffs_dbl = rs[k].double_in_place(two_inv);
+                const c0_eval = fp2ScalarMul(coeffs_dbl.c0, g1_points[k].y);
+                const c1_eval = fp2ScalarMul(coeffs_dbl.c1, g1_points[k].x);
+                f = fp12MulBy034(f, c0_eval, c1_eval, coeffs_dbl.c2);
             }
         }
     }
diff --git a/packages/zolt-arith/src/msm/glv.zig b/packages/zolt-arith/src/msm/glv.zig
@@ -445,35 +445,92 @@ pub fn glvScalarMulG2WithBases(bases_in: [4]G2Point, scalar: Fr) G2Projective {
     return shamirMul4D(bases, decomp.k, decomp.max_bits);
 }
 
-/// Shamir's trick for 4-point multi-scalar multiplication
+/// Shamir's trick for 4-point multi-scalar multiplication with precomputed table.
 fn shamirMul4D(bases: [4]G2Point, coeffs: [4][4]u64, max_bits: usize) G2Projective {
-    var result = G2Projective.identity();
+    // Precompute table[0..16] where table[b3<<3|b2<<2|b1<<1|b0] = sum of bases[i] where bit i is set
+    var table: [16]G2Point = undefined;
+    table[0] = G2Point.identity();
+    table[1] = bases[0];
+    table[2] = bases[1];
+    table[4] = bases[2];
+    table[8] = bases[3];
+    table[3] = G2Projective.fromAffine(bases[0]).addAffine(bases[1]).toAffine();
+    table[5] = G2Projective.fromAffine(bases[0]).addAffine(bases[2]).toAffine();
+    table[6] = G2Projective.fromAffine(bases[1]).addAffine(bases[2]).toAffine();
+    table[9] = G2Projective.fromAffine(bases[0]).addAffine(bases[3]).toAffine();
+    table[10] = G2Projective.fromAffine(bases[1]).addAffine(bases[3]).toAffine();
+    table[12] = G2Projective.fromAffine(bases[2]).addAffine(bases[3]).toAffine();
+    table[7] = G2Projective.fromAffine(table[3]).addAffine(bases[2]).toAffine();
+    table[11] = G2Projective.fromAffine(table[3]).addAffine(bases[3]).toAffine();
+    table[13] = G2Projective.fromAffine(table[5]).addAffine(bases[3]).toAffine();
+    table[14] = G2Projective.fromAffine(table[6]).addAffine(bases[3]).toAffine();
+    table[15] = G2Projective.fromAffine(table[7]).addAffine(bases[3]).toAffine();
 
+    var result = G2Projective.identity();
     var bit_idx: usize = max_bits;
-    while (bit_idx > 0) {
+
+    // Handle odd top bit with 1-bit step
+    if (max_bits % 2 == 1) {
         bit_idx -= 1;
+        const idx = shamirTableIndex(coeffs, bit_idx);
+        if (idx != 0) {
+            result = G2Projective.fromAffine(table[idx]);
+        }
+    }
 
+    // Process 2 bits at a time: double-add-double-add
+    while (bit_idx >= 2) {
+        bit_idx -= 2;
+
+        // Double for high bit position
         if (!result.isIdentity()) {
             result = result.double();
         }
 
-        const limb_pos = bit_idx / 64;
-        const bit_pos: u6 = @intCast(bit_idx % 64);
+        // High bit contribution
+        const idx_hi = shamirTableIndex(coeffs, bit_idx + 1);
+        if (idx_hi != 0) {
+            if (result.isIdentity()) {
+                result = G2Projective.fromAffine(table[idx_hi]);
+            } else {
+                result = result.addAffine(table[idx_hi]);
+            }
+        }
 
-        for (0..4) |i| {
-            if (limb_pos < 4 and (coeffs[i][limb_pos] >> bit_pos) & 1 == 1) {
-                if (result.isIdentity()) {
-                    result = G2Projective.fromAffine(bases[i]);
-                } else {
-                    result = result.addAffine(bases[i]);
-                }
+        // Double for low bit position
+        if (!result.isIdentity()) {
+            result = result.double();
+        }
+
+        // Low bit contribution
+        const idx_lo = shamirTableIndex(coeffs, bit_idx);
+        if (idx_lo != 0) {
+            if (result.isIdentity()) {
+                result = G2Projective.fromAffine(table[idx_lo]);
+            } else {
+                result = result.addAffine(table[idx_lo]);
             }
         }
     }
 
     return result;
 }
 
+/// Compute 4-bit Shamir table index from bit position across 4 scalars.
+inline fn shamirTableIndex(coeffs: [4][4]u64, bit_idx: usize) u4 {
+    const limb_pos = bit_idx / 64;
+    const bit_pos: u6 = @intCast(bit_idx % 64);
+    var idx: u4 = 0;
+    if (limb_pos < 4) {
+        for (0..4) |i| {
+            if ((coeffs[i][limb_pos] >> bit_pos) & 1 == 1) {
+                idx |= @as(u4, 1) << @intCast(i);
+            }
+        }
+    }
+    return idx;
+}
+
 // ============================================================================
 // Utility functions
 // ============================================================================