Skip to content

Commit 2bc7f5c

Browse files
authored
Merge pull request #87 from MatteoMer/perf/dory-pairing-microopts
perf: Dory/pairing micro-optimizations
2 parents 8d5bf91 + 74e6616 commit 2bc7f5c

2 files changed

Lines changed: 92 additions & 32 deletions

File tree

packages/zolt-arith/src/field/pairing.zig

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ const MAX_UNPREPARED_BATCH: usize = 8;
584584

585585
/// Maximum pairs per sub-batch for prepared Miller loop.
586586
/// Only reads EllCoeff per step (~48 bytes/pair), so larger batches fit L1.
587-
const MAX_PREPARED_BATCH: usize = 16;
587+
const MAX_PREPARED_BATCH: usize = 64;
588588

589589
/// Batched Miller loop using precomputed G2 coefficients.
590590
/// Shares a single Fp12.square() per ATE iteration across all pairs,
@@ -707,31 +707,34 @@ pub fn batchedMillerLoopUnprepared(
707707
f = f.square();
708708
}
709709

710-
// Doubling step for all pairs
711-
for (0..n) |k| {
712-
if (g1_points[k].infinity or g2_points[k].infinity) continue;
713-
const coeffs_dbl = rs[k].double_in_place(two_inv);
714-
const c0_eval = fp2ScalarMul(coeffs_dbl.c0, g1_points[k].y);
715-
const c1_eval = fp2ScalarMul(coeffs_dbl.c1, g1_points[k].x);
716-
f = fp12MulBy034(f, c0_eval, c1_eval, coeffs_dbl.c2);
717-
}
718-
719710
const bit = ATE_LOOP_COUNT[idx - 1];
720-
if (bit == 1) {
711+
712+
if (bit == 1 or bit == -1) {
713+
// Non-zero bit: combine doubling + addition lines via sparse-sparse
721714
for (0..n) |k| {
722715
if (g1_points[k].infinity or g2_points[k].infinity) continue;
723-
const coeffs_add = rs[k].add_in_place(g2_points[k]);
724-
const c0_add = fp2ScalarMul(coeffs_add.c0, g1_points[k].y);
725-
const c1_add = fp2ScalarMul(coeffs_add.c1, g1_points[k].x);
726-
f = fp12MulBy034(f, c0_add, c1_add, coeffs_add.c2);
716+
// Doubling line coefficients
717+
const coeffs_dbl = rs[k].double_in_place(two_inv);
718+
const dbl_c0 = fp2ScalarMul(coeffs_dbl.c0, g1_points[k].y);
719+
const dbl_c1 = fp2ScalarMul(coeffs_dbl.c1, g1_points[k].x);
720+
// Addition line coefficients
721+
const q = if (bit == 1) g2_points[k] else nqs[k];
722+
const coeffs_add = rs[k].add_in_place(q);
723+
const add_c0 = fp2ScalarMul(coeffs_add.c0, g1_points[k].y);
724+
const add_c1 = fp2ScalarMul(coeffs_add.c1, g1_points[k].x);
725+
// Sparse × sparse combination (6 Fp2.mul)
726+
const combined = fp12Mul034By034(dbl_c0, dbl_c1, coeffs_dbl.c2, add_c0, add_c1, coeffs_add.c2);
727+
// 01234-sparse × full (17 Fp2.mul)
728+
f = fp12MulBy01234(f, combined);
727729
}
728-
} else if (bit == -1) {
730+
} else {
731+
// Zero bit: only doubling line
729732
for (0..n) |k| {
730733
if (g1_points[k].infinity or g2_points[k].infinity) continue;
731-
const coeffs_add = rs[k].add_in_place(nqs[k]);
732-
const c0_add = fp2ScalarMul(coeffs_add.c0, g1_points[k].y);
733-
const c1_add = fp2ScalarMul(coeffs_add.c1, g1_points[k].x);
734-
f = fp12MulBy034(f, c0_add, c1_add, coeffs_add.c2);
734+
const coeffs_dbl = rs[k].double_in_place(two_inv);
735+
const c0_eval = fp2ScalarMul(coeffs_dbl.c0, g1_points[k].y);
736+
const c1_eval = fp2ScalarMul(coeffs_dbl.c1, g1_points[k].x);
737+
f = fp12MulBy034(f, c0_eval, c1_eval, coeffs_dbl.c2);
735738
}
736739
}
737740
}

packages/zolt-arith/src/msm/glv.zig

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -445,35 +445,92 @@ pub fn glvScalarMulG2WithBases(bases_in: [4]G2Point, scalar: Fr) G2Projective {
445445
return shamirMul4D(bases, decomp.k, decomp.max_bits);
446446
}
447447

448-
/// Shamir's trick for 4-point multi-scalar multiplication
448+
/// Shamir's trick for 4-point multi-scalar multiplication with precomputed table.
449449
fn shamirMul4D(bases: [4]G2Point, coeffs: [4][4]u64, max_bits: usize) G2Projective {
450-
var result = G2Projective.identity();
450+
// Precompute table[0..16] where table[b3<<3|b2<<2|b1<<1|b0] = sum of bases[i] where bit i is set
451+
var table: [16]G2Point = undefined;
452+
table[0] = G2Point.identity();
453+
table[1] = bases[0];
454+
table[2] = bases[1];
455+
table[4] = bases[2];
456+
table[8] = bases[3];
457+
table[3] = G2Projective.fromAffine(bases[0]).addAffine(bases[1]).toAffine();
458+
table[5] = G2Projective.fromAffine(bases[0]).addAffine(bases[2]).toAffine();
459+
table[6] = G2Projective.fromAffine(bases[1]).addAffine(bases[2]).toAffine();
460+
table[9] = G2Projective.fromAffine(bases[0]).addAffine(bases[3]).toAffine();
461+
table[10] = G2Projective.fromAffine(bases[1]).addAffine(bases[3]).toAffine();
462+
table[12] = G2Projective.fromAffine(bases[2]).addAffine(bases[3]).toAffine();
463+
table[7] = G2Projective.fromAffine(table[3]).addAffine(bases[2]).toAffine();
464+
table[11] = G2Projective.fromAffine(table[3]).addAffine(bases[3]).toAffine();
465+
table[13] = G2Projective.fromAffine(table[5]).addAffine(bases[3]).toAffine();
466+
table[14] = G2Projective.fromAffine(table[6]).addAffine(bases[3]).toAffine();
467+
table[15] = G2Projective.fromAffine(table[7]).addAffine(bases[3]).toAffine();
451468

469+
var result = G2Projective.identity();
452470
var bit_idx: usize = max_bits;
453-
while (bit_idx > 0) {
471+
472+
// Handle odd top bit with 1-bit step
473+
if (max_bits % 2 == 1) {
454474
bit_idx -= 1;
475+
const idx = shamirTableIndex(coeffs, bit_idx);
476+
if (idx != 0) {
477+
result = G2Projective.fromAffine(table[idx]);
478+
}
479+
}
455480

481+
// Process 2 bits at a time: double-add-double-add
482+
while (bit_idx >= 2) {
483+
bit_idx -= 2;
484+
485+
// Double for high bit position
456486
if (!result.isIdentity()) {
457487
result = result.double();
458488
}
459489

460-
const limb_pos = bit_idx / 64;
461-
const bit_pos: u6 = @intCast(bit_idx % 64);
490+
// High bit contribution
491+
const idx_hi = shamirTableIndex(coeffs, bit_idx + 1);
492+
if (idx_hi != 0) {
493+
if (result.isIdentity()) {
494+
result = G2Projective.fromAffine(table[idx_hi]);
495+
} else {
496+
result = result.addAffine(table[idx_hi]);
497+
}
498+
}
462499

463-
for (0..4) |i| {
464-
if (limb_pos < 4 and (coeffs[i][limb_pos] >> bit_pos) & 1 == 1) {
465-
if (result.isIdentity()) {
466-
result = G2Projective.fromAffine(bases[i]);
467-
} else {
468-
result = result.addAffine(bases[i]);
469-
}
500+
// Double for low bit position
501+
if (!result.isIdentity()) {
502+
result = result.double();
503+
}
504+
505+
// Low bit contribution
506+
const idx_lo = shamirTableIndex(coeffs, bit_idx);
507+
if (idx_lo != 0) {
508+
if (result.isIdentity()) {
509+
result = G2Projective.fromAffine(table[idx_lo]);
510+
} else {
511+
result = result.addAffine(table[idx_lo]);
470512
}
471513
}
472514
}
473515

474516
return result;
475517
}
476518

519+
/// Compute 4-bit Shamir table index from bit position across 4 scalars.
520+
inline fn shamirTableIndex(coeffs: [4][4]u64, bit_idx: usize) u4 {
521+
const limb_pos = bit_idx / 64;
522+
const bit_pos: u6 = @intCast(bit_idx % 64);
523+
var idx: u4 = 0;
524+
if (limb_pos < 4) {
525+
for (0..4) |i| {
526+
if ((coeffs[i][limb_pos] >> bit_pos) & 1 == 1) {
527+
idx |= @as(u4, 1) << @intCast(i);
528+
}
529+
}
530+
}
531+
return idx;
532+
}
533+
477534
// ============================================================================
478535
// Utility functions
479536
// ============================================================================

0 commit comments

Comments
 (0)