@@ -779,6 +779,19 @@ void MacroAssembler::avx_ghash(Register input_state, Register htbl,
779779 vpxor (xmm15, xmm15, xmm15, Assembler::AVX_128bit);
780780}
781781
782+ // Add 128-bit integers in xmmsrc1 to xmmsrc2, then place the result in xmmdst.
783+ // Clobber ktmp and rscratch.
784+ // Used by aesctr_encrypt.
785+ void MacroAssembler::ev_add128 (XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
786+ int vector_len, KRegister ktmp, Register rscratch) {
787+ vpaddq (xmmdst, xmmsrc1, xmmsrc2, vector_len);
788+ evpcmpuq (ktmp, xmmdst, xmmsrc2, lt, vector_len); // set mask[0/1] bit if addq to dst[0/1] wraps
789+ kshiftlbl (ktmp, ktmp, 1 ); // mask[1] <- mask[0], mask[0] <- 0, etc
790+
791+ evpaddq (xmmdst, ktmp, xmmdst, xmm17, /* merge*/ true ,
792+ vector_len); // dst[1]++ if mask[1] set
793+ }
794+
782795// AES Counter Mode using VAES instructions
783796void MacroAssembler::aesctr_encrypt (Register src_addr, Register dest_addr, Register key, Register counter,
784797 Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
@@ -831,19 +844,23 @@ void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Regis
831844 // shuffle counter using lbswap_mask
832845 vpshufb (xmm8, xmm8, xmm16, Assembler::AVX_512bit);
833846
847+ // Vector value to propagate carries
848+ evmovdquq (xmm17, ExternalAddress (StubRoutines::x86::counter_mask_ones_addr ()), Assembler::AVX_512bit, r15);
834849 // pre-increment and propagate counter values to zmm9-zmm15 registers.
835850 // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
836851 // The counter is incremented after each block i.e. 16 bytes is processed;
837852 // each zmm register has 4 counter values as its MSB
838853 // the counters are incremented in parallel
839- vpaddd (xmm8, xmm8, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 64 ), Assembler::AVX_512bit, r15);// linc0
840- vpaddd (xmm9, xmm8, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15);// linc4(rip)
841- vpaddd (xmm10, xmm9, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15);// Linc4(rip)
842- vpaddd (xmm11, xmm10, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15);// Linc4(rip)
843- vpaddd (xmm12, xmm11, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15);// Linc4(rip)
844- vpaddd (xmm13, xmm12, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15);// Linc4(rip)
845- vpaddd (xmm14, xmm13, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15);// Linc4(rip)
846- vpaddd (xmm15, xmm14, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15);// Linc4(rip)
854+ evmovdquq (xmm19, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 64 ), Assembler::AVX_512bit, r15 /* rscratch*/ );// linc0
855+ ev_add128 (xmm8, xmm8, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
856+ evmovdquq (xmm19, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 128 ), Assembler::AVX_512bit, r15 /* rscratch*/ );// linc4
857+ ev_add128 (xmm9, xmm8, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
858+ ev_add128 (xmm10, xmm9, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
859+ ev_add128 (xmm11, xmm10, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
860+ ev_add128 (xmm12, xmm11, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
861+ ev_add128 (xmm13, xmm12, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
862+ ev_add128 (xmm14, xmm13, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
863+ ev_add128 (xmm15, xmm14, xmm19, Assembler::AVX_512bit, /* ktmp*/ k1, r15);
847864
848865 // load linc32 mask in zmm register.linc32 increments counter by 32
849866 evmovdquq (xmm19, ExternalAddress (StubRoutines::x86::counter_mask_addr () + 256 ), Assembler::AVX_512bit, r15);// Linc32
@@ -891,21 +908,21 @@ void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Regis
891908 // This is followed by incrementing counter values in zmm8-zmm15.
892909 // Since we will be processing 32 blocks at a time, the counter is incremented by 32.
893910 roundEnc (xmm21, 7 );
894- vpaddq (xmm8, xmm8, xmm19, Assembler::AVX_512bit);
911+ ev_add128 /* !!! */ (xmm8, xmm8, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
895912 roundEnc (xmm22, 7 );
896- vpaddq (xmm9, xmm9, xmm19, Assembler::AVX_512bit);
913+ ev_add128 /* !!! */ (xmm9, xmm9, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
897914 roundEnc (xmm23, 7 );
898- vpaddq (xmm10, xmm10, xmm19, Assembler::AVX_512bit);
915+ ev_add128 /* !!! */ (xmm10, xmm10, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
899916 roundEnc (xmm24, 7 );
900- vpaddq (xmm11, xmm11, xmm19, Assembler::AVX_512bit);
917+ ev_add128 /* !!! */ (xmm11, xmm11, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
901918 roundEnc (xmm25, 7 );
902- vpaddq (xmm12, xmm12, xmm19, Assembler::AVX_512bit);
919+ ev_add128 /* !!! */ (xmm12, xmm12, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
903920 roundEnc (xmm26, 7 );
904- vpaddq (xmm13, xmm13, xmm19, Assembler::AVX_512bit);
921+ ev_add128 /* !!! */ (xmm13, xmm13, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
905922 roundEnc (xmm27, 7 );
906- vpaddq (xmm14, xmm14, xmm19, Assembler::AVX_512bit);
923+ ev_add128 /* !!! */ (xmm14, xmm14, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
907924 roundEnc (xmm28, 7 );
908- vpaddq (xmm15, xmm15, xmm19, Assembler::AVX_512bit);
925+ ev_add128 /* !!! */ (xmm15, xmm15, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
909926 roundEnc (xmm29, 7 );
910927
911928 cmpl (rounds, 52 );
@@ -983,8 +1000,8 @@ void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Regis
9831000 vpshufb (xmm3, xmm11, xmm16, Assembler::AVX_512bit);
9841001 evpxorq (xmm3, xmm3, xmm20, Assembler::AVX_512bit);
9851002 // Increment counter values by 16
986- vpaddq (xmm8, xmm8, xmm19, Assembler::AVX_512bit);
987- vpaddq (xmm9, xmm9, xmm19, Assembler::AVX_512bit);
1003+ ev_add128 /* !!! */ (xmm8, xmm8, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
1004+ ev_add128 /* !!! */ (xmm9, xmm9, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
9881005 // AES encode rounds
9891006 roundEnc (xmm21, 3 );
9901007 roundEnc (xmm22, 3 );
@@ -1051,7 +1068,7 @@ void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Regis
10511068 vpshufb (xmm1, xmm9, xmm16, Assembler::AVX_512bit);
10521069 evpxorq (xmm1, xmm1, xmm20, Assembler::AVX_512bit);
10531070 // increment counter by 8
1054- vpaddq (xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1071+ ev_add128 /* !!! */ (xmm8, xmm8, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
10551072 // AES encode
10561073 roundEnc (xmm21, 1 );
10571074 roundEnc (xmm22, 1 );
@@ -1109,7 +1126,7 @@ void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Regis
11091126 vpshufb (xmm0, xmm8, xmm16, Assembler::AVX_512bit);
11101127 evpxorq (xmm0, xmm0, xmm20, Assembler::AVX_512bit);
11111128 // Increment counter
1112- vpaddq (xmm8, xmm8, xmm19, Assembler::AVX_512bit);
1129+ ev_add128 /* !!! */ (xmm8, xmm8, xmm19, Assembler::AVX_512bit, /* ktmp */ k1, r15 /* rscratch */ );
11131130 vaesenc (xmm0, xmm0, xmm21, Assembler::AVX_512bit);
11141131 vaesenc (xmm0, xmm0, xmm22, Assembler::AVX_512bit);
11151132 vaesenc (xmm0, xmm0, xmm23, Assembler::AVX_512bit);
@@ -1159,7 +1176,7 @@ void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Regis
11591176 evpxorq (xmm0, xmm0, xmm20, Assembler::AVX_128bit);
11601177 vaesenc (xmm0, xmm0, xmm21, Assembler::AVX_128bit);
11611178 // Increment counter by 1
1162- vpaddq (xmm8, xmm8, xmm19, Assembler::AVX_128bit);
1179+ ev_add128 /* !!! */ (xmm8, xmm8, xmm19, Assembler::AVX_128bit, /* ktmp */ k1, r15 /* rscratch */ );
11631180 vaesenc (xmm0, xmm0, xmm22, Assembler::AVX_128bit);
11641181 vaesenc (xmm0, xmm0, xmm23, Assembler::AVX_128bit);
11651182 vaesenc (xmm0, xmm0, xmm24, Assembler::AVX_128bit);
0 commit comments