Skip to content

Commit 594b265

Browse files
author
Jamil Nimeh
committed
8350126: Regression ~3% on Crypto-ChaCha20Poly1305.encrypt for MacOSX aarch64
Reviewed-by: aph
1 parent d783a94 commit 594b265

File tree

3 files changed

+315
-258
lines changed

3 files changed

+315
-258
lines changed

src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
33
* Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*
@@ -1611,11 +1611,15 @@ class MacroAssembler: public Assembler {
16111611
void aes_round(FloatRegister input, FloatRegister subkey);
16121612

16131613
// ChaCha20 functions support block
1614-
void cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
1615-
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
1616-
FloatRegister tbl);
1617-
void cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
1618-
FloatRegister dVec, bool colToDiag);
1614+
void cc20_qr_add4(FloatRegister (&addFirst)[4],
1615+
FloatRegister (&addSecond)[4]);
1616+
void cc20_qr_xor4(FloatRegister (&firstElem)[4],
1617+
FloatRegister (&secondElem)[4], FloatRegister (&result)[4]);
1618+
void cc20_qr_lrot4(FloatRegister (&sourceReg)[4],
1619+
FloatRegister (&destReg)[4], int bits, FloatRegister table);
1620+
void cc20_set_qr_registers(FloatRegister (&vectorSet)[4],
1621+
const FloatRegister (&stateVectors)[16], int idx1, int idx2,
1622+
int idx3, int idx4);
16191623

16201624
// Place an ISB after code may have been modified due to a safepoint.
16211625
void safepoint_isb();

src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp

Lines changed: 104 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -28,60 +28,119 @@
2828
#include "runtime/stubRoutines.hpp"
2929

3030
/**
31-
* Perform the quarter round calculations on values contained within
32-
* four SIMD registers.
31+
* Perform the vectorized add for a group of 4 quarter round operations.
32+
* In the ChaCha20 quarter round, there are two add ops: a += b and c += d.
33+
* Each parameter is a set of 4 registers representing the 4 registers
34+
* for the each addend in the add operation for each of the quarter rounds.
35+
* (e.g. for "a" it would consist of v0/v1/v2/v3). The result of the add
36+
* is placed into the vectors in the "addFirst" array.
3337
*
34-
* @param aVec the SIMD register containing only the "a" values
35-
* @param bVec the SIMD register containing only the "b" values
36-
* @param cVec the SIMD register containing only the "c" values
37-
* @param dVec the SIMD register containing only the "d" values
38-
* @param scratch scratch SIMD register used for 12 and 7 bit left rotations
39-
* @param table the SIMD register used as a table for 8 bit left rotations
38+
* @param addFirst array of SIMD registers representing the first addend.
39+
* @param addSecond array of SIMD registers representing the second addend.
4040
*/
41-
void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
42-
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
43-
FloatRegister table) {
41+
void MacroAssembler::cc20_qr_add4(FloatRegister (&addFirst)[4],
42+
FloatRegister (&addSecond)[4]) {
43+
for (int i = 0; i < 4; i++) {
44+
addv(addFirst[i], T4S, addFirst[i], addSecond[i]);
45+
}
46+
}
47+
48+
49+
/**
50+
* Perform the vectorized XOR for a group of 4 quarter round operations.
51+
* In the ChaCha20 quarter round, there are two XOR ops: d ^= a and b ^= c
52+
* Each parameter is a set of 4 registers representing the 4 registers
53+
* for the each element in the xor operation for each of the quarter rounds.
54+
* (e.g. for "a" it would consist of v0/v1/v2/v3)
55+
* Note: because the b ^= c ops precede a non-byte-aligned left-rotation,
56+
* there is a third parameter which can take a set of scratch registers
57+
* for the result, which facilitates doing the subsequent operations for
58+
* the left rotation.
59+
*
60+
* @param firstElem array of SIMD registers representing the first element.
61+
* @param secondElem array of SIMD registers representing the second element.
62+
* @param result array of SIMD registers representing the destination.
63+
* May be the same as firstElem or secondElem, or a separate array.
64+
*/
65+
void MacroAssembler::cc20_qr_xor4(FloatRegister (&firstElem)[4],
66+
FloatRegister (&secondElem)[4], FloatRegister (&result)[4]) {
67+
for (int i = 0; i < 4; i++) {
68+
eor(result[i], T16B, firstElem[i], secondElem[i]);
69+
}
70+
}
71+
72+
/**
73+
* Perform the vectorized left-rotation on 32-bit lanes for a group of
74+
* 4 quarter round operations.
75+
* Each parameter is a set of 4 registers representing the 4 registers
76+
* for the each element in the source and destination for each of the quarter
77+
* rounds (e.g. for "d" it would consist of v12/v13/v14/v15 on columns and
78+
* v15/v12/v13/v14 on diagonal alignments).
79+
*
80+
* @param sourceReg array of SIMD registers representing the source
81+
* @param destReg array of SIMD registers representing the destination
82+
* @param bits the distance of the rotation in bits, must be 16/12/8/7 per
83+
* the ChaCha20 specification.
84+
*/
85+
void MacroAssembler::cc20_qr_lrot4(FloatRegister (&sourceReg)[4],
86+
FloatRegister (&destReg)[4], int bits, FloatRegister table) {
87+
switch (bits) {
88+
case 16: // reg <<<= 16, in-place swap of half-words
89+
for (int i = 0; i < 4; i++) {
90+
rev32(destReg[i], T8H, sourceReg[i]);
91+
}
92+
break;
4493

45-
// a += b, d ^= a, d <<<= 16
46-
addv(aVec, T4S, aVec, bVec);
47-
eor(dVec, T16B, dVec, aVec);
48-
rev32(dVec, T8H, dVec);
94+
case 7: // reg <<<= (12 || 7)
95+
case 12: // r-shift src -> dest, l-shift src & ins to dest
96+
for (int i = 0; i < 4; i++) {
97+
ushr(destReg[i], T4S, sourceReg[i], 32 - bits);
98+
}
4999

50-
// c += d, b ^= c, b <<<= 12
51-
addv(cVec, T4S, cVec, dVec);
52-
eor(scratch, T16B, bVec, cVec);
53-
ushr(bVec, T4S, scratch, 20);
54-
sli(bVec, T4S, scratch, 12);
100+
for (int i = 0; i < 4; i++) {
101+
sli(destReg[i], T4S, sourceReg[i], bits);
102+
}
103+
break;
55104

56-
// a += b, d ^= a, d <<<= 8
57-
addv(aVec, T4S, aVec, bVec);
58-
eor(dVec, T16B, dVec, aVec);
59-
tbl(dVec, T16B, dVec, 1, table);
105+
case 8: // reg <<<= 8, simulate left rotation with table reorg
106+
for (int i = 0; i < 4; i++) {
107+
tbl(destReg[i], T16B, sourceReg[i], 1, table);
108+
}
109+
break;
60110

61-
// c += d, b ^= c, b <<<= 7
62-
addv(cVec, T4S, cVec, dVec);
63-
eor(scratch, T16B, bVec, cVec);
64-
ushr(bVec, T4S, scratch, 25);
65-
sli(bVec, T4S, scratch, 7);
111+
default:
112+
// The caller shouldn't be sending bit rotation values outside
113+
// of the 16/12/8/7 as defined in the specification.
114+
ShouldNotReachHere();
115+
}
66116
}
67117

68118
/**
69-
* Shift the b, c, and d vectors between columnar and diagonal representations.
70-
* Note that the "a" vector does not shift.
119+
* Set the FloatRegisters for a 4-vector register set. These will be used
120+
* during various quarter round transformations (adds, xors and left-rotations).
121+
* This method itself does not result in the output of any assembly
122+
* instructions. It just organizes the vectors so they can be in columnar or
123+
* diagonal alignments.
71124
*
72-
* @param bVec the SIMD register containing only the "b" values
73-
* @param cVec the SIMD register containing only the "c" values
74-
* @param dVec the SIMD register containing only the "d" values
75-
* @param colToDiag true if moving columnar to diagonal, false if
76-
* moving diagonal back to columnar.
125+
* @param vectorSet a 4-vector array to be altered into a new alignment
126+
* @param stateVectors the 16-vector array that represents the current
127+
* working state. The indices of this array match up with the
128+
* organization of the ChaCha20 state per RFC 7539 (e.g. stateVectors[12]
129+
* would contain the vector that holds the 32-bit counter, etc.)
130+
* @param idx1 the index of the stateVectors array to be assigned to the
131+
* first vectorSet element.
132+
* @param idx2 the index of the stateVectors array to be assigned to the
133+
* second vectorSet element.
134+
* @param idx3 the index of the stateVectors array to be assigned to the
135+
* third vectorSet element.
136+
* @param idx4 the index of the stateVectors array to be assigned to the
137+
* fourth vectorSet element.
77138
*/
78-
void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
79-
FloatRegister dVec, bool colToDiag) {
80-
int bShift = colToDiag ? 4 : 12;
81-
int cShift = 8;
82-
int dShift = colToDiag ? 12 : 4;
83-
84-
ext(bVec, T16B, bVec, bVec, bShift);
85-
ext(cVec, T16B, cVec, cVec, cShift);
86-
ext(dVec, T16B, dVec, dVec, dShift);
139+
void MacroAssembler::cc20_set_qr_registers(FloatRegister (&vectorSet)[4],
140+
const FloatRegister (&stateVectors)[16], int idx1, int idx2,
141+
int idx3, int idx4) {
142+
vectorSet[0] = stateVectors[idx1];
143+
vectorSet[1] = stateVectors[idx2];
144+
vectorSet[2] = stateVectors[idx3];
145+
vectorSet[3] = stateVectors[idx4];
87146
}

0 commit comments

Comments
 (0)