|
28 | 28 | #include "runtime/stubRoutines.hpp"
|
29 | 29 |
|
30 | 30 | /**
|
31 |
| - * Perform the quarter round calculations on values contained within |
32 |
| - * four SIMD registers. |
| 31 | + * Perform the vectorized add for a group of 4 quarter round operations. |
| 32 | + * In the ChaCha20 quarter round, there are two add ops: a += b and c += d. |
| 33 | + * Each parameter is a set of 4 registers representing the 4 registers |
| 34 | + * for the each addend in the add operation for each of the quarter rounds. |
| 35 | + * (e.g. for "a" it would consist of v0/v1/v2/v3). The result of the add |
| 36 | + * is placed into the vectors in the "addFirst" array. |
33 | 37 | *
|
34 |
| - * @param aVec the SIMD register containing only the "a" values |
35 |
| - * @param bVec the SIMD register containing only the "b" values |
36 |
| - * @param cVec the SIMD register containing only the "c" values |
37 |
| - * @param dVec the SIMD register containing only the "d" values |
38 |
| - * @param scratch scratch SIMD register used for 12 and 7 bit left rotations |
39 |
| - * @param table the SIMD register used as a table for 8 bit left rotations |
| 38 | + * @param addFirst array of SIMD registers representing the first addend. |
| 39 | + * @param addSecond array of SIMD registers representing the second addend. |
40 | 40 | */
|
41 |
| -void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec, |
42 |
| - FloatRegister cVec, FloatRegister dVec, FloatRegister scratch, |
43 |
| - FloatRegister table) { |
| 41 | +void MacroAssembler::cc20_qr_add4(FloatRegister (&addFirst)[4], |
| 42 | + FloatRegister (&addSecond)[4]) { |
| 43 | + for (int i = 0; i < 4; i++) { |
| 44 | + addv(addFirst[i], T4S, addFirst[i], addSecond[i]); |
| 45 | + } |
| 46 | +} |
| 47 | + |
| 48 | + |
| 49 | +/** |
| 50 | + * Perform the vectorized XOR for a group of 4 quarter round operations. |
| 51 | + * In the ChaCha20 quarter round, there are two XOR ops: d ^= a and b ^= c |
| 52 | + * Each parameter is a set of 4 registers representing the 4 registers |
| 53 | + * for the each element in the xor operation for each of the quarter rounds. |
| 54 | + * (e.g. for "a" it would consist of v0/v1/v2/v3) |
| 55 | + * Note: because the b ^= c ops precede a non-byte-aligned left-rotation, |
| 56 | + * there is a third parameter which can take a set of scratch registers |
| 57 | + * for the result, which facilitates doing the subsequent operations for |
| 58 | + * the left rotation. |
| 59 | + * |
| 60 | + * @param firstElem array of SIMD registers representing the first element. |
| 61 | + * @param secondElem array of SIMD registers representing the second element. |
| 62 | + * @param result array of SIMD registers representing the destination. |
| 63 | + * May be the same as firstElem or secondElem, or a separate array. |
| 64 | + */ |
| 65 | +void MacroAssembler::cc20_qr_xor4(FloatRegister (&firstElem)[4], |
| 66 | + FloatRegister (&secondElem)[4], FloatRegister (&result)[4]) { |
| 67 | + for (int i = 0; i < 4; i++) { |
| 68 | + eor(result[i], T16B, firstElem[i], secondElem[i]); |
| 69 | + } |
| 70 | +} |
| 71 | + |
| 72 | +/** |
| 73 | + * Perform the vectorized left-rotation on 32-bit lanes for a group of |
| 74 | + * 4 quarter round operations. |
| 75 | + * Each parameter is a set of 4 registers representing the 4 registers |
| 76 | + * for the each element in the source and destination for each of the quarter |
| 77 | + * rounds (e.g. for "d" it would consist of v12/v13/v14/v15 on columns and |
| 78 | + * v15/v12/v13/v14 on diagonal alignments). |
| 79 | + * |
| 80 | + * @param sourceReg array of SIMD registers representing the source |
| 81 | + * @param destReg array of SIMD registers representing the destination |
| 82 | + * @param bits the distance of the rotation in bits, must be 16/12/8/7 per |
| 83 | + * the ChaCha20 specification. |
| 84 | + */ |
| 85 | +void MacroAssembler::cc20_qr_lrot4(FloatRegister (&sourceReg)[4], |
| 86 | + FloatRegister (&destReg)[4], int bits, FloatRegister table) { |
| 87 | + switch (bits) { |
| 88 | + case 16: // reg <<<= 16, in-place swap of half-words |
| 89 | + for (int i = 0; i < 4; i++) { |
| 90 | + rev32(destReg[i], T8H, sourceReg[i]); |
| 91 | + } |
| 92 | + break; |
44 | 93 |
|
45 |
| - // a += b, d ^= a, d <<<= 16 |
46 |
| - addv(aVec, T4S, aVec, bVec); |
47 |
| - eor(dVec, T16B, dVec, aVec); |
48 |
| - rev32(dVec, T8H, dVec); |
| 94 | + case 7: // reg <<<= (12 || 7) |
| 95 | + case 12: // r-shift src -> dest, l-shift src & ins to dest |
| 96 | + for (int i = 0; i < 4; i++) { |
| 97 | + ushr(destReg[i], T4S, sourceReg[i], 32 - bits); |
| 98 | + } |
49 | 99 |
|
50 |
| - // c += d, b ^= c, b <<<= 12 |
51 |
| - addv(cVec, T4S, cVec, dVec); |
52 |
| - eor(scratch, T16B, bVec, cVec); |
53 |
| - ushr(bVec, T4S, scratch, 20); |
54 |
| - sli(bVec, T4S, scratch, 12); |
| 100 | + for (int i = 0; i < 4; i++) { |
| 101 | + sli(destReg[i], T4S, sourceReg[i], bits); |
| 102 | + } |
| 103 | + break; |
55 | 104 |
|
56 |
| - // a += b, d ^= a, d <<<= 8 |
57 |
| - addv(aVec, T4S, aVec, bVec); |
58 |
| - eor(dVec, T16B, dVec, aVec); |
59 |
| - tbl(dVec, T16B, dVec, 1, table); |
| 105 | + case 8: // reg <<<= 8, simulate left rotation with table reorg |
| 106 | + for (int i = 0; i < 4; i++) { |
| 107 | + tbl(destReg[i], T16B, sourceReg[i], 1, table); |
| 108 | + } |
| 109 | + break; |
60 | 110 |
|
61 |
| - // c += d, b ^= c, b <<<= 7 |
62 |
| - addv(cVec, T4S, cVec, dVec); |
63 |
| - eor(scratch, T16B, bVec, cVec); |
64 |
| - ushr(bVec, T4S, scratch, 25); |
65 |
| - sli(bVec, T4S, scratch, 7); |
| 111 | + default: |
| 112 | + // The caller shouldn't be sending bit rotation values outside |
| 113 | + // of the 16/12/8/7 as defined in the specification. |
| 114 | + ShouldNotReachHere(); |
| 115 | + } |
66 | 116 | }
|
67 | 117 |
|
68 | 118 | /**
|
69 |
| - * Shift the b, c, and d vectors between columnar and diagonal representations. |
70 |
| - * Note that the "a" vector does not shift. |
| 119 | + * Set the FloatRegisters for a 4-vector register set. These will be used |
| 120 | + * during various quarter round transformations (adds, xors and left-rotations). |
| 121 | + * This method itself does not result in the output of any assembly |
| 122 | + * instructions. It just organizes the vectors so they can be in columnar or |
| 123 | + * diagonal alignments. |
71 | 124 | *
|
72 |
| - * @param bVec the SIMD register containing only the "b" values |
73 |
| - * @param cVec the SIMD register containing only the "c" values |
74 |
| - * @param dVec the SIMD register containing only the "d" values |
75 |
| - * @param colToDiag true if moving columnar to diagonal, false if |
76 |
| - * moving diagonal back to columnar. |
| 125 | + * @param vectorSet a 4-vector array to be altered into a new alignment |
| 126 | + * @param stateVectors the 16-vector array that represents the current |
| 127 | + * working state. The indices of this array match up with the |
| 128 | + * organization of the ChaCha20 state per RFC 7539 (e.g. stateVectors[12] |
| 129 | + * would contain the vector that holds the 32-bit counter, etc.) |
| 130 | + * @param idx1 the index of the stateVectors array to be assigned to the |
| 131 | + * first vectorSet element. |
| 132 | + * @param idx2 the index of the stateVectors array to be assigned to the |
| 133 | + * second vectorSet element. |
| 134 | + * @param idx3 the index of the stateVectors array to be assigned to the |
| 135 | + * third vectorSet element. |
| 136 | + * @param idx4 the index of the stateVectors array to be assigned to the |
| 137 | + * fourth vectorSet element. |
77 | 138 | */
|
78 |
| -void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec, |
79 |
| - FloatRegister dVec, bool colToDiag) { |
80 |
| - int bShift = colToDiag ? 4 : 12; |
81 |
| - int cShift = 8; |
82 |
| - int dShift = colToDiag ? 12 : 4; |
83 |
| - |
84 |
| - ext(bVec, T16B, bVec, bVec, bShift); |
85 |
| - ext(cVec, T16B, cVec, cVec, cShift); |
86 |
| - ext(dVec, T16B, dVec, dVec, dShift); |
| 139 | +void MacroAssembler::cc20_set_qr_registers(FloatRegister (&vectorSet)[4], |
| 140 | + const FloatRegister (&stateVectors)[16], int idx1, int idx2, |
| 141 | + int idx3, int idx4) { |
| 142 | + vectorSet[0] = stateVectors[idx1]; |
| 143 | + vectorSet[1] = stateVectors[idx2]; |
| 144 | + vectorSet[2] = stateVectors[idx3]; |
| 145 | + vectorSet[3] = stateVectors[idx4]; |
87 | 146 | }
|
0 commit comments