@@ -5103,6 +5103,262 @@ class StubGenerator: public StubCodeGenerator {
5103
5103
return (address) start;
5104
5104
}
5105
5105
5106
+ void adler32_process_bytes (Register buff, Register s1, Register s2, VectorRegister vtable,
5107
+ VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
5108
+ Register temp0, Register temp1, Register temp2, Register temp3,
5109
+ VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
5110
+
5111
+ assert ((lmul == Assembler::m4 && step == 64 ) ||
5112
+ (lmul == Assembler::m2 && step == 32 ) ||
5113
+ (lmul == Assembler::m1 && step == 16 ),
5114
+ " LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16" );
5115
+ // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
5116
+ // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
5117
+ // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
5118
+ // In non-vectorized code, we update s1 and s2 as:
5119
+ // s1 <- s1 + b1
5120
+ // s2 <- s2 + s1
5121
+ // s1 <- s1 + b2
5122
+ // s2 <- s2 + b1
5123
+ // ...
5124
+ // s1 <- s1 + b64
5125
+ // s2 <- s2 + s1
5126
+ // Putting above assignments together, we have:
5127
+ // s1_new = s1 + b1 + b2 + ... + b64
5128
+ // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
5129
+ // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
5130
+ // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
5131
+
5132
+ __ mv (temp3, step);
5133
+ // Load data
5134
+ __ vsetvli (temp0, temp3, Assembler::e8 , lmul);
5135
+ __ vle8_v (vbytes, buff);
5136
+ __ addi (buff, buff, step);
5137
+
5138
+ // Upper bound reduction sum for s1_new:
5139
+ // 0xFF * 64 = 0x3FC0, so:
5140
+ // 1. Need to do vector-widening reduction sum
5141
+ // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
5142
+ __ vwredsumu_vs (vs1acc, vbytes, vzero);
5143
+ // Multiplication for s2_new
5144
+ __ vwmulu_vv (vs2acc, vtable, vbytes);
5145
+
5146
+ // s2 = s2 + s1 * log2(step)
5147
+ __ slli (temp1, s1, exact_log2 (step));
5148
+ __ add (s2, s2, temp1);
5149
+
5150
+ // Summing up calculated results for s2_new
5151
+ if (MaxVectorSize > 16 ) {
5152
+ __ vsetvli (temp0, temp3, Assembler::e16 , lmul);
5153
+ } else {
5154
+ // Half of vector-widening multiplication result is in successor of vs2acc
5155
+ // group for vlen == 16, in which case we need to double vector register
5156
+ // group width in order to reduction sum all of them
5157
+ Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
5158
+ (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
5159
+ __ vsetvli (temp0, temp3, Assembler::e16 , lmulx2);
5160
+ }
5161
+ // Upper bound for reduction sum:
5162
+ // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
5163
+ // 1. Need to do vector-widening reduction sum
5164
+ // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
5165
+ __ vwredsumu_vs (vtemp1, vs2acc, vzero);
5166
+
5167
+ // Extracting results for:
5168
+ // s1_new
5169
+ __ vmv_x_s (temp0, vs1acc);
5170
+ __ add (s1, s1, temp0);
5171
+ // s2_new
5172
+ __ vsetvli (temp0, temp3, Assembler::e32 , Assembler::m1);
5173
+ __ vmv_x_s (temp1, vtemp1);
5174
+ __ add (s2, s2, temp1);
5175
+ }
5176
+
5177
+ /* **
5178
+ * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
5179
+ *
5180
+ * Arguments:
5181
+ *
5182
+ * Inputs:
5183
+ * c_rarg0 - int adler
5184
+ * c_rarg1 - byte* buff (b + off)
5185
+ * c_rarg2 - int len
5186
+ *
5187
+ * Output:
5188
+ * c_rarg0 - int adler result
5189
+ */
5190
+ address generate_updateBytesAdler32 () {
5191
+ __ align (CodeEntryAlignment);
5192
+ StubCodeMark mark (this , " StubRoutines" , " updateBytesAdler32" );
5193
+ address start = __ pc ();
5194
+
5195
+ Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
5196
+ L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
5197
+
5198
+ // Aliases
5199
+ Register adler = c_rarg0;
5200
+ Register s1 = c_rarg0;
5201
+ Register s2 = c_rarg3;
5202
+ Register buff = c_rarg1;
5203
+ Register len = c_rarg2;
5204
+ Register nmax = c_rarg4;
5205
+ Register base = c_rarg5;
5206
+ Register count = c_rarg6;
5207
+ Register temp0 = x28; // t3
5208
+ Register temp1 = x29; // t4
5209
+ Register temp2 = x30; // t5
5210
+ Register temp3 = x31; // t6
5211
+
5212
+ VectorRegister vzero = v31;
5213
+ VectorRegister vbytes = v8; // group: v8, v9, v10, v11
5214
+ VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
5215
+ VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
5216
+ VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
5217
+ VectorRegister vtable_32 = v4; // group: v4, v5
5218
+ VectorRegister vtable_16 = v30;
5219
+ VectorRegister vtemp1 = v28;
5220
+ VectorRegister vtemp2 = v29;
5221
+
5222
+ // Max number of bytes we can process before having to take the mod
5223
+ // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
5224
+ const uint64_t BASE = 0xfff1 ;
5225
+ const uint64_t NMAX = 0x15B0 ;
5226
+
5227
+ // Loops steps
5228
+ int step_64 = 64 ;
5229
+ int step_32 = 32 ;
5230
+ int step_16 = 16 ;
5231
+ int step_1 = 1 ;
5232
+
5233
+ __ enter (); // Required for proper stackwalking of RuntimeStub frame
5234
+ __ mv (temp1, 64 );
5235
+ __ vsetvli (temp0, temp1, Assembler::e8 , Assembler::m4);
5236
+
5237
+ // Generating accumulation coefficients for further calculations
5238
+ // vtable_64:
5239
+ __ vid_v (vtemp1);
5240
+ __ vrsub_vx (vtable_64, vtemp1, temp1);
5241
+ // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
5242
+
5243
+ // vtable_32:
5244
+ __ mv (temp1, 32 );
5245
+ __ vsetvli (temp0, temp1, Assembler::e8 , Assembler::m2);
5246
+ __ vid_v (vtemp1);
5247
+ __ vrsub_vx (vtable_32, vtemp1, temp1);
5248
+ // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
5249
+
5250
+ __ vsetivli (temp0, 16 , Assembler::e8 , Assembler::m1);
5251
+ // vtable_16:
5252
+ __ mv (temp1, 16 );
5253
+ __ vid_v (vtemp1);
5254
+ __ vrsub_vx (vtable_16, vtemp1, temp1);
5255
+ // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
5256
+
5257
+ __ vmv_v_i (vzero, 0 );
5258
+
5259
+ __ mv (base, BASE);
5260
+ __ mv (nmax, NMAX);
5261
+
5262
+ // s1 is initialized to the lower 16 bits of adler
5263
+ // s2 is initialized to the upper 16 bits of adler
5264
+ __ srliw (s2, adler, 16 ); // s2 = ((adler >> 16) & 0xffff)
5265
+ __ zero_extend (s1, adler, 16 ); // s1 = (adler & 0xffff)
5266
+
5267
+ // The pipelined loop needs at least 16 elements for 1 iteration
5268
+ // It does check this, but it is more effective to skip to the cleanup loop
5269
+ __ mv (temp0, step_16);
5270
+ __ bgeu (len, temp0, L_nmax);
5271
+ __ beqz (len, L_combine);
5272
+
5273
+ // Jumping to L_by1_loop
5274
+ __ sub (len, len, step_1);
5275
+ __ j (L_by1_loop);
5276
+
5277
+ __ bind (L_nmax);
5278
+ __ sub (len, len, nmax);
5279
+ __ sub (count, nmax, 16 );
5280
+ __ bltz (len, L_by16);
5281
+
5282
+ // Align L_nmax loop by 64
5283
+ __ bind (L_nmax_loop_entry);
5284
+ __ sub (count, count, 32 );
5285
+
5286
+ __ bind (L_nmax_loop);
5287
+ adler32_process_bytes (buff, s1, s2, vtable_64, vzero,
5288
+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5289
+ vtemp1, vtemp2, step_64, Assembler::m4);
5290
+ __ sub (count, count, step_64);
5291
+ __ bgtz (count, L_nmax_loop);
5292
+
5293
+ // There are three iterations left to do
5294
+ adler32_process_bytes (buff, s1, s2, vtable_32, vzero,
5295
+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5296
+ vtemp1, vtemp2, step_32, Assembler::m2);
5297
+ adler32_process_bytes (buff, s1, s2, vtable_16, vzero,
5298
+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5299
+ vtemp1, vtemp2, step_16, Assembler::m1);
5300
+
5301
+ // s1 = s1 % BASE
5302
+ __ remuw (s1, s1, base);
5303
+ // s2 = s2 % BASE
5304
+ __ remuw (s2, s2, base);
5305
+
5306
+ __ sub (len, len, nmax);
5307
+ __ sub (count, nmax, 16 );
5308
+ __ bgez (len, L_nmax_loop_entry);
5309
+
5310
+ __ bind (L_by16);
5311
+ __ add (len, len, count);
5312
+ __ bltz (len, L_by1);
5313
+ // Trying to unroll
5314
+ __ mv (temp3, step_64);
5315
+ __ blt (len, temp3, L_by16_loop);
5316
+
5317
+ __ bind (L_by16_loop_unroll);
5318
+ adler32_process_bytes (buff, s1, s2, vtable_64, vzero,
5319
+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5320
+ vtemp1, vtemp2, step_64, Assembler::m4);
5321
+ __ sub (len, len, step_64);
5322
+ // By now the temp3 should still be 64
5323
+ __ bge (len, temp3, L_by16_loop_unroll);
5324
+
5325
+ __ bind (L_by16_loop);
5326
+ adler32_process_bytes (buff, s1, s2, vtable_16, vzero,
5327
+ vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
5328
+ vtemp1, vtemp2, step_16, Assembler::m1);
5329
+ __ sub (len, len, step_16);
5330
+ __ bgez (len, L_by16_loop);
5331
+
5332
+ __ bind (L_by1);
5333
+ __ add (len, len, 15 );
5334
+ __ bltz (len, L_do_mod);
5335
+
5336
+ __ bind (L_by1_loop);
5337
+ __ lbu (temp0, Address (buff, 0 ));
5338
+ __ addi (buff, buff, step_1);
5339
+ __ add (s1, temp0, s1);
5340
+ __ add (s2, s2, s1);
5341
+ __ sub (len, len, step_1);
5342
+ __ bgez (len, L_by1_loop);
5343
+
5344
+ __ bind (L_do_mod);
5345
+ // s1 = s1 % BASE
5346
+ __ remuw (s1, s1, base);
5347
+ // s2 = s2 % BASE
5348
+ __ remuw (s2, s2, base);
5349
+
5350
+ // Combine lower bits and higher bits
5351
+ // adler = s1 | (s2 << 16)
5352
+ __ bind (L_combine);
5353
+ __ slli (s2, s2, 16 );
5354
+ __ orr (s1, s1, s2);
5355
+
5356
+ __ leave (); // Required for proper stackwalking of RuntimeStub frame
5357
+ __ ret ();
5358
+
5359
+ return start;
5360
+ }
5361
+
5106
5362
#endif // COMPILER2_OR_JVMCI
5107
5363
5108
5364
#ifdef COMPILER2
@@ -5746,6 +6002,10 @@ static const int64_t right_3_bits = right_n_bits(3);
5746
6002
StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress (true , " sha1_implCompressMB" );
5747
6003
}
5748
6004
6005
+ if (UseAdler32Intrinsics) {
6006
+ StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32 ();
6007
+ }
6008
+
5749
6009
#endif // COMPILER2_OR_JVMCI
5750
6010
}
5751
6011
0 commit comments