Skip to content

8303762: Optimize vector slice operation with constant index using VPALIGNR instruction #24104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/hotspot/cpu/aarch64/matcher_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@
return true;
}

// Does the CPU supports vector slice from non-constant index?
static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
return false;
}

// Some microarchitectures have mask registers used on vectors
static bool has_predicated_vectors(void) {
return UseSVE > 0;
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/cpu/arm/matcher_arm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@
return false;
}

// Does the CPU supports vector slice from non-constant index?
static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
return false;
}

// Some microarchitectures have mask registers used on vectors
static constexpr bool has_predicated_vectors(void) {
return false;
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/cpu/ppc/matcher_ppc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,11 @@
return false;
}

// Does the CPU supports vector slice from non-constant index?
static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
return false;
}

// Some microarchitectures have mask registers used on vectors
static constexpr bool has_predicated_vectors(void) {
return false;
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/cpu/riscv/matcher_riscv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,11 @@
return UseRVV;
}

// Does the CPU supports vector slice from non-constant index?
static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
return false;
}

// Some microarchitectures have mask registers used on vectors
static bool has_predicated_vectors(void) {
return UseRVV;
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/cpu/s390/matcher_s390.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@
return false;
}

// Does the CPU supports vector slice from non-constant index?
static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
return false;
}

// Some microarchitectures have mask registers used on vectors
static constexpr bool has_predicated_vectors(void) {
return false;
Expand Down
8 changes: 8 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6778,6 +6778,14 @@ void Assembler::vpalignr(XMMRegister dst, XMMRegister nds, XMMRegister src, int
emit_int24(0x0F, (0xC0 | encode), imm8);
}

void Assembler::evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x3, (0xC0 | encode), imm8);
}

void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2350,6 +2350,7 @@ class Assembler : public AbstractAssembler {
void palignr(XMMRegister dst, XMMRegister src, int imm8);
void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
void evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len);

void pblendw(XMMRegister dst, XMMRegister src, int imm8);
void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
Expand Down
129 changes: 129 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7091,3 +7091,132 @@ void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, X
vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
}
}

void C2_MacroAssembler::vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
assert(vlen_enc == Assembler::AVX_256bit, "");
if (origin <= 16) {
// ALIGNR instruction concatenates the corresponding 128 bit
// lanes of two source vectors and then performs the right
// shift operation over intermediate value. Thus source vectors
// lanes needs to shuffled to a format consumable by ALIGNR.
// i.e.
// Initial source vectors
// 0...256 0...256
// src1 = [v1 v2] and src2= [v3 v4]
// Formatted source vectors when SHIFT < 16 bytes
// 0...256 0...256
// src1 = [v1 v2] and src2 = [v2 v3]
// Higher 128bit lane of src2 will not impact result, which will be
// sliced from lower and higher 128 bit lane of src1 and lower 128 bit
// lane of src2.
// i.e.
// Result lanes
// res[127:0] = {src1[255:128] , src1[127:0]} >> SHIFT
// res[255:128] = {src2[127:0] , src1[255:128]} >> SHIFT
vextracti128_high(xtmp, src1);
vinserti128_high(xtmp, src2);
vpalignr(dst, xtmp, src1, origin, Assembler::AVX_256bit);
} else {
assert(origin > 16 && origin <= 32, "");
// Similarly, when SHIFT >= 16 bytes, lower 128bit lane of
// src1 will not impact result, which will be sliced from
// higher 128 bit lane of src1 and lower and upper 128 bit
// lanes of src2.
// Thus, two source vector should have following format
// 0...256 0...256
// src1 = [v2 v3] and src2 = [v3 v4]
// Result lanes
// res[127:0] = {src2[127:0] , src1[255:127]} >> SHIFT
// res[255:128] = {src2[255:128] , src2[127:0]} >> SHIFT
vextracti128_high(xtmp, src1);
vinserti128_high(xtmp, src2);
vpalignr(dst, src2, xtmp, origin, Assembler::AVX_256bit);
}
}


void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
if (origin <= 16) {
// Initial source vectors
// 0.........512 0.........512
// src1 = [v1 v2 v3 v4] and src2 = [v5 v6 v7 v8]
// where v* represents 128 bit wide vector lanes.
// When SHIFT <= 16 result will be sliced out from src1 and
// lowest 128 bit vector lane
// of src2.
// ALIGNR will consider following source vector lanes pairs
// spread across two source vectors in order to compute 128 bit
// lanes of result vector.
// res[127:0] = {src1[255:128], src1[127:0]}
// res[255:128] = {src1[383:256], src1[255:128]}
// res[383:256] = {src1[511:384], src1[383:256]}
// res[511:384] = {src2[127:0], src1[511:384]}
//
// ALIGNR concatenates corresponding lanes across source vectors
// before right shifting the intermediate result. Therefore, source
// vector lanes should be shuffled to have following format
// src1 = {v1, v2, v3, v4} and src2 = {v2, v3, v4, v5}
//
// |-------------|
// |-----|--------| |
// alignr -> [v1 v2 v3 v4] [v2 v3 v4 v5]
// |_____|________| |
// |_____________|
evalignd(xtmp, src2, src1, 4, vlen_enc);
vpalignr(dst, xtmp, src1, origin, vlen_enc);
} else if (origin > 16 && origin <= 32) {
// Similarly, for SHIFT between 16 and 32 bytes
// result will be sliced out of src1 and lower
// two 128 bit lanes of src2.
// i.e.
// res[127:0] = {src1[383:256], src1[255:128]}
// res[255:128] = {src1[511:384], src1[383:256]}
// res[383:256] = {src2[127:0], src1[511:384]}
// res[511:384] = {src2[255:128], src2[127:0]}
// Thus, source vector lanes should have following format.
// src1 = {v2, v3, v4, v5} and src2 = {v3, v4, v5, v6}
evalignd(xtmp, src2, src1, 4, vlen_enc);
evalignd(dst, src2, src1, 8, vlen_enc);
vpalignr(dst, dst, xtmp, origin, vlen_enc);
} else if (origin > 32 && origin <= 48) {
// For SHIFT between 32 and 48 bytes
// result will be sliced out of src1 and lower
// four 128 bit lanes of src2.
// i.e.
// res[127:0] = {src1[511:384], src1[383:255]}
// res[255:128] = {src2[127:0], src1[511:384]}
// res[383:256] = {src2[255:128], src2[127:0]}
// res[511:384] = {src2[383:256], src2[255:128]}
// Thus, source vector lanes should have following format.
// src1 = {v3, v4, v5, v6} and src2 = {v4, v5, v6, v7}
evalignd(xtmp, src2, src1, 8, vlen_enc);
evalignd(dst, src2, src1, 12, vlen_enc);
vpalignr(dst, dst, xtmp, origin, vlen_enc);
} else {
// Finally, for SHIFT greater than 48 bytes
// result will be sliced out of upper 128 bit lane of src1 and
// src2.
// i.e.
// res[127:0] = {src2[127:0], src1[511:383]}
// res[255:128] = {src2[255:127], src2[127:0]}
// res[383:256] = {src2[383:256], src2[255:128]}
// res[511:384] = {src2[511:384], src2[383:256]}
// Thus, source vector lanes should have following format.
// src1 = {v4, v5, v6, v7} and src2 = {v5, v6, v7, v8}
assert(origin > 48 && origin <= 64, "");
evalignd(xtmp, src2, src1, 12, vlen_enc);
vpalignr(dst, src2, xtmp, origin, vlen_enc);
}
}

void C2_MacroAssembler::vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
vector_slice_64B_op(dst, src1, src2, xtmp, origin, vlen_enc);
} else {
assert(vlen_enc == Assembler::AVX_256bit, "");
vector_slice_32B_op(dst, src1, src2, xtmp, origin, vlen_enc);
}
}
6 changes: 6 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -584,4 +584,10 @@

void select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc);

void vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);

void vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);

void vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);

#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
5 changes: 5 additions & 0 deletions src/hotspot/cpu/x86/matcher_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@
return true;
}

// Does the CPU supports vector slice from non-constant index?
static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
return false;
}

// Some microarchitectures have mask registers used on vectors
static bool has_predicated_vectors(void) {
return VM_Version::supports_evex();
Expand Down
30 changes: 30 additions & 0 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1856,6 +1856,11 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
return false; // Implementation limitation
}
break;
case Op_VectorSlice:
if (UseAVX < 1 || size_in_bits < 128) {
return false;
}
break;
case Op_VectorLoadShuffle:
case Op_VectorRearrange:
if(vlen == 2) {
Expand Down Expand Up @@ -10957,3 +10962,28 @@ instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
%}
ins_pipe( pipe_slow );
%}

instruct vector_slice_const_origin_LT16B_reg(vec dst, vec src1, vec src2, immI origin)
%{
predicate(Matcher::vector_length_in_bytes(n) == 16);
match(Set dst (VectorSlice (Binary src1 src2) origin));
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vpalignr($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $origin$$constant, vlen_enc);
%}
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_reg(vec dst, vec src1, vec src2, immI origin, vec xtmp)
%{
predicate(Matcher::vector_length_in_bytes(n) >= 32);
match(Set dst (VectorSlice (Binary src1 src2) origin));
effect(TEMP xtmp);
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vector_slice_op($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, $origin$$constant, vlen_enc);
%}
ins_pipe(pipe_slow);
%}
2 changes: 1 addition & 1 deletion src/hotspot/share/adlc/formssel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4365,7 +4365,7 @@ bool MatchRule::is_vector() const {
"VectorRearrange", "VectorLoadShuffle", "VectorLoadConst",
"VectorCastB2X", "VectorCastS2X", "VectorCastI2X",
"VectorCastL2X", "VectorCastF2X", "VectorCastD2X", "VectorCastF2HF", "VectorCastHF2F",
"VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X",
"VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X", "VectorSlice",
"VectorMaskWrapper","VectorMaskCmp","VectorReinterpret","LoadVectorMasked","StoreVectorMasked",
"FmaVD","FmaVF","PopCountVI","PopCountVL","PopulateIndex","VectorLongToMask",
"CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV",
Expand Down
12 changes: 12 additions & 0 deletions src/hotspot/share/classfile/vmIntrinsics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,18 @@ class methodHandle;
"Z") \
do_name(vector_test_name, "test") \
\
do_intrinsic(_VectorSlice, jdk_internal_vm_vector_VectorSupport, vector_slice_name, vector_slice_sig, F_S) \
do_signature(vector_slice_sig, "(I" \
"Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"I" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;" \
"Ljdk/internal/vm/vector/VectorSupport$VectorSliceOp;)" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;") \
do_name(vector_slice_name, "sliceOp") \
\
\
do_intrinsic(_VectorBlend, jdk_internal_vm_vector_VectorSupport, vector_blend_name, vector_blend_sig, F_S) \
do_signature(vector_blend_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/opto/c2compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,7 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) {
case vmIntrinsics::_VectorSelectFromTwoVectorOp:
case vmIntrinsics::_VectorGatherOp:
case vmIntrinsics::_VectorScatterOp:
case vmIntrinsics::_VectorSlice:
case vmIntrinsics::_VectorReductionCoerced:
case vmIntrinsics::_VectorTest:
case vmIntrinsics::_VectorBlend:
Expand Down
19 changes: 19 additions & 0 deletions src/hotspot/share/opto/callGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,23 @@ CallGenerator* CallGenerator::for_mh_late_inline(ciMethod* caller, ciMethod* cal
return cg;
}

class LateInlineVectorCallGenerator : public LateInlineCallGenerator {
protected:
CallGenerator* _inline_cg;

public:
LateInlineVectorCallGenerator(ciMethod* method, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) :
LateInlineCallGenerator(method, intrinsic_cg) , _inline_cg(inline_cg) {}

CallGenerator* inline_cg2() const { return _inline_cg; }
virtual bool is_vector_late_inline() const { return true; }
};

CallGenerator* CallGenerator::for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) {
return new LateInlineVectorCallGenerator(m, intrinsic_cg, inline_cg);
}


// Allow inlining decisions to be delayed
class LateInlineVirtualCallGenerator : public VirtualCallGenerator {
private:
Expand Down Expand Up @@ -673,6 +690,8 @@ void CallGenerator::do_late_inline_helper() {

// Now perform the inlining using the synthesized JVMState
JVMState* new_jvms = inline_cg()->generate(jvms);
new_jvms = new_jvms == nullptr && is_vector_late_inline() ?
static_cast<const LateInlineVectorCallGenerator*>(this)->inline_cg2()->generate(jvms) : new_jvms;
if (new_jvms == nullptr) return; // no change
if (C->failing()) return;

Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/callGenerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class CallGenerator : public ArenaObj {
// same but for method handle calls
virtual bool is_mh_late_inline() const { return false; }
virtual bool is_string_late_inline() const { return false; }
virtual bool is_vector_late_inline() const { return false; }
virtual bool is_boxing_late_inline() const { return false; }
virtual bool is_vector_reboxing_late_inline() const { return false; }
virtual bool is_virtual_late_inline() const { return false; }
Expand Down Expand Up @@ -141,6 +142,7 @@ class CallGenerator : public ArenaObj {
static CallGenerator* for_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const);
static CallGenerator* for_string_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg);
static CallGenerator* for_boxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_vector_reboxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_late_inline_virtual(ciMethod* m, int vtable_index, float expected_uses);
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/opto/classes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,7 @@ macro(VectorRearrange)
macro(VectorLoadMask)
macro(VectorLoadShuffle)
macro(VectorLoadConst)
macro(VectorSlice)
macro(VectorStoreMask)
macro(VectorReinterpret)
macro(VectorCast)
Expand Down
4 changes: 3 additions & 1 deletion src/hotspot/share/opto/doCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
cg_intrinsic = cg;
cg = nullptr;
} else if (IncrementalInline && should_delay_vector_inlining(callee, jvms)) {
return CallGenerator::for_late_inline(callee, cg);
float expected_uses = jvms->method()->scale_count(site_count, prof_factor);
CallGenerator* inline_cg = CallGenerator::for_inline(callee, expected_uses);
return CallGenerator::for_vector_late_inline(callee, cg, inline_cg);
} else {
return cg;
}
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/library_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
return inline_index_vector();
case vmIntrinsics::_IndexPartiallyInUpperRange:
return inline_index_partially_in_upper_range();
case vmIntrinsics::_VectorSlice:
return inline_vector_slice();

case vmIntrinsics::_getObjectSize:
return inline_getObjectSize();
Expand Down
Loading
Loading