openjdk · jatin-bhateja · Mar 18, 2025
diff --git a/src/hotspot/cpu/aarch64/matcher_aarch64.hpp b/src/hotspot/cpu/aarch64/matcher_aarch64.hpp
@@ -153,6 +153,11 @@
     return true;
   }
 
+  // Does the CPU supports vector slice from non-constant index?
+  static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
+    return false;
+  }
+
   // Some microarchitectures have mask registers used on vectors
   static bool has_predicated_vectors(void) {
     return UseSVE > 0;

diff --git a/src/hotspot/cpu/arm/matcher_arm.hpp b/src/hotspot/cpu/arm/matcher_arm.hpp
@@ -146,6 +146,11 @@
     return false;
   }
 
+  // Does the CPU supports vector slice from non-constant index?
+  static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
+    return false;
+  }
+
   // Some microarchitectures have mask registers used on vectors
   static constexpr bool has_predicated_vectors(void) {
     return false;

diff --git a/src/hotspot/cpu/ppc/matcher_ppc.hpp b/src/hotspot/cpu/ppc/matcher_ppc.hpp
@@ -155,6 +155,11 @@
     return false;
   }
 
+  // Does the CPU supports vector slice from non-constant index?
+  static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
+    return false;
+  }
+
   // Some microarchitectures have mask registers used on vectors
   static constexpr bool has_predicated_vectors(void) {
     return false;

diff --git a/src/hotspot/cpu/riscv/matcher_riscv.hpp b/src/hotspot/cpu/riscv/matcher_riscv.hpp
@@ -152,6 +152,11 @@
     return UseRVV;
   }
 
+  // Does the CPU supports vector slice from non-constant index?
+  static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
+    return false;
+  }
+
   // Some microarchitectures have mask registers used on vectors
   static bool has_predicated_vectors(void) {
     return UseRVV;

diff --git a/src/hotspot/cpu/s390/matcher_s390.hpp b/src/hotspot/cpu/s390/matcher_s390.hpp
@@ -149,6 +149,11 @@
     return false;
   }
 
+  // Does the CPU supports vector slice from non-constant index?
+  static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
+    return false;
+  }
+
   // Some microarchitectures have mask registers used on vectors
   static constexpr bool has_predicated_vectors(void) {
     return false;

diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp
@@ -6778,6 +6778,14 @@ void Assembler::vpalignr(XMMRegister dst, XMMRegister nds, XMMRegister src, int
   emit_int24(0x0F, (0xC0 | encode), imm8);
 }
 
+void Assembler::evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int24(0x3, (0xC0 | encode), imm8);
+}
+
 void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);

diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp
@@ -2350,6 +2350,7 @@ class Assembler : public AbstractAssembler  {
   void palignr(XMMRegister dst, XMMRegister src, int imm8);
   void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
   void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len);
 
   void pblendw(XMMRegister dst, XMMRegister src, int imm8);
   void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);

diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@@ -7091,3 +7091,132 @@ void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, X
     vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
   }
 }
+
+void C2_MacroAssembler::vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
+                                            XMMRegister xtmp, int origin, int vlen_enc) {
+   assert(vlen_enc == Assembler::AVX_256bit, "");
+   if (origin <= 16) {
+     // ALIGNR instruction concatenates the corresponding 128 bit
+     // lanes of two source vectors and then performs the right
+     // shift operation over intermediate value. Thus source vectors
+     // lanes needs to shuffled to a format consumable by ALIGNR.
+     // i.e.
+     // Initial source vectors
+     //         0...256            0...256
+     // src1 = [v1 v2]   and src2= [v3  v4]
+     // Formatted source vectors when SHIFT < 16 bytes
+     //         0...256            0...256
+     // src1 = [v1  v2] and src2 = [v2  v3]
+     // Higher 128bit lane of src2 will not impact result, which will be
+     // sliced from lower and higher 128 bit lane of src1 and lower 128 bit
+     // lane of src2.
+     // i.e.
+     // Result lanes
+     // res[127:0]   = {src1[255:128] , src1[127:0]}    >> SHIFT
+     // res[255:128] = {src2[127:0]   , src1[255:128]}  >> SHIFT
+     vextracti128_high(xtmp, src1);
+     vinserti128_high(xtmp, src2);
+     vpalignr(dst, xtmp, src1, origin, Assembler::AVX_256bit);
+   } else {
+     assert(origin > 16 && origin <= 32, "");
+     // Similarly, when SHIFT >= 16 bytes, lower 128bit lane of
+     // src1 will not impact result, which will be sliced from
+     // higher 128 bit lane of src1 and lower and upper 128 bit
+     // lanes of src2.
+     // Thus, two source vector should have following format
+     //         0...256            0...256
+     // src1 = [v2  v3] and src2 = [v3  v4]
+     // Result lanes
+     // res[127:0]   = {src2[127:0]   , src1[255:127]}  >> SHIFT
+     // res[255:128] = {src2[255:128] , src2[127:0]}    >> SHIFT
+     vextracti128_high(xtmp, src1);
+     vinserti128_high(xtmp, src2);
+     vpalignr(dst, src2, xtmp, origin, Assembler::AVX_256bit);
+   }
+}
+
+
+void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
+                                            XMMRegister xtmp, int origin, int vlen_enc) {
+  if (origin <= 16) {
+    // Initial source vectors
+    //        0.........512            0.........512
+    // src1 = [v1 v2 v3 v4] and src2 = [v5 v6 v7 v8]
+    // where v* represents 128 bit wide vector lanes.
+    // When SHIFT <= 16 result will be sliced out from src1 and
+    // lowest 128 bit vector lane
+    // of src2.
+    // ALIGNR will consider following source vector lanes pairs
+    // spread across two source vectors in order to compute 128 bit
+    // lanes of result vector.
+    // res[127:0]   = {src1[255:128], src1[127:0]}
+    // res[255:128] = {src1[383:256], src1[255:128]}
+    // res[383:256] = {src1[511:384], src1[383:256]}
+    // res[511:384] = {src2[127:0],   src1[511:384]}
+    //
+    // ALIGNR concatenates corresponding lanes across source vectors
+    // before right shifting the intermediate result. Therefore, source
+    // vector lanes should be shuffled to have following format
+    // src1 = {v1, v2, v3, v4} and src2 = {v2, v3, v4, v5}
+    //
+    //                       |-------------|
+    //                 |-----|--------|    |
+    // alignr ->  [v1 v2 v3 v4] [v2 v3 v4 v5]
+    //            |_____|________|    |
+    //                  |_____________|
+     evalignd(xtmp, src2, src1, 4, vlen_enc);
+     vpalignr(dst, xtmp, src1, origin, vlen_enc);
+   } else if (origin > 16 && origin <= 32) {
+    // Similarly, for SHIFT between 16 and 32 bytes
+    // result will be sliced out of src1 and lower
+    // two 128 bit lanes of src2.
+    // i.e.
+    // res[127:0]   = {src1[383:256], src1[255:128]}
+    // res[255:128] = {src1[511:384], src1[383:256]}
+    // res[383:256] = {src2[127:0],   src1[511:384]}
+    // res[511:384] = {src2[255:128], src2[127:0]}
+    // Thus, source vector lanes should have following format.
+    // src1 = {v2, v3, v4, v5} and src2 = {v3, v4, v5, v6}
+     evalignd(xtmp, src2, src1, 4, vlen_enc);
+     evalignd(dst, src2, src1, 8, vlen_enc);
+     vpalignr(dst, dst, xtmp, origin, vlen_enc);
+   } else if (origin > 32 && origin <= 48) {
+    // For SHIFT between 32 and 48 bytes
+    // result will be sliced out of src1 and lower
+    // four 128 bit lanes of src2.
+    // i.e.
+    // res[127:0]   = {src1[511:384], src1[383:255]}
+    // res[255:128] = {src2[127:0],   src1[511:384]}
+    // res[383:256] = {src2[255:128], src2[127:0]}
+    // res[511:384] = {src2[383:256], src2[255:128]}
+    // Thus, source vector lanes should have following format.
+    // src1 = {v3, v4, v5, v6} and src2 = {v4, v5, v6, v7}
+     evalignd(xtmp, src2, src1, 8, vlen_enc);
+     evalignd(dst, src2, src1, 12, vlen_enc);
+     vpalignr(dst, dst, xtmp, origin, vlen_enc);
+   } else {
+    // Finally, for SHIFT greater than 48 bytes
+    // result will be sliced out of upper 128 bit lane of src1 and
+    // src2.
+    // i.e.
+    // res[127:0]   = {src2[127:0],   src1[511:383]}
+    // res[255:128] = {src2[255:127], src2[127:0]}
+    // res[383:256] = {src2[383:256], src2[255:128]}
+    // res[511:384] = {src2[511:384], src2[383:256]}
+    // Thus, source vector lanes should have following format.
+    // src1 = {v4, v5, v6, v7} and src2 = {v5, v6, v7, v8}
+     assert(origin > 48 && origin <= 64, "");
+     evalignd(xtmp, src2, src1, 12, vlen_enc);
+     vpalignr(dst, src2, xtmp, origin, vlen_enc);
+   }
+}
+
+void C2_MacroAssembler::vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
+                                        XMMRegister xtmp, int origin, int vlen_enc) {
+  if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
+    vector_slice_64B_op(dst, src1, src2, xtmp, origin, vlen_enc);
+  } else {
+    assert(vlen_enc == Assembler::AVX_256bit, "");
+    vector_slice_32B_op(dst, src1, src2, xtmp, origin, vlen_enc);
+  }
+}
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@@ -584,4 +584,10 @@
 
   void select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc);
 
+  void vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);
+
+  void vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);
+
+  void vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);
+
 #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
diff --git a/src/hotspot/cpu/x86/matcher_x86.hpp b/src/hotspot/cpu/x86/matcher_x86.hpp
@@ -179,6 +179,11 @@
     return true;
   }
 
+  // Does the CPU supports vector slice from non-constant index?
+  static constexpr bool supports_vector_slice_with_non_constant_index(int vlen, BasicType bt) {
+    return false;
+  }
+
   // Some microarchitectures have mask registers used on vectors
   static bool has_predicated_vectors(void) {
     return VM_Version::supports_evex();

diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
@@ -1856,6 +1856,11 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
         return false; // Implementation limitation
       }
       break;
+    case Op_VectorSlice:
+      if (UseAVX < 1 || size_in_bits < 128) {
+        return false;
+      }
+      break;
     case Op_VectorLoadShuffle:
     case Op_VectorRearrange:
       if(vlen == 2) {
@@ -10957,3 +10962,28 @@ instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
   %}
   ins_pipe( pipe_slow );
 %}
+
+instruct vector_slice_const_origin_LT16B_reg(vec dst, vec src1, vec src2, immI origin)
+%{
+  predicate(Matcher::vector_length_in_bytes(n) == 16);
+  match(Set dst (VectorSlice (Binary src1 src2) origin));
+  format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    __ vpalignr($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $origin$$constant, vlen_enc);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vector_slice_const_origin_GT16B_reg(vec dst, vec src1, vec src2, immI origin, vec xtmp)
+%{
+  predicate(Matcher::vector_length_in_bytes(n) >= 32);
+  match(Set dst (VectorSlice (Binary src1 src2) origin));
+  effect(TEMP xtmp);
+  format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %}
+  ins_encode %{
+    int vlen_enc = vector_length_encoding(this);
+    __ vector_slice_op($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, $origin$$constant, vlen_enc);
+  %}
+  ins_pipe(pipe_slow);
+%}
diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp
@@ -4365,7 +4365,7 @@ bool MatchRule::is_vector() const {
     "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst",
     "VectorCastB2X", "VectorCastS2X", "VectorCastI2X",
     "VectorCastL2X", "VectorCastF2X", "VectorCastD2X", "VectorCastF2HF", "VectorCastHF2F",
-    "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X",
+    "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X", "VectorSlice",
     "VectorMaskWrapper","VectorMaskCmp","VectorReinterpret","LoadVectorMasked","StoreVectorMasked",
     "FmaVD","FmaVF","PopCountVI","PopCountVL","PopulateIndex","VectorLongToMask",
     "CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV",

diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -1121,6 +1121,18 @@ class methodHandle;
                                   "Z")                                                                                                         \
    do_name(vector_test_name, "test")                                                                                                           \
                                                                                                                                                \
+  do_intrinsic(_VectorSlice, jdk_internal_vm_vector_VectorSupport, vector_slice_name, vector_slice_sig, F_S)                                   \
+   do_signature(vector_slice_sig, "(I"                                                                                                         \
+                                   "Ljava/lang/Class;"                                                                                         \
+                                   "Ljava/lang/Class;"                                                                                         \
+                                   "I"                                                                                                         \
+                                   "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                             \
+                                   "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                             \
+                                   "Ljdk/internal/vm/vector/VectorSupport$VectorSliceOp;)"                                                     \
+                                   "Ljdk/internal/vm/vector/VectorSupport$Vector;")                                                     \
+   do_name(vector_slice_name, "sliceOp")                                                                                                         \
+                                                                                                                                               \
+                                                                                                                                               \
   do_intrinsic(_VectorBlend, jdk_internal_vm_vector_VectorSupport, vector_blend_name, vector_blend_sig, F_S)                                   \
    do_signature(vector_blend_sig, "(Ljava/lang/Class;"                                                                                         \
                                    "Ljava/lang/Class;"                                                                                         \

diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp
@@ -827,6 +827,7 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) {
   case vmIntrinsics::_VectorSelectFromTwoVectorOp:
   case vmIntrinsics::_VectorGatherOp:
   case vmIntrinsics::_VectorScatterOp:
+  case vmIntrinsics::_VectorSlice:
   case vmIntrinsics::_VectorReductionCoerced:
   case vmIntrinsics::_VectorTest:
   case vmIntrinsics::_VectorBlend:

diff --git a/src/hotspot/share/opto/callGenerator.cpp b/src/hotspot/share/opto/callGenerator.cpp
@@ -441,6 +441,23 @@ CallGenerator* CallGenerator::for_mh_late_inline(ciMethod* caller, ciMethod* cal
   return cg;
 }
 
+class LateInlineVectorCallGenerator : public LateInlineCallGenerator {
+ protected:
+ CallGenerator* _inline_cg;
+
+ public:
+  LateInlineVectorCallGenerator(ciMethod* method, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) :
+    LateInlineCallGenerator(method, intrinsic_cg) , _inline_cg(inline_cg) {}
+
+  CallGenerator* inline_cg2() const { return _inline_cg; }
+  virtual bool      is_vector_late_inline() const  { return true; }
+};
+
+CallGenerator* CallGenerator::for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) {
+  return new LateInlineVectorCallGenerator(m, intrinsic_cg, inline_cg);
+}
+
+
 // Allow inlining decisions to be delayed
 class LateInlineVirtualCallGenerator : public VirtualCallGenerator {
  private:
@@ -673,6 +690,8 @@ void CallGenerator::do_late_inline_helper() {
 
     // Now perform the inlining using the synthesized JVMState
     JVMState* new_jvms = inline_cg()->generate(jvms);
+    new_jvms = new_jvms == nullptr && is_vector_late_inline() ?
+        static_cast<const LateInlineVectorCallGenerator*>(this)->inline_cg2()->generate(jvms) : new_jvms;
     if (new_jvms == nullptr)  return;  // no change
     if (C->failing())      return;
 

diff --git a/src/hotspot/share/opto/callGenerator.hpp b/src/hotspot/share/opto/callGenerator.hpp
@@ -75,6 +75,7 @@ class CallGenerator : public ArenaObj {
   // same but for method handle calls
   virtual bool      is_mh_late_inline() const      { return false; }
   virtual bool      is_string_late_inline() const  { return false; }
+  virtual bool      is_vector_late_inline() const  { return false; }
   virtual bool      is_boxing_late_inline() const  { return false; }
   virtual bool      is_vector_reboxing_late_inline() const  { return false; }
   virtual bool      is_virtual_late_inline() const { return false; }
@@ -141,6 +142,7 @@ class CallGenerator : public ArenaObj {
   static CallGenerator* for_late_inline(ciMethod* m, CallGenerator* inline_cg);
   static CallGenerator* for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const);
   static CallGenerator* for_string_late_inline(ciMethod* m, CallGenerator* inline_cg);
+  static CallGenerator* for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg);
   static CallGenerator* for_boxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
   static CallGenerator* for_vector_reboxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
   static CallGenerator* for_late_inline_virtual(ciMethod* m, int vtable_index, float expected_uses);

diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp
@@ -513,6 +513,7 @@ macro(VectorRearrange)
 macro(VectorLoadMask)
 macro(VectorLoadShuffle)
 macro(VectorLoadConst)
+macro(VectorSlice)
 macro(VectorStoreMask)
 macro(VectorReinterpret)
 macro(VectorCast)

diff --git a/src/hotspot/share/opto/doCall.cpp b/src/hotspot/share/opto/doCall.cpp
@@ -164,7 +164,9 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
         cg_intrinsic = cg;
         cg = nullptr;
       } else if (IncrementalInline && should_delay_vector_inlining(callee, jvms)) {
-        return CallGenerator::for_late_inline(callee, cg);
+        float expected_uses = jvms->method()->scale_count(site_count, prof_factor);
+        CallGenerator* inline_cg = CallGenerator::for_inline(callee, expected_uses);
+        return CallGenerator::for_vector_late_inline(callee, cg, inline_cg);
       } else {
         return cg;
       }

diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
@@ -753,6 +753,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
     return inline_index_vector();
   case vmIntrinsics::_IndexPartiallyInUpperRange:
     return inline_index_partially_in_upper_range();
+  case vmIntrinsics::_VectorSlice:
+    return inline_vector_slice();
 
   case vmIntrinsics::_getObjectSize:
     return inline_getObjectSize();