@@ -135,9 +135,9 @@ source %{
135
135
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
136
136
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
137
137
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
138
- // The vector implementation of Op_AddReductionVD/F is for the Vector API only.
139
- // It is not suitable for auto-vectorization because it does not add the elements
140
- // in the same order as sequential code, and FP addition is non-associative .
138
+ // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
139
+ // They are not suitable for auto-vectorization because the result would not conform
140
+ // to the JLS, Section Evaluation Order .
141
141
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
142
142
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
143
143
opcode == Op_MulVL) {
@@ -2858,26 +2858,28 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
2858
2858
%}
2859
2859
2860
2860
// reduction addF
2861
- // Floating-point addition is not associative, so the rules for AddReductionVF
2862
- // on NEON can't be used to auto-vectorize floating-point reduce-add.
2863
- // Currently, on NEON, AddReductionVF is only generated by Vector API.
2864
- instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
2865
- predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
2861
+
2862
+ instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
2863
+ // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
2864
+ // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
2865
+ predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order() );
2866
2866
match(Set dst (AddReductionVF fsrc vsrc));
2867
2867
effect(TEMP_DEF dst);
2868
- format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
2868
+ format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
2869
2869
ins_encode %{
2870
2870
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
2871
2871
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
2872
2872
%}
2873
2873
ins_pipe(pipe_slow);
2874
2874
%}
2875
2875
2876
- instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2877
- predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
2876
+ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2877
+ // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
2878
+ // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
2879
+ predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
2878
2880
match(Set dst (AddReductionVF fsrc vsrc));
2879
2881
effect(TEMP_DEF dst, TEMP tmp);
2880
- format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
2882
+ format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
2881
2883
ins_encode %{
2882
2884
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
2883
2885
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2886
2888
ins_pipe(pipe_slow);
2887
2889
%}
2888
2890
2891
+ // This rule calculates the reduction result in strict order. Two cases will
2892
+ // reach here:
2893
+ // 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
2894
+ // AddReductionVF generated by Vector API. For vector size > 128-bits, it is more
2895
+ // beneficial performance-wise to generate direct SVE instruction even if it is
2896
+ // strictly ordered.
2897
+ // 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
2898
+ // auto-vectorization on SVE machine.
2889
2899
instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
2890
- predicate(UseSVE > 0);
2900
+ predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
2901
+ n->as_Reduction()->requires_strict_order());
2891
2902
match(Set dst_src1 (AddReductionVF dst_src1 src2));
2892
2903
format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
2893
2904
ins_encode %{
2905
+ assert(UseSVE > 0, "must be sve");
2894
2906
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
2895
2907
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
2896
2908
__ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister);
@@ -2899,26 +2911,36 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
2899
2911
%}
2900
2912
2901
2913
// reduction addD
2902
- // Floating-point addition is not associative, so the rule for AddReductionVD
2903
- // on NEON can't be used to auto-vectorize floating-point reduce-add.
2904
- // Currently, on NEON, AddReductionVD is only generated by Vector API.
2905
- instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
2906
- predicate(UseSVE == 0 );
2914
+
2915
+ instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
2916
+ // Non-strictly ordered floating-point add reduction for doubles. This rule is
2917
+ // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
2918
+ predicate(!n->as_Reduction()->requires_strict_order() );
2907
2919
match(Set dst (AddReductionVD dsrc vsrc));
2908
2920
effect(TEMP_DEF dst);
2909
- format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
2921
+ format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
2910
2922
ins_encode %{
2911
2923
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
2912
2924
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
2913
2925
%}
2914
2926
ins_pipe(pipe_slow);
2915
2927
%}
2916
2928
2929
+ // This rule calculates the reduction result in strict order. Two cases will
2930
+ // reach here:
2931
+ // 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example -
2932
+ // AddReductionVD generated by Vector API. For vector size > 128-bits, it is more
2933
+ // beneficial performance-wise to generate direct SVE instruction even if it is
2934
+ // strictly ordered.
2935
+ // 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
2936
+ // auto-vectorization on SVE machine.
2917
2937
instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
2918
- predicate(UseSVE > 0);
2938
+ predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
2939
+ n->as_Reduction()->requires_strict_order());
2919
2940
match(Set dst_src1 (AddReductionVD dst_src1 src2));
2920
2941
format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
2921
2942
ins_encode %{
2943
+ assert(UseSVE > 0, "must be sve");
2922
2944
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
2923
2945
assert(length_in_bytes == MaxVectorSize, "invalid vector length");
2924
2946
__ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);
0 commit comments