[llvm] 9cd7c53 - [InstCombine] Enable fold select into operand for FAdd, FMul, FSub and FDiv.

Mon Nov 22 15:10:30 PST 2021

Author: Huihui Zhang
Date: 2021-11-22T15:10:10-08:00
New Revision: 9cd7c534e27c2558ef16e14d4440bd838320334b

URL: https://github.com/llvm/llvm-project/commit/9cd7c534e27c2558ef16e14d4440bd838320334b
DIFF: https://github.com/llvm/llvm-project/commit/9cd7c534e27c2558ef16e14d4440bd838320334b.diff

LOG: [InstCombine] Enable fold select into operand for FAdd,  FMul, FSub and FDiv.

For FAdd, FMul, FSub and FDiv, fold select into one of the operands to enable
further optimizations, i.e., floating-point reduction detection.

Turn code:
  %C = fadd %A, %B
  %D = select %cond, %C, %A

into:
  %C = select %cond, %B, -0.000000e+00
  %D = fadd %A, %C

Alive2 verification (with --disable-undef-input), timed out otherwise.
FAdd - https://alive2.llvm.org/ce/z/eUxN4Y
FMul - https://alive2.llvm.org/ce/z/5SWZz4
FSub - https://alive2.llvm.org/ce/z/Dhj8dU
FDiv - https://alive2.llvm.org/ce/z/Yj_NA2

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D113442

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
    llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
    llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
    llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
    llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
    llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 4a1e82ae9c1d0..518d3952dce50 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -246,12 +246,16 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
 static unsigned getSelectFoldableOperands(BinaryOperator *I) {
   switch (I->getOpcode()) {
   case Instruction::Add:
+  case Instruction::FAdd:
   case Instruction::Mul:
+  case Instruction::FMul:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
     return 3;              // Can fold through either operand.
   case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::FSub:
+  case Instruction::FDiv:  // Can only fold on the divisor amount.
   case Instruction::Shl:   // Can only fold on the shift amount.
   case Instruction::LShr:
   case Instruction::AShr:

diff  --git a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
index db989def65dbc..0dfd116d56219 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
@@ -3,8 +3,8 @@
 
 define float @select_fadd(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fadd(
-; CHECK-NEXT:    [[C:%.*]] = fadd float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd float %A, %B
@@ -14,8 +14,8 @@ define float @select_fadd(i1 %cond, float %A, float %B) {
 
 define float @select_fadd_swapped(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fadd_swapped(
-; CHECK-NEXT:    [[C:%.*]] = fadd float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd float %A, %B
@@ -25,8 +25,8 @@ define float @select_fadd_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_fadd_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fadd_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fadd fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fadd fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd fast float %A, %B
@@ -36,8 +36,8 @@ define float @select_fadd_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_fadd_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fadd_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fadd fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fadd fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd fast float %A, %B
@@ -47,8 +47,8 @@ define float @select_fadd_swapped_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_fmul(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fmul(
-; CHECK-NEXT:    [[C:%.*]] = fmul float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fmul float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul float %A, %B
@@ -58,8 +58,8 @@ define float @select_fmul(i1 %cond, float %A, float %B) {
 
 define float @select_fmul_swapped(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fmul_swapped(
-; CHECK-NEXT:    [[C:%.*]] = fmul float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fmul float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul float %A, %B
@@ -69,8 +69,8 @@ define float @select_fmul_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_fmul_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fmul_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fmul fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fmul fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul fast float %A, %B
@@ -80,8 +80,8 @@ define float @select_fmul_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fmul_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fmul fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fmul fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul fast float %A, %B
@@ -91,8 +91,8 @@ define float @select_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_fsub(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fsub(
-; CHECK-NEXT:    [[C:%.*]] = fsub float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %A, %B
@@ -102,8 +102,8 @@ define float @select_fsub(i1 %cond, float %A, float %B) {
 
 define float @select_fsub_swapped(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fsub_swapped(
-; CHECK-NEXT:    [[C:%.*]] = fsub float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %A, %B
@@ -113,8 +113,8 @@ define float @select_fsub_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_fsub_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fsub_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fsub fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fsub fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub fast float %A, %B
@@ -124,8 +124,8 @@ define float @select_fsub_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_fsub_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fsub_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fsub fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fsub fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub fast float %A, %B
@@ -147,8 +147,8 @@ define float @select_fsub_invalid(i1 %cond, float %A, float %B) {
 
 define float @select_fdiv(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fdiv(
-; CHECK-NEXT:    [[C:%.*]] = fdiv float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fdiv float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %A, %B
@@ -158,8 +158,8 @@ define float @select_fdiv(i1 %cond, float %A, float %B) {
 
 define float @select_fdiv_swapped(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fdiv_swapped(
-; CHECK-NEXT:    [[C:%.*]] = fdiv float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fdiv float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %A, %B
@@ -169,8 +169,8 @@ define float @select_fdiv_swapped(i1 %cond, float %A, float %B) {
 
 define float @select_fdiv_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fdiv_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fdiv fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv fast float %A, %B
@@ -180,8 +180,8 @@ define float @select_fdiv_fast_math(i1 %cond, float %A, float %B) {
 
 define float @select_fdiv_swapped_fast_math(i1 %cond, float %A, float %B) {
 ; CHECK-LABEL: @select_fdiv_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = fdiv fast float [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[A]], float [[C]]
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv fast float %A, %B

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index ff8aae86e8212..7a06d3afde421 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -43,8 +43,8 @@ define float @test(float* nocapture readonly %pA, float* nocapture readonly %pB,
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = fdiv fast <4 x float> [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <4 x float> [[TMP10]], [[VEC_PHI]]
-; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP11]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP4]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[TMP10]]
+; CHECK-NEXT:    [[PREDPHI]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index a99f6610e9186..42de1a781d90e 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -959,7 +959,7 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 257, i64 257, i64 257, i64 257>
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -1019,7 +1019,7 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
 ; CHECK-NEXT:    [[TMP40:%.*]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP38]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP40]])
 ; CHECK-NEXT:    [[TMP42:%.*]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP39]], <4 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[TMP41]], <4 x float> [[TMP42]])
+; CHECK-NEXT:    [[TMP43]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[TMP41]], <4 x float> [[TMP42]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
@@ -1366,9 +1366,9 @@ define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP11]]
-; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP13]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
+; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP13]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
+; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index 76ef0faf8fd81..01b26818fa8c0 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -58,21 +58,21 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD3]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
@@ -166,21 +166,21 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD3]])
 ; CHECK-NEXT:    [[TMP9]] = mul i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
@@ -226,20 +226,20 @@ define i32 @reduction_mix(i32* noalias nocapture %A, i32* noalias nocapture %B)
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8]] = add i32 [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
@@ -551,7 +551,7 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
@@ -559,7 +559,7 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[TMP4]], <4 x float> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP5]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[TMP4]], <4 x float> [[WIDE_LOAD1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -818,9 +818,9 @@ define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP11]]
-; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP13]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
+; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP13]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
+; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
@@ -941,21 +941,21 @@ define i32 @reduction_predicated(i32* noalias nocapture %A, i32* noalias nocaptu
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD3]])
 ; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       middle.block:

diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 8b3a3d497d7d6..88563e583a19b 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -170,8 +170,8 @@ define void @loop2(float* %A, float* %B, i32* %C, float %x) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !13, !noalias !15
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD15]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[TMP7]], <4 x float> [[TMP10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP4]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD15]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], <4 x float>* [[TMP11]], align 4, !alias.scope !13, !noalias !15
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4