[llvm] 7344f3d - [LoopVectorize] Add strict in-order reduction support for fixed-width vectorization

Tue Apr 6 06:46:36 PDT 2021

Author: Kerry McLaughlin
Date: 2021-04-06T14:45:34+01:00
New Revision: 7344f3d39a0dc934b22ee43b4f92160fc3ea2db6

URL: https://github.com/llvm/llvm-project/commit/7344f3d39a0dc934b22ee43b4f92160fc3ea2db6
DIFF: https://github.com/llvm/llvm-project/commit/7344f3d39a0dc934b22ee43b4f92160fc3ea2db6.diff

LOG: [LoopVectorize] Add strict in-order reduction support for fixed-width vectorization

Previously we could only vectorize FP reductions if fast math was enabled, as this allows us to
reorder FP operations. However, it may still be beneficial to vectorize the loop by moving
the reduction inside the vectorized loop and making sure that the scalar reduction value
be an input to the horizontal reduction, e.g:

  %phi = phi float [ 0.0, %entry ], [ %reduction, %vector_body ]
  %load = load <8 x float>
  %reduction = call float @llvm.vector.reduce.fadd.v8f32(float %phi, <8 x float> %load)

This patch adds a new flag (IsOrdered) to RecurrenceDescriptor and makes use of the changes added
by D75069 as much as possible, which already teaches the vectorizer about in-loop reductions.
For now in-order reduction support is off by default and controlled with the `-enable-strict-reductions` flag.

Reviewed By: david-arm

Differential Revision: https://reviews.llvm.org/D98435

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll

Modified: 
    llvm/include/llvm/Analysis/IVDescriptors.h
    llvm/include/llvm/Transforms/Utils/LoopUtils.h
    llvm/lib/Analysis/IVDescriptors.cpp
    llvm/lib/Transforms/Utils/LoopUtils.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 842a19f47671..41086e4abe61 100644

--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -69,9 +69,11 @@ class RecurrenceDescriptor {
 
   RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
                        FastMathFlags FMF, Instruction *ExactFP, Type *RT,
-                       bool Signed, SmallPtrSetImpl<Instruction *> &CI)
+                       bool Signed, bool Ordered,
+                       SmallPtrSetImpl<Instruction *> &CI)
       : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
-        ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed) {
+        ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
+        IsOrdered(Ordered) {
     CastInsts.insert(CI.begin(), CI.end());
   }
 
@@ -228,6 +230,9 @@ class RecurrenceDescriptor {
   /// Returns true if all source operands of the recurrence are SExtInsts.
   bool isSigned() const { return IsSigned; }
 
+  /// Expose an ordered FP reduction to the instance users.
+  bool isOrdered() const { return IsOrdered; }
+
   /// Attempts to find a chain of operations from Phi to LoopExitInst that can
   /// be treated as a set of reductions instructions for in-loop reductions.
   SmallVector<Instruction *, 4> getReductionOpChain(PHINode *Phi,
@@ -250,6 +255,10 @@ class RecurrenceDescriptor {
   Type *RecurrenceType = nullptr;
   // True if all source operands of the recurrence are SExtInsts.
   bool IsSigned = false;
+  // True if this recurrence can be treated as an in-order reduction.
+  // Currently only a non-reassociative FAdd can be considered in-order,
+  // if it is also the only FAdd in the PHI's use chain.
+  bool IsOrdered = false;
   // Instructions used for type-promoting the recurrence.
   SmallPtrSet<Instruction *, 8> CastInsts;
 };

diff  --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index bf12941e5f4e..067da68bd8de 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -389,6 +389,11 @@ Value *createSimpleTargetReduction(IRBuilderBase &B,
 Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI,
                              RecurrenceDescriptor &Desc, Value *Src);
 
+/// Create an ordered reduction intrinsic using the given recurrence
+/// descriptor \p Desc.
+Value *createOrderedReduction(IRBuilderBase &B, RecurrenceDescriptor &Desc,
+                              Value *Src, Value *Start);
+
 /// Get the intersection (logical and) of all of the potential IR flags
 /// of each scalar operation (VL) that will be converted into a vector (I).
 /// If OpValue is non-null, we only consider operations similar to OpValue

diff  --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 8eaa6ef14596..3427150288b8 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -189,6 +189,32 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
   }
 }
 
+// Check if a given Phi node can be recognized as an ordered reduction for
+// vectorizing floating point operations without unsafe math.
+static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
+                                  Instruction *Exit, PHINode *Phi) {
+  // Currently only FAdd is supported
+  if (Kind != RecurKind::FAdd)
+    return false;
+
+  bool IsOrdered =
+      Exit->getOpcode() == Instruction::FAdd && Exit == ExactFPMathInst;
+
+  // The only pattern accepted is the one in which the reduction PHI
+  // is used as one of the operands of the exit instruction
+  auto *LHS = Exit->getOperand(0);
+  auto *RHS = Exit->getOperand(1);
+  IsOrdered &= ((LHS == Phi) || (RHS == Phi));
+
+  if (!IsOrdered)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi
+                    << ", ExitInst: " << *Exit << "\n");
+
+  return true;
+}
+
 bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
                                            Loop *TheLoop, FastMathFlags FuncFMF,
                                            RecurrenceDescriptor &RedDes,
@@ -416,6 +442,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
+  const bool IsOrdered = checkOrderedReduction(
+      Kind, ReduxDesc.getExactFPMathInst(), ExitInstruction, Phi);
+
   if (Start != Phi) {
     // If the starting value is not the same as the phi node, we speculatively
     // looked through an 'and' instruction when evaluating a potential
@@ -470,7 +499,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // Save the description of this reduction variable.
   RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
                           ReduxDesc.getExactFPMathInst(), RecurrenceType,
-                          IsSigned, CastInsts);
+                          IsSigned, IsOrdered, CastInsts);
   RedDes = RD;
 
   return true;

diff  --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index d2482fe60136..7f1bbeb34d14 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1070,6 +1070,17 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
   return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind());
 }
 
+Value *llvm::createOrderedReduction(IRBuilderBase &B,
+                                    RecurrenceDescriptor &Desc, Value *Src,
+                                    Value *Start) {
+  auto Kind = Desc.getRecurrenceKind();
+  assert(Kind == RecurKind::FAdd && "Unexpected reduction kind");
+  assert(Src->getType()->isVectorTy() && "Expected a vector type");
+  assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
+
+  return B.CreateFAddReduce(Start, Src);
+}
+
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
   auto *VecOp = dyn_cast<Instruction>(I);
   if (!VecOp)

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 57f1b4e145c3..31efee728ddc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -330,6 +330,11 @@ static cl::opt<bool>
                            cl::desc("Prefer in-loop vector reductions, "
                                     "overriding the targets preference."));
 
+cl::opt<bool> EnableStrictReductions(
+    "enable-strict-reductions", cl::init(false), cl::Hidden,
+    cl::desc("Enable the vectorisation of loops with in-order (strict) "
+             "FP reductions"));
+
 static cl::opt<bool> PreferPredicatedReductionSelect(
     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
     cl::desc(
@@ -4259,6 +4264,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
 }
 
+static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+  return EnableStrictReductions && RdxDesc.isOrdered();
+}
+
 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
   // Get it's reduction variable descriptor.
   assert(Legal->isReductionVariable(Phi) &&
@@ -4288,6 +4297,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
     Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
+    if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc) &&
+        State.VF.isVector())
+      Val = State.get(State.Plan->getVPValue(LoopVal), UF - 1);
     cast<PHINode>(VecRdxPhi)
       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
   }
@@ -4379,7 +4391,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
   // terminate on this line. This is the easiest way to ensure we don't
   // accidentally cause an extra step back into the loop while debugging.
   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
-  {
+  if (IsInLoopReductionPhi && useOrderedReductions(RdxDesc))
+    ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
+  else {
     // Floating-point operations should have some FMF to enable the reduction.
     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
@@ -6078,7 +6092,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
         if (!Legal->isReductionVariable(PN))
           continue;
         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
-        if (PreferInLoopReductions ||
+        if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
                                       RdxDesc.getRecurrenceType(),
                                       TargetTransformInfo::ReductionFlags()))
@@ -7657,7 +7671,7 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
     // If the target would prefer this reduction to happen "in-loop", then we
     // want to record it as such.
     unsigned Opcode = RdxDesc.getOpcode();
-    if (!PreferInLoopReductions &&
+    if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
                                    TargetTransformInfo::ReductionFlags()))
       continue;
@@ -9200,8 +9214,10 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 
 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
+  Value *PrevInChain = State.get(getChainOp(), 0);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     RecurKind Kind = RdxDesc->getRecurrenceKind();
+    bool IsOrdered = useOrderedReductions(*RdxDesc);
     Value *NewVecOp = State.get(getVecOp(), Part);
     if (VPValue *Cond = getCondOp()) {
       Value *NewCond = State.get(Cond, Part);
@@ -9213,15 +9229,23 @@ void VPReductionRecipe::execute(VPTransformState &State) {
       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
       NewVecOp = Select;
     }
-    Value *NewRed =
-        createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
-    Value *PrevInChain = State.get(getChainOp(), Part);
+    Value *NewRed;
     Value *NextInChain;
+    if (IsOrdered) {
+      NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
+                                      PrevInChain);
+      PrevInChain = NewRed;
+    } else {
+      PrevInChain = State.get(getChainOp(), Part);
+      NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
+    }
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
       NextInChain =
           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
                          NewRed, PrevInChain);
-    } else {
+    } else if (IsOrdered)
+      NextInChain = NewRed;
+    else {
       NextInChain = State.Builder.CreateBinOp(
           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
           PrevInChain);

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
new file mode 100644
index 000000000000..ed16349c14bb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -0,0 +1,280 @@
+; RUN: opt < %s -loop-vectorize -instcombine -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions -S | FileCheck %s -check-prefix=CHECK
+
+define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
+; CHECK-LABEL: @fadd_strict
+; CHECK: vector.body:
+; CHECK: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK: %[[LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]])
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[PHI]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %add
+}
+
+define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
+; CHECK-LABEL: @fadd_strict_unroll
+; CHECK: vector.body:
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK: %[[RDX4:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK: ret float %[[PHI]]
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret float %add
+}
+
+define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_strict_interleave
+; CHECK: entry
+; CHECK: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK: %[[LOAD1:.*]] = load float, float* %a
+; CHECK: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK: vector.body
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
+; CHECK: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
+; CHECK: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]])
+; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]])
+; CHECK: for.end
+; CHECK ret void
+entry:
+  %arrayidxa = getelementptr inbounds float, float* %a, i64 1
+  %a1 = load float, float* %a, align 4
+  %a2 = load float, float* %arrayidxa, align 4
+  br label %for.body
+
+for.body:
+  %add.phi1 = phi float [ %a2, %entry ], [ %add2, %for.body ]
+  %add.phi2 = phi float [ %a1, %entry ], [ %add1, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidxb1 = getelementptr inbounds float, float* %b, i64 %iv
+  %0 = load float, float* %arrayidxb1, align 4
+  %add1 = fadd float %0, %add.phi2
+  %or = or i64 %iv, 1
+  %arrayidxb2 = getelementptr inbounds float, float* %b, i64 %or
+  %1 = load float, float* %arrayidxb2, align 4
+  %add2 = fadd float %1, %add.phi1
+  %iv.next = add nuw nsw i64 %iv, 2
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:
+  store float %add1, float* %a, align 4
+  store float %add2, float* %arrayidxa, align 4
+  ret void
+}
+
+define float @fadd_invariant(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_invariant
+; CHECK: vector.body
+; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]])
+; CHECK: for.end.loopexit
+; CHECK: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: for.end
+; CHECK: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
+; CHECK: ret float %[[PHI]]
+entry:
+  %arrayidx = getelementptr inbounds float, float* %a, i64 1
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 5.000000e-01
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                      ; preds = %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %res.014 = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %b, i64 %iv
+  %2 = load float, float* %arrayidx4, align 4
+  %add = fadd float %1, %2
+  %rdx = fadd float %res.014, %add
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                 ; preds = %for.body, %entry
+  %res = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ]
+  ret float %res
+}
+
+define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @fadd_conditional
+; CHECK: vector.body:
+; CHECK: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ]
+; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
+; CHECK: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
+; CHECK: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
+; CHECK: pred.load.continue6
+; CHECK: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ]
+; CHECK: %[[PRED:.*]] = select <4 x i1> %[[FCMP1]], <4 x float> %[[PHI1]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]])
+; CHECK: for.body
+; CHECK: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK: %[[LOAD2:.*]] = load float, float*
+; CHECK: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
+; CHECK: br i1 %[[FCMP2]], label %if.then, label %for.inc
+; CHECK: if.then
+; CHECK: %[[LOAD3:.*]] = load float, float*
+; CHECK: br label %for.inc
+; CHECK: for.inc
+; CHECK: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]]
+; CHECK: for.end
+; CHECK: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RDX_PHI]]
+entry:
+  br label %for.body
+
+for.body:                                      ; preds = %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ]
+  %res = phi float [ 1.000000e+00, %entry ], [ %fadd, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %tobool = fcmp une float %0, 0.000000e+00
+  br i1 %tobool, label %if.then, label %for.inc
+
+if.then:                                      ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:
+  %phi = phi float [ %1, %if.then ], [ 3.000000e+00, %for.body ]
+  %fadd = fadd float %res, %phi
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:
+  %rdx = phi float [ %fadd, %for.inc ]
+  ret float %rdx
+}
+
+; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute
+define float @fadd_predicated(float* noalias nocapture %a, i64 %n) {
+; CHECK-LABEL: @fadd_predicated
+; CHECK: vector.ph
+; CHECK: %[[TRIP_MINUS_ONE:.*]] = add i64 %n, -1
+; CHECK: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0
+; CHECK: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK: vector.body
+; CHECK: %[[RDX_PHI:.*]] =  phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ]
+; CHECK: pred.load.continue2
+; CHECK: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ]
+; CHECK: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> <float -0.000000e+00, float -0.000000e+00>
+; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]])
+; CHECK: for.end:
+; CHECK: %[[RES_PHI:.*]] = phi float [ undef, %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RES_PHI]]
+entry:
+  br label %for.body
+
+for.body:                                           ; preds = %entry, %for.body
+  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %l7, %for.body ], [ 0.000000e+00, %entry ]
+  %l2 = getelementptr inbounds float, float* %a, i64 %iv
+  %l3 = load float, float* %l2, align 4
+  %l7 = fadd float %sum.02, %l3
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                            ; preds = %for.body
+  %sum.0.lcssa = phi float [ %l7, %for.body ]
+  ret float %sum.0.lcssa
+}
+
+; Negative test - loop contains multiple fadds which we cannot safely reorder
+define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
+; CHECK-LABEL: @fadd_multiple
+; CHECK: vector.body
+; CHECK: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK: middle.block
+; CHECK: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK: for.body
+; CHECK: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK: %[[LOAD1:.*]] = load float, float*
+; CHECK: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK: %[[LOAD2:.*]] = load float, float*
+; CHECK: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK: for.end
+; CHECK: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK: ret float %[[RET]]
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum = phi float [ -0.000000e+00, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %sum, %0
+  %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add3 = fadd float %add, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                         ; preds = %for.body
+  %rdx = phi float [ %add3, %for.body ]
+  ret float %rdx
+}
+
+!0 = distinct !{!0, !4, !7, !9}
+!1 = distinct !{!1, !4, !8, !9}
+!2 = distinct !{!2, !5, !7, !9}
+!3 = distinct !{!3, !6, !7, !9, !10}
+!4 = !{!"llvm.loop.vectorize.width", i32 8}
+!5 = !{!"llvm.loop.vectorize.width", i32 4}
+!6 = !{!"llvm.loop.vectorize.width", i32 2}
+!7 = !{!"llvm.loop.interleave.count", i32 1}
+!8 = !{!"llvm.loop.interleave.count", i32 4}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}