[llvm] [VPlan] Introduce ComputeReductionResult VPInstruction opcode. (PR #70253)

Mon Dec 4 06:58:04 PST 2023

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/70253

>From 42d1023e0272718a36a3bcc47d2f1113cbf03fea Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 16 Sep 2023 20:09:00 +0100
Subject: [PATCH] [VPlan] Introduce ComputeReductionResult VPInstruction
 opcode.

This patch introduces a new ComputeReductionResult opcode to compute the
final reduction result in the middle block. The code from fixReduction
has been moved to ComputeReductionResult, after some earlier cleanup
changes to model parts of fixReduction explicitly elsewhere as needed.

The recipe may be broken down further in the future.

Note that the ComputeReductionResult at the moment also creates the phi
node to merge the reduction result from the trip count check and the
middle block, to be used as resume value for the scalar remainder loop.
Once we have a VPValue for the reduction result, this can also be
modeled explicitly and moved out of the recipe.
---
 .../Vectorize/LoopVectorizationPlanner.h      |  11 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 274 ++++++------------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.h         |   3 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 136 +++++++++
 ...-order-recurrence-sink-replicate-region.ll |   3 +-
 .../LoopVectorize/vplan-printing.ll           |   7 +-
 7 files changed, 237 insertions(+), 198 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 577ce8000de27..a4ead0523a2fb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -347,10 +347,13 @@ class LoopVectorizationPlanner {
   /// vectorization re-using plans for both the main and epilogue vector loops.
   /// It should be removed once the re-use issue has been fixed.
   /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
-  /// to re-use expansion results generated during main plan execution. Returns
-  /// a mapping of SCEVs to their expanded IR values. Note that this is a
-  /// temporary workaround needed due to the current epilogue handling.
-  DenseMap<const SCEV *, Value *>
+  /// to re-use expansion results generated during main plan execution.
+  ///
+  /// Returns a mapping of SCEVs to their expanded IR values and a mapping for
+  /// the reduction resume values. Note that this is a temporary workaround
+  /// needed due to the current epilogue handling.
+  std::pair<DenseMap<const SCEV *, Value *>,
+            DenseMap<const RecurrenceDescriptor *, Value *>>
   executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
               InnerLoopVectorizer &LB, DominatorTree *DT,
               bool IsEpilogueVectorization,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d7a0a29d6ed55..d54192bd7a059 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -584,10 +584,6 @@ class InnerLoopVectorizer {
   /// able to vectorize with strict in-order reductions for the given RdxDesc.
   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
 
-  // Returns the resume value (bc.merge.rdx) for a reduction as
-  // generated by fixReduction.
-  PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
-
   /// Create a new phi node for the induction variable \p OrigPhi to resume
   /// iteration count in the scalar epilogue, from where the vectorized loop
   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
@@ -1166,14 +1162,6 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
   }
 }
 
-PHINode *InnerLoopVectorizer::getReductionResumeValue(
-    const RecurrenceDescriptor &RdxDesc) {
-  auto It = ReductionResumeValues.find(&RdxDesc);
-  assert(It != ReductionResumeValues.end() &&
-         "Expected to find a resume value for the reduction.");
-  return It->second;
-}
-
 namespace llvm {
 
 // Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -3513,11 +3501,6 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
   VPBasicBlock *Header =
       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
 
-  for (VPRecipeBase &R : Header->phis()) {
-    if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
-      fixReduction(ReductionPhi, State);
-  }
-
   for (VPRecipeBase &R : Header->phis()) {
     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       fixFixedOrderRecurrence(FOR, State);
@@ -3643,169 +3626,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
   Phi->setName("scalar.recur");
 }
 
-void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
-                                       VPTransformState &State) {
-  PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
-  // Get it's reduction variable descriptor.
-  assert(Legal->isReductionVariable(OrigPhi) &&
-         "Unable to find the reduction variable");
-  const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-
-  RecurKind RK = RdxDesc.getRecurrenceKind();
-  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
-  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-  if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
-    State.setDebugLocFrom(I->getDebugLoc());
-
-  VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
-
-  // Before each round, move the insertion point right between
-  // the PHIs and the values we are going to write.
-  // This allows us to write both PHINodes and the extractelement
-  // instructions.
-  Builder.SetInsertPoint(LoopMiddleBlock,
-                         LoopMiddleBlock->getFirstInsertionPt());
-
-  State.setDebugLocFrom(LoopExitInst->getDebugLoc());
-
-  Type *PhiTy = OrigPhi->getType();
-  // If tail is folded by masking, the vector value to leave the loop should be
-  // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
-  // instead of the former. For an inloop reduction the reduction will already
-  // be predicated, and does not need to be handled here.
-  if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
-    VPValue *Def = nullptr;
-    for (VPUser *U : LoopExitInstDef->users()) {
-      auto *S = dyn_cast<VPInstruction>(U);
-      if (S && S->getOpcode() == Instruction::Select) {
-        Def = S;
-        break;
-      }
-    }
-    if (Def)
-      LoopExitInstDef = Def;
-  }
-
-  VectorParts RdxParts(UF);
-  for (unsigned Part = 0; Part < UF; ++Part)
-    RdxParts[Part] = State.get(LoopExitInstDef, Part);
-
-  // If the vector reduction can be performed in a smaller type, we truncate
-  // then extend the loop exit value to enable InstCombine to evaluate the
-  // entire expression in the smaller type.
-  if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
-    Builder.SetInsertPoint(LoopMiddleBlock,
-                           LoopMiddleBlock->getFirstInsertionPt());
-    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
-    }
-  }
-
-  // Reduce all of the unrolled parts into a single vector.
-  Value *ReducedPartRdx = RdxParts[0];
-  unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-
-  // The middle block terminator has already been assigned a DebugLoc here (the
-  // OrigLoop's single latch terminator). We want the whole middle block to
-  // appear to execute on this line because: (a) it is all compiler generated,
-  // (b) these instructions are always executed after evaluating the latch
-  // conditional branch, and (c) other passes may add new predecessors which
-  // terminate on this line. This is the easiest way to ensure we don't
-  // accidentally cause an extra step back into the loop while debugging.
-  State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
-  if (PhiR->isOrdered())
-    ReducedPartRdx = RdxParts[UF - 1];
-  else {
-    // Floating-point operations should have some FMF to enable the reduction.
-    IRBuilderBase::FastMathFlagGuard FMFG(Builder);
-    Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
-    for (unsigned Part = 1; Part < UF; ++Part) {
-      Value *RdxPart = RdxParts[Part];
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        ReducedPartRdx = Builder.CreateBinOp(
-            (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
-      else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
-        ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
-                                       ReducedPartRdx, RdxPart);
-      else
-        ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
-    }
-  }
-
-  // Create the reduction after the loop. Note that inloop reductions create the
-  // target reduction in the loop using a Reduction recipe.
-  if (VF.isVector() && !PhiR->isInLoop()) {
-    ReducedPartRdx =
-        createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
-    // If the reduction can be performed in a smaller type, we need to extend
-    // the reduction to the wider type before we branch to the original loop.
-    if (PhiTy != RdxDesc.getRecurrenceType())
-      ReducedPartRdx = RdxDesc.isSigned()
-                           ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
-                           : Builder.CreateZExt(ReducedPartRdx, PhiTy);
-  }
-
-  PHINode *ResumePhi =
-      dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-
-  // Create a phi node that merges control-flow from the backedge-taken check
-  // block and the middle block.
-  PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
-                                        LoopScalarPreHeader->getTerminator());
-
-  // If we are fixing reductions in the epilogue loop then we should already
-  // have created a bc.merge.rdx Phi after the main vector body. Ensure that
-  // we carry over the incoming values correctly.
-  for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
-    if (Incoming == LoopMiddleBlock)
-      BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
-    else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
-      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
-                              Incoming);
-    else
-      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
-  }
-
-  // Set the resume value for this reduction
-  ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
-
-  // If there were stores of the reduction value to a uniform memory address
-  // inside the loop, create the final store here.
-  if (StoreInst *SI = RdxDesc.IntermediateStore) {
-    StoreInst *NewSI =
-        Builder.CreateAlignedStore(ReducedPartRdx, SI->getPointerOperand(),
-                                   SI->getAlign());
-    propagateMetadata(NewSI, SI);
-
-    // If the reduction value is used in other places,
-    // then let the code below create PHI's for that.
-  }
-
-  // Now, we need to fix the users of the reduction variable
-  // inside and outside of the scalar remainder loop.
-
-  // We know that the loop is in LCSSA form. We need to update the PHI nodes
-  // in the exit blocks.  See comment on analogous loop in
-  // fixFixedOrderRecurrence for a more complete explaination of the logic.
-  if (!Cost->requiresScalarEpilogue(VF.isVector()))
-    for (PHINode &LCSSAPhi : LoopExitBlock->phis())
-      if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
-        LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
-        State.Plan->removeLiveOut(&LCSSAPhi);
-      }
-
-  // Fix the scalar loop reduction variable with the incoming reduction sum
-  // from the vector body and from the backedge value.
-  int IncomingEdgeBlockIdx =
-      OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-  // Pick the other block.
-  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-}
-
 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
   // The basic block and loop containing the predicated instruction.
   auto *PredBB = PredInst->getParent();
@@ -7561,7 +7381,9 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
   }
 }
 
-SCEV2ValueTy LoopVectorizationPlanner::executePlan(
+std::pair<DenseMap<const SCEV *, Value *>,
+          DenseMap<const RecurrenceDescriptor *, Value *>>
+LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
@@ -7590,6 +7412,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
     State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
     BestVPlan.getPreheader()->execute(&State);
   }
+  State.CFG.VPBB2IRBB[BestVPlan.getPreheader()] = OrigLoop->getLoopPreheader();
   if (!ILV.getTripCount())
     ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
   else
@@ -7640,6 +7463,46 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
 
   BestVPlan.execute(&State);
 
+  DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
+  BasicBlock *OrigScalarPH = OrigLoop->getLoopPreheader();
+  // Check if ResResult is a CompueReductionResult instruction, and if it is
+  // return the merge phi node for it.
+  auto GetMergePhiForReduction = [&State,
+                                  OrigScalarPH](VPInstruction *RedResult)
+      -> std::pair<const RecurrenceDescriptor *, PHINode *> {
+    if (!RedResult ||
+        RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
+      return {nullptr, nullptr};
+
+    Value *FinalValue =
+        State.get(RedResult, VPIteration(State.UF - 1,
+                                         VPLane::getLastLaneForVF(State.VF)));
+    auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
+    assert(FinalValue->hasOneUse() || PhiR->isInLoop() ||
+           PhiR->getRecurrenceDescriptor().IntermediateStore &&
+               "unexpected final value");
+
+    // Find the merge phi, which must be located in the preheader of the
+    // original scalar loop.
+    PHINode *MergePhi = nullptr;
+    for (Value *U : FinalValue->users()) {
+      auto *P = dyn_cast<PHINode>(U);
+      if (P && OrigScalarPH == P->getParent()) {
+        MergePhi = P;
+        break;
+      }
+    }
+    assert(MergePhi && "must have merge phi");
+    return {&PhiR->getRecurrenceDescriptor(), MergePhi};
+  };
+  for (VPRecipeBase &R : *cast<VPBasicBlock>(
+           BestVPlan.getVectorLoopRegion()->getSingleSuccessor())) {
+    const auto &[RedRdx, MergePhi] =
+        GetMergePhiForReduction(dyn_cast<VPInstruction>(&R));
+    if (!RedRdx)
+      continue;
+    ReductionResumeValues[RedRdx] = MergePhi;
+  }
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
   MDNode *OrigLoopID = OrigLoop->getLoopID();
@@ -7673,7 +7536,7 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
-  return State.ExpandedSCEVs;
+  return {State.ExpandedSCEVs, ReductionResumeValues};
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -8949,7 +8812,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 void LoopVectorizationPlanner::adjustRecipesForReductions(
     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
     ElementCount MinVF) {
-  VPBasicBlock *Header = Plan->getVectorLoopRegion()->getEntryBasicBlock();
+  VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
+  VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
   // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
   // sank outside of the loop would keep the same order as they had in the
   // original loop.
@@ -8992,7 +8856,25 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   SmallVector<VPReductionPHIRecipe *> InLoopReductionPhis;
   for (VPRecipeBase &R : Header->phis()) {
     auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
-    if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
+    if (!PhiR)
+      continue;
+
+    // TODO: At the moment ComputeReductionResult also creates the bc.merge.rdx
+    // phi nodes, hence it needs to be created unconditionally here, until the
+    // reduction resume value handling is also modeled in VPlan.
+    auto *FinalReductionResult =
+        new VPInstruction(VPInstruction::ComputeReductionResult,
+                          {PhiR, PhiR->getBackedgeValue()});
+    cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
+        ->appendRecipe(FinalReductionResult);
+    for (VPUser *U : to_vector(PhiR->getBackedgeValue()->users())) {
+      auto *LiveOut = dyn_cast<VPLiveOut>(U);
+      if (!LiveOut)
+        continue;
+      LiveOut->setOperand(0, FinalReductionResult);
+    }
+
+    if (!PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
       continue;
     InLoopReductionPhis.push_back(PhiR);
   }
@@ -9027,6 +8909,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {
       VPValue *PreviousLinkV = PreviousLink->getVPSingleValue();
 
+      if (CurrentLink->getParent()->getParent() != VectorLoopRegion)
+        continue;
       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
 
       // Index of the first operand which holds a non-mask vector operand.
@@ -9128,8 +9012,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
               : new VPInstruction(Instruction::Select, {Cond, Red, PhiR});
       Result->insertBefore(&*Builder.getInsertPoint());
       Red->replaceUsesWithIf(
-          Result->getVPSingleValue(),
-          [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
+          Result->getVPSingleValue(), [VectorLoopRegion](VPUser &U, unsigned) {
+            auto *UR = cast<VPRecipeBase>(&U);
+            if (UR->getParent()->getParent() != VectorLoopRegion) {
+              assert(cast<VPInstruction>(UR)->getOpcode() ==
+                         VPInstruction::ComputeReductionResult &&
+                     "user outside of the vector region must be "
+                     "ComputeReductionResult");
+              return true;
+            }
+            return false;
+          });
       if (PreferPredicatedReductionSelect ||
           TTI.preferPredicatedReductionSelect(
               PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
@@ -10161,8 +10054,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
-        auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
-                                             BestMainPlan, MainILV, DT, true);
+        const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
+            EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;
 
         // Second pass vectorizes the epilogue and adjusts the control flow
@@ -10203,8 +10096,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           Value *ResumeV = nullptr;
           // TODO: Move setting of resume values to prepareToExecute.
           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
-            ResumeV = MainILV.getReductionResumeValue(
-                ReductionPhi->getRecurrenceDescriptor());
+            ResumeV = ReductionResumeValues
+                          .find(&ReductionPhi->getRecurrenceDescriptor())
+                          ->second;
           } else {
             // Create induction resume values for both widened pointer and
             // integer/fp inductions and update the start value of the induction
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 0074532f9c68d..d4ac17d677da6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -456,6 +456,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
     // The Exit block of a loop is always set to be successor 0 of the Exiting
     // block.
     cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
+    State->Builder.SetInsertPoint(NewBB->getFirstNonPHI());
   } else if (PrevVPBB && /* A */
              !((SingleHPred = getSingleHierarchicalPredecessor()) &&
                SingleHPred->getExitingBasicBlock() == PrevVPBB &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 639c2ac685a66..9911246a223a0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1034,7 +1034,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
     // canonical IV separately for each unrolled part.
     CanonicalIVIncrementForPart,
     BranchOnCount,
-    BranchOnCond
+    BranchOnCond,
+    ComputeReductionResult,
   };
 
 private:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f73487ffb6a7e..f2571b4197a3a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
 
@@ -404,6 +405,138 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
     return CondBr;
   }
+  case VPInstruction::ComputeReductionResult: {
+    if (Part != 0)
+      return State.get(
+          this, VPIteration(State.UF - 1, VPLane::getLastLaneForVF(State.VF)));
+
+    auto *PhiR = dyn_cast<VPReductionPHIRecipe>(getOperand(0));
+    auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
+    // Get it's reduction variable descriptor.
+    const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+
+    BasicBlock *LoopMiddleBlock = State.CFG.VPBB2IRBB[getParent()];
+    Builder.SetInsertPoint(LoopMiddleBlock,
+                           LoopMiddleBlock->getFirstInsertionPt());
+
+    RecurKind RK = RdxDesc.getRecurrenceKind();
+    TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+    Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+    if (auto *I = dyn_cast<Instruction>(&*ReductionStartValue))
+      State.setDebugLocFrom(I->getDebugLoc());
+
+    VPValue *LoopExitInstDef = getOperand(1);
+    // This is the vector-clone of the value that leaves the loop.
+    // State.setDebugLocFrom(LoopExitInst->getDebugLoc());
+
+    Type *PhiTy = OrigPhi->getType();
+    VectorParts RdxParts(State.UF);
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      RdxParts[Part] = State.get(LoopExitInstDef, Part);
+
+    // If the vector reduction can be performed in a smaller type, we truncate
+    // then extend the loop exit value to enable InstCombine to evaluate the
+    // entire expression in the smaller type.
+    if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
+      Builder.SetInsertPoint(LoopMiddleBlock,
+                             LoopMiddleBlock->getFirstInsertionPt());
+      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
+      for (unsigned Part = 0; Part < State.UF; ++Part)
+        RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+    }
+    // Reduce all of the unrolled parts into a single vector.
+    Value *ReducedPartRdx = RdxParts[0];
+    unsigned Op = RecurrenceDescriptor::getOpcode(RK);
+
+    // The middle block terminator has already been assigned a DebugLoc here
+    // (the OrigLoop's single latch terminator). We want the whole middle block
+    // to appear to execute on this line because: (a) it is all compiler
+    // generated, (b) these instructions are always executed after evaluating
+    // the latch conditional branch, and (c) other passes may add new
+    // predecessors which terminate on this line. This is the easiest way to
+    // ensure we don't accidentally cause an extra step back into the loop while
+    // debugging.
+    State.setDebugLocFrom(LoopMiddleBlock->getTerminator()->getDebugLoc());
+    if (PhiR->isOrdered()) {
+      ReducedPartRdx = RdxParts[State.UF - 1];
+    } else {
+      // Floating-point operations should have some FMF to enable the reduction.
+      IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+      Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
+      for (unsigned Part = 1; Part < State.UF; ++Part) {
+        Value *RdxPart = RdxParts[Part];
+        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+          ReducedPartRdx = Builder.CreateBinOp(
+              (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
+        else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
+          ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+                                         ReducedPartRdx, RdxPart);
+        else
+          ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
+      }
+    }
+
+    // Create the reduction after the loop. Note that inloop reductions create
+    // the target reduction in the loop using a Reduction recipe.
+    if (State.VF.isVector() && !PhiR->isInLoop()) {
+      ReducedPartRdx =
+          createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
+      // If the reduction can be performed in a smaller type, we need to extend
+      // the reduction to the wider type before we branch to the original loop.
+      if (PhiTy != RdxDesc.getRecurrenceType())
+        ReducedPartRdx = RdxDesc.isSigned()
+                             ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
+                             : Builder.CreateZExt(ReducedPartRdx, PhiTy);
+    }
+
+    auto *ResumePhi =
+        dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
+    auto *OrigLoop = State.LI->getLoopFor(OrigPhi->getParent());
+    // TODO: bc.merge.rdx should not be created here, instead it should be
+    // modeled in VPlan.
+    BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
+    // Create a phi node that merges control-flow from the backedge-taken check
+    // block and the middle block.
+    auto *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
+                                       LoopScalarPreHeader->getTerminator());
+
+    // If we are fixing reductions in the epilogue loop then we should already
+    // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+    // we carry over the incoming values correctly.
+    for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+      if (Incoming == LoopMiddleBlock)
+        BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
+      else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+        BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+                                Incoming);
+      else
+        BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+    }
+
+    // If there were stores of the reduction value to a uniform memory address
+    // inside the loop, create the final store here.
+    if (StoreInst *SI = RdxDesc.IntermediateStore) {
+      auto *NewSI = Builder.CreateAlignedStore(
+          ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
+      propagateMetadata(NewSI, SI);
+    }
+
+    // TODO: This fixup should instead be modeled in VPlan.
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx =
+        OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+    // Pick the other block.
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+    OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+    OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+
+    State.set(this, ReducedPartRdx,
+              VPIteration(State.UF - 1, VPLane::getLastLaneForVF(State.VF)));
+    return ReducedPartRdx;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -483,6 +616,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::BranchOnCount:
     O << "branch-on-count";
     break;
+  case VPInstruction::ComputeReductionResult:
+    O << "compute-reduction-result";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 3198f19a8f9a9..245dcc63f600e 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -212,9 +212,10 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:  EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out i32 %res = vp<[[SEL]]>
+; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 3626b8c9dce78..4fb0ca8b8264d 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -129,9 +129,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %red.next.lcssa = ir<%red.next>
+; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -175,6 +176,7 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RES:.+]]> = compute-reduction-result ir<%red>
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
 ;
@@ -369,9 +371,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
-; CHECK-NEXT: Live-out float %muladd.lcssa = ir<%muladd>
+; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]>
 ; CHECK-NEXT:}
 
 entry: