[llvm] [VPlan] Use ResumePhi to create reduction resume phis. (PR #110004)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 28 07:28:09 PDT 2024


https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/110004

>From d3614bcbf77ac92b1f1dd37711392f828bd695f3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 25 Sep 2024 12:20:07 +0100
Subject: [PATCH 1/5] [VPlan] Use ResumePhi to create reduction resume phis.

Use VPInstruction::ResumePhi to create phi nodes for reduction resume
values.

This allows simplifying createAndCollectMergePhiForReduction to only
collect reduction resume phis when vectorizing epilogue loops and adding
extra incoming edges from the main vector loop.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 82 +++++++++----------
 .../RISCV/vplan-vp-intrinsics-reduction.ll    |  9 ++
 ...-order-recurrence-sink-replicate-region.ll |  2 +
 .../LoopVectorize/vplan-printing.ll           |  9 ++
 4 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index db650b23e271e2..74104304301a86 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7467,23 +7467,31 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
 }
 
 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
-// create a merge phi node for it.
-static void createAndCollectMergePhiForReduction(
-    VPInstruction *RedResult,
-    VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
-    bool VectorizingEpilogue) {
+// create a merge phi node for it and add incoming values from the main vector
+// loop.
+static void updateAndCollectMergePhiForReductionForEpilogueVectorization(
+    VPInstruction *RedResult, VPTransformState &State, Loop *OrigLoop,
+    BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue) {
   if (!RedResult ||
       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
     return;
 
+  using namespace VPlanPatternMatch;
+  VPValue *ResumePhiVPV =
+      cast<VPInstruction>(*find_if(RedResult->users(), [](VPUser *U) {
+        return match(U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(),
+                                                                  m_VPValue()));
+      }));
+  auto *BCBlockPhi = cast<PHINode>(State.get(ResumePhiVPV, true));
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
+  if (!VectorizingEpilogue)
+    return;
 
-  Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane()));
   auto *ResumePhi =
       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
-  if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
-                                 RdxDesc.getRecurrenceKind())) {
+  if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
+          RdxDesc.getRecurrenceKind())) {
     auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
     assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
@@ -7493,40 +7501,15 @@ static void createAndCollectMergePhiForReduction(
          "when vectorizing the epilogue loop, we need a resume phi from main "
          "vector loop");
 
-  // TODO: bc.merge.rdx should not be created here, instead it should be
-  // modeled in VPlan.
   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
-  // Create a phi node that merges control-flow from the backedge-taken check
-  // block and the middle block.
-  auto *BCBlockPhi =
-      PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
-                      LoopScalarPreHeader->getTerminator()->getIterator());
-
   // If we are fixing reductions in the epilogue loop then we should already
   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
   // we carry over the incoming values correctly.
   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
-    if (Incoming == LoopMiddleBlock)
-      BCBlockPhi->addIncoming(FinalValue, Incoming);
-    else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
-      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
-                              Incoming);
-    else
-      BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
+    if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+      BCBlockPhi->setIncomingValueForBlock(
+          Incoming, ResumePhi->getIncomingValueForBlock(Incoming));
   }
-
-  auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
-  // TODO: This fixup should instead be modeled in VPlan.
-  // Fix the scalar loop reduction variable with the incoming reduction sum
-  // from the vector body and from the backedge value.
-  int IncomingEdgeBlockIdx =
-      OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-  // Pick the other block.
-  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-  OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-  OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
 }
 
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
@@ -7617,11 +7600,12 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  for (VPRecipeBase &R : *ExitVPBB) {
-    createAndCollectMergePhiForReduction(
-        dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
-  }
+  if (IsEpilogueVectorization)
+    for (VPRecipeBase &R : *ExitVPBB) {
+      updateAndCollectMergePhiForReductionForEpilogueVectorization(
+          dyn_cast<VPInstruction>(&R), State, OrigLoop,
+          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+    }
 
   // 2.6. Maintain Loop Hints
   // Keep all loop hints from the original loop on the vector loop (we'll
@@ -9411,6 +9395,22 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         });
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
 
+    VPBasicBlock *ScalarPHVPBB = nullptr;
+    if (MiddleVPBB->getNumSuccessors() == 2) {
+      // Order is strict: first is the exit block, second is the scalar
+      // preheader.
+      ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+    } else {
+      ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
+    }
+
+    VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+    auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+        VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},
+        {}, "bc.merge.rdx");
+    auto *RedPhi = cast<PHINode>(PhiR->getUnderlyingInstr());
+    Plan->addLiveOut(RedPhi, ResumePhiRecipe);
+
     // Adjust AnyOf reductions; replace the reduction phi for the selected value
     // with a boolean reduction phi node to check if the condition is true in
     // any iteration. The final value is selected by the final
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 90c209cf3f5186..6a435709aeb2b2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: No successors
 ; IF-EVL-INLOOP-EMPTY:
 ; IF-EVL-INLOOP-NEXT: scalar.ph:
+; IF-EVL-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; IF-EVL-INLOOP-NEXT: No successors
+; IF-EVL-INLOOP-EMPTY:
+; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; IF-EVL-INLOOP-NEXT: }
 ;
 
@@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT: No successors
 ; NO-VP-OUTLOOP-EMPTY:
 ; NO-VP-OUTLOOP-NEXT: scalar.ph:
+; NO-VP-OUTLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-OUTLOOP-NEXT: No successors
+; NO-VP-OUTLOOP-EMPTY:
+; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; NO-VP-OUTLOOP-NEXT: }
 ;
 
@@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT: No successors
 ; NO-VP-INLOOP-EMPTY:
 ; NO-VP-INLOOP-NEXT: scalar.ph:
+; NO-VP-INLOOP-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
 ; NO-VP-INLOOP-NEXT: No successors
+; NO-VP-INLOOP-EMPTY:
+; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
 ; NO-VP-INLOOP-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 8e56614a2e3d5c..b05980bef1b38f 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
 ; CHECK-NEXT:   EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
+; CHECK-NEXT:   EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
+; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]>
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 0dde507d08be74..2247295295663e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
 ; CHECK-NEXT: }
 ;
 entry:
@@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: scalar.ph
+; CHECK-NEXT:   EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
 ; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]>
 ; CHECK-NEXT:}
 
 entry:

>From 8337aa62e52ae7ca82cd9ecef63dd0ed7bc4aadf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 13 Oct 2024 21:58:06 +0100
Subject: [PATCH 2/5] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d11e63fe68579e..0cd40b2faf4042 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7494,12 +7494,12 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
   }
 }
 
-// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
-// create a merge phi node for it and add incoming values from the main vector
-// loop.
-static void updateAndCollectMergePhiForReductionForEpilogueVectorization(
+// If \p RedResult is a ComputeReductionResult when vectorizing the epilog loop,
+// update the reduction's scalar PHI node by adding the incoming value from the
+// main vector loop.
+static void updateMergePhiForReductionForEpilogueVectorization(
     VPInstruction *RedResult, VPTransformState &State, Loop *OrigLoop,
-    BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue) {
+    BasicBlock *LoopMiddleBlock) {
   if (!RedResult ||
       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
     return;
@@ -7510,12 +7510,11 @@ static void updateAndCollectMergePhiForReductionForEpilogueVectorization(
         return match(U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(),
                                                                   m_VPValue()));
       }));
+  assert(ResumePhiVPV->getNumUsers() == 1 &&
+         "ResumePhi must have a single user");
   auto *BCBlockPhi = cast<PHINode>(State.get(ResumePhiVPV, true));
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-  if (!VectorizingEpilogue)
-    return;
-
   auto *ResumePhi =
       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
@@ -7525,16 +7524,16 @@ static void updateAndCollectMergePhiForReductionForEpilogueVectorization(
     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
     ResumePhi = cast<PHINode>(Cmp->getOperand(0));
   }
-  assert((!VectorizingEpilogue || ResumePhi) &&
+  assert(ResumePhi &&
          "when vectorizing the epilogue loop, we need a resume phi from main "
          "vector loop");
 
   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
-  // If we are fixing reductions in the epilogue loop then we should already
-  // have created a bc.merge.rdx Phi after the main vector body. Ensure that
-  // we carry over the incoming values correctly.
+  // When fixing reductions in the epilogue loop then we should already have
+  // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
+  // over the incoming values correctly.
   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
-    if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
+    if (is_contained(ResumePhi->blocks(), Incoming))
       BCBlockPhi->setIncomingValueForBlock(
           Incoming, ResumePhi->getIncomingValueForBlock(Incoming));
   }
@@ -7628,11 +7627,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  if (IsEpilogueVectorization)
+  if (IsEpilogueVectorization && ExpandedSCEVs)
     for (VPRecipeBase &R : *ExitVPBB) {
-      updateAndCollectMergePhiForReductionForEpilogueVectorization(
+      updateMergePhiForReductionForEpilogueVectorization(
           dyn_cast<VPInstruction>(&R), State, OrigLoop,
-          State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+          State.CFG.VPBB2IRBB[ExitVPBB]);
     }
 
   // 2.6. Maintain Loop Hints

>From 3852344e37808aa17d11c6299d1dc4ddf4def1a1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 25 Oct 2024 16:44:36 +0100
Subject: [PATCH 3/5] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 56 ++++++++++---------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 94edef5a2da2fd..86d4a95a3371a0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7561,48 +7561,52 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
   }
 }
 
-// If \p RedResult is a ComputeReductionResult when vectorizing the epilog loop,
+// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
 // update the reduction's scalar PHI node by adding the incoming value from the
 // main vector loop.
 static void updateMergePhiForReductionForEpilogueVectorization(
-    VPInstruction *RedResult, VPTransformState &State, Loop *OrigLoop,
+    VPRecipeBase *R, VPTransformState &State, Loop *OrigLoop,
     BasicBlock *LoopMiddleBlock) {
+  auto *RedResult = dyn_cast<VPInstruction>(R);
   if (!RedResult ||
       RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
     return;
 
-  using namespace VPlanPatternMatch;
-  VPValue *ResumePhiVPV =
-      cast<VPInstruction>(*find_if(RedResult->users(), [](VPUser *U) {
-        return match(U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(),
-                                                                  m_VPValue()));
-      }));
-  assert(ResumePhiVPV->getNumUsers() == 1 &&
-         "ResumePhi must have a single user");
-  auto *BCBlockPhi = cast<PHINode>(State.get(ResumePhiVPV, true));
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-  auto *ResumePhi =
-      dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+  PHINode *MainResumePhi;
   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
           RdxDesc.getRecurrenceKind())) {
     auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
     assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
     assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
-    ResumePhi = cast<PHINode>(Cmp->getOperand(0));
+    MainResumePhi = cast<PHINode>(Cmp->getOperand(0));
+  } else {
+    MainResumePhi = cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
   }
-  assert(ResumePhi &&
-         "when vectorizing the epilogue loop, we need a resume phi from main "
-         "vector loop");
 
-  BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
-  // When fixing reductions in the epilogue loop then we should already have
+  // When fixing reductions in the epilogue loop we should already have
   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
   // over the incoming values correctly.
+  using namespace VPlanPatternMatch;
+  auto IsResumePhi = [](VPUser *U) {
+    return match(
+        U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
+  };
+  auto *EpiResumePhiVPI =
+      cast<VPInstruction>(*find_if(RedResult->users(), IsResumePhi));
+  assert(count_if(RedResult->users(), IsResumePhi) == 1 &&
+         "ResumePhi must have a single user");
+  auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
+  BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
-    if (is_contained(ResumePhi->blocks(), Incoming))
-      BCBlockPhi->setIncomingValueForBlock(
-          Incoming, ResumePhi->getIncomingValueForBlock(Incoming));
+    if (is_contained(MainResumePhi->blocks(), Incoming)) {
+      assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
+                 RdxDesc.getRecurrenceStartValue() &&
+             "Trying to reset unexpected value");
+      EpiResumePhi->setIncomingValueForBlock(
+          Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
+    }
   }
 }
 
@@ -7617,7 +7621,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   assert(
       (IsEpilogueVectorization || !ExpandedSCEVs) &&
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
-  (void)IsEpilogueVectorization;
 
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
@@ -7694,11 +7697,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // 2.5 Collect reduction resume values.
   auto *ExitVPBB =
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
-  if (IsEpilogueVectorization && ExpandedSCEVs)
+  if (IsEpilogueVectorization)
     for (VPRecipeBase &R : *ExitVPBB) {
       updateMergePhiForReductionForEpilogueVectorization(
-          dyn_cast<VPInstruction>(&R), State, OrigLoop,
-          State.CFG.VPBB2IRBB[ExitVPBB]);
+          &R, State, OrigLoop, State.CFG.VPBB2IRBB[ExitVPBB]);
     }
 
   // 2.6. Maintain Loop Hints
@@ -10232,7 +10234,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
-                                             *BestMainPlan, MainILV, DT, true);
+                                             *BestMainPlan, MainILV, DT, false);
         ++LoopsVectorized;
 
         // Second pass vectorizes the epilogue and adjusts the control flow

>From 1d34cd8fc1d5b4e00813bbfc04db9a8f51b71f27 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 27 Oct 2024 21:06:00 +0100
Subject: [PATCH 4/5] !fixup address latest comments, thanks!

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 58 ++++++++++---------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8da72f21a8a0e0..b61699cf7153e0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7563,28 +7563,33 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
 }
 
 // If \p R is a ComputeReductionResult when vectorizing the epilog loop,
-// update the reduction's scalar PHI node by adding the incoming value from the
+// fix the reduction's scalar PHI node by adding the incoming value from the
 // main vector loop.
-static void updateMergePhiForReductionForEpilogueVectorization(
+static void fixReductionScalarResumeWhenVectorizingEpilog(
     VPRecipeBase *R, VPTransformState &State, Loop *OrigLoop,
     BasicBlock *LoopMiddleBlock) {
-  auto *RedResult = dyn_cast<VPInstruction>(R);
-  if (!RedResult ||
-      RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
+  auto *EpiRedResult = dyn_cast<VPInstruction>(R);
+  if (!EpiRedResult ||
+      EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
     return;
 
-  auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
-  const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-  PHINode *MainResumePhi;
+  auto *EpiRedHeaderPhi =
+      cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
+  const RecurrenceDescriptor &RdxDesc =
+      EpiRedHeaderPhi->getRecurrenceDescriptor();
+  Value *MainResumeValue =
+      EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
   if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
           RdxDesc.getRecurrenceKind())) {
-    auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
-    assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
-    assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
-    MainResumePhi = cast<PHINode>(Cmp->getOperand(0));
-  } else {
-    MainResumePhi = cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+    auto *Cmp = cast<ICmpInst>(MainResumeValue);
+    assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
+           "AnyOf expected to start with ICMP_NE");
+    assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
+           "AnyOf expected to start by comparing main resume value to original "
+           "start value");
+    MainResumeValue = Cmp->getOperand(0);
   }
+  PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
 
   // When fixing reductions in the epilogue loop we should already have
   // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
@@ -7594,12 +7599,13 @@ static void updateMergePhiForReductionForEpilogueVectorization(
     return match(
         U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
   };
-  auto *EpiResumePhiVPI =
-      cast<VPInstruction>(*find_if(RedResult->users(), IsResumePhi));
-  assert(count_if(RedResult->users(), IsResumePhi) == 1 &&
+  assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
          "ResumePhi must have a single user");
+  auto *EpiResumePhiVPI =
+      cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
   BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
+  unsigned UpdateCnt = 0;
   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
     if (is_contained(MainResumePhi->blocks(), Incoming)) {
       assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
@@ -7607,8 +7613,11 @@ static void updateMergePhiForReductionForEpilogueVectorization(
              "Trying to reset unexpected value");
       EpiResumePhi->setIncomingValueForBlock(
           Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
+      UpdateCnt++;
     }
   }
+  assert(UpdateCnt <= 1 && "Only should update at most 1 incoming value");
+  (void)UpdateCnt;
 }
 
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
@@ -7701,7 +7710,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
       cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
   if (VectorizingEpilogue)
     for (VPRecipeBase &R : *ExitVPBB) {
-      updateMergePhiForReductionForEpilogueVectorization(
+      fixReductionScalarResumeWhenVectorizingEpilog(
           &R, State, OrigLoop, State.CFG.VPBB2IRBB[ExitVPBB]);
     }
 
@@ -9504,15 +9513,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         });
     FinalReductionResult->insertBefore(*MiddleVPBB, IP);
 
-    VPBasicBlock *ScalarPHVPBB = nullptr;
-    if (MiddleVPBB->getNumSuccessors() == 2) {
-      // Order is strict: first is the exit block, second is the scalar
-      // preheader.
-      ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
-    } else {
-      ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
-    }
-
+    // Order is strict: if there are multiple successors, the first is the exit
+    // block, second is the scalar preheader.
+    VPBasicBlock *ScalarPHVPBB =
+        cast<VPBasicBlock>(MiddleVPBB->getSuccessors().back());
     VPBuilder ScalarPHBuilder(ScalarPHVPBB);
     auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
         VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},

>From 60c8fae7436e78e6f84d3e8e1ed13853b2f49fbc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 28 Oct 2024 14:24:48 +0000
Subject: [PATCH 5/5] !fixup address latest comments, thanks!

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b61699cf7153e0..778d928252e051 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7566,8 +7566,7 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
 // fix the reduction's scalar PHI node by adding the incoming value from the
 // main vector loop.
 static void fixReductionScalarResumeWhenVectorizingEpilog(
-    VPRecipeBase *R, VPTransformState &State, Loop *OrigLoop,
-    BasicBlock *LoopMiddleBlock) {
+    VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) {
   auto *EpiRedResult = dyn_cast<VPInstruction>(R);
   if (!EpiRedResult ||
       EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
@@ -7604,20 +7603,21 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
   auto *EpiResumePhiVPI =
       cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
   auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
-  BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
-  unsigned UpdateCnt = 0;
+  BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent();
+  bool Updated = false;
   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
     if (is_contained(MainResumePhi->blocks(), Incoming)) {
       assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
                  RdxDesc.getRecurrenceStartValue() &&
              "Trying to reset unexpected value");
+      assert(!Updated && "Should update at most 1 incoming value");
       EpiResumePhi->setIncomingValueForBlock(
           Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
-      UpdateCnt++;
+      Updated = true;
     }
   }
-  assert(UpdateCnt <= 1 && "Only should update at most 1 incoming value");
-  (void)UpdateCnt;
+  assert(Updated && "Must update EpiResumePhi.");
+  (void)Updated;
 }
 
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
@@ -7711,7 +7711,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   if (VectorizingEpilogue)
     for (VPRecipeBase &R : *ExitVPBB) {
       fixReductionScalarResumeWhenVectorizingEpilog(
-          &R, State, OrigLoop, State.CFG.VPBB2IRBB[ExitVPBB]);
+          &R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
     }
 
   // 2.6. Maintain Loop Hints



More information about the llvm-commits mailing list