[llvm] [VPlan] Update getBestPlan to return VF, use also for epilogue vec. (PR #98821)

Fri Jul 26 04:22:03 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/98821

>From f9fc443fa579217362d9e28dab6803eabd7be703 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 14 Jul 2024 16:20:41 +0100
Subject: [PATCH 1/4] [VPlan] Update getBestPlan to return VF, use also for
 epilogue vec.

Update getBestPlan to return the VF alongside the best plan instead of
restricting the returned plan's VFs to the best VF.

This is allows using getBestPlan to also get the best VPlan for epilogue
vectorization. As the same plan may be used to vectorize both the main
and epilogue loop, restricting the VF of the best plan would cause
issues.
---
 .../Vectorize/LoopVectorizationPlanner.h      |  5 +++--
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 ++++++++-----------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c63cf0c37f2f9..62b5d270e2f04 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -365,8 +365,9 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
-  /// Return the most profitable plan and fix its VF to the most profitable one.
-  VPlan &getBestPlan() const;
+  /// Return the most profitable vectorization factor together with the most
+  /// profitable plan containing that vectorization factor.
+  std::pair<ElementCount, VPlan &> getBestPlan() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7d37d67cde29c..2ea136c7ebc48 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7423,11 +7423,11 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
   return Cost;
 }
 
-VPlan &LoopVectorizationPlanner::getBestPlan() const {
+std::pair<ElementCount, VPlan &> LoopVectorizationPlanner::getBestPlan() const {
   // If there is a single VPlan with a single VF, return it directly.
   VPlan &FirstPlan = *VPlans[0];
   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
-    return FirstPlan;
+    return {*FirstPlan.vectorFactors().begin(), FirstPlan};
 
   VPlan *BestPlan = &FirstPlan;
   ElementCount ScalarVF = ElementCount::getFixed(1);
@@ -7466,8 +7466,7 @@ VPlan &LoopVectorizationPlanner::getBestPlan() const {
       }
     }
   }
-  BestPlan->setVF(BestFactor.Width);
-  return *BestPlan;
+  return {BestFactor.Width, *BestPlan};
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -10287,6 +10286,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     } else {
       // If we decided that it is *legal* to vectorize the loop, then do it.
 
+      const auto &[Width, BestPlan] = LVP.getBestPlan();
+      LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width << "\n");
+      assert(VF.Width == Width &&
+             "VPlan cost model and legacy cost model disagreed");
+
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
           LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
@@ -10395,14 +10399,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlan();
-        assert(size(BestPlan.vectorFactors()) == 1 &&
-               "Plan should have a single VF");
-        ElementCount Width = *BestPlan.vectorFactors().begin();
-        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
-                          << "\n");
-        assert(VF.Width == Width &&
-               "VPlan cost model and legacy cost model disagreed");
         LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 

>From 69fd80e097419738608941ad8452273cdacb2fc8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sat, 20 Jul 2024 10:56:01 +0100
Subject: [PATCH 2/4] !fixup update after updating to latest main.

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 32 ++++++++-----------
 .../RISCV/riscv-vector-reverse.ll             |  4 +--
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f472aee34ef28..873f4b2d7db22 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10320,8 +10320,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
 
-      VPlan &BestPlan =
-          UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width) : LVP.getBestPlan();
+      VPlan &BestPlan = UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width)
+                                           : LVP.getBestPlan().second;
       assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
              "VPlan cost model and legacy cost model disagreed");
       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
@@ -10335,10 +10335,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     } else {
       // If we decided that it is *legal* to vectorize the loop, then do it.
 
-      const auto &[Width, BestPlan] = LVP.getBestPlan();
-      LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width << "\n");
-      assert(VF.Width == Width &&
-             "VPlan cost model and legacy cost model disagreed");
+      ElementCount Width = VF.Width;
+      std::optional<VPlan *> VPlanFromVPCost;
+      if (!UseLegacyCostModel) {
+        const auto &[VPWidth, Plan] = LVP.getBestPlan();
+        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
+                          << "\n");
+        Width = VPWidth;
+        assert(VF.Width == Width &&
+               "VPlan cost model and legacy cost model disagreed");
+      }
+      VPlan &BestPlan =
+          *VPlanFromVPCost.value_or(&LVP.getBestPlanFor(VF.Width));
 
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
@@ -10444,18 +10452,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
-        ElementCount Width = VF.Width;
-        VPlan &BestPlan =
-            UseLegacyCostModel ? LVP.getBestPlanFor(Width) : LVP.getBestPlan();
-        if (!UseLegacyCostModel) {
-          assert(size(BestPlan.vectorFactors()) == 1 &&
-                 "Plan should have a single VF");
-          Width = *BestPlan.vectorFactors().begin();
-          LLVM_DEBUG(dbgs()
-                     << "VF picked by VPlan cost model: " << Width << "\n");
-          assert(VF.Width == Width &&
-                 "VPlan cost model and legacy cost model disagreed");
-        }
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 8e9713fecf29d..7fb580be9e22e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -131,8 +131,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
@@ -287,8 +287,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
-; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  VF picked by VPlan cost model: vscale x 4
+; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue

>From 28f29301dd029662c55a0ca4b2ce2b996a84ddce Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 24 Jul 2024 18:08:32 +0100
Subject: [PATCH 3/4] !fixup introduce getBestVF.

---
 .../Vectorize/LoopVectorizationPlanner.h      |  5 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 55 +++++--------------
 2 files changed, 17 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 62b5d270e2f04..590f3153cd084 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -365,9 +365,8 @@ class LoopVectorizationPlanner {
   /// Return the best VPlan for \p VF.
   VPlan &getBestPlanFor(ElementCount VF) const;
 
-  /// Return the most profitable vectorization factor together with the most
-  /// profitable plan containing that vectorization factor.
-  std::pair<ElementCount, VPlan &> getBestPlan() const;
+  /// Return the most profitable vectorization factor.
+  ElementCount getBestVF() const;
 
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7a0e37b6e17b6..d389f87b17af8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7139,13 +7139,12 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
   return Cost;
 }
 
-std::pair<ElementCount, VPlan &> LoopVectorizationPlanner::getBestPlan() const {
+ElementCount LoopVectorizationPlanner::getBestVF() const {
   // If there is a single VPlan with a single VF, return it directly.
   VPlan &FirstPlan = *VPlans[0];
   if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
-    return {*FirstPlan.vectorFactors().begin(), FirstPlan};
+    return *FirstPlan.vectorFactors().begin();
 
-  VPlan *BestPlan = &FirstPlan;
   ElementCount ScalarVF = ElementCount::getFixed(1);
   assert(hasPlanWithVF(ScalarVF) &&
          "More than a single plan/VF w/o any plan having scalar VF");
@@ -7178,11 +7177,10 @@ std::pair<ElementCount, VPlan &> LoopVectorizationPlanner::getBestPlan() const {
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
       if (isMoreProfitable(CurrentFactor, BestFactor)) {
         BestFactor = CurrentFactor;
-        BestPlan = &*P;
       }
     }
   }
-  return {BestFactor.Width, *BestPlan};
+  return BestFactor.Width;
 }
 
 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -9970,13 +9968,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
 
-      VPlan &BestPlan = UseLegacyCostModel ? LVP.getBestPlanFor(VF.Width)
-                                           : LVP.getBestPlan().second;
-      assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
-=======
-      VPlan &BestPlan = LVP.getBestPlan();
-      assert(BestPlan.hasScalarVFOnly() &&
->>>>>>> origin/main
+      ElementCount BestVF = LVP.getBestVF();
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
+      assert(BestVF.isScalar() &&
              "VPlan cost model and legacy cost model disagreed");
       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
 
@@ -9989,28 +9983,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     } else {
       // If we decided that it is *legal* to vectorize the loop, then do it.
 
-      ElementCount Width = VF.Width;
-      std::optional<VPlan *> VPlanFromVPCost;
-      if (!UseLegacyCostModel) {
-        const auto &[VPWidth, Plan] = LVP.getBestPlan();
-        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
-                          << "\n");
-        Width = VPWidth;
-        assert(VF.Width == Width &&
-               "VPlan cost model and legacy cost model disagreed");
-      }
-      VPlan &BestPlan =
-          *VPlanFromVPCost.value_or(&LVP.getBestPlanFor(VF.Width));
-
+      ElementCount BestVF = LVP.getBestVF();
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
+      LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
+      assert(VF.Width == BestVF &&
+             "VPlan cost model and legacy cost model disagreed");
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
-          LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
+          LVP.selectEpilogueVectorizationFactor(BestVF, IC);
       if (EpilogueVF.Width.isVector()) {
 
         // The first pass vectorizes the main loop and creates a scalar epilogue
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
-        EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+        EpilogueLoopVectorizationInfo EPI(BestVF, IC, EpilogueVF.Width, 1);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
@@ -10106,21 +10092,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
-<<<<<<< HEAD
-=======
-        VPlan &BestPlan = LVP.getBestPlan();
-        assert(size(BestPlan.vectorFactors()) == 1 &&
-               "Plan should have a single VF");
-        ElementCount Width = *BestPlan.vectorFactors().begin();
-        LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
-                          << "\n");
-        assert(VF.Width == Width &&
-               "VPlan cost model and legacy cost model disagreed");
->>>>>>> origin/main
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
+        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, BestVF,
                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
                                PSI, Checks);
-        LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
+        LVP.executePlan(BestVF, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there

>From 52f032c67102a7c02993c9563fa1c9b3c615e8d6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 26 Jul 2024 12:21:18 +0100
Subject: [PATCH 4/4] !fixup address comments, thanks!

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 752af51847873..9733ac073eedd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7198,9 +7198,8 @@ ElementCount LoopVectorizationPlanner::getBestVF() const {
 
       InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
-      if (isMoreProfitable(CurrentFactor, BestFactor)) {
+      if (isMoreProfitable(CurrentFactor, BestFactor))
         BestFactor = CurrentFactor;
-      }
     }
   }
   return BestFactor.Width;
@@ -9999,10 +9998,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                  &CM, BFI, PSI, Checks);
 
       ElementCount BestVF = LVP.getBestVF();
-      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
       assert(BestVF.isScalar() &&
              "VPlan cost model and legacy cost model disagreed");
-      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
+      LVP.executePlan(BestVF, IC, BestPlan, Unroller, DT, false);
 
       ORE->emit([&]() {
         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10014,10 +10013,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
       // If we decided that it is *legal* to vectorize the loop, then do it.
 
       ElementCount BestVF = LVP.getBestVF();
-      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
       LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << BestVF << "\n");
       assert(VF.Width == BestVF &&
              "VPlan cost model and legacy cost model disagreed");
+      VPlan &BestPlan = LVP.getBestPlanFor(BestVF);
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
           LVP.selectEpilogueVectorizationFactor(BestVF, IC);
@@ -10031,8 +10030,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
         assert(EPI.MainLoopVF == VF.Width && "VFs must match");
-        std::unique_ptr<VPlan> BestMainPlan(
-            LVP.getBestPlanFor(VF.Width).duplicate());
+        std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
         const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
             EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan, MainILV, DT, true);
         ++LoopsVectorized;