[llvm] [SLP][REVEC] Add getScalarizationOverhead helper function to reduce error when REVEC is enabled. (PR #128530)

Han-Kuan Chen via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 25 00:00:13 PST 2025


https://github.com/HanKuanChen updated https://github.com/llvm/llvm-project/pull/128530

>From 9786add75946d9ac974b9365c7d93ade6efa7bcf Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 24 Feb 2025 00:44:05 -0800
Subject: [PATCH 1/5] [SLP][REVEC] Add getScalarizationOverhead helper function
 to reduce error when REVEC is enabled.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 3d660b63309d4..19823bd3cab44 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5014,6 +5014,39 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
 }
 
+/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
+/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
+/// instead of a scalar.
+static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
+                                                Type *ScalarTy, VectorType *Ty,
+                                                const APInt &DemandedElts,
+                                                bool Insert, bool Extract,
+                                                TTI::TargetCostKind CostKind,
+                                                ArrayRef<Value *> VL = {}) {
+  assert(!isa<ScalableVectorType>(Ty) &&
+         "ScalableVectorType is not supported.");
+  if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+    assert(SLPReVec && "Only supported by REVEC.");
+    // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
+    // of CreateInsertElement.
+    unsigned ScalarTyNumElements = VecTy->getNumElements();
+    InstructionCost Cost = 0;
+    for (unsigned I : seq(DemandedElts.getBitWidth())) {
+      if (!DemandedElts[I])
+        continue;
+      if (Insert)
+        Cost += getShuffleCost(TTI, TTI::SK_InsertSubvector, Ty, {}, CostKind,
+                               I * ScalarTyNumElements, VecTy);
+      if (Extract)
+        Cost += getShuffleCost(TTI, TTI::SK_ExtractSubvector, Ty, {}, CostKind,
+                               I * ScalarTyNumElements, VecTy);
+    }
+    return Cost;
+  }
+  return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
+                                      CostKind, VL);
+}
+
 /// Correctly creates insert_subvector, checking that the index is multiple of
 /// the subvectors length. Otherwise, generates shuffle using \p Generator or
 /// using default shuffle.

>From 26fe00a0fe24cd41ba261680fd9746aff23325f4 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 24 Feb 2025 00:55:05 -0800
Subject: [PATCH 2/5] [SLP][REVEC] Use getScalarizationOverhead.

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 112 ++++++------------
 1 file changed, 37 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 19823bd3cab44..60b4e0263f8bd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5273,8 +5273,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
             /*VariableMask=*/false, CommonAlignment, CostKind) +
         (ProfitableGatherPointers ? 0 : VectorGEPCost);
     InstructionCost GatherCost =
-        TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
-                                     /*Extract=*/false, CostKind) +
+        getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+                                 /*Insert=*/true,
+                                 /*Extract=*/false, CostKind) +
         ScalarLoadsCost;
     // The list of loads is small or perform partial check already - directly
     // compare masked gather cost and gather cost.
@@ -5327,10 +5328,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
       // Can be vectorized later as a serie of loads/insertelements.
       InstructionCost VecLdCost = 0;
       if (!DemandedElts.isZero()) {
-        VecLdCost =
-            TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
-                                         /*Extract=*/false, CostKind) +
-            ScalarGEPCost;
+        VecLdCost = getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+                                             /*Insert=*/true,
+                                             /*Extract=*/false, CostKind) +
+                    ScalarGEPCost;
         for (unsigned Idx : seq<unsigned>(VL.size()))
           if (DemandedElts[Idx])
             VecLdCost +=
@@ -5356,13 +5357,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                 return getUnderlyingObject(V) !=
                        getUnderlyingObject(PointerOps.front());
               }))
-            VectorGEPCost += TTI.getScalarizationOverhead(
-                SubVecTy, APInt::getAllOnes(VF),
+            VectorGEPCost += getScalarizationOverhead(
+                TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF),
                 /*Insert=*/true, /*Extract=*/false, CostKind);
           else
             VectorGEPCost +=
-                TTI.getScalarizationOverhead(
-                    SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
+                getScalarizationOverhead(
+                    TTI, ScalarTy, SubVecTy,
+                    APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
                     /*Insert=*/true, /*Extract=*/false, CostKind) +
                 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
                                  CostKind);
@@ -9945,20 +9947,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
     Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
                              Idx, getWidenedType(ScalarTy, Sz));
   }
-  if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
-    assert(SLPReVec && "Only supported by REVEC.");
-    // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
-    // of CreateInsertElement.
-    unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-    for (unsigned I : seq<unsigned>(TE.Scalars.size()))
-      if (DemandedElts[I])
-        Cost +=
-            TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
-                                CostKind, I * ScalarTyNumElements, FTy);
-  } else {
-    Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
-                                          /*Extract=*/false, CostKind);
-  }
+  Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
+                                   /*Insert=*/true,
+                                   /*Extract=*/false, CostKind);
   int Sz = TE.Scalars.size();
   SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
                                TE.ReorderIndices.end());
@@ -9987,8 +9978,9 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
       ReorderMask[I] = I + Sz;
     }
   }
-  InstructionCost BVCost = TTI->getScalarizationOverhead(
-      VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
+  InstructionCost BVCost =
+      getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
+                               /*Insert=*/true, /*Extract=*/false, CostKind);
   if (!DemandedElts.isAllOnes())
     BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
   if (Cost >= BVCost) {
@@ -11636,9 +11628,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     assert(Offset < NumElts && "Failed to find vector index offset");
 
     InstructionCost Cost = 0;
-    Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
-                                          /*Insert*/ true, /*Extract*/ false,
-                                          CostKind);
+    Cost -=
+        getScalarizationOverhead(*TTI, ScalarTy, SrcVecTy, DemandedElts,
+                                 /*Insert*/ true, /*Extract*/ false, CostKind);
 
     // First cost - resize to actual vector size if not identity shuffle or
     // need to shift the vector.
@@ -13813,8 +13805,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       }
       if (!IsIdentity)
         FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
-      FirstShuffleCost += TTI->getScalarizationOverhead(
-          MaskVecTy, DemandedElts, /*Insert=*/true,
+      FirstShuffleCost += getScalarizationOverhead(
+          *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
           /*Extract=*/false, CostKind);
     }
     InstructionCost SecondShuffleCost = 0;
@@ -13838,17 +13830,17 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       }
       if (!IsIdentity)
         SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
-      SecondShuffleCost += TTI->getScalarizationOverhead(
-          MaskVecTy, DemandedElts, /*Insert=*/true,
+      SecondShuffleCost += getScalarizationOverhead(
+          *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
           /*Extract=*/false, CostKind);
     }
     APInt DemandedElts = APInt::getAllOnes(SubMask.size());
     for (auto [I, Idx] : enumerate(SubMask))
       if (Idx == PoisonMaskElem)
         DemandedElts.clearBit(I);
-    InstructionCost BuildVectorCost =
-        TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
-                                      /*Extract=*/false, CostKind);
+    InstructionCost BuildVectorCost = getScalarizationOverhead(
+        *TTI, VL.front()->getType(), MaskVecTy, DemandedElts, /*Insert=*/true,
+        /*Extract=*/false, CostKind);
     const TreeEntry *BestEntry = nullptr;
     if (FirstShuffleCost < ShuffleCost) {
       std::for_each(std::next(Mask.begin(), Part * VL.size()),
@@ -14001,45 +13993,15 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
     ShuffledElements.setBit(I);
     ShuffleMask[I] = Res.first->second;
   }
-  if (!DemandedElements.isZero()) {
-    if (isa<FixedVectorType>(ScalarTy)) {
-      assert(SLPReVec && "Only supported by REVEC.");
-      // We don't need to insert elements one by one. Instead, we can insert the
-      // entire vector into the destination.
-      Cost = 0;
-      unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-      for (unsigned I : seq<unsigned>(VL.size()))
-        if (DemandedElements[I])
-          Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {},
-                                   CostKind, I * ScalarTyNumElements,
-                                   cast<FixedVectorType>(ScalarTy));
-    } else {
-      Cost += TTI->getScalarizationOverhead(VecTy, DemandedElements,
-                                            /*Insert=*/true,
-                                            /*Extract=*/false, CostKind, VL);
-    }
-  }
-  if (ForPoisonSrc) {
-    if (isa<FixedVectorType>(ScalarTy)) {
-      assert(SLPReVec && "Only supported by REVEC.");
-      // We don't need to insert elements one by one. Instead, we can insert the
-      // entire vector into the destination.
-      assert(DemandedElements.isZero() &&
-             "Need to consider the cost from DemandedElements.");
-      Cost = 0;
-      unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-      for (unsigned I : seq<unsigned>(VL.size()))
-        if (!ShuffledElements[I])
-          Cost += TTI->getShuffleCost(
-              TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
-              I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
-    } else {
-      Cost = TTI->getScalarizationOverhead(VecTy,
-                                           /*DemandedElts*/ ~ShuffledElements,
-                                           /*Insert*/ true,
-                                           /*Extract*/ false, CostKind, VL);
-    }
-  }
+  if (!DemandedElements.isZero())
+    Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
+                                     /*Insert=*/true,
+                                     /*Extract=*/false, CostKind, VL);
+  if (ForPoisonSrc)
+    Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy,
+                                    /*DemandedElts*/ ~ShuffledElements,
+                                    /*Insert*/ true,
+                                    /*Extract*/ false, CostKind, VL);
   if (DuplicateNonConst)
     Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
                              VecTy, ShuffleMask);

>From cb8acd6831bc22dc91e41ea0d772ad45d8f4384c Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 24 Feb 2025 08:25:17 -0800
Subject: [PATCH 3/5] add assert to prevent incorrect usage

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 60b4e0263f8bd..b7e69a9aea237 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5025,6 +5025,9 @@ static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
                                                 ArrayRef<Value *> VL = {}) {
   assert(!isa<ScalableVectorType>(Ty) &&
          "ScalableVectorType is not supported.");
+  assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
+             getNumElements(Ty) &&
+         "Incorrect usage.");
   if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
     assert(SLPReVec && "Only supported by REVEC.");
     // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead

>From 91c156bfccb2eeb66b8a7bc50091959845ad2089 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 24 Feb 2025 08:53:39 -0800
Subject: [PATCH 4/5] use VF to instead of VecTy->getNumElements()

When ScalarTy is FixedVectorType, VecTy->getNumElements() is different
from ScalarTy is NOT FixedVectorType. However, VF is same.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b7e69a9aea237..623208d7de80c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5243,10 +5243,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                     Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
     // Estimate the cost of masked gather GEP. If not a splat, roughly
     // estimate as a buildvector, otherwise estimate as splat.
-    APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+    APInt DemandedElts = APInt::getAllOnes(Sz);
     VectorType *PtrVecTy =
-        getWidenedType(PointerOps.front()->getType()->getScalarType(),
-                       VecTy->getNumElements());
+        getWidenedType(PointerOps.front()->getType()->getScalarType(), Sz);
     if (static_cast<unsigned>(count_if(
             PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
         any_of(PointerOps, [&](Value *V) {
@@ -5257,9 +5256,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
           PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
     else
       VectorGEPCost +=
-          TTI.getScalarizationOverhead(
-              PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
-              /*Insert=*/true, /*Extract=*/false, CostKind) +
+          TTI.getScalarizationOverhead(PtrVecTy, APInt::getOneBitSet(Sz, 0),
+                                       /*Insert=*/true, /*Extract=*/false,
+                                       CostKind) +
           ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
     // The cost of scalar loads.
     InstructionCost ScalarLoadsCost =
@@ -5340,7 +5339,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
             VecLdCost +=
                 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
       }
-      unsigned ScalarTyNumElements = getNumElements(ScalarTy);
       auto *SubVecTy = getWidenedType(ScalarTy, VF);
       for (auto [I, LS] : enumerate(States)) {
         auto *LI0 = cast<LoadInst>(VL[I * VF]);
@@ -5366,8 +5364,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
           else
             VectorGEPCost +=
                 getScalarizationOverhead(
-                    TTI, ScalarTy, SubVecTy,
-                    APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
+                    TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0),
                     /*Insert=*/true, /*Extract=*/false, CostKind) +
                 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
                                  CostKind);
@@ -9969,7 +9966,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
                                ? TTI::SK_PermuteTwoSrc
                                : TTI::SK_PermuteSingleSrc,
                            VecTy, ReorderMask);
-  DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+  DemandedElts = APInt::getAllOnes(TE.Scalars.size());
   ReorderMask.assign(Sz, PoisonMaskElem);
   for (unsigned I : seq<unsigned>(Sz)) {
     Value *V = TE.getOrdered(I);

>From ba93ce0c6e68f37f58b6828bb9fb269f28838811 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen at sifive.com>
Date: Mon, 24 Feb 2025 23:55:51 -0800
Subject: [PATCH 5/5] use general getScalarizationOverhead

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 623208d7de80c..02fea2eaf9d41 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5244,21 +5244,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     // Estimate the cost of masked gather GEP. If not a splat, roughly
     // estimate as a buildvector, otherwise estimate as splat.
     APInt DemandedElts = APInt::getAllOnes(Sz);
-    VectorType *PtrVecTy =
-        getWidenedType(PointerOps.front()->getType()->getScalarType(), Sz);
+    Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
+    VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
     if (static_cast<unsigned>(count_if(
             PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
         any_of(PointerOps, [&](Value *V) {
           return getUnderlyingObject(V) !=
                  getUnderlyingObject(PointerOps.front());
         }))
-      VectorGEPCost += TTI.getScalarizationOverhead(
-          PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
+      VectorGEPCost += getScalarizationOverhead(TTI, PtrScalarTy, PtrVecTy,
+                                                DemandedElts, /*Insert=*/true,
+                                                /*Extract=*/false, CostKind);
     else
       VectorGEPCost +=
-          TTI.getScalarizationOverhead(PtrVecTy, APInt::getOneBitSet(Sz, 0),
-                                       /*Insert=*/true, /*Extract=*/false,
-                                       CostKind) +
+          getScalarizationOverhead(
+              TTI, PtrScalarTy, PtrVecTy, APInt::getOneBitSet(Sz, 0),
+              /*Insert=*/true, /*Extract=*/false, CostKind) +
           ::getShuffleCost(TTI, TTI::SK_Broadcast, PtrVecTy, {}, CostKind);
     // The cost of scalar loads.
     InstructionCost ScalarLoadsCost =



More information about the llvm-commits mailing list