[llvm] d7dd31e - [SLP]Better analysis of the repeated instructions during operands reordering

Tue Sep 24 14:04:48 PDT 2024

Author: Alexey Bataev
Date: 2024-09-24T14:03:10-07:00
New Revision: d7dd31e41791d71ad81af9cc4e7a26b26d4cb27e

URL: https://github.com/llvm/llvm-project/commit/d7dd31e41791d71ad81af9cc4e7a26b26d4cb27e
DIFF: https://github.com/llvm/llvm-project/commit/d7dd31e41791d71ad81af9cc4e7a26b26d4cb27e.diff

LOG: [SLP]Better analysis of the repeated instructions during operands reordering

When doing the repeated instructions analysis, better to make the
reordering non-profitable, if the number of unique instructions is not
power-of-2. In this case better to keep power-of-2 elements as this
allows better vectorization.

Fixes https://github.com/llvm/llvm-project/issues/109725

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b79e964cdb1b6b..414c6388c777b3 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1930,30 +1930,38 @@ class BoUpSLP {
     /// elements in the lane, it will be vectorized with higher probability
     /// after removing duplicates. Currently the SLP vectorizer supports only
     /// vectorization of the power-of-2 number of unique scalars.
-    int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
+    int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
+                      const SmallBitVector &UsedLanes) const {
       Value *IdxLaneV = getData(Idx, Lane).V;
-      if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
+      if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
+          isa<ExtractElementInst>(IdxLaneV))
         return 0;
-      SmallPtrSet<Value *, 4> Uniques;
-      for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
+      SmallDenseMap<Value *, unsigned, 4> Uniques;
+      for (unsigned Ln : seq<unsigned>(getNumLanes())) {
         if (Ln == Lane)
           continue;
         Value *OpIdxLnV = getData(OpIdx, Ln).V;
         if (!isa<Instruction>(OpIdxLnV))
           return 0;
-        Uniques.insert(OpIdxLnV);
+        Uniques.try_emplace(OpIdxLnV, Ln);
       }
-      int UniquesCount = Uniques.size();
-      int UniquesCntWithIdxLaneV =
-          Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
+      unsigned UniquesCount = Uniques.size();
+      auto IdxIt = Uniques.find(IdxLaneV);
+      unsigned UniquesCntWithIdxLaneV =
+          IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
       Value *OpIdxLaneV = getData(OpIdx, Lane).V;
-      int UniquesCntWithOpIdxLaneV =
-          Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
+      auto OpIdxIt = Uniques.find(OpIdxLaneV);
+      unsigned UniquesCntWithOpIdxLaneV =
+          OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
       if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
         return 0;
-      return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
-              UniquesCntWithOpIdxLaneV) -
-             (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
+      return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
+                          UniquesCntWithOpIdxLaneV,
+                      UniquesCntWithOpIdxLaneV -
+                          bit_floor(UniquesCntWithOpIdxLaneV)) -
+             ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
+                  ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
+                  : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
     }
 
     /// \param Lane lane of the operands under analysis.
@@ -1993,7 +2001,7 @@ class BoUpSLP {
     /// predecessors.
     int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
                           int Lane, unsigned OpIdx, unsigned Idx,
-                          bool &IsUsed) {
+                          bool &IsUsed, const SmallBitVector &UsedLanes) {
       LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
                                     LookAheadMaxDepth);
       // Keep track of the instruction stack as we recurse into the operands
@@ -2002,11 +2010,10 @@ class BoUpSLP {
           LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
                                        /*CurrLevel=*/1, MainAltOps);
       if (Score) {
-        int SplatScore = getSplatScore(Lane, OpIdx, Idx);
+        int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
         if (Score <= -SplatScore) {
-          // Set the minimum score for splat-like sequence to avoid setting
-          // failed state.
-          Score = 1;
+          // Failed score.
+          Score = 0;
         } else {
           Score += SplatScore;
           // Scale score to see the 
diff erence between 
diff erent operands
@@ -2036,7 +2043,8 @@ class BoUpSLP {
     std::optional<unsigned>
     getBestOperand(unsigned OpIdx, int Lane, int LastLane,
                    ArrayRef<ReorderingMode> ReorderingModes,
-                   ArrayRef<Value *> MainAltOps) {
+                   ArrayRef<Value *> MainAltOps,
+                   const SmallBitVector &UsedLanes) {
       unsigned NumOperands = getNumOperands();
 
       // The operand of the previous lane at OpIdx.
@@ -2092,7 +2100,7 @@ class BoUpSLP {
           Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
           Value *OpRight = (LeftToRight) ? Op : OpLastLane;
           int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
-                                        OpIdx, Idx, IsUsed);
+                                        OpIdx, Idx, IsUsed, UsedLanes);
           if (Score > static_cast<int>(BestOp.Score) ||
               (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
                Idx == OpIdx)) {
@@ -2507,20 +2515,24 @@ class BoUpSLP {
         for (unsigned I = 0; I < NumOperands; ++I)
           MainAltOps[I].push_back(getData(I, FirstLane).V);
 
+        SmallBitVector UsedLanes(NumLanes);
+        UsedLanes.set(FirstLane);
         for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
           // Visit the lane on the right and then the lane on the left.
           for (int Direction : {+1, -1}) {
             int Lane = FirstLane + Direction * Distance;
             if (Lane < 0 || Lane >= (int)NumLanes)
               continue;
+            UsedLanes.set(Lane);
             int LastLane = Lane - Direction;
             assert(LastLane >= 0 && LastLane < (int)NumLanes &&
                    "Out of bounds");
             // Look for a good match for each operand.
             for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
               // Search for the operand that matches SortedOps[OpIdx][Lane-1].
-              std::optional<unsigned> BestIdx = getBestOperand(
-                  OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
+              std::optional<unsigned> BestIdx =
+                  getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
+                                 MainAltOps[OpIdx], UsedLanes);
               // By not selecting a value, we allow the operands that follow to
               // select a better matching value. We will get a non-null value in
               // the next run of getBestOperand().

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
index 9691cb7537a702..33fa00c1881da3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
@@ -6,29 +6,23 @@ define i32 @a() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i8 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi i8 [ 0, [[TMP0]] ], [ [[TMP8:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP6]] = load <4 x i8>, ptr null, align 4
-; CHECK-NEXT:    [[TMP7]] = extractelement <4 x i8> [[TMP6]], i32 3
-; CHECK-NEXT:    [[TMP8]] = extractelement <4 x i8> [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i8> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i8 [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i8> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i8 [[TMP11]], [[TMP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i8 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i8 [[TMP7]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 2
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP13]], i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 poison, i32 13>
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP14]], i32 6
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
 ; CHECK-NEXT:    store <8 x i8> [[TMP23]], ptr null, align 4
+; CHECK-NEXT:    [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1]]
 ;
   br label %1