[llvm] 6f7160e - [SLP]Attempt to vectorize long stores, if short one failed.

Wed Apr 17 10:27:59 PDT 2024

Author: Alexey Bataev
Date: 2024-04-17T10:24:35-07:00
New Revision: 6f7160eedb2db02f37d4ffd52fff7b0cf88b3fdc

URL: https://github.com/llvm/llvm-project/commit/6f7160eedb2db02f37d4ffd52fff7b0cf88b3fdc
DIFF: https://github.com/llvm/llvm-project/commit/6f7160eedb2db02f37d4ffd52fff7b0cf88b3fdc.diff

LOG: [SLP]Attempt to vectorize long stores, if short one failed.

We can try to vectorize long store sequences, if short ones were
unsuccessful because of the non-profitable vectorization. It should not
increase compile time significantly (stores are sorted already,
complexity is n x log n), but vectorize extra code.

Metric: size..text

Program                                                                         size..text
                                                                                results     results0    diff
         test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test  1088012.00  1088236.00  0.0%
                  test-suite :: SingleSource/UnitTests/matrix-types-spec.test   480396.00   480476.00  0.0%
          test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   664613.00   664661.00  0.0%
         test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   664613.00   664661.00  0.0%
        test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test  2041105.00  2040961.00 -0.0%
                 test-suite :: MultiSource/Applications/JM/lencod/lencod.test   836563.00   836387.00 -0.0%
                 test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  1035100.00  1032140.00 -0.3%

In all benchmarks extra code gets vectorized

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/88563

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7694627c3b0430..806e8085038b35 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15164,10 +15164,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
   BoUpSLP::ValueSet VectorizedStores;
   bool Changed = false;
 
-  // Stores the pair of stores (first_store, last_store) in a range, that were
-  // already tried to be vectorized. Allows to skip the store ranges that were
-  // already tried to be vectorized but the attempts were unsuccessful.
-  DenseSet<std::pair<Value *, Value *>> TriedSequences;
   struct StoreDistCompare {
     bool operator()(const std::pair<unsigned, int> &Op1,
                     const std::pair<unsigned, int> &Op2) const {
@@ -15209,8 +15205,10 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
       Type *ValueTy = StoreTy;
       if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
         ValueTy = Trunc->getSrcTy();
-      unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
-          R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
+      unsigned MinVF = std::max<unsigned>(
+          2, PowerOf2Ceil(TTI->getStoreMinimumVF(
+                 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
+                 ValueTy)));
 
       if (MaxVF < MinVF) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
@@ -15236,40 +15234,74 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         VF = Size > MaxVF ? NonPowerOf2VF : Size;
         Size *= 2;
       });
-      unsigned StartIdx = 0;
-      for (unsigned Size : CandidateVFs) {
-        for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
-          ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
-          assert(
-              all_of(
-                  Slice,
-                  [&](Value *V) {
-                    return cast<StoreInst>(V)->getValueOperand()->getType() ==
-                           cast<StoreInst>(Slice.front())
-                               ->getValueOperand()
-                               ->getType();
-                  }) &&
-              "Expected all operands of same type.");
-          if (!VectorizedStores.count(Slice.front()) &&
-              !VectorizedStores.count(Slice.back()) &&
-              TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
-                  .second &&
-              vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
-            // Mark the vectorized stores so that we don't vectorize them again.
-            VectorizedStores.insert(Slice.begin(), Slice.end());
-            Changed = true;
-            // If we vectorized initial block, no need to try to vectorize it
-            // again.
-            if (Cnt == StartIdx)
-              StartIdx += Size;
-            Cnt += Size;
-            continue;
+      unsigned End = Operands.size();
+      unsigned Repeat = 0;
+      constexpr unsigned MaxAttempts = 2;
+      SmallBitVector Range(Operands.size());
+      while (true) {
+        ++Repeat;
+        for (unsigned Size : CandidateVFs) {
+          int StartIdx = Range.find_first_unset();
+          while (StartIdx != -1) {
+            int EndIdx = Range.find_next(StartIdx);
+            unsigned Sz = EndIdx == -1 ? End : EndIdx;
+            for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
+              ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+              assert(all_of(Slice,
+                            [&](Value *V) {
+                              return cast<StoreInst>(V)
+                                         ->getValueOperand()
+                                         ->getType() ==
+                                     cast<StoreInst>(Slice.front())
+                                         ->getValueOperand()
+                                         ->getType();
+                            }) &&
+                     "Expected all operands of same type.");
+              if (vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                VectorizedStores.insert(Slice.begin(), Slice.end());
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                Changed = true;
+                // If we vectorized initial block, no need to try to vectorize
+                // it again.
+                Range.set(Cnt, Cnt + Size);
+                if (Cnt < StartIdx + MinVF)
+                  Range.set(StartIdx, Cnt);
+                if (Cnt > EndIdx - Size - MinVF) {
+                  Range.set(Cnt + Size, EndIdx);
+                  End = Cnt;
+                }
+                Cnt += Size;
+                continue;
+              }
+              ++Cnt;
+            }
+            if (Sz >= End)
+              break;
+            StartIdx = Range.find_next_unset(EndIdx);
           }
-          ++Cnt;
         }
-        // Check if the whole array was vectorized already - exit.
-        if (StartIdx >= Operands.size())
+        // All values vectorize - exit.
+        if (Range.all())
+          break;
+        // Check if tried all attempts or no need for the last attempts at all.
+        if (Repeat >= MaxAttempts)
+          break;
+        constexpr unsigned MaxVFScale = 4;
+        constexpr unsigned StoresLimit = 16;
+        const unsigned MaxTotalNum = std::min(
+            std::max<unsigned>(StoresLimit, MaxVFScale * MaxVF),
+            bit_floor(static_cast<unsigned>(Range.find_last_unset() -
+                                            Range.find_first_unset() + 1)));
+        if (MaxVF >= MaxTotalNum)
           break;
+        // Last attempt to vectorize max number of elements, if all previous
+        // attempts were unsuccessful because of the cost issues.
+        CandidateVFs.clear();
+        for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2)
+          CandidateVFs.push_back(Size);
       }
     }
   };

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 75505f632a43f3..3deab0975ce764 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; SSE-LABEL: @store_i64(
 ; SSE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; SSE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; SSE-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; SSE-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT:    store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; SSE-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; SSE-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT:    store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; SSE-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; SSE-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; SSE-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT:    store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; SSE-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; SSE-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; SSE-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; SSE-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT:    store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
+; SSE-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; SSE-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
+; SSE-NEXT:    [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; SSE-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
+; SSE-NEXT:    [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
+; SSE-NEXT:    store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @store_i64(