[llvm] efd6055 - Revert "[SLP]Attempt to vectorize long stores, if short one failed."

Tue Apr 16 17:25:48 PDT 2024

Author: Nikita Popov
Date: 2024-04-17T09:25:05+09:00
New Revision: efd60556f759fbfa0fc0a5984463daeaef20799c

URL: https://github.com/llvm/llvm-project/commit/efd60556f759fbfa0fc0a5984463daeaef20799c
DIFF: https://github.com/llvm/llvm-project/commit/efd60556f759fbfa0fc0a5984463daeaef20799c.diff

LOG: Revert "[SLP]Attempt to vectorize long stores, if short one failed."

This reverts commit 7d4e8c1f3bbfe976f4871c9cf953f76d771b0eda.

Contrary to the commit description, this does cause large
compile-time regressions (up to 10% on individual files).

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8ae38550d3095d..7694627c3b0430 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15237,60 +15237,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         Size *= 2;
       });
       unsigned StartIdx = 0;
-      unsigned Repeat = 0;
-      constexpr unsigned MaxAttempts = 2;
-      while (true) {
-        ++Repeat;
-        for (unsigned Size : CandidateVFs) {
-          for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
-            ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
-            assert(
-                all_of(
-                    Slice,
-                    [&](Value *V) {
-                      return cast<StoreInst>(V)->getValueOperand()->getType() ==
-                             cast<StoreInst>(Slice.front())
-                                 ->getValueOperand()
-                                 ->getType();
-                    }) &&
-                "Expected all operands of same type.");
-            if (!VectorizedStores.count(Slice.front()) &&
-                !VectorizedStores.count(Slice.back()) &&
-                TriedSequences
-                    .insert(std::make_pair(Slice.front(), Slice.back()))
-                    .second &&
-                vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
-              // Mark the vectorized stores so that we don't vectorize them
-              // again.
-              VectorizedStores.insert(Slice.begin(), Slice.end());
-              Changed = true;
-              // If we vectorized initial block, no need to try to vectorize
-              // it again.
-              if (Cnt == StartIdx)
-                StartIdx += Size;
-              Cnt += Size;
-              continue;
-            }
-            ++Cnt;
-          }
-          // Check if the whole array was vectorized already - exit.
-          if (StartIdx >= Operands.size()) {
-            Repeat = MaxAttempts;
-            break;
+      for (unsigned Size : CandidateVFs) {
+        for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+          ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+          assert(
+              all_of(
+                  Slice,
+                  [&](Value *V) {
+                    return cast<StoreInst>(V)->getValueOperand()->getType() ==
+                           cast<StoreInst>(Slice.front())
+                               ->getValueOperand()
+                               ->getType();
+                  }) &&
+              "Expected all operands of same type.");
+          if (!VectorizedStores.count(Slice.front()) &&
+              !VectorizedStores.count(Slice.back()) &&
+              TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
+                  .second &&
+              vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
+            // Mark the vectorized stores so that we don't vectorize them again.
+            VectorizedStores.insert(Slice.begin(), Slice.end());
+            Changed = true;
+            // If we vectorized initial block, no need to try to vectorize it
+            // again.
+            if (Cnt == StartIdx)
+              StartIdx += Size;
+            Cnt += Size;
+            continue;
           }
+          ++Cnt;
         }
-        // Check if tried all attempts or no need for the last attempts at all.
-        if (Repeat >= MaxAttempts)
-          break;
-        const unsigned MaxTotalNum = bit_floor(Operands.size() - StartIdx);
-        if (MaxVF >= MaxTotalNum)
+        // Check if the whole array was vectorized already - exit.
+        if (StartIdx >= Operands.size())
           break;
-        // Last attempt to vectorize max number of elements, if all previous
-        // attempts were unsuccessful because of the cost issues.
-        CandidateVFs.clear();
-        for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2) {
-          CandidateVFs.push_back(Size);
-        }
       }
     }
   };

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 3deab0975ce764..75505f632a43f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -100,17 +100,41 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; SSE-LABEL: @store_i64(
 ; SSE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
-; SSE-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
-; SSE-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
-; SSE-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
-; SSE-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
-; SSE-NEXT:    [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
-; SSE-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
-; SSE-NEXT:    [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
-; SSE-NEXT:    store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; SSE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
+; SSE-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; SSE-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
+; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
+; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
+; SSE-NEXT:    store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
+; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
+; SSE-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
+; SSE-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
+; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
+; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
+; SSE-NEXT:    store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; SSE-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
+; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
+; SSE-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; SSE-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
+; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
+; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
+; SSE-NEXT:    store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
+; SSE-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
+; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
+; SSE-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
+; SSE-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
+; SSE-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
+; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
+; SSE-NEXT:    store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @store_i64(