[llvm] 7d01a8f - [SLP]Fix vector factor for repeated node for bv

Mon Jan 20 14:25:56 PST 2025

Author: Alexey Bataev
Date: 2025-01-20T14:22:20-08:00
New Revision: 7d01a8f2b9ac28ffe73bef4b513d383d3edf34b5

URL: https://github.com/llvm/llvm-project/commit/7d01a8f2b9ac28ffe73bef4b513d383d3edf34b5
DIFF: https://github.com/llvm/llvm-project/commit/7d01a8f2b9ac28ffe73bef4b513d383d3edf34b5.diff

LOG: [SLP]Fix vector factor for repeated node for bv

When adding a node vector, when it is used already in the shuffle for
buildvector, need to calculate vector factor from all vector, not only
this single vector, to avoid incorrect result. Also, need to increase
stability of the reused entries detection to avoid mismatch in cost
estimation/codegen.

Fixes #123639

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fc3afed391a06d..ad4855d908747e 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13205,9 +13205,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           VTE = *MIt;
         }
       }
-      Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
-      if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
-        continue;
+      if (none_of(TE->CombinedEntriesWithIndices,
+                  [&](const auto &P) { return P.first == VTE->Idx; })) {
+        Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
+        if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
+          continue;
+      }
       VToTEs.insert(VTE);
     }
     if (VToTEs.empty())
@@ -14497,7 +14500,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
           break;
         }
     }
-    int VF = getVF(V1);
+    unsigned VF = 0;
+    for (Value *V : InVectors)
+      VF = std::max(VF, getVF(V));
     for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
       if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
         CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll
index 34606feade5d35..f7d78be4f13cad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-node-reuse-in-bv.ll
@@ -8,7 +8,7 @@
 ; YAML-NEXT: Function:        test
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-38'
+; YAML-NEXT:   - Cost:            '-41'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '7'
 ; YAML-NEXT: ...
@@ -17,7 +17,7 @@ define i64 @test() {
 ; CHECK-LABEL: define i64 @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 6
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 1, i32 0>, i32 0, i32 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP1]], <4 x i32> <i32 0, i32 0, i32 0, i32 1>, i64 24)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <32 x i32> zeroinitializer, [[TMP2]]