[llvm] 2271f0b - [SLP]Check for perfect/shuffled match for the split node

Tue Apr 15 13:18:32 PDT 2025

Author: Alexey Bataev
Date: 2025-04-15T13:17:46-07:00
New Revision: 2271f0bebd48c9ed8b16b500886a819c4f269a6a

URL: https://github.com/llvm/llvm-project/commit/2271f0bebd48c9ed8b16b500886a819c4f269a6a
DIFF: https://github.com/llvm/llvm-project/commit/2271f0bebd48c9ed8b16b500886a819c4f269a6a.diff

LOG: [SLP]Check for perfect/shuffled match for the split node

If the potential split node is a perfect/shuffled match of another split
node, need to skip creation of the another split node with the same
scalars, it should be a buildvector.

Fixes #135800

Added: 
    llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 253933a2438cd..234cd340ebc13 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9575,6 +9575,24 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
       !SplitAlternateInstructions)
     return false;
 
+  // Check if this is a duplicate of another split entry.
+  LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
+                    << ".\n");
+  for (TreeEntry *E : getSplitTreeEntries(LocalState.getMainOp())) {
+    if (E->isSame(VL)) {
+      LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
+                        << *LocalState.getMainOp() << ".\n");
+      return false;
+    }
+    SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
+    if (all_of(VL, [&](Value *V) {
+          return isa<PoisonValue>(V) || Values.contains(V);
+        })) {
+      LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
+      return false;
+    }
+  }
+
   ReorderIndices.assign(VL.size(), VL.size());
   SmallBitVector Op1Indices(VL.size());
   for (auto [Idx, V] : enumerate(VL)) {

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll
new file mode 100644
index 0000000000000..10e73b042f19b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-full-match.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(double %0) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: double [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[_THREAD:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call double null(ptr null, ptr null, ptr null)
+; CHECK-NEXT:    [[TMP2:%.*]] = call double null(ptr null, ptr null, ptr null)
+; CHECK-NEXT:    br i1 false, label %[[BB3:.*]], label %[[BB7:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call double null(ptr null, ptr null, ptr null)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double poison, double 0.000000e+00>, double [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP4]], i32 0
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x double> [ [[TMP6]], %[[BB3]] ], [ zeroinitializer, [[DOTTHREAD:%.*]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP8]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = call <6 x double> @llvm.vector.insert.v6f64.v2f64(<6 x double> [[TMP11]], <2 x double> [[TMP10]], i64 4)
+; CHECK-NEXT:    br i1 false, label %[[DOTLR_PH272_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]]
+; CHECK:       [[_LR_PH272_PREHEADER:.*:]]
+; CHECK-NEXT:    br i1 false, [[DOT_CRIT_EDGE]], label %[[BB13:.*]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <6 x double> [[TMP12]], <6 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <6 x double> @llvm.vector.insert.v6f64.v2f64(<6 x double> [[TMP15]], <2 x double> splat (double 0x7FF8000000000000), i64 4)
+; CHECK-NEXT:    br i1 false, label %[[BB17:.*]], [[DOT_CRIT_EDGE]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <6 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00, double 0.000000e+00>, double [[TMP0]], i32 3
+; CHECK-NEXT:    br [[DOT_CRIT_EDGE]]
+; CHECK:       [[__CRIT_EDGE:.*:]]
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <6 x double> [ [[TMP12]], %[[BB7]] ], [ [[TMP18]], %[[BB17]] ], [ [[TMP16]], %[[BB13]] ], [ [[TMP12]], %[[DOTLR_PH272_PREHEADER]] ]
+; CHECK-NEXT:    ret void
+;
+.thread:
+  %1 = call double null(ptr null, ptr null, ptr null)
+  %2 = call double null(ptr null, ptr null, ptr null)
+  br i1 false, label %3, label %5
+
+3:
+  %4 = call double null(ptr null, ptr null, ptr null)
+  br label %5
+
+5:
+  %.1226 = phi double [ %4, %3 ], [ 0.000000e+00, %.thread ]
+  %.1222 = phi double [ 0.000000e+00, %3 ], [ 0.000000e+00, %.thread ]
+  %.1218 = phi double [ %0, %3 ], [ 0.000000e+00, %.thread ]
+  %.1216 = phi double [ 0.000000e+00, %3 ], [ 0.000000e+00, %.thread ]
+  br i1 false, label %.lr.ph272.preheader, label %._crit_edge
+
+.lr.ph272.preheader:
+  br i1 false, label %._crit_edge, label %6
+
+6:
+  %7 = fdiv double 0.000000e+00, 0.000000e+00
+  %8 = fsub double 0.000000e+00, %7
+  %9 = fdiv double 0.000000e+00, 0.000000e+00
+  %10 = fsub double 0.000000e+00, %9
+  br i1 false, label %11, label %._crit_edge
+
+11:
+  br label %._crit_edge
+
+._crit_edge:
+  %.2227.lcssa = phi double [ %.1226, %5 ], [ 0.000000e+00, %11 ], [ %.1226, %6 ], [ %.1226, %.lr.ph272.preheader ]
+  %.2223.lcssa = phi double [ %.1222, %5 ], [ 0.000000e+00, %11 ], [ %.1222, %6 ], [ %.1222, %.lr.ph272.preheader ]
+  %.2219.lcssa = phi double [ %.1218, %5 ], [ 0.000000e+00, %11 ], [ %.1218, %6 ], [ %.1218, %.lr.ph272.preheader ]
+  %.2.lcssa = phi double [ %.1216, %5 ], [ %0, %11 ], [ %.1216, %6 ], [ %.1216, %.lr.ph272.preheader ]
+  %.0213.lcssa = phi double [ %2, %5 ], [ 0.000000e+00, %11 ], [ %10, %6 ], [ %2, %.lr.ph272.preheader ]
+  %.0211.lcssa = phi double [ %1, %5 ], [ 0.000000e+00, %11 ], [ %8, %6 ], [ %1, %.lr.ph272.preheader ]
+  ret void
+}

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
index b7b6c10137b64..9abb994db1e73 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
@@ -15,8 +15,9 @@ define i1 @test(ptr %0, ptr %1, <2 x float> %2, <2 x float> %3, <2 x float> %4)
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison>
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP9]], i32 7
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP12]], <16 x i32> <i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 14, i32 14, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP9]], i32 15
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul <16 x float> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP20]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = call float @foo(float [[TMP21]])