[llvm] aaaa2a3 - [SLP]Support vectorization of previously vectorized scalars in split nodes

via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 10 09:06:42 PDT 2025


Author: Alexey Bataev
Date: 2025-04-10T12:06:38-04:00
New Revision: aaaa2a325bd1abb8c87e0171384fd2c42da5e38a

URL: https://github.com/llvm/llvm-project/commit/aaaa2a325bd1abb8c87e0171384fd2c42da5e38a
DIFF: https://github.com/llvm/llvm-project/commit/aaaa2a325bd1abb8c87e0171384fd2c42da5e38a.diff

LOG: [SLP]Support vectorization of previously vectorized scalars in split nodes

Patch removes the restriction for the revectorization of the previously
vectorized scalars in split nodes, and moves the cost profitability
check to avoid regressions.

Reviewers: hiraditya, RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/134286

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
    llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll
    llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 444bd08eac549..b563dc8e4f2a6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -895,6 +895,13 @@ class InstructionsState {
            is_contained(AddSub, getAltOpcode());
   }
 
+  /// Checks if main/alt instructions are cmp operations.
+  bool isCmpOp() const {
+    return (getOpcode() == Instruction::ICmp ||
+            getOpcode() == Instruction::FCmp) &&
+           getAltOpcode() == getOpcode();
+  }
+
   /// Checks if the current state is valid, i.e. has non-null MainOp
   bool valid() const { return MainOp && AltOp; }
 
@@ -9277,6 +9284,16 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
   // as alternate ops.
   if (NumParts >= VL.size())
     return false;
+  constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
+  InstructionCost InsertCost = ::getShuffleCost(
+      *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
+  FixedVectorType *SubVecTy =
+      getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
+  InstructionCost NewShuffleCost =
+      ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
+  if (!LocalState.isCmpOp() && NumParts <= 1 &&
+      (Mask.empty() || InsertCost >= NewShuffleCost))
+    return false;
   if ((LocalState.getMainOp()->isBinaryOp() &&
        LocalState.getAltOp()->isBinaryOp() &&
        (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
@@ -9284,15 +9301,6 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
       (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
       (LocalState.getMainOp()->isUnaryOp() &&
        LocalState.getAltOp()->isUnaryOp())) {
-    constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
-    InstructionCost InsertCost = ::getShuffleCost(
-        *TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
-    FixedVectorType *SubVecTy =
-        getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
-    InstructionCost NewShuffleCost =
-        ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
-    if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
-      return false;
     InstructionCost OriginalVecOpsCost =
         TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
         TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
@@ -9429,18 +9437,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
       return false;
 
-    // Any value is used in split node already - just gather.
-    if (any_of(VL, [&](Value *V) {
-          return ScalarsInSplitNodes.contains(V) || isVectorized(V);
-        })) {
-      if (TryToFindDuplicates(S)) {
-        auto Invalid = ScheduleBundle::invalid();
-        newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndices);
-      }
-      return true;
-    }
-
     SmallVector<Value *> NewVL(VL.size());
     copy(Op1, NewVL.begin());
     copy(Op2, std::next(NewVL.begin(), Op1.size()));
@@ -9616,9 +9612,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
           ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
                                      /*Insert=*/false, /*Extract=*/true, Kind);
-      InstructionCost ScalarizeCostEstimate =
-          ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Vectorized,
-                                     /*Insert=*/true, /*Extract=*/false, Kind);
+      InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
+          *TTI, ScalarTy, VecTy, Vectorized,
+          /*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
       PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
     }
     if (PreferScalarize) {

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
index 38e9ba7ce7028..1c4f51700d083 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll
@@ -7,18 +7,22 @@ define i32 @a() {
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP6]] = load <4 x i8>, ptr null, align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4)
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 2, i32 3, i32 12, i32 3, i32 12, i32 13, i32 14>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; CHECK-NEXT:    store <8 x i8> [[TMP13]], ptr null, align 4
+; CHECK-NEXT:    store <8 x i8> [[TMP23]], ptr null, align 4
+; CHECK-NEXT:    [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[BB1]]
 ;
   br label %1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll
index c3da2aad4c869..47a25a089c69e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll
@@ -6,20 +6,19 @@ define void @test(ptr %0, <8 x i8> %1) {
 ; CHECK-SAME: ptr [[TMP0:%.*]], <8 x i8> [[TMP1:%.*]]) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP0]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13436
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13444
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13544
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13536
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13437
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i8>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 0, i32 7>
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x i8> poison, i8 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> [[TMP12]], i8 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP11]], <8 x i8> [[TMP10]], i64 8)
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> poison, <8 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[TMP7]], i64 0)
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP15]], <8 x i8> [[TMP14]], i64 8)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i8> [[TMP16]], [[TMP12]]
-; CHECK-NEXT:    store <16 x i8> [[TMP17]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i8> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    store <8 x i8> [[TMP13]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 0, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i8> [[TMP7]], [[TMP14]]
+; CHECK-NEXT:    store <8 x i8> [[TMP15]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %3 = load i8, ptr %0, align 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
index e9884b24e1078..b7b6c10137b64 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll
@@ -16,9 +16,8 @@ define i1 @test(ptr %0, ptr %1, <2 x float> %2, <2 x float> %3, <2 x float> %4)
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP9]], i32 7
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP12]], <16 x i32> <i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 14, i32 14, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 poison>
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP9]], i32 15
-; CHECK-NEXT:    [[TMP20:%.*]] = fmul <16 x float> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul <16 x float> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP20]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = call float @foo(float [[TMP21]])
 ; CHECK-NEXT:    ret i1 false


        


More information about the llvm-commits mailing list