[llvm] 88e7b8b - [SLP]Use TTI::getScalarizationOverhead where possible

Tue Feb 4 15:49:46 PST 2025

Author: Alexey Bataev
Date: 2025-02-04T18:49:43-05:00
New Revision: 88e7b8b81c061113399637f936937ffaf5a9bc08

URL: https://github.com/llvm/llvm-project/commit/88e7b8b81c061113399637f936937ffaf5a9bc08
DIFF: https://github.com/llvm/llvm-project/commit/88e7b8b81c061113399637f936937ffaf5a9bc08.diff

LOG: [SLP]Use TTI::getScalarizationOverhead where possible

Better to use TTI::getScalarizationOverhead instead of
TTI::getVectorInstrCost to correctly calculate the costs of
buildvectors/extracts.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/125725

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
    llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll
    llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
    llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
    llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
    llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 558d75c5eb388fe..7b3526cca119f5c 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10706,6 +10706,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         });
     SmallPtrSet<Value *, 4> UniqueBases;
     unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+    SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
     for (unsigned Part : seq<unsigned>(NumParts)) {
       unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
       ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
@@ -10756,10 +10757,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             continue;
           }
         }
-        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
-                                       CostKind, Idx);
-      }
-    }
+        APInt &DemandedElts =
+            VectorOpsToExtracts
+                .try_emplace(VecBase,
+                             APInt::getZero(getNumElements(VecBase->getType())))
+                .first->getSecond();
+        DemandedElts.setBit(Idx);
+      }
+    }
+    for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
+      Cost -= TTI.getScalarizationOverhead(cast<VectorType>(Vec->getType()),
+                                           DemandedElts, /*Insert=*/false,
+                                           /*Extract=*/true, CostKind);
     // Check that gather of extractelements can be represented as just a
     // shuffle of a single/two vectors the scalars are extracted from.
     // Found the bunch of extractelement instructions that must be gathered
@@ -11283,24 +11292,27 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   }
   case Instruction::ExtractValue:
   case Instruction::ExtractElement: {
+    APInt DemandedElts;
+    VectorType *SrcVecTy = nullptr;
     auto GetScalarCost = [&](unsigned Idx) {
       if (isa<PoisonValue>(UniqueValues[Idx]))
         return InstructionCost(TTI::TCC_Free);
 
       auto *I = cast<Instruction>(UniqueValues[Idx]);
-      VectorType *SrcVecTy;
-      if (ShuffleOrOp == Instruction::ExtractElement) {
-        auto *EE = cast<ExtractElementInst>(I);
-        SrcVecTy = EE->getVectorOperandType();
-      } else {
-        auto *EV = cast<ExtractValueInst>(I);
-        Type *AggregateTy = EV->getAggregateOperand()->getType();
-        unsigned NumElts;
-        if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
-          NumElts = ATy->getNumElements();
-        else
-          NumElts = AggregateTy->getStructNumElements();
-        SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
+      if (!SrcVecTy) {
+        if (ShuffleOrOp == Instruction::ExtractElement) {
+          auto *EE = cast<ExtractElementInst>(I);
+          SrcVecTy = EE->getVectorOperandType();
+        } else {
+          auto *EV = cast<ExtractValueInst>(I);
+          Type *AggregateTy = EV->getAggregateOperand()->getType();
+          unsigned NumElts;
+          if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
+            NumElts = ATy->getNumElements();
+          else
+            NumElts = AggregateTy->getStructNumElements();
+          SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
+        }
       }
       if (I->hasOneUse()) {
         Instruction *Ext = I->user_back();
@@ -11317,10 +11329,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
           return Cost;
         }
       }
-      return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
-                                     CostKind, *getExtractIndex(I));
+      if (DemandedElts.isZero())
+        DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
+      DemandedElts.setBit(*getExtractIndex(I));
+      return InstructionCost(TTI::TCC_Free);
+    };
+    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
+      return CommonCost - (DemandedElts.isZero()
+                               ? TTI::TCC_Free
+                               : TTI.getScalarizationOverhead(
+                                     SrcVecTy, DemandedElts, /*Insert=*/false,
+                                     /*Extract=*/true, CostKind));
     };
-    auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
   case Instruction::InsertElement: {
@@ -13663,6 +13683,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   APInt ShuffledElements = APInt::getZero(VL.size());
+  APInt DemandedElements = APInt::getZero(VL.size());
   DenseMap<Value *, unsigned> UniqueElements;
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost;
@@ -13673,9 +13694,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
       V = nullptr;
     }
     if (!ForPoisonSrc)
-      Cost +=
-          TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
-                                  I, Constant::getNullValue(VecTy), V);
+      DemandedElements.setBit(I);
   };
   SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
@@ -13698,6 +13717,10 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
     ShuffledElements.setBit(I);
     ShuffleMask[I] = Res.first->second;
   }
+  if (!DemandedElements.isZero())
+    Cost +=
+        TTI->getScalarizationOverhead(VecTy, DemandedElements, /*Insert=*/true,
+                                      /*Extract=*/false, CostKind, VL);
   if (ForPoisonSrc) {
     if (isa<FixedVectorType>(ScalarTy)) {
       assert(SLPReVec && "Only supported by REVEC.");

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
index 29bd81998cdb2a9..bb88edff1163471 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
@@ -607,35 +607,13 @@ define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i3
 
 ; computes (a/const + x - y) * z
 define <2 x i32> @sdiv_v2i32_const_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
-; NO-SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
-; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; NO-SVE-NEXT:    [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0
-; NO-SVE-NEXT:    [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1
-; NO-SVE-NEXT:    [[TMP1:%.*]] = sdiv i32 [[A0]], 2
-; NO-SVE-NEXT:    [[TMP2:%.*]] = sdiv i32 [[A1]], 4
-; NO-SVE-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0
-; NO-SVE-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1
-; NO-SVE-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]]
-; NO-SVE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]]
-; NO-SVE-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0
-; NO-SVE-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1
-; NO-SVE-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]]
-; NO-SVE-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]]
-; NO-SVE-NEXT:    [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0
-; NO-SVE-NEXT:    [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1
-; NO-SVE-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]]
-; NO-SVE-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]]
-; NO-SVE-NEXT:    [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; NO-SVE-NEXT:    [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1
-; NO-SVE-NEXT:    ret <2 x i32> [[RES1]]
-;
-; SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
-; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
-; SVE-NEXT:    [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
-; SVE-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
-; SVE-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
-; SVE-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
-; SVE-NEXT:    ret <2 x i32> [[TMP4]]
+; CHECK-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
+; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
+; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
 ;
 {
   %a0 = extractelement <2 x i32> %a, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll
index 579239bc659bd85..75a413ffc1fb198 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll
@@ -10,15 +10,18 @@ define <4 x double> @test(ptr %ia, ptr %ib, ptr %ic, ptr %id, ptr %ie, ptr %x) {
 ; CHECK-NEXT:    [[I4275:%.*]] = load double, ptr [[ID]], align 8
 ; CHECK-NEXT:    [[I4277:%.*]] = load double, ptr [[IE]], align 8
 ; CHECK-NEXT:    [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> poison, double [[I4238]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[I4252]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[I4264]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[I4277]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP7]]
-; CHECK-NEXT:    ret <4 x double> [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I4238]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I4252]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[I4264]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[I4277]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[I44281:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x double> [[I44281]]
 ;
   %i4238 = load double, ptr %ia, align 8
   %i4252 = load double, ptr %ib, align 8

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index de99654d84eb81f..c2369a6a89ec1de 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; YAML-NEXT:  Function:        foo
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - Cost:            '-4'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '10'
 ; YAML-NEXT:  ...

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index 2cdbd5cff4468c2..cb4783010965e81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -49,11 +49,24 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
 ;
 ; AVX512-LABEL: @reduce_and4(
 ; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX512-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX512-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; AVX512-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
+; AVX512-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1
+; AVX512-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2
+; AVX512-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3
+; AVX512-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
+; AVX512-NEXT:    [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
+; AVX512-NEXT:    [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2
+; AVX512-NEXT:    [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3
+; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[VECEXT8]], i32 8
+; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT7]], i32 9
+; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT10]], i32 10
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT12]], i32 11
+; AVX512-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 12
+; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT]], i32 13
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT2]], i32 14
+; AVX512-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT4]], i32 15
+; AVX512-NEXT:    [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP8]])
 ; AVX512-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; AVX512-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -131,11 +144,24 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
 ; AVX2-NEXT:    ret i32 [[OP_RDX]]
 ;
 ; AVX512-LABEL: @reduce_and4_transpose(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX512-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX512-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; AVX512-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
+; AVX512-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
+; AVX512-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1
+; AVX512-NEXT:    [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
+; AVX512-NEXT:    [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2
+; AVX512-NEXT:    [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2
+; AVX512-NEXT:    [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3
+; AVX512-NEXT:    [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT24]], i32 8
+; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT16]], i32 9
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT8]], i32 10
+; AVX512-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 11
+; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT23]], i32 12
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT15]], i32 13
+; AVX512-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT7]], i32 14
+; AVX512-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[VECEXT]], i32 15
+; AVX512-NEXT:    [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP9]])
 ; AVX512-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; AVX512-NEXT:    ret i32 [[OP_RDX1]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
index f1034f397113514..ae5018a63e2148d 100644
--- a/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
@@ -1,22 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @tes() {
-; CHECK-LABEL: define void @tes() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
-; CHECK-NEXT:    br label [[TMP1:%.*]]
-; CHECK:       1:
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP4]], i1 false
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false
-; CHECK-NEXT:    br i1 [[OP_RDX1]], label [[TMP6:%.*]], label [[TMP5:%.*]]
-; CHECK:       4:
-; CHECK-NEXT:    ret void
-; CHECK:       5:
-; CHECK-NEXT:    ret void
+; X86-LABEL: define void @tes() {
+; X86-NEXT:  entry:
+; X86-NEXT:    [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
+; X86-NEXT:    br label [[TMP1:%.*]]
+; X86:       1:
+; X86-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
+; X86-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
+; X86-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP3]], i1 false
+; X86-NEXT:    [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false
+; X86-NEXT:    br i1 [[OP_RDX1]], label [[TMP4:%.*]], label [[TMP5:%.*]]
+; X86:       4:
+; X86-NEXT:    ret void
+; X86:       5:
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: define void @tes() {
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[TMP0:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0
+; AARCH64-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0
+; AARCH64-NEXT:    [[TMP2:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
+; AARCH64-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AARCH64-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0
+; AARCH64-NEXT:    br label [[TMP5:%.*]]
+; AARCH64:       5:
+; AARCH64-NEXT:    [[TMP6:%.*]] = select i1 false, i1 false, i1 false
+; AARCH64-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i1 [[TMP0]], i1 false
+; AARCH64-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i1 [[TMP1]], i1 false
+; AARCH64-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i1 false, i1 false
+; AARCH64-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i1 [[TMP3]], i1 false
+; AARCH64-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i1 [[TMP4]], i1 false
+; AARCH64-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]]
+; AARCH64:       12:
+; AARCH64-NEXT:    ret void
+; AARCH64:       13:
+; AARCH64-NEXT:    ret void
 ;
 entry:
   %0 = extractelement <2 x i1> zeroinitializer, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll
index 2570cdb45e1e782..0a6b86c953ee016 100644
--- a/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll
+++ b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll
@@ -1,11 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @test() {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
+; X86-LABEL: @test(
+; X86-NEXT:  entry:
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @test(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> zeroinitializer, i64 1
+; AARCH64-NEXT:    [[TOBOOL:%.*]] = fcmp une half [[TMP0]], 0xH0000
+; AARCH64-NEXT:    [[TMP1:%.*]] = extractelement <8 x half> zeroinitializer, i64 1
+; AARCH64-NEXT:    [[TOBOOL3:%.*]] = fcmp une half [[TMP1]], 0xH0000
+; AARCH64-NEXT:    ret void
 ;
 entry:
   %0 = extractelement <8 x half> zeroinitializer, i64 1