[llvm] [SLP]Better cost estimation for masked gather or "clustered" loads. (PR #105858)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 30 11:27:22 PDT 2024
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/105858
>From 0d0a1d8235ad4e9d81efcb2efbb3860dd6d13e73 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 23 Aug 2024 17:00:57 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 226 ++++++++++++------
.../SLPVectorizer/X86/pr47629-inseltpoison.ll | 20 +-
.../Transforms/SLPVectorizer/X86/pr47629.ll | 20 +-
.../X86/redux-feed-buildvector.ll | 44 +++-
...masked-loads-consecutive-loads-same-ptr.ll | 20 +-
.../X86/reorder-possible-strided-node.ll | 72 ++++--
.../X86/reorder-reused-masked-gather2.ll | 26 +-
7 files changed, 293 insertions(+), 135 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 949579772b94d5..f2ebc444573399 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4780,16 +4780,74 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
}
}
}
- auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
+ // Correctly identify compare the cost of loads + shuffles rather than
+ // strided/masked gather loads. Returns true if vectorized + shuffles
+ // representation is better than just gather.
+ auto CheckForShuffledLoads = [&,
+ &TTI = *TTI](Align CommonAlignment,
+ bool ProfitableGatherPointers) {
+ // Compare masked gather cost and loads + insert subvector costs.
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto [ScalarGEPCost, VectorGEPCost] =
+ getGEPCosts(TTI, PointerOps, PointerOps.front(),
+ Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
+ // Estimate the cost of masked gather GEP. If not a splat, roughly
+ // estimate as a buildvector, otherwise estimate as splat.
+ if (static_cast<unsigned>(
+ count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
+ PointerOps.size() - 1 ||
+ any_of(PointerOps, [&](Value *V) {
+ return getUnderlyingObject(V) !=
+ getUnderlyingObject(PointerOps.front());
+ }))
+ VectorGEPCost += TTI.getScalarizationOverhead(
+ VecTy,
+ APInt::getAllOnes(VecTy->getElementCount().getKnownMinValue()),
+ /*Insert=*/true, /*Extract=*/false, CostKind);
+ else
+ VectorGEPCost +=
+ TTI.getScalarizationOverhead(
+ VecTy,
+ APInt::getOneBitSet(VecTy->getElementCount().getKnownMinValue(),
+ 0),
+ /*Insert=*/true, /*Extract=*/false, CostKind) +
+ ::getShuffleCost(TTI, TTI::SK_Broadcast, VecTy, std::nullopt,
+ CostKind);
+ // The cost of scalar loads.
+ InstructionCost ScalarLoadsCost =
+ std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+ [&](InstructionCost C, Value *V) {
+ return C + TTI.getInstructionCost(
+ cast<Instruction>(V), CostKind);
+ }) +
+ ScalarGEPCost;
+ // The cost of masked gather.
+ InstructionCost MaskedGatherCost =
+ TTI.getGatherScatterOpCost(Instruction::Load, VecTy,
+ cast<LoadInst>(VL0)->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment,
+ CostKind) +
+ (ProfitableGatherPointers ? 0 : VectorGEPCost);
+ APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+ InstructionCost GatherCost =
+ TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind) +
+ ScalarLoadsCost;
+ // The list of loads is small or perform partial check already - directly
+ // compare masked gather cost and gather cost.
+ constexpr unsigned ListLimit = 4;
+ if (!TryRecursiveCheck || VL.size() < ListLimit)
+ return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
- unsigned MinVF = getMinVF(Sz);
- unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
+ unsigned MinVF = 2;
+ unsigned MaxVF = bit_floor(VL.size() / 2);
MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
+ DemandedElts.clearAllBits();
+ // Iterate through possible vectorization factors and check if vectorized
+ // + shuffles is better than just gather.
for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
- unsigned VectorizedCnt = 0;
SmallVector<LoadsState> States;
- for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
- Cnt += VF, ++VectorizedCnt) {
+ for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
SmallVector<unsigned> Order;
SmallVector<Value *> PointerOps;
@@ -4797,8 +4855,10 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
/*TryRecursiveCheck=*/false);
// Check that the sorted loads are consecutive.
- if (LS == LoadsState::Gather)
- break;
+ if (LS == LoadsState::Gather) {
+ DemandedElts.setBits(Cnt, Cnt + VF);
+ continue;
+ }
// If need the reorder - consider as high-cost masked gather for now.
if ((LS == LoadsState::Vectorize ||
LS == LoadsState::StridedVectorize) &&
@@ -4806,79 +4866,94 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
LS = LoadsState::ScatterVectorize;
States.push_back(LS);
}
+ if (DemandedElts.isAllOnes())
+ // All loads gathered - try smaller VF.
+ continue;
+ InstructionCost ScalarVFGEPCost = 0;
// Can be vectorized later as a serie of loads/insertelements.
- if (VectorizedCnt == VL.size() / VF) {
- // Compare masked gather cost and loads + insersubvector costs.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
- TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
- CostKind, ScalarTy, VecTy);
- InstructionCost MaskedGatherCost =
- TTI.getGatherScatterOpCost(
- Instruction::Load, VecTy,
- cast<LoadInst>(VL0)->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment, CostKind) +
- VectorGEPCost - ScalarGEPCost;
- InstructionCost VecLdCost = 0;
- auto *SubVecTy = getWidenedType(ScalarTy, VF);
- for (auto [I, LS] : enumerate(States)) {
- auto *LI0 = cast<LoadInst>(VL[I * VF]);
- switch (LS) {
- case LoadsState::Vectorize: {
- auto [ScalarGEPCost, VectorGEPCost] =
- getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
- LI0->getPointerOperand(), Instruction::Load,
- CostKind, ScalarTy, SubVecTy);
- VecLdCost += TTI.getMemoryOpCost(
- Instruction::Load, SubVecTy, LI0->getAlign(),
- LI0->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo()) +
- VectorGEPCost - ScalarGEPCost;
- break;
- }
- case LoadsState::StridedVectorize: {
- auto [ScalarGEPCost, VectorGEPCost] =
- getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
- LI0->getPointerOperand(), Instruction::Load,
- CostKind, ScalarTy, SubVecTy);
+ InstructionCost VecLdCost = 0;
+ if (!DemandedElts.isZero()) {
+ VecLdCost =
+ TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+ /*Extract=*/false, CostKind) +
+ ScalarGEPCost;
+ for (unsigned Idx : seq<unsigned>(VL.size()))
+ if (DemandedElts[Idx])
VecLdCost +=
- TTI.getStridedMemoryOpCost(
- Instruction::Load, SubVecTy, LI0->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment, CostKind) +
- VectorGEPCost - ScalarGEPCost;
- break;
- }
- case LoadsState::ScatterVectorize: {
- auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
- TTI, ArrayRef(PointerOps).slice(I * VF, VF),
- LI0->getPointerOperand(), Instruction::GetElementPtr,
- CostKind, ScalarTy, SubVecTy);
- VecLdCost +=
- TTI.getGatherScatterOpCost(
- Instruction::Load, SubVecTy, LI0->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment, CostKind) +
- VectorGEPCost - ScalarGEPCost;
- break;
- }
- case LoadsState::Gather:
- llvm_unreachable(
- "Expected only consecutive, strided or masked gather loads.");
- }
- SmallVector<int> ShuffleMask(VL.size());
- for (int Idx : seq<int>(0, VL.size()))
- ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
+ TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
+ }
+ auto *SubVecTy = getWidenedType(ScalarTy, VF);
+ for (auto [I, LS] : enumerate(States)) {
+ auto *LI0 = cast<LoadInst>(VL[I * VF]);
+ InstructionCost VectorGEPCost =
+ (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
+ ? 0
+ : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+ LI0->getPointerOperand(),
+ Instruction::GetElementPtr, CostKind, ScalarTy,
+ SubVecTy)
+ .second;
+ if (LS == LoadsState::ScatterVectorize) {
+ if (static_cast<unsigned>(
+ count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
+ PointerOps.size() - 1 ||
+ any_of(PointerOps, [&](Value *V) {
+ return getUnderlyingObject(V) !=
+ getUnderlyingObject(PointerOps.front());
+ }))
+ VectorGEPCost += TTI.getScalarizationOverhead(
+ SubVecTy, APInt::getAllOnes(VF),
+ /*Insert=*/true, /*Extract=*/false, CostKind);
+ else
+ VectorGEPCost +=
+ TTI.getScalarizationOverhead(
+ SubVecTy, APInt::getOneBitSet(VF, 0),
+ /*Insert=*/true, /*Extract=*/false, CostKind) +
+ ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy,
+ std::nullopt, CostKind);
+ }
+ switch (LS) {
+ case LoadsState::Vectorize:
+ VecLdCost += TTI.getMemoryOpCost(
+ Instruction::Load, SubVecTy, LI0->getAlign(),
+ LI0->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo()) +
+ VectorGEPCost;
+ break;
+ case LoadsState::StridedVectorize:
+ VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
+ LI0->getPointerOperand(),
+ /*VariableMask=*/false,
+ CommonAlignment, CostKind) +
+ VectorGEPCost;
+ break;
+ case LoadsState::ScatterVectorize:
+ VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
+ LI0->getPointerOperand(),
+ /*VariableMask=*/false,
+ CommonAlignment, CostKind) +
+ VectorGEPCost;
+ break;
+ case LoadsState::Gather:
+ // Gathers are already calculated - ignore.
+ continue;
+ }
+ SmallVector<int> ShuffleMask(VL.size());
+ for (int Idx : seq<int>(0, VL.size()))
+ ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
+ if (I > 0)
VecLdCost +=
::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy,
ShuffleMask, CostKind, I * VF, SubVecTy);
- }
- // If masked gather cost is higher - better to vectorize, so
- // consider it as a gather node. It will be better estimated
- // later.
- if (MaskedGatherCost >= VecLdCost)
- return true;
}
+ // If masked gather cost is higher - better to vectorize, so
+ // consider it as a gather node. It will be better estimated
+ // later.
+ if (MaskedGatherCost >= VecLdCost &&
+ VecLdCost - GatherCost < -SLPCostThreshold)
+ return true;
}
- return false;
+ return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
};
// TODO: need to improve analysis of the pointers, if not all of them are
// GEPs or have > 2 operands, we end up with a gather node, which just
@@ -4900,7 +4975,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
!TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
// Check if potential masked gather can be represented as series
// of loads + insertsubvectors.
- if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
+ if (TryRecursiveCheck &&
+ CheckForShuffledLoads(CommonAlignment, ProfitableGatherPointers)) {
// If masked gather cost is higher - better to vectorize, so
// consider it as a gather node. It will be better estimated
// later.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 5b33c6e889363e..89bc44dc1d530a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = getelementptr inbounds i32, ptr %1, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 09d6c77557efaa..c1b501015e81e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
; AVX512VL-NEXT: ret void
;
%3 = getelementptr inbounds i32, ptr %1, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 83457cc4966f7c..729d5fd5546dc8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,19 +10,39 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
+; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
+; CHECK-NEXT: [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]])
-; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
-; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0
-; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1
+; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3
+; CHECK-NEXT: [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8
+; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5
+; CHECK-NEXT: [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8
+; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7
+; CHECK-NEXT: [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8
+; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9
+; CHECK-NEXT: [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8
+; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11
+; CHECK-NEXT: [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8
+; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13
+; CHECK-NEXT: [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8
+; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15
+; CHECK-NEXT: [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> poison, double [[LD1_0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[LD1_1]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[LD1_2]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[LD1_3]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[LD1_4]], i32 4
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[LD1_5]], i32 5
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[LD1_6]], i32 6
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[LD1_7]], i32 7
+; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <8 x double> [[TMP0]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP9]])
+; CHECK-NEXT: [[TMP11:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <8 x double> [[TMP11]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP12]])
+; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP13]], i64 1
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
index 40dcc79f79ffce..09a5ace101e645 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
@@ -8,19 +8,23 @@
; YAML-NEXT: Function: test
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Stores SLP vectorized with cost '
-; YAML-NEXT: - Cost: '-5'
+; YAML-NEXT: - Cost: '-7'
; YAML-NEXT: - String: ' and with tree size '
-; YAML-NEXT: - TreeSize: '7'
+; YAML-NEXT: - TreeSize: '5'
define void @test(ptr noalias %p, ptr noalias %p1) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[P:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 32, i64 33, i64 34>
-; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[P]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32
+; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2)
+; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index cfbbe14186b501..eacfbda5447c7b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -5,13 +5,23 @@ define void @test() {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr null, align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: store <4 x i32> [[TMP11]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
; CHECK-NEXT: ret void
;
entry:
@@ -57,15 +67,25 @@ define void @test1() {
; CHECK-LABEL: define void @test1(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr null, align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
-; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: store <4 x i32> [[TMP13]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
; CHECK-NEXT: ret void
;
entry:
@@ -111,12 +131,22 @@ define void @test_div() {
; CHECK-LABEL: define void @test_div(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr null, align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 2>
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], <i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
; CHECK-NEXT: ret void
;
@@ -163,12 +193,22 @@ define void @test_rem() {
; CHECK-LABEL: define void @test_rem(
; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr null, align 4
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], <i32 1, i32 1, i32 2, i32 1>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 30f328293cdaa3..c114c5dee78e99 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -6,18 +6,20 @@ target triple = "x86_64-unknown-linux-gnu"
define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
-; CHECK-NEXT: [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
-; CHECK-NEXT: store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2)
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]]
+; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
+; CHECK-NEXT: store <8 x float> [[TMP15]], ptr addrspace(1) [[TMP3]], align 4
; CHECK-NEXT: ret void
;
%3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
More information about the llvm-commits
mailing list