[llvm] f82eb7e - [SLP]Introduce gather cost estimation function.
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 13 10:19:14 PDT 2023
Author: Alexey Bataev
Date: 2023-04-13T10:16:00-07:00
New Revision: f82eb7e066f322a231627383fc80522d98ce6181
URL: https://github.com/llvm/llvm-project/commit/f82eb7e066f322a231627383fc80522d98ce6181
DIFF: https://github.com/llvm/llvm-project/commit/f82eb7e066f322a231627383fc80522d98ce6181.diff
LOG: [SLP]Introduce gather cost estimation function.
Introduced BoUpSLP::ShuffleCostEstimator::gather function as an initial
implementation of the gather/buildvector cost estimation for buildvector
nodes. It will allow to use general codegen infrastructure for better
cost estimation + it improves the cost estimation for the
gathers/buildvectors.
Improved part of D110978.
Differential Revision: https://reviews.llvm.org/D148174
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7f79095153e90..6917c8cf9cc25 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6795,6 +6795,131 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
BoUpSLP &R;
constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
+ if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
+ return TTI::TCC_Free;
+ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+ InstructionCost GatherCost = 0;
+ SmallVector<Value *> Gathers(VL.begin(), VL.end());
+ // Improve gather cost for gather of loads, if we can group some of the
+ // loads into vector loads.
+ InstructionsState S = getSameOpcode(VL, *R.TLI);
+ if (VL.size() > 2 && S.getOpcode() == Instruction::Load &&
+ !S.isAltShuffle() &&
+ !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
+ !isSplat(Gathers)) {
+ BoUpSLP::ValueSet VectorizedLoads;
+ unsigned StartIdx = 0;
+ unsigned VF = VL.size() / 2;
+ unsigned VectorizedCnt = 0;
+ unsigned ScatterVectorizeCnt = 0;
+ const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType());
+ for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
+ for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
+ Cnt += VF) {
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
+ if (!VectorizedLoads.count(Slice.front()) &&
+ !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
+ SmallVector<Value *> PointerOps;
+ OrdersType CurrentOrder;
+ LoadsState LS =
+ canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,
+ *R.LI, *R.TLI, CurrentOrder, PointerOps);
+ switch (LS) {
+ case LoadsState::Vectorize:
+ case LoadsState::ScatterVectorize:
+ // Mark the vectorized loads so that we don't vectorize them
+ // again.
+ if (LS == LoadsState::Vectorize)
+ ++VectorizedCnt;
+ else
+ ++ScatterVectorizeCnt;
+ VectorizedLoads.insert(Slice.begin(), Slice.end());
+ // If we vectorized initial block, no need to try to vectorize
+ // it again.
+ if (Cnt == StartIdx)
+ StartIdx += VF;
+ break;
+ case LoadsState::Gather:
+ break;
+ }
+ }
+ }
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= VL.size())
+ break;
+ // Found vectorizable parts - exit.
+ if (!VectorizedLoads.empty())
+ break;
+ }
+ if (!VectorizedLoads.empty()) {
+ unsigned NumParts = TTI.getNumberOfParts(VecTy);
+ bool NeedInsertSubvectorAnalysis =
+ !NumParts || (VL.size() / VF) > NumParts;
+ // Get the cost for gathered loads.
+ for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
+ if (VectorizedLoads.contains(VL[I]))
+ continue;
+ GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
+ }
+ // Exclude potentially vectorized loads from list of gathered
+ // scalars.
+ auto *LI = cast<LoadInst>(S.MainOp);
+ Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType()));
+ // The cost for vectorized loads.
+ InstructionCost ScalarsCost = 0;
+ for (Value *V : VectorizedLoads) {
+ auto *LI = cast<LoadInst>(V);
+ ScalarsCost +=
+ TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
+ LI->getAlign(), LI->getPointerAddressSpace(),
+ CostKind, TTI::OperandValueInfo(), LI);
+ }
+ auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
+ Align Alignment = LI->getAlign();
+ GatherCost +=
+ VectorizedCnt *
+ TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+ LI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo(), LI);
+ GatherCost += ScatterVectorizeCnt *
+ TTI.getGatherScatterOpCost(
+ Instruction::Load, LoadTy, LI->getPointerOperand(),
+ /*VariableMask=*/false, Alignment, CostKind, LI);
+ if (NeedInsertSubvectorAnalysis) {
+ // Add the cost for the subvectors insert.
+ for (int I = VF, E = VL.size(); I < E; I += VF)
+ GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
+ std::nullopt, CostKind, I, LoadTy);
+ }
+ GatherCost -= ScalarsCost;
+ }
+ } else if (!Root && isSplat(VL)) {
+ // Found the broadcasting of the single scalar, calculate the cost as
+ // the broadcast.
+ const auto *It =
+ find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
+ assert(It != VL.end() && "Expected at least one non-undef value.");
+ // Add broadcast for non-identity shuffle only.
+ bool NeedShuffle =
+ count(VL, *It) > 1 &&
+ (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
+ InstructionCost InsertCost = TTI.getVectorInstrCost(
+ Instruction::InsertElement, VecTy, CostKind,
+ NeedShuffle ? 0 : std::distance(VL.begin(), It),
+ PoisonValue::get(VecTy), *It);
+ return InsertCost +
+ (NeedShuffle ? TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast, VecTy,
+ /*Mask=*/std::nullopt, CostKind, /*Index=*/0,
+ /*SubTp=*/nullptr, /*Args=*/*It)
+ : TTI::TCC_Free);
+ }
+ return GatherCost + (all_of(Gathers, UndefValue::classof)
+ ? TTI::TCC_Free
+ : R.getGatherCost(Gathers));
+ };
+
public:
ShuffleCostEstimator(TargetTransformInfo &TTI,
ArrayRef<Value *> VectorizedVals, BoUpSLP &R)
@@ -6887,6 +7012,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
return VecBase;
}
+ void gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
+ Cost += getBuildVectorCost(VL, Root);
+ }
/// Finalize emission of the shuffles.
InstructionCost finalize() {
IsFinalized = true;
@@ -6957,7 +7085,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
GatheredScalars.append(VF - GatheredScalars.size(),
PoisonValue::get(ScalarTy));
}
- InstructionCost ExtractCost = Estimator.finalize();
// Do not try to look for reshuffled loads for gathered loads (they will be
// handled later), for vectorized scalars, and cases, which are definitely
@@ -7007,9 +7134,11 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
::addMask(Mask, E->ReuseShuffleIndices);
GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask);
}
- if (!all_of(GatheredScalars, UndefValue::classof))
- GatherCost += getGatherCost(GatheredScalars);
- return GatherCost;
+ Estimator.gather(
+ GatheredScalars,
+ Constant::getNullValue(FixedVectorType::get(
+ GatheredScalars.front()->getType(), GatheredScalars.size())));
+ return GatherCost + Estimator.finalize();
}
if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) {
// Check that gather of extractelements can be represented as just a
@@ -7022,129 +7151,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
if (NeedToShuffleReuses)
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
FinalVecTy, E->ReuseShuffleIndices);
- return Cost + ExtractCost;
- }
- if (isSplat(VL)) {
- // Found the broadcasting of the single scalar, calculate the cost as the
- // broadcast.
- assert(VecTy == FinalVecTy &&
- "No reused scalars expected for broadcast.");
- const auto *It =
- find_if(VL, [](Value *V) { return !isa<UndefValue>(V); });
- // If all values are undefs - consider cost free.
- if (It == VL.end())
- return TTI::TCC_Free;
- // Add broadcast for non-identity shuffle only.
- bool NeedShuffle =
- count(VL, *It) > 1 &&
- (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
- InstructionCost InsertCost = TTI->getVectorInstrCost(
- Instruction::InsertElement, VecTy, CostKind,
- NeedShuffle ? 0 : std::distance(VL.begin(), It),
- PoisonValue::get(VecTy), *It);
- return InsertCost + (NeedShuffle
- ? TTI->getShuffleCost(
- TargetTransformInfo::SK_Broadcast, VecTy,
- /*Mask=*/std::nullopt, CostKind,
- /*Index=*/0,
- /*SubTp=*/nullptr, /*Args=*/*It)
- : TTI::TCC_Free);
+ return Cost + Estimator.finalize();
}
InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses)
ReuseShuffleCost = TTI->getShuffleCost(
TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
- // Improve gather cost for gather of loads, if we can group some of the
- // loads into vector loads.
- if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&
- !E->isAltShuffle()) {
- BoUpSLP::ValueSet VectorizedLoads;
- unsigned StartIdx = 0;
- unsigned VF = VL.size() / 2;
- unsigned VectorizedCnt = 0;
- unsigned ScatterVectorizeCnt = 0;
- const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());
- for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {
- for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
- Cnt += VF) {
- ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
- if (!VectorizedLoads.count(Slice.front()) &&
- !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
- SmallVector<Value *> PointerOps;
- OrdersType CurrentOrder;
- LoadsState LS =
- canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,
- *TLI, CurrentOrder, PointerOps);
- switch (LS) {
- case LoadsState::Vectorize:
- case LoadsState::ScatterVectorize:
- // Mark the vectorized loads so that we don't vectorize them
- // again.
- if (LS == LoadsState::Vectorize)
- ++VectorizedCnt;
- else
- ++ScatterVectorizeCnt;
- VectorizedLoads.insert(Slice.begin(), Slice.end());
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += VF;
- break;
- case LoadsState::Gather:
- break;
- }
- }
- }
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= VL.size())
- break;
- // Found vectorizable parts - exit.
- if (!VectorizedLoads.empty())
- break;
- }
- if (!VectorizedLoads.empty()) {
- InstructionCost GatherCost = 0;
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
- bool NeedInsertSubvectorAnalysis =
- !NumParts || (VL.size() / VF) > NumParts;
- // Get the cost for gathered loads.
- for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
- if (VectorizedLoads.contains(VL[I]))
- continue;
- GatherCost += getGatherCost(VL.slice(I, VF));
- }
- // The cost for vectorized loads.
- InstructionCost ScalarsCost = 0;
- for (Value *V : VectorizedLoads) {
- auto *LI = cast<LoadInst>(V);
- ScalarsCost +=
- TTI->getMemoryOpCost(Instruction::Load, LI->getType(),
- LI->getAlign(), LI->getPointerAddressSpace(),
- CostKind, TTI::OperandValueInfo(), LI);
- }
- auto *LI = cast<LoadInst>(E->getMainOp());
- auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
- Align Alignment = LI->getAlign();
- GatherCost +=
- VectorizedCnt *
- TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
- LI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), LI);
- GatherCost += ScatterVectorizeCnt *
- TTI->getGatherScatterOpCost(
- Instruction::Load, LoadTy, LI->getPointerOperand(),
- /*VariableMask=*/false, Alignment, CostKind, LI);
- if (NeedInsertSubvectorAnalysis) {
- // Add the cost for the subvectors insert.
- for (int I = VF, E = VL.size(); I < E; I += VF)
- GatherCost +=
- TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
- std::nullopt, CostKind, I, LoadTy);
- }
- return ReuseShuffleCost + GatherCost - ScalarsCost;
- }
- }
- return ReuseShuffleCost + getGatherCost(VL);
+ Estimator.gather(GatheredScalars);
+ return ReuseShuffleCost + Estimator.finalize();
}
InstructionCost CommonCost = 0;
SmallVector<int> Mask;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 61aa9110e123c..455693e27b349 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -3,27 +3,21 @@
define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) {
; CHECK-LABEL: @test(
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0
-; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0
-; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1
-; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0
-; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; CHECK-NEXT: br label [[TMP17:%.*]]
-; CHECK: 17:
-; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ]
-; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ]
-; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0
-; CHECK-NEXT: br label [[TMP17]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <4 x i32> <i32 0, i32 2, i32 undef, i32 2>
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-NEXT: br label [[TMP11:%.*]]
+; CHECK: 11:
+; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT: [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT: br label [[TMP11]]
;
%4 = extractelement <2 x i64> %1, i64 0
%5 = or i64 %4, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index 52a84e6248f79..0365d046cbf36 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -524,36 +524,22 @@ define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %
define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
-; SSE-LABEL: @foo(
-; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
-; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 undef, i32 1>
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
-; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
-; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
-; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
-; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
-; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; SSE-NEXT: ret i1 [[CMP_I185]]
-;
-; AVX-LABEL: @foo(
-; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
-; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
-; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
-; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
-; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
-; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
-; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
-; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
-; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
-; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
-; AVX-NEXT: ret i1 [[CMP_I185]]
+; CHECK-LABEL: @foo(
+; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
+; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 undef, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
+; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
+; CHECK-NEXT: ret i1 [[CMP_I185]]
;
%vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
%sub14.i167 = fsub float undef, %vecext.i291.i166
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
index 9d52dfa41db36..7faff0c39a739 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,21 +6,23 @@ define i16 @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
; CHECK-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
+; CHECK-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
; CHECK-NEXT: br label [[WHILE:%.*]]
; CHECK: while:
-; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[WHILE]] ]
+; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP7]])
-; CHECK-NEXT: [[OP_RDX23:%.*]] = xor i64 0, [[TMP1]]
-; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP8]]
-; CHECK-NEXT: [[OP_RDX25]] = xor i64 [[OP_RDX23]], [[OP_RDX24]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A1]], align 16
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A2]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
+; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
+; CHECK-NEXT: [[OP_RDX5]] = xor i64 [[TMP0]], [[TMP11]]
; CHECK-NEXT: br label [[WHILE]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index 97fb5686c8710..c5b6ac647aa32 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -93,33 +93,20 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
}
define i1 @logical_and_icmp_
diff _preds(<4 x i32> %x) {
-; SSE-LABEL: @logical_and_icmp_
diff _preds(
-; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
-; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
-; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false
-; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false
-; SSE-NEXT: ret i1 [[S3]]
-;
-; AVX-LABEL: @logical_and_icmp_
diff _preds(
-; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1
-; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3
-; AVX-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0
-; AVX-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0
-; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
-; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false
-; AVX-NEXT: ret i1 [[S3]]
+; CHECK-LABEL: @logical_and_icmp_
diff _preds(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 3, i32 1, i32 7>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 0, i32 poison>, <4 x i32> <i32 4, i32 5, i32 6, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2
+; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3
+; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1
+; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false
+; CHECK-NEXT: ret i1 [[S3]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
More information about the llvm-commits
mailing list