[llvm] 62267e8 - Revert "[SLP]Generalize cost model."
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 18 11:26:45 PDT 2022
Author: Alexey Bataev
Date: 2022-10-18T11:25:59-07:00
New Revision: 62267e8de06a6cc95fb9b91391a33c035baf000c
URL: https://github.com/llvm/llvm-project/commit/62267e8de06a6cc95fb9b91391a33c035baf000c
DIFF: https://github.com/llvm/llvm-project/commit/62267e8de06a6cc95fb9b91391a33c035baf000c.diff
LOG: Revert "[SLP]Generalize cost model."
This reverts commit f12fb91188b836e1bddb36bacbbdb8e4ab70b9b6 and
f5c747bfbe36b8f53e6fe2d85ffcaecba6d7153c to fix detected non-initialized
var use.
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f35653197ff2f..658a943301ee5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5970,48 +5970,27 @@ static bool isAlternateInstruction(const Instruction *I,
TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,
unsigned OpIdx) {
assert(!VL.empty());
- const auto *I0 = cast<Instruction>(*find_if(VL, Instruction::classof));
- const auto *Op0 = I0->getOperand(OpIdx);
+ const auto *Op0 = cast<Instruction>(VL.front())->getOperand(OpIdx);
const bool IsConstant = all_of(VL, [&](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return true;
- auto *Op = I->getOperand(OpIdx);
+ auto *Op = cast<Instruction>(V)->getOperand(OpIdx);
return isConstant(Op) && !isa<UndefValue>(Op);
});
const bool IsUniform = all_of(VL, [&](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
- return I->getOperand(OpIdx) == Op0;
+ return cast<Instruction>(V)->getOperand(OpIdx) == Op0;
});
const bool IsPowerOfTwo = all_of(VL, [&](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- assert((isa<UndefValue>(V) ||
- I0->getOpcode() == Instruction::GetElementPtr) &&
- "Expected undef or GEP.");
- return true;
- }
- auto *Op = I->getOperand(OpIdx);
+ auto *Op = cast<Instruction>(V)->getOperand(OpIdx);
if (auto *CI = dyn_cast<ConstantInt>(Op))
return CI->getValue().isPowerOf2();
return false;
});
const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) {
// TODO: We should allow undef elements here
- const auto *I = dyn_cast<Instruction>(V);
- if (!I) {
- assert((isa<UndefValue>(V) ||
- I0->getOpcode() == Instruction::GetElementPtr) &&
- "Expected undef or GEP.");
- return true;
- }
- const auto *Op = I->getOperand(OpIdx);
+ auto *Op = cast<Instruction>(V)->getOperand(OpIdx);
if (auto *CI = dyn_cast<ConstantInt>(Op))
return CI->getValue().isNegatedPowerOf2();
return false;
@@ -6323,94 +6302,67 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
- const unsigned Sz = VL.size();
- auto GetCostDiff =
- [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
- function_ref<InstructionCost(InstructionCost)> VectorCost) {
- // Calculate the cost of this instruction.
- InstructionCost ScalarCost = 0;
- if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
- // For some of the instructions no need to calculate cost for each
- // particular instruction, we can use the cost of the single
- // instruction x total number of scalar instructions.
- ScalarCost = Sz * ScalarEltCost(0);
- } else {
- for (unsigned I = 0; I < Sz; ++I)
- ScalarCost += ScalarEltCost(I);
- }
-
- InstructionCost VecCost = VectorCost(CommonCost);
- LLVM_DEBUG(
- dumpTreeCosts(E, CommonCost, VecCost - CommonCost, ScalarCost));
- // Disable warnings for `this` and `E` are unused. Required for
- // `dumpTreeCosts`.
- (void)this;
- (void)E;
- return VecCost - ScalarCost;
- };
switch (ShuffleOrOp) {
- case Instruction::PHI: {
- // Count reused scalars.
- InstructionCost ScalarCost = 0;
- SmallPtrSet<const TreeEntry *, 4> CountedOps;
- for (Value *V : VL) {
- auto *PHI = dyn_cast<PHINode>(V);
- if (!PHI)
- continue;
-
- ValueList Operands(PHI->getNumIncomingValues(), nullptr);
- for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
- Value *Op = PHI->getIncomingValue(I);
- Operands[I] = Op;
- }
- if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
- if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
- if (!OpTE->ReuseShuffleIndices.empty())
- ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
- OpTE->Scalars.size());
- }
+ case Instruction::PHI:
+ return 0;
- return CommonCost - ScalarCost;
- }
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *I = cast<Instruction>(VL[Idx]);
- VectorType *SrcVecTy;
- if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *EE = cast<ExtractElementInst>(I);
- SrcVecTy = EE->getVectorOperandType();
- } else {
- auto *EV = cast<ExtractValueInst>(I);
- Type *AggregateTy = EV->getAggregateOperand()->getType();
- unsigned NumElts;
- if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
- NumElts = ATy->getNumElements();
- else
- NumElts = AggregateTy->getStructNumElements();
- SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
- }
- if (I->hasOneUse()) {
- Instruction *Ext = I->user_back();
- if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
- all_of(Ext->users(),
- [](User *U) { return isa<GetElementPtrInst>(U); })) {
- // Use getExtractWithExtendCost() to calculate the cost of
- // extractelement/ext pair.
- InstructionCost Cost = TTI->getExtractWithExtendCost(
- Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
- // Subtract the cost of s|zext which is subtracted separately.
- Cost -= TTI->getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), I->getType(),
- TTI::getCastContextHint(Ext), CostKind, Ext);
- return Cost;
+ // The common cost of removal ExtractElement/ExtractValue instructions +
+ // the cost of shuffles, if required to resuffle the original vector.
+ if (NeedToShuffleReuses) {
+ unsigned Idx = 0;
+ for (unsigned I : E->ReuseShuffleIndices) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *EE = cast<ExtractElementInst>(VL[I]);
+ CommonCost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(),
+ *getExtractIndex(EE));
+ } else {
+ CommonCost -=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+ ++Idx;
}
}
- return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
- *getExtractIndex(I));
- };
- auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ Idx = EntryVF;
+ for (Value *V : VL) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *EE = cast<ExtractElementInst>(V);
+ CommonCost += TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(),
+ *getExtractIndex(EE));
+ } else {
+ --Idx;
+ CommonCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+ }
+ }
+ }
+ if (ShuffleOrOp == Instruction::ExtractValue) {
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<Instruction>(VL[I]);
+ // Take credit for instruction that will become dead.
+ if (EI->hasOneUse()) {
+ Instruction *Ext = EI->user_back();
+ if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+ return isa<GetElementPtrInst>(U);
+ })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ CommonCost -= TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), VecTy, I);
+ // Add back the cost of s|zext which is subtracted separately.
+ CommonCost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), EI->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
+ continue;
+ }
+ }
+ CommonCost -=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
+ }
+ } else {
+ AdjustExtractsCost(CommonCost);
+ }
+ return CommonCost;
}
case Instruction::InsertElement: {
assert(E->ReuseShuffleIndices.empty() &&
@@ -6528,72 +6480,78 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
- return TTI->getCastInstrCost(E->getOpcode(), ScalarTy,
- VI->getOperand(0)->getType(),
- TTI::getCastContextHint(VI), CostKind, VI);
- };
- auto GetVectorCost = [=](InstructionCost CommonCost) {
- Type *SrcTy = VL0->getOperand(0)->getType();
- auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
- InstructionCost VecCost = CommonCost;
- // Check if the values are candidates to demote.
- if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
- VecCost +=
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
- TTI::getCastContextHint(VL0), CostKind, VL0);
- return VecCost;
- };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ InstructionCost ScalarEltCost =
+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
+ if (NeedToShuffleReuses) {
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+ }
+
+ // Calculate the cost of this instruction.
+ InstructionCost ScalarCost = VL.size() * ScalarEltCost;
+
+ auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
+ InstructionCost VecCost = 0;
+ // Check if the values are candidates to demote.
+ if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+ VecCost = CommonCost + TTI->getCastInstrCost(
+ E->getOpcode(), VecTy, SrcVecTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
+ }
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
- CmpInst::Predicate VecPred, SwappedVecPred;
- auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
- if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
- match(VL0, MatchCmp))
- SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
- else
- SwappedVecPred = VecPred = CmpInst::BAD_ICMP_PREDICATE;
- auto GetScalarCost = [&](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
+ // Calculate the cost of this instruction.
+ InstructionCost ScalarEltCost =
+ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
+ if (NeedToShuffleReuses) {
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+ }
+ auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+
+ // Check if all entries in VL are either compares or selects with compares
+ // as condition that have the same predicates.
+ CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
+ bool First = true;
+ for (auto *V : VL) {
CmpInst::Predicate CurrentPred;
auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
- if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
- !match(VI, MatchCmp)) ||
- (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
- VecPred = SwappedVecPred = CmpInst::BAD_ICMP_PREDICATE;
-
- return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(), CurrentPred, CostKind,
- VI);
- };
- auto GetVectorCost = [&](InstructionCost CommonCost) {
- auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
+ if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
+ !match(V, MatchCmp)) ||
+ (!First && VecPred != CurrentPred)) {
+ VecPred = CmpInst::BAD_ICMP_PREDICATE;
+ break;
+ }
+ First = false;
+ VecPred = CurrentPred;
+ }
- InstructionCost VecCost = TTI->getCmpSelInstrCost(
- E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
- // Check if it is possible and profitable to use min/max for selects
- // in VL.
- //
- auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
- if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
- IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
- {VecTy, VecTy});
- InstructionCost IntrinsicCost =
- TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
- // If the selects are the only uses of the compares, they will be
- // dead and we can adjust the cost by removing their cost.
- if (IntrinsicAndUse.second)
- IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
- MaskTy, VecPred, CostKind);
- VecCost = std::min(VecCost, IntrinsicCost);
- }
- return VecCost + CommonCost;
- };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ InstructionCost VecCost = TTI->getCmpSelInstrCost(
+ E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
+ // Check if it is possible and profitable to use min/max for selects in
+ // VL.
+ //
+ auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
+ if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
+ IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
+ {VecTy, VecTy});
+ InstructionCost IntrinsicCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+ // If the selects are the only uses of the compares, they will be dead
+ // and we can adjust the cost by removing their cost.
+ if (IntrinsicAndUse.second)
+ IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
+ MaskTy, VecPred, CostKind);
+ VecCost = std::min(VecCost, IntrinsicCost);
+ }
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
case Instruction::Add:
@@ -6613,116 +6571,142 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor:
- case Instruction::GetElementPtr: {
- unsigned Opcode = ShuffleOrOp == Instruction::GetElementPtr
- ? static_cast<unsigned>(Instruction::Add)
- : ShuffleOrOp;
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = dyn_cast<Instruction>(VL[Idx]);
- // GEPs may contain just addresses without instructions, consider
- // their cost 0.
- if (!VI)
- return InstructionCost();
- unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
+ case Instruction::Xor: {
+ const unsigned OpIdx = isa<BinaryOperator>(VL0) ? 1 : 0;
+
+ InstructionCost ScalarCost = 0;
+ for (auto *V : VL) {
+ auto *VI = cast<Instruction>(V);
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
TTI::OperandValueInfo Op2Info =
TTI::getOperandInfo(VI->getOperand(OpIdx));
- SmallVector<const Value *> Operands(VI->operand_values());
- return TTI->getArithmeticInstrCost(Opcode, ScalarTy, CostKind, Op1Info,
- Op2Info, Operands, VI);
- };
- auto GetVectorCost = [=](InstructionCost CommonCost) {
- unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
- TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
- TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
- return TTI->getArithmeticInstrCost(Opcode, VecTy, CostKind, Op1Info,
- Op2Info) +
- CommonCost;
- };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ SmallVector<const Value *, 4> Operands(VI->operand_values());
+ ScalarCost += TTI->getArithmeticInstrCost(
+ E->getOpcode(), ScalarTy, CostKind, Op1Info, Op2Info, Operands, VI);
+ }
+ if (NeedToShuffleReuses) {
+ CommonCost -= (EntryVF - VL.size()) * ScalarCost / VL.size();
+ }
+ TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
+ TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
+ InstructionCost VecCost = TTI->getArithmeticInstrCost(
+ E->getOpcode(), VecTy, CostKind, Op1Info, Op2Info);
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
+ }
+ case Instruction::GetElementPtr: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ any_of(VL,
+ [](Value *V) {
+ return isa<GetElementPtrInst>(V) &&
+ !isConstant(cast<GetElementPtrInst>(V)->getOperand(1));
+ })
+ ? TargetTransformInfo::OK_AnyValue
+ : TargetTransformInfo::OK_UniformConstantValue;
+
+ InstructionCost ScalarEltCost =
+ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
+ {Op1VK, TargetTransformInfo::OP_None},
+ {Op2VK, TargetTransformInfo::OP_None});
+ if (NeedToShuffleReuses) {
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+ }
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecCost =
+ TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
+ {Op1VK, TargetTransformInfo::OP_None},
+ {Op2VK, TargetTransformInfo::OP_None});
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
}
case Instruction::Load: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<LoadInst>(VL[Idx]);
- InstructionCost GEPCost = 0;
- if (VI != VL0) {
+ // Cost of wide load - cost of scalar loads.
+ Align Alignment = cast<LoadInst>(VL0)->getAlign();
+ InstructionCost ScalarEltCost =
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, 0,
+ CostKind, {TTI::OK_AnyValue, TTI::OP_None}, VL0);
+ if (NeedToShuffleReuses) {
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+ }
+ InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecLdCost;
+ if (E->State == TreeEntry::Vectorize) {
+ VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
+ CostKind, TTI::OperandValueInfo(), VL0);
+ for (Value *V : VL) {
+ auto *VI = cast<LoadInst>(V);
+ // Add the costs of scalar GEP pointers, to be removed from the code.
+ if (VI == VL0)
+ continue;
auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
- if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices())
- GEPCost = TTI->getArithmeticInstrCost(Instruction::Add,
- Ptr->getType(), CostKind);
- }
- return GEPCost +
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
- VI->getPointerAddressSpace(), CostKind,
- TTI::OperandValueInfo(), VI);
- };
- auto GetVectorCost = [=](InstructionCost CommonCost) {
- auto *LI0 = cast<LoadInst>(VL0);
- InstructionCost VecLdCost;
- if (E->State == TreeEntry::Vectorize) {
- VecLdCost = TTI->getMemoryOpCost(
- Instruction::Load, VecTy, LI0->getAlign(),
- LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
- } else {
- assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
- Align CommonAlignment = LI0->getAlign();
- for (Value *V : VL)
- CommonAlignment =
- std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
- VecLdCost = TTI->getGatherScatterOpCost(
- Instruction::Load, VecTy, LI0->getPointerOperand(),
- /*VariableMask=*/false, CommonAlignment, CostKind);
+ if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
+ continue;
+ ScalarLdCost += TTI->getArithmeticInstrCost(Instruction::Add,
+ Ptr->getType(), CostKind);
}
- return VecLdCost + CommonCost;
- };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ } else {
+ assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+ Align CommonAlignment = Alignment;
+ for (Value *V : VL)
+ CommonAlignment =
+ std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ VecLdCost = TTI->getGatherScatterOpCost(
+ Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
+ }
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
+ return CommonCost + VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
+ // We know that we can merge the stores. Calculate the cost.
bool IsReorder = !E->ReorderIndices.empty();
auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<StoreInst>(VL[Idx]);
- InstructionCost GEPCost = 0;
- if (VI != SI) {
- auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
- if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices())
- GEPCost = TTI->getArithmeticInstrCost(Instruction::Add,
- Ptr->getType(), CostKind);
- }
- TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0);
- return GEPCost + TTI->getMemoryOpCost(
- Instruction::Store, ScalarTy, VI->getAlign(),
- VI->getPointerAddressSpace(), CostKind, OpInfo, VI);
- };
- auto GetVectorCost = [=](InstructionCost CommonCost) {
- // We know that we can merge the stores. Calculate the cost.
- TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
- return TTI->getMemoryOpCost(Instruction::Store, VecTy, SI->getAlign(),
- SI->getPointerAddressSpace(), CostKind,
- OpInfo) +
- CommonCost;
- };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ Align Alignment = SI->getAlign();
+ InstructionCost ScalarStCost = 0;
+ for (auto *V : VL) {
+ auto *VI = cast<StoreInst>(V);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getOperand(0));
+ ScalarStCost += TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
+ Alignment, 0, CostKind, OpInfo, VI);
+ // Add the costs of scalar GEP pointers, to be removed from the code.
+ if (VI == SI)
+ continue;
+ auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
+ if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
+ continue;
+ ScalarStCost += TTI->getArithmeticInstrCost(Instruction::Add,
+ Ptr->getType(), CostKind);
+ }
+ TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
+ InstructionCost VecStCost = TTI->getMemoryOpCost(
+ Instruction::Store, VecTy, Alignment, 0, CostKind, OpInfo);
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
+ return CommonCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
- auto GetScalarCost = [=](unsigned Idx) {
- auto *CI = cast<CallInst>(VL[Idx]);
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (ID != Intrinsic::not_intrinsic) {
- IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
- return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
- }
- return TTI->getCallInstrCost(CI->getCalledFunction(),
- CI->getFunctionType()->getReturnType(),
- CI->getFunctionType()->params(), CostKind);
- };
- auto GetVectorCost = [=](InstructionCost CommonCost) {
- auto *CI = cast<CallInst>(VL0);
- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
- return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
- };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
+ IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
+ InstructionCost ScalarEltCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+ if (NeedToShuffleReuses) {
+ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+ }
+ InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+ InstructionCost VecCallCost =
+ std::min(VecCallCosts.first, VecCallCosts.second);
+
+ LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *CI << "\n");
+
+ return CommonCost + VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
assert(E->isAltShuffle() &&
@@ -6732,6 +6716,25 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
Instruction::isCast(E->getAltOpcode())) ||
(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
"Invalid Shuffle Vector Operand");
+ InstructionCost ScalarCost = 0;
+ if (NeedToShuffleReuses) {
+ for (unsigned Idx : E->ReuseShuffleIndices) {
+ Instruction *I = cast<Instruction>(VL[Idx]);
+ CommonCost -= TTI->getInstructionCost(I, CostKind);
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ CommonCost += TTI->getInstructionCost(I, CostKind);
+ }
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ ScalarCost += TTI->getInstructionCost(I, CostKind);
+ }
+ // VecCost is equal to sum of the cost of creating 2 vectors
+ // and the cost of creating shuffle.
+ InstructionCost VecCost = 0;
// Try to find the previous shuffle node with the same operands and same
// main/alternate ops.
auto TryFindNodeWithEqualOperands = [=]() {
@@ -6748,66 +6751,53 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
}
return false;
};
- auto GetScalarCost = [=](unsigned Idx) {
- auto *VI = cast<Instruction>(VL[Idx]);
- assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
- (void)E;
- return TTI->getInstructionCost(VI, CostKind);
- };
- // Need to clear CommonCost since the final shuffle cost is included into
- // vector cost.
- auto GetVectorCost = [&](InstructionCost) {
- // VecCost is equal to sum of the cost of creating 2 vectors
- // and the cost of creating shuffle.
- InstructionCost VecCost = 0;
- if (TryFindNodeWithEqualOperands()) {
- LLVM_DEBUG({
- dbgs() << "SLP: diamond match for alternate node found.\n";
- E->dump();
- });
- // No need to add new vector costs here since we're going to reuse
- // same main/alternate vector ops, just do
diff erent shuffling.
- } else if (Instruction::isBinaryOp(E->getOpcode())) {
- VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
- VecCost +=
- TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
- } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
- VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(),
- CI0->getPredicate(), CostKind, VL0);
- VecCost += TTI->getCmpSelInstrCost(
- E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
- cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
- E->getAltOp());
- } else {
- Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
- Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
- auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
- auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
- VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
- TTI::CastContextHint::None, CostKind);
- VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
- TTI::CastContextHint::None, CostKind);
- }
+ if (TryFindNodeWithEqualOperands()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: diamond match for alternate node found.\n";
+ E->dump();
+ });
+ // No need to add new vector costs here since we're going to reuse
+ // same main/alternate vector ops, just do
diff erent shuffling.
+ } else if (Instruction::isBinaryOp(E->getOpcode())) {
+ VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+ VecCost +=
+ TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
+ } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+ VecCost =
+ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ CI0->getPredicate(), CostKind, VL0);
+ VecCost +=
+ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ cast<CmpInst>(E->getAltOp())->getPredicate(),
+ CostKind, E->getAltOp());
+ } else {
+ Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+ Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
+ auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+ auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+ TTI::CastContextHint::None, CostKind);
+ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+ TTI::CastContextHint::None, CostKind);
+ }
+
+ if (E->ReuseShuffleIndices.empty()) {
+ CommonCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
+ } else {
SmallVector<int> Mask;
- if (E->ReuseShuffleIndices.empty()) {
- VecCost +=
- TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
- } else {
- SmallVector<int> Mask;
- buildShuffleEntryMask(
- E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
- [E](Instruction *I) {
- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- return I->getOpcode() == E->getAltOpcode();
- },
- Mask);
- VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+ buildShuffleEntryMask(
+ E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+ [E](Instruction *I) {
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ return I->getOpcode() == E->getAltOpcode();
+ },
+ Mask);
+ CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
FinalVecTy, Mask);
- }
- return VecCost;
- };
- return GetCostDiff(GetScalarCost, GetVectorCost);
+ }
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
@@ -7299,6 +7289,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;
+ // Already counted the cost for external uses when tried to adjust the cost
+ // for extractelements, no need to add it again.
+ if (isa<ExtractElementInst>(EU.Scalar))
+ continue;
+
// If found user is an insertelement, do not calculate extract cost but try
// to detect it as a final shuffled/identity match.
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 3f295f5aff90e..9fde7e563c7be 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-threshold=-3 -S -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -slp-threshold=-5 -S -pass-remarks-output=%t < %s | FileCheck %s
; RUN: cat %t | FileCheck -check-prefix=YAML %s
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
index b1adeec0f636e..3148b054d3082 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
@@ -107,14 +107,25 @@ entry:
define void @select_ule_ugt_mix_4xi32(i32* %ptr, i32 %x) {
; CHECK-LABEL: @select_ule_ugt_mix_4xi32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[PTR:%.*]] to <4 x i32>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[TMP1]], <i32 16383, i32 16383, i32 16383, i32 16383>
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i32> [[TMP1]], <i32 16383, i32 16383, i32 16383, i32 16383>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 16383, i32 16383, i32 16383, i32 16383>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ult i32 [[L_0]], 16383
+; CHECK-NEXT: [[S_0:%.*]] = select i1 [[CMP_0]], i32 [[L_0]], i32 16383
+; CHECK-NEXT: store i32 [[S_0]], i32* [[PTR]], align 4
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 1
+; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4
+; CHECK-NEXT: [[CMP_1:%.*]] = icmp ugt i32 [[L_1]], 16383
+; CHECK-NEXT: [[S_1:%.*]] = select i1 [[CMP_1]], i32 [[L_1]], i32 16383
+; CHECK-NEXT: store i32 [[S_1]], i32* [[GEP_1]], align 4
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 2
+; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[GEP_2]], align 4
+; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i32 [[L_2]], 16383
+; CHECK-NEXT: [[S_2:%.*]] = select i1 [[CMP_2]], i32 [[L_2]], i32 16383
+; CHECK-NEXT: store i32 [[S_2]], i32* [[GEP_2]], align 4
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 3
+; CHECK-NEXT: [[L_3:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT: [[CMP_3:%.*]] = icmp ugt i32 [[L_3]], 16383
+; CHECK-NEXT: [[S_3:%.*]] = select i1 [[CMP_3]], i32 [[L_3]], i32 16383
+; CHECK-NEXT: store i32 [[S_3]], i32* [[GEP_3]], align 4
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
index dbfd2b23d5376..ba7b35b3bb9a3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -53,17 +53,13 @@ define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) {
; CHECK-LABEL: @i64_simplifiedi_extract(
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
-; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8
-; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
-; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8
-; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX3]], align 8
-; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8
-; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8
-; CHECK-NEXT: store i64 [[T1]], i64* [[LD]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD:%.*]] to <2 x i64>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST:%.*]] to <4 x i64>*
+; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3
+; CHECK-NEXT: store i64 [[TMP4]], i64* [[LD]], align 8
; CHECK-NEXT: ret void
;
%arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
index 94aa4449c6797..d6caca29016f6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-same-vals.ll
@@ -8,22 +8,13 @@ define i64 @test() {
; CHECK: bb2:
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
-; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ 0, [[BB2:%.*]] ], [ 0, [[BB1:%.*]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[BB2]] ], [ 0, [[BB1]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP4]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP4]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP4]], i32 3
-; CHECK-NEXT: [[TMP44:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP44]], i32 [[TMP4]], i32 5
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 6
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP4]], i32 7
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP7]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP8]], [[TMP4]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[TMP4]], [[TMP4]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP]]
-; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX3]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ poison, [[BB2:%.*]] ], [ zeroinitializer, [[BB1:%.*]] ]
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[SHUFFLE]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP65:%.*]] = sext i32 [[OP_RDX]] to i64
; CHECK-NEXT: ret i64 [[TMP65]]
;
bb1:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
index c855583b273ba..eeeaf660ae9a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
@@ -6,24 +6,14 @@
define i16 @D134605() {
; CHECK-LABEL: @D134605(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 3
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX81]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr poison, align 1
-; CHECK-NEXT: [[ARRAYIDX101:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 1
-; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX101]], align 1
-; CHECK-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 2
-; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX107]], align 1
-; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]]
-; CHECK-NEXT: [[ADD116:%.*]] = add i16 [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[ADD122:%.*]] = add i16 [[ADD116]], [[TMP2]]
-; CHECK-NEXT: [[ADD124:%.*]] = add i16 [[ADD122]], [[TMP3]]
-; CHECK-NEXT: [[ADD125:%.*]] = add i16 [[ADD124]], poison
-; CHECK-NEXT: [[FACTOR2531:%.*]] = add i16 [[TMP3]], [[ADD125]]
-; CHECK-NEXT: [[ADD14332:%.*]] = add i16 [[FACTOR2531]], [[TMP2]]
-; CHECK-NEXT: [[ADD14933:%.*]] = add i16 [[ADD14332]], [[TMP1]]
-; CHECK-NEXT: [[ADD15534:%.*]] = add i16 [[ADD14933]], [[TMP0]]
-; CHECK-NEXT: [[ADD15935:%.*]] = add i16 [[ADD15534]], poison
-; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[ADD15935]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr poison, align 1
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <8 x i32> <i32 2, i32 2, i32 1, i32 1, i32 0, i32 0, i32 3, i32 3>
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i16> [[SHUFFLE]], i32 6
+; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[SHUFFLE]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP2]], poison
+; CHECK-NEXT: [[OP_RDX1:%.*]] = add i16 [[OP_RDX]], poison
+; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX1]], 2
; CHECK-NEXT: [[CALL:%.*]] = call i16 @check_i16(i16 noundef 1, i16 noundef [[REASS_MUL24]], i16 noundef 5120)
; CHECK-NEXT: unreachable
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
index d4b352134d55f..6dd5dc1b35af4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -4,32 +4,20 @@
define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[A1:%.*]]
-; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP0]], [[A2:%.*]]
-; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP0]], [[A3:%.*]]
-; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[TMP0]], [[A4:%.*]]
-; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[TMP0]], [[A5:%.*]]
-; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[TMP0]], [[A6:%.*]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARR]], align 4
-; CHECK-NEXT: [[ADD12:%.*]] = add i32 [[TMP1]], [[A7:%.*]]
-; CHECK-NEXT: [[ADD14:%.*]] = add i32 [[TMP1]], [[A8:%.*]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], [[ADD2]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 [[ADD]], i32 [[ADD2]]
-; CHECK-NEXT: [[CMP15:%.*]] = icmp ult i32 [[COND]], [[ADD4]]
-; CHECK-NEXT: [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 [[ADD4]]
-; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[COND19]], [[ADD6]]
-; CHECK-NEXT: [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 [[ADD6]]
-; CHECK-NEXT: [[CMP25:%.*]] = icmp ult i32 [[COND24]], [[ADD8]]
-; CHECK-NEXT: [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 [[ADD8]]
-; CHECK-NEXT: [[CMP30:%.*]] = icmp ult i32 [[COND29]], [[ADD10]]
-; CHECK-NEXT: [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 [[ADD10]]
-; CHECK-NEXT: [[CMP35:%.*]] = icmp ult i32 [[COND34]], [[ADD12]]
-; CHECK-NEXT: [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 [[ADD12]]
-; CHECK-NEXT: [[CMP40:%.*]] = icmp ult i32 [[COND39]], [[ADD14]]
-; CHECK-NEXT: [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 [[ADD14]]
-; CHECK-NEXT: ret i32 [[COND44]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR:%.*]] to <2 x i32>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A2:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
+; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
+; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]])
+; CHECK-NEXT: ret i32 [[TMP11]]
;
entry:
%arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
More information about the llvm-commits
mailing list