[llvm] 9202806 - Revert "[CostModel] Remove VF from IntrinsicCostAttributes"
Jinsong Ji via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 8 18:15:27 PST 2021
Author: Jinsong Ji
Date: 2021-02-09T02:14:14Z
New Revision: 92028062413907e1c10b16e7c338e0745a11a051
URL: https://github.com/llvm/llvm-project/commit/92028062413907e1c10b16e7c338e0745a11a051
DIFF: https://github.com/llvm/llvm-project/commit/92028062413907e1c10b16e7c338e0745a11a051.diff
LOG: Revert "[CostModel] Remove VF from IntrinsicCostAttributes"
This reverts commit 502a67dd7f23901834e05071ab253889f671b5d9.
This expose a failure in test-suite build on PowerPC,
revert to unblock buildbot first,
Dave will re-commit in https://reviews.llvm.org/D96287.
Thanks Dave.
Added:
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/CodeGen/BasicTTIImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 04106eca81b1..992e15500d9c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -118,32 +118,44 @@ class IntrinsicCostAttributes {
SmallVector<Type *, 4> ParamTys;
SmallVector<const Value *, 4> Arguments;
FastMathFlags FMF;
+ ElementCount VF = ElementCount::getFixed(1);
// If ScalarizationCost is UINT_MAX, the cost of scalarizing the
// arguments and the return value will be computed based on types.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
public:
- IntrinsicCostAttributes(
- Intrinsic::ID Id, const CallBase &CI,
- unsigned ScalarizationCost = std::numeric_limits<unsigned>::max());
+ IntrinsicCostAttributes(const IntrinsicInst &I);
- IntrinsicCostAttributes(
- Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys,
- FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr,
- unsigned ScalarCost = std::numeric_limits<unsigned>::max());
+ IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI);
+
+ IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
+ ElementCount Factor);
+
+ IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
+ ElementCount Factor, unsigned ScalarCost);
IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
- ArrayRef<const Value *> Args);
+ ArrayRef<Type *> Tys, FastMathFlags Flags);
- IntrinsicCostAttributes(
- Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args,
- ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(),
- const IntrinsicInst *I = nullptr,
- unsigned ScalarCost = std::numeric_limits<unsigned>::max());
+ IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+ ArrayRef<Type *> Tys, FastMathFlags Flags,
+ unsigned ScalarCost);
+
+ IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+ ArrayRef<Type *> Tys, FastMathFlags Flags,
+ unsigned ScalarCost,
+ const IntrinsicInst *I);
+
+ IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+ ArrayRef<Type *> Tys);
+
+ IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+ ArrayRef<const Value *> Args);
Intrinsic::ID getID() const { return IID; }
const IntrinsicInst *getInst() const { return II; }
Type *getReturnType() const { return RetTy; }
+ ElementCount getVectorFactor() const { return VF; }
FastMathFlags getFlags() const { return FMF; }
unsigned getScalarizationCost() const { return ScalarizationCost; }
const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index d5ab2fb6d4f2..75e897c2db91 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1211,9 +1211,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
Type *RetTy = ICA.getReturnType();
+ ElementCount VF = ICA.getVectorFactor();
ElementCount RetVF =
(RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
: ElementCount::getFixed(1));
+ assert((RetVF.isScalar() || VF.isScalar()) &&
+ "VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
@@ -1223,13 +1226,15 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::cttz:
// FIXME: If necessary, this should go in target-specific overrides.
- if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
+ if (VF.isScalar() && RetVF.isScalar() &&
+ getTLI()->isCheapToSpeculateCttz())
return TargetTransformInfo::TCC_Basic;
break;
case Intrinsic::ctlz:
// FIXME: If necessary, this should go in target-specific overrides.
- if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
+ if (VF.isScalar() && RetVF.isScalar() &&
+ getTLI()->isCheapToSpeculateCtlz())
return TargetTransformInfo::TCC_Basic;
break;
@@ -1237,14 +1242,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return thisT()->getMemcpyCost(ICA.getInst());
case Intrinsic::masked_scatter: {
+ assert(VF.isScalar() && "Can't vectorize types here.");
const Value *Mask = Args[3];
bool VarMask = !isa<Constant>(Mask);
Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
return thisT()->getGatherScatterOpCost(Instruction::Store,
- ICA.getArgTypes()[0], Args[1],
+ Args[0]->getType(), Args[1],
VarMask, Alignment, CostKind, I);
}
case Intrinsic::masked_gather: {
+ assert(VF.isScalar() && "Can't vectorize types here.");
const Value *Mask = Args[2];
bool VarMask = !isa<Constant>(Mask);
Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
@@ -1282,13 +1289,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::vector_reduce_fmin:
case Intrinsic::vector_reduce_umax:
case Intrinsic::vector_reduce_umin: {
- IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
+ IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
}
case Intrinsic::vector_reduce_fadd:
case Intrinsic::vector_reduce_fmul: {
IntrinsicCostAttributes Attrs(
- IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
+ IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I);
return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
}
case Intrinsic::fshl:
@@ -1340,20 +1347,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
// Assume that we need to scalarize this intrinsic.
+ SmallVector<Type *, 4> Types;
+ for (const Value *Op : Args) {
+ Type *OpTy = Op->getType();
+ assert(VF.isScalar() || !OpTy->isVectorTy());
+ Types.push_back(VF.isScalar()
+ ? OpTy
+ : FixedVectorType::get(OpTy, VF.getKnownMinValue()));
+ }
+
+ if (VF.isVector() && !RetTy->isVoidTy())
+ RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue());
+
// Compute the scalarization overhead based on Args for a vector
- // intrinsic.
+ // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+ // CostModel will pass a vector RetTy and VF is 1.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
- if (RetVF.isVector()) {
+ if (RetVF.isVector() || VF.isVector()) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
ScalarizationCost +=
- getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue());
+ getOperandsScalarizationOverhead(Args, VF.getKnownMinValue());
}
- IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
- ScalarizationCost);
+ IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I);
return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
}
@@ -1596,7 +1615,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// SatMin -> Overflow && SumDiff >= 0
unsigned Cost = 0;
IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
- nullptr, ScalarizationCostPassed);
+ ScalarizationCostPassed);
Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
@@ -1617,7 +1636,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned Cost = 0;
IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
- nullptr, ScalarizationCostPassed);
+ ScalarizationCostPassed);
Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 5fde2b471da8..e376d42c15bd 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -54,26 +54,86 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
return true;
}
+IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) :
+ II(&I), RetTy(I.getType()), IID(I.getIntrinsicID()) {
+
+ FunctionType *FTy = I.getCalledFunction()->getFunctionType();
+ ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+ Arguments.insert(Arguments.begin(), I.arg_begin(), I.arg_end());
+ if (auto *FPMO = dyn_cast<FPMathOperator>(&I))
+ FMF = FPMO->getFastMathFlags();
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+ const CallBase &CI) :
+ II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id) {
+
+ if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+ FMF = FPMO->getFastMathFlags();
+
+ Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
+ FunctionType *FTy =
+ CI.getCalledFunction()->getFunctionType();
+ ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
const CallBase &CI,
- unsigned ScalarizationCost)
- : II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
- ScalarizationCost(ScalarizationCost) {
+ ElementCount Factor)
+ : RetTy(CI.getType()), IID(Id), VF(Factor) {
+
+ assert(!Factor.isScalable() && "Scalable vectors are not yet supported");
+ if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+ FMF = FPMO->getFastMathFlags();
+
+ Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
+ FunctionType *FTy =
+ CI.getCalledFunction()->getFunctionType();
+ ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+ const CallBase &CI,
+ ElementCount Factor,
+ unsigned ScalarCost)
+ : RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
FMF = FPMO->getFastMathFlags();
Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
- FunctionType *FTy = CI.getCalledFunction()->getFunctionType();
+ FunctionType *FTy =
+ CI.getCalledFunction()->getFunctionType();
ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
}
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+ ArrayRef<Type *> Tys,
+ FastMathFlags Flags) :
+ RetTy(RTy), IID(Id), FMF(Flags) {
+ ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
ArrayRef<Type *> Tys,
FastMathFlags Flags,
- const IntrinsicInst *I,
- unsigned ScalarCost)
- : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+ unsigned ScalarCost) :
+ RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+ ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+ ArrayRef<Type *> Tys,
+ FastMathFlags Flags,
+ unsigned ScalarCost,
+ const IntrinsicInst *I) :
+ II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+ ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+ ArrayRef<Type *> Tys) :
+ RetTy(RTy), IID(Id) {
ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
}
@@ -87,17 +147,6 @@ IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
ParamTys.push_back(Arguments[Idx]->getType());
}
-IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
- ArrayRef<const Value *> Args,
- ArrayRef<Type *> Tys,
- FastMathFlags Flags,
- const IntrinsicInst *I,
- unsigned ScalarCost)
- : II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
- ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
- Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
-}
-
bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
LoopInfo &LI, DominatorTree &DT,
bool ForceNestedLoop,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 7c33f22d29ef..7b8a79640bb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -731,28 +731,40 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+ Type *RetTy = ICA.getReturnType();
+ unsigned VF = ICA.getVectorFactor().getFixedValue();
unsigned RetVF =
(RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
: 1);
+ assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
// Assume that we need to scalarize this intrinsic.
+ SmallVector<Type *, 4> Types;
+ for (const Value *Op : Args) {
+ Type *OpTy = Op->getType();
+ assert(VF == 1 || !OpTy->isVectorTy());
+ Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
+ }
+
+ if (VF > 1 && !RetTy->isVoidTy())
+ RetTy = FixedVectorType::get(RetTy, VF);
// Compute the scalarization overhead based on Args for a vector
// intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
// CostModel will pass a vector RetTy and VF is 1.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
- if (RetVF > 1) {
+ if (RetVF > 1 || VF > 1) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
- ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF);
+ ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
}
- IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
- ScalarizationCost);
+ IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
+ ScalarizationCost, I);
return getIntrinsicInstrCost(Attrs, CostKind);
}
@@ -772,20 +784,9 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
// TODO: Get more refined intrinsic costs?
unsigned InstRate = getQuarterRateInstrCost(CostKind);
-
- switch (ICA.getID()) {
- case Intrinsic::fma:
+ if (ICA.getID() == Intrinsic::fma) {
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
- break;
- case Intrinsic::uadd_sat:
- case Intrinsic::usub_sat:
- case Intrinsic::sadd_sat:
- case Intrinsic::ssub_sat:
- static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
- if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; }))
- NElts = 1;
- break;
}
return LT.first * NElts * InstRate;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d674dd9000bf..af67839c2d75 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1550,16 +1550,21 @@ int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
case Intrinsic::usub_sat: {
if (!ST->hasMVEIntegerOps())
break;
+ // Get the Return type, either directly of from ICA.ReturnType and ICA.VF.
Type *VT = ICA.getReturnType();
+ if (!VT->isVectorTy() && !ICA.getVectorFactor().isScalar())
+ VT = VectorType::get(VT, ICA.getVectorFactor());
std::pair<int, MVT> LT =
TLI->getTypeLegalizationCost(DL, VT);
if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
LT.second == MVT::v16i8) {
- // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
+ // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
// need to extend the type, as it uses shr(qadd(shl, shl)).
- unsigned Instrs =
- LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
+ unsigned Instrs = LT.second.getScalarSizeInBits() ==
+ ICA.getReturnType()->getScalarSizeInBits()
+ ? 1
+ : 4;
return LT.first * ST->getMVEVectorCostFactor() * Instrs;
}
break;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e799a948da1a..d6223415c154 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3803,27 +3803,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
ElementCount VF) {
- auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * {
- if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
- return Elt;
- return VectorType::get(Elt, VF);
- };
-
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
- Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
- FastMathFlags FMF;
- if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
- FMF = FPMO->getFastMathFlags();
-
- SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
- FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
- SmallVector<Type *> ParamTys;
- std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(),
- [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
-
- IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast<IntrinsicInst>(CI));
+
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
return TTI.getIntrinsicInstrCost(CostAttrs,
TargetTransformInfo::TCK_RecipThroughput);
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d21151d9a909..0b630197911a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3417,16 +3417,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type *, 4> VecTys;
- for (Use &Arg : CI->args())
- VecTys.push_back(
- FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
- FastMathFlags FMF;
- if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
- FMF = FPCI->getFastMathFlags();
- SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
- IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
- dyn_cast<IntrinsicInst>(CI));
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
@@ -3437,6 +3428,11 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
auto LibCost = IntrinsicCost;
if (!CI->isNoBuiltin() && VecFunc) {
// Calculate the cost of the vector library call.
+ SmallVector<Type *, 4> VecTys;
+ for (Use &Arg : CI->args())
+ VecTys.push_back(
+ FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+
// If the corresponding vector call is cheaper, return its cost.
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
TTI::TCK_RecipThroughput);
@@ -3802,7 +3798,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
+ IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
InstructionCost ScalarEltCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 59c736885eba..b86a7da0daff 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"
; CHECK-COST-LABEL: sadd
; CHECK-COST: Found an estimated cost of 10 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Found an estimated cost of 26 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Found an estimated cost of 58 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
+; CHECK-COST: Found an estimated cost of 122 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @saddsat(
@@ -21,38 +21,29 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[OFFSET:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX]]
-; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 8
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2
-; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]])
-; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]])
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[NEXT_GEP6]] to <8 x i16>*
-; CHECK-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* [[TMP8]], align 2
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[NEXT_GEP6]], i64 8
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>*
-; CHECK-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* [[TMP10]], align 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <2 x i16>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 2
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[WIDE_LOAD]], <2 x i16> [[BROADCAST_SPLAT]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[NEXT_GEP5]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> [[TMP4]], <2 x i16>* [[TMP5]], align 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@@ -66,10 +57,10 @@ define void @saddsat(i16* nocapture readonly %pSrc, i16 signext %offset, i16* no
; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]])
+; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP7]], i16 [[OFFSET]])
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT: store i16 [[TMP13]], i16* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT: store i16 [[TMP8]], i16* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP2:!llvm.loop !.*]]
@@ -99,10 +90,10 @@ while.end: ; preds = %while.body, %entry
; CHECK-COST-LABEL: umin
; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 6 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 14 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 30 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 62 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @umin(
@@ -116,87 +107,78 @@ define void @umin(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocaptur
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 31
+; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 15
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560
+; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 2
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 16
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2
-; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]])
-; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]])
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[NEXT_GEP3]] to <16 x i8>*
-; CHECK-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* [[TMP8]], align 2
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i64 16
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
-; CHECK-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]])
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[NEXT_GEP2]] to <16 x i8>*
+; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT: [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]]
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT: [[CAST_CRD7:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT: [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD7]]
+; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 8
+; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK_NOT_NOT:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
+; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[BLOCKSIZE]], -1
-; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], 1
-; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP14]], 8589934584
-; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[BLOCKSIZE]], -1
+; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP8]], 1
+; CHECK-NEXT: [[N_VEC4:%.*]] = and i64 [[TMP9]], 8589934584
+; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC4]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]]
-; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC4]]
+; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC4]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT20]], <8 x i8> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]]
-; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]]
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2
-; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]])
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2
-; CHECK-NEXT: [[INDEX_NEXT11]] = add i64 [[INDEX10]], 8
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
+; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[NEXT_GEP17:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX5]]
+; CHECK-NEXT: [[NEXT_GEP18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX5]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[NEXT_GEP17]] to <8 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP10]], align 2
+; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD19]], <8 x i8> [[BROADCAST_SPLAT21]])
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP18]] to <8 x i8>*
+; CHECK-NEXT: store <8 x i8> [[TMP11]], <8 x i8>* [[TMP12]], align 2
+; CHECK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX5]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP14]], [[N_VEC9]]
-; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP9]], [[N_VEC4]]
+; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i8* [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT: [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]])
+; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP14]], i8 [[OFFSET]])
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT: store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT: store i8 [[TMP15]], i8* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP6:!llvm.loop !.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index 5e213f18ebe5..fccdf9b1cc2a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -157,7 +157,7 @@ while.end: ; preds = %while.body, %entry
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
+; CHECK-COST: Found an estimated cost of 1 for VF 32 For instruction: %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
define void @cttz(i8* nocapture readonly %pSrc, i8 signext %offset, i8* nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @cttz(
diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
index ad8e35bf9478..042f03840613 100644
--- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
@@ -4,22 +4,20 @@
; Regression test for a bug in the SLP vectorizer that was causing
; these rotates to be incorrectly combined into a vector rotate.
+; The bug fix is at https://reviews.llvm.org/D85759. This test has
+; been pre-committed to demonstrate the regressed behavior and provide
+; a clear
diff for the bug fix.
+
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0
-; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
-; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64
-; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]])
-; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8
-; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1
-; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3
-; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64
-; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1
-; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[X:%.*]], <2 x i64> [[X]], <2 x i64> [[TMP2]])
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[OUT:%.*]] to <2 x i64>*
+; CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8
; CHECK-NEXT: ret void
;
%a = extractelement <2 x i64> %x, i32 0
More information about the llvm-commits
mailing list