[llvm] [LV][TTI] Calculate cost of extracting last index in a scalable vector (PR #144086)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 13 07:42:12 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-hexagon
Author: David Sherwood (david-arm)
<details>
<summary>Changes</summary>
There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.
This patch adds support for querying the cost of extracting
the last lane by passing a new negative value to
getVectorInstrCost that's different to -1. An index of -1
means completely unknown, whereas -2 means the last element.
I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement as
I happened to spot the opcode after a rebase.
---
Patch is 53.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144086.diff
28 Files Affected:
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+11-4)
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+6-6)
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+6-6)
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4-4)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+36-18)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+7-7)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+2-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp (+2-3)
- (modified) llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp (+1-1)
- (modified) llvm/lib/Target/ARM/ARMTargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp (+3-4)
- (modified) llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (+7-5)
- (modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+8-8)
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp (+5-3)
- (modified) llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp (+2-2)
- (modified) llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Target/X86/X86TargetTransformInfo.cpp (+7-5)
- (modified) llvm/lib/Target/X86/X86TargetTransformInfo.h (+2-2)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+10-11)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+7)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll (+9-18)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll (+20-13)
``````````diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..eb0d17bdd44b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1443,7 +1443,7 @@ class TargetTransformInfo {
/// Index = -1 to indicate that there is no information about the index value.
LLVM_ABI InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const;
+ int Index, TTI::TargetCostKind CostKind) const;
/// \return The expected cost of control-flow related instructions such as
/// Phi, Ret, Br, Switch.
@@ -1465,6 +1465,13 @@ class TargetTransformInfo {
OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
const Instruction *I = nullptr) const;
+ enum : int {
+ UnknownIndex = -1,
+ LastIndex = -2,
+ };
+
+ static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
+
/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
/// This is used when the instruction is not available; a typical use
@@ -1472,7 +1479,7 @@ class TargetTransformInfo {
/// vectorizer passes.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1,
+ int Index = UnknownIndex,
const Value *Op0 = nullptr,
const Value *Op1 = nullptr) const;
@@ -1486,7 +1493,7 @@ class TargetTransformInfo {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
LLVM_ABI InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
@@ -1498,7 +1505,7 @@ class TargetTransformInfo {
/// exists (e.g., from basic blocks during transformation).
LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1) const;
+ int Index = UnknownIndex) const;
/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..e8037a2e208ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -758,7 +758,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const {
+ int Index, TTI::TargetCostKind CostKind) const {
return 1;
}
@@ -781,7 +781,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
return 1;
}
@@ -791,7 +791,7 @@ class TargetTransformInfoImplBase {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
virtual InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return 1;
@@ -799,7 +799,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return 1;
}
@@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *IE = dyn_cast<InsertElementInst>(U);
if (!IE)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[2]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
@@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *EEI = dyn_cast<ExtractElementInst>(U);
if (!EEI)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[1]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..e9f2698ccbf8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override {
return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, Index, nullptr, nullptr) +
@@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override {
return getRegUsageForType(Val->getScalarType());
}
@@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override {
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
@@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override {
+ int Index) const override {
Value *Op0 = nullptr;
Value *Op1 = nullptr;
if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa9..86846009fa60a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost(
}
InstructionCost TargetTransformInfo::getExtractWithExtendCost(
- unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index,
+ unsigned Opcode, Type *Dst, VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
InstructionCost Cost =
TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
@@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
const Value *Op0, const Value *Op1) const {
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
@@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert((Opcode == Instruction::InsertElement ||
@@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
// This is mentioned in the interface description and respected by all
// callers, but never asserted upon.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..94711dd63d0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
InstructionCost
AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
- VectorType *VecTy, unsigned Index,
+ VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
// Make sure we were given a valid extend opcode.
@@ -3711,12 +3711,24 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
}
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
- if (Index != -1U) {
+ if (Index == TargetTransformInfo::LastIndex) {
+ if (isa<ScalableVectorType>(Val)) {
+ // This typically requires both while and lastb instructions in order
+ // to extract the last element. If this is in a loop the while
+ // instruction can at least be hoisted out, although it will consume a
+ // predicate register. The cost should be more expensive than the base
+ // extract cost, which is 2 for most CPUs.
+ return CostKind == TTI::TCK_CodeSize ? 2 : 3;
+ }
+ Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
+ }
+
+ if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3884,8 +3896,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
@@ -3893,7 +3904,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
@@ -3903,7 +3914,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
true /* HasRealUse */, &I);
}
@@ -4052,10 +4063,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// loading the vector from constant pool or in some cases, may also result
// in scalarization. For now, we are approximating this with the
// scalarization cost.
- auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
- CostKind, -1, nullptr, nullptr);
- auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
- CostKind, -1, nullptr, nullptr);
+ auto ExtractCost =
+ 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ auto InsertCost = getVectorInstrCost(
+ Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr, nullptr);
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
return ExtractCost + InsertCost +
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
@@ -4153,9 +4167,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// On AArch64, without SVE, vector divisions are expanded
// into scalar divisions of each pair of elements.
Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
- -1, nullptr, nullptr);
- Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr);
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
}
// TODO: if one of the arguments is scalar, then it's not necessary to
@@ -4186,11 +4202,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
- getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
- nullptr, nullptr) *
+ getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr) *
2 +
- getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr));
+ getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr));
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..96dc151eec783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
@@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
/// \param ScalarUserAndIdx encodes the information about extracts from a
@@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override;
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override;
+ int Index) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..3eb0b02f47d32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/144086
More information about the llvm-commits
mailing list