[llvm] [LV][TTI] Calculate cost of extracting last index in a scalable vector (PR #144086)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 13 07:41:43 PDT 2025
https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/144086
There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.
This patch adds support for querying the cost of extracting
the last lane by passing a new negative value to
getVectorInstrCost that's different to -1. An index of -1
means completely unknown, whereas -2 means the last element.
I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement as
I happened to spot the opcode after a rebase.
>From 863c81e85226b6f15bd25b4147c5b5a89cd0ae1e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 13 Jun 2025 14:30:12 +0000
Subject: [PATCH 1/2] [TTI][NFC] Change Index argument in getVectorInstrCost to
signed
We currently use an unsigned value for the index, but pass in
the value of (unsigned)-1 for unknown indices. This patch
changes the type to an int so that we no longer need the cast
to unsigned. Restricting the range of indices to 0..SINT_MAX
shouldn't cause any problems as it's large enough.
I've added a new isKnownVectorIndex helper function to
TargetTransformInfo as an easy way of asking if the index is
known, instead of constantly comparing the index against -1.
---
.../llvm/Analysis/TargetTransformInfo.h | 15 +++++--
.../llvm/Analysis/TargetTransformInfoImpl.h | 12 +++---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 12 +++---
llvm/lib/Analysis/TargetTransformInfo.cpp | 8 ++--
.../AArch64/AArch64TargetTransformInfo.cpp | 42 +++++++++++--------
.../AArch64/AArch64TargetTransformInfo.h | 14 +++----
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 +-
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 4 +-
.../Target/AMDGPU/R600TargetTransformInfo.cpp | 5 +--
.../Target/AMDGPU/R600TargetTransformInfo.h | 4 +-
.../lib/Target/ARM/ARMTargetTransformInfo.cpp | 2 +-
llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 4 +-
.../Hexagon/HexagonTargetTransformInfo.cpp | 7 ++--
.../Hexagon/HexagonTargetTransformInfo.h | 4 +-
.../Target/PowerPC/PPCTargetTransformInfo.cpp | 12 +++---
.../Target/PowerPC/PPCTargetTransformInfo.h | 4 +-
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 16 +++----
.../Target/RISCV/RISCVTargetTransformInfo.h | 4 +-
.../SystemZ/SystemZTargetTransformInfo.cpp | 8 ++--
.../SystemZ/SystemZTargetTransformInfo.h | 4 +-
.../WebAssemblyTargetTransformInfo.cpp | 4 +-
.../WebAssemblyTargetTransformInfo.h | 4 +-
.../lib/Target/X86/X86TargetTransformInfo.cpp | 12 +++---
llvm/lib/Target/X86/X86TargetTransformInfo.h | 4 +-
24 files changed, 113 insertions(+), 96 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..0b3b8e95c0cd5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1443,7 +1443,7 @@ class TargetTransformInfo {
/// Index = -1 to indicate that there is no information about the index value.
LLVM_ABI InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const;
+ int Index, TTI::TargetCostKind CostKind) const;
/// \return The expected cost of control-flow related instructions such as
/// Phi, Ret, Br, Switch.
@@ -1465,6 +1465,13 @@ class TargetTransformInfo {
OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
const Instruction *I = nullptr) const;
+ enum : int {
+ UnknownIndex = -1,
+ // This will be expanded in a future patch.
+ };
+
+ static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
+
/// \return The expected cost of vector Insert and Extract.
/// Use -1 to indicate that there is no information on the index value.
/// This is used when the instruction is not available; a typical use
@@ -1472,7 +1479,7 @@ class TargetTransformInfo {
/// vectorizer passes.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1,
+ int Index = UnknownIndex,
const Value *Op0 = nullptr,
const Value *Op1 = nullptr) const;
@@ -1486,7 +1493,7 @@ class TargetTransformInfo {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
LLVM_ABI InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
@@ -1498,7 +1505,7 @@ class TargetTransformInfo {
/// exists (e.g., from basic blocks during transformation).
LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index = -1) const;
+ int Index = UnknownIndex) const;
/// \return The expected cost of aggregate inserts and extracts. This is
/// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..e8037a2e208ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -758,7 +758,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index, TTI::TargetCostKind CostKind) const {
+ int Index, TTI::TargetCostKind CostKind) const {
return 1;
}
@@ -781,7 +781,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
return 1;
}
@@ -791,7 +791,7 @@ class TargetTransformInfoImplBase {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
virtual InstructionCost getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return 1;
@@ -799,7 +799,7 @@ class TargetTransformInfoImplBase {
virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return 1;
}
@@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *IE = dyn_cast<InsertElementInst>(U);
if (!IE)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[2]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
@@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
auto *EEI = dyn_cast<ExtractElementInst>(U);
if (!EEI)
return TTI::TCC_Basic; // FIXME
- unsigned Idx = -1;
+ int Idx = TargetTransformInfo::UnknownIndex;
if (auto *CI = dyn_cast<ConstantInt>(Operands[1]))
if (CI->getValue().getActiveBits() <= 32)
Idx = CI->getZExtValue();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..e9f2698ccbf8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override {
return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, Index, nullptr, nullptr) +
@@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override {
return getRegUsageForType(Val->getScalarType());
}
@@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override {
return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
@@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override {
+ int Index) const override {
Value *Op0 = nullptr;
Value *Op1 = nullptr;
if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa9..86846009fa60a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost(
}
InstructionCost TargetTransformInfo::getExtractWithExtendCost(
- unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index,
+ unsigned Opcode, Type *Dst, VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
InstructionCost Cost =
TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
@@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
const Value *Op0, const Value *Op1) const {
assert((Opcode == Instruction::InsertElement ||
Opcode == Instruction::ExtractElement) &&
@@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
}
InstructionCost TargetTransformInfo::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert((Opcode == Instruction::InsertElement ||
@@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
InstructionCost
TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
// FIXME: Assert that Opcode is either InsertElement or ExtractElement.
// This is mentioned in the interface description and respected by all
// callers, but never asserted upon.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..12bb00cdc8e69 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
InstructionCost
AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
- VectorType *VecTy, unsigned Index,
+ VectorType *VecTy, int Index,
TTI::TargetCostKind CostKind) const {
// Make sure we were given a valid extend opcode.
@@ -3711,12 +3711,12 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
}
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I, Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
- if (Index != -1U) {
+ if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3884,8 +3884,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
bool HasRealUse =
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
@@ -3893,7 +3892,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
InstructionCost AArch64TTIImpl::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
@@ -3903,7 +3902,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const {
+ int Index) const {
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
true /* HasRealUse */, &I);
}
@@ -4052,10 +4051,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// loading the vector from constant pool or in some cases, may also result
// in scalarization. For now, we are approximating this with the
// scalarization cost.
- auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
- CostKind, -1, nullptr, nullptr);
- auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
- CostKind, -1, nullptr, nullptr);
+ auto ExtractCost =
+ 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ auto InsertCost = getVectorInstrCost(
+ Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr, nullptr);
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
return ExtractCost + InsertCost +
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
@@ -4153,9 +4155,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
// On AArch64, without SVE, vector divisions are expanded
// into scalar divisions of each pair of elements.
Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
- -1, nullptr, nullptr);
- Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr);
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
+ Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr);
}
// TODO: if one of the arguments is scalar, then it's not necessary to
@@ -4186,11 +4190,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
return LT.first;
return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
(getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
- getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
- nullptr, nullptr) *
+ getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr) *
2 +
- getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
- nullptr, nullptr));
+ getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+ TargetTransformInfo::UnknownIndex, nullptr,
+ nullptr));
case ISD::ADD:
case ISD::XOR:
case ISD::OR:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..96dc151eec783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
@@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
- unsigned Index,
+ int Index,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
/// \param ScalarUserAndIdx encodes the information about extracts from a
@@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, Value *Scalar,
+ TTI::TargetCostKind CostKind, int Index,
+ Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>>
ScalarUserAndIdx) const override;
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index) const override;
+ int Index) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..3eb0b02f47d32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
switch (Opcode) {
case Instruction::ExtractElement:
@@ -853,7 +853,7 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
// operations, and we don't have to copy into a different register class.
// Dynamic indexing isn't free and is best avoided.
- return Index == ~0u ? 2 : 0;
+ return TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 2;
}
default:
return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ec298c7e9631a..7726fa31949da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -169,8 +169,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index 3093227279a31..3bd4a20390e32 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -110,8 +110,7 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
switch (Opcode) {
case Instruction::ExtractElement:
@@ -128,7 +127,7 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
// operations, and we don't have to copy into a different register class.
// Dynamic indexing isn't free and is best avoided.
- return Index == ~0u ? 2 : 0;
+ return TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 2;
}
default:
return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
index 3deae69bfc8c9..2bcc47a01eb05 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -63,8 +63,8 @@ class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
const Instruction *I = nullptr) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
};
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6c3a1ae7e1775..e0d89ea5d5325 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -901,7 +901,7 @@ InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
// Penalize inserting into an D-subregister. We end up with a three times
// lower estimated throughput on swift.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 20a2c59511087..36b988215c5d3 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -253,8 +253,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index a4cc472fdbf29..f5619d8931fe1 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -316,11 +316,10 @@ InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
- Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
- : Val;
+ Type *ElemTy =
+ Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() : Val;
if (Opcode == Instruction::InsertElement) {
// Need two rotations for non-zero index.
unsigned Cost = (Index != 0) ? 2 : 0;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index d7509c3bb1d2f..676dd2cabb045 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -154,8 +154,8 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
const Instruction *I = nullptr) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
InstructionCost
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f9e77f2abdca2..e2eb108167c8e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -674,7 +674,7 @@ InstructionCost PPCTTIImpl::getCmpSelInstrCost(
InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
assert(Val->isVectorTy() && "This must be a vector type");
@@ -702,7 +702,8 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// Computing on 1 bit values requires extra mask or compare operations.
unsigned MaskCostForOneBitSize = (VecMaskCost && EltSize == 1) ? 1 : 0;
// Computing on non const index requires extra mask or compare operations.
- unsigned MaskCostForIdx = (Index != -1U) ? 0 : 1;
+ unsigned MaskCostForIdx =
+ TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 1;
if (ST->hasP9Altivec()) {
// P10 has vxform insert which can handle non const index. The
// MaskCostForIdx is for masking the index.
@@ -711,13 +712,13 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
if (ISD == ISD::INSERT_VECTOR_ELT) {
if (ST->hasP10Vector())
return CostFactor + MaskCostForIdx;
- if (Index != -1U)
+ if (TargetTransformInfo::isKnownVectorIndex(Index))
return 2 * CostFactor;
} else if (ISD == ISD::EXTRACT_VECTOR_ELT) {
// It's an extract. Maybe we can do a cheap move-from VSR.
unsigned EltSize = Val->getScalarSizeInBits();
// P9 has both mfvsrd and mfvsrld for 64 bit integer.
- if (EltSize == 64 && Index != -1U)
+ if (EltSize == 64 && TargetTransformInfo::isKnownVectorIndex(Index))
return 1;
if (EltSize == 32) {
unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
@@ -734,7 +735,8 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// (invariant, easily schedulable).
return CostFactor + MaskCostForOneBitSize + MaskCostForIdx;
}
- } else if (ST->hasDirectMove() && Index != -1U) {
+ } else if (ST->hasDirectMove() &&
+ TargetTransformInfo::isKnownVectorIndex(Index)) {
// Assume permute has standard cost.
// Assume move-to/move-from VSR have 2x standard cost.
if (ISD == ISD::INSERT_VECTOR_ELT)
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 361b2ff223ea0..42c6ffa746fcb 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -129,8 +129,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
const Instruction *I = nullptr) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bee47527cf428..2f7647e2501f8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2211,8 +2211,7 @@ InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
assert(Val->isVectorTy() && "This must be a vector type");
@@ -2227,7 +2226,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
if (!LT.second.isVector()) {
auto *FixedVecTy = cast<FixedVectorType>(Val);
// If Index is a known constant, cost is zero.
- if (Index != -1U)
+ if (TargetTransformInfo::isKnownVectorIndex(Index))
return 0;
// Extract/InsertElement with non-constant index is very costly when
// scalarized; estimate cost of loads/stores sequence via the stack:
@@ -2280,7 +2279,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// When insertelement we should add the index with 1 as the input of vslideup.
unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
- if (Index != -1U) {
+ if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// The type may be split. For fixed-width vectors we can normalize the
// index to the new type.
if (LT.second.isFixedLengthVector()) {
@@ -2309,14 +2308,15 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// When the vector needs to split into multiple register groups and the index
// exceeds single vector register group, we need to insert/extract the element
// via stack.
- if (LT.first > 1 &&
- ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
- LT.second.isScalableVector()))) {
+ if (LT.first > 1 && (!TargetTransformInfo::isKnownVectorIndex(Index) ||
+ (Index >= LT.second.getVectorMinNumElements() &&
+ LT.second.isScalableVector()))) {
Type *ScalarType = Val->getScalarType();
Align VecAlign = DL.getPrefTypeAlign(Val);
Align SclAlign = DL.getPrefTypeAlign(ScalarType);
// Extra addi for unknown index.
- InstructionCost IdxCost = Index == -1U ? 1 : 0;
+ InstructionCost IdxCost =
+ TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 1;
// Store all split vectors into stack and load the target element.
if (Opcode == Instruction::ExtractElement)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 0a784461d67bf..131fe30325216 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -244,8 +244,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 68ba7498d586b..6f9d720896c25 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1194,8 +1194,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(
InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index,
- const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
if (Opcode == Instruction::InsertElement) {
// Vector Element Load.
@@ -1205,8 +1204,11 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// vlvgp will insert two grs into a vector register, so count half the
// number of instructions as an estimate when we don't have the full
// picture (as in getScalarizationOverhead()).
- if (Val->isIntOrIntVectorTy(64))
+ if (Val->isIntOrIntVectorTy(64)) {
+ if (!TargetTransformInfo::isKnownVectorIndex(Index))
+ return 0;
return ((Index % 2 == 0) ? 1 : 0);
+ }
}
if (Opcode == Instruction::ExtractElement) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index b4bc41974b70b..3e1462338deee 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -122,8 +122,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
const Instruction *I = nullptr) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
bool isFoldableLoad(const LoadInst *Ld,
const Instruction *&FoldedValue) const;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 978e08bb89551..61ffc47e52cfa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -183,13 +183,13 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
}
InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
- unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+ unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
const Value *Op0, const Value *Op1) const {
InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
Opcode, Val, CostKind, Index, Op0, Op1);
// SIMD128's insert/extract currently only take constant indices.
- if (Index == -1u)
+ if (!TargetTransformInfo::isKnownVectorIndex(Index))
return Cost + 25 * TargetTransformInfo::TCC_Expensive;
return Cost;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 6b6d060076a80..38d97699f288f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -80,8 +80,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
const Instruction *I = nullptr) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
InstructionCost getPartialReductionCost(
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index a1a177528eb23..fc7aab6b41b34 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4767,7 +4767,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ int Index, const Value *Op0,
const Value *Op1) const {
static const CostTblEntry SLMCostTbl[] = {
{ ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
@@ -4782,8 +4782,9 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// Non-immediate extraction/insertion can be handled as a sequence of
// aliased loads+stores via the stack.
- if (Index == -1U && (Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::InsertElement)) {
+ if (!TargetTransformInfo::isKnownVectorIndex(Index) &&
+ (Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::InsertElement)) {
// TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
@@ -4807,8 +4808,9 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
}
- if (Index != -1U && (Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::InsertElement)) {
+ if (TargetTransformInfo::isKnownVectorIndex(Index) &&
+ (Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::InsertElement)) {
// Extraction of vXi1 elements are now efficiently handled by MOVMSK.
if (Opcode == Instruction::ExtractElement &&
ScalarType->getScalarSizeInBits() == 1 &&
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 72673d6fbd80f..58fe7292a1f3d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -165,8 +165,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const Instruction *I = nullptr) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
- TTI::TargetCostKind CostKind,
- unsigned Index, const Value *Op0,
+ TTI::TargetCostKind CostKind, int Index,
+ const Value *Op0,
const Value *Op1) const override;
InstructionCost getScalarizationOverhead(
VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
>From 222f4b15c63191f4260dd2e0c6d5c07c8575c42b Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 13 Jun 2025 14:30:30 +0000
Subject: [PATCH 2/2] [LV][TTI] Calculate cost of extracting last index in a
scalable vector
There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.
This patch adds support for querying the cost of extracting
the last lane by passing a new negative value to
getVectorInstrCost that's different to -1. An index of -1
means completely unknown, whereas -2 means the last element.
I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement as
I happened to spot the opcode after a rebase.
---
.../llvm/Analysis/TargetTransformInfo.h | 2 +-
.../AArch64/AArch64TargetTransformInfo.cpp | 12 +++++++
.../Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++------
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 7 ++++
.../LoopVectorize/AArch64/masked-call.ll | 27 +++++----------
.../vf-will-not-generate-any-vector-insts.ll | 33 +++++++++++--------
6 files changed, 59 insertions(+), 43 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0b3b8e95c0cd5..eb0d17bdd44b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1467,7 +1467,7 @@ class TargetTransformInfo {
enum : int {
UnknownIndex = -1,
- // This will be expanded in a future patch.
+ LastIndex = -2,
};
static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 12bb00cdc8e69..94711dd63d0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3716,6 +3716,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
assert(Val->isVectorTy() && "This must be a vector type");
+ if (Index == TargetTransformInfo::LastIndex) {
+ if (isa<ScalableVectorType>(Val)) {
+ // This typically requires both while and lastb instructions in order
+ // to extract the last element. If this is in a loop the while
+ // instruction can at least be hoisted out, although it will consume a
+ // predicate register. The cost should be more expensive than the base
+ // extract cost, which is 2 for most CPUs.
+ return CostKind == TTI::TCK_CodeSize ? 2 : 3;
+ }
+ Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
+ }
+
if (TargetTransformInfo::isKnownVectorIndex(Index)) {
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fa313243a57da..09369fbdce390 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5342,17 +5342,16 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
StoreInst *SI = cast<StoreInst>(I);
bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
- // TODO: We have existing tests that request the cost of extracting element
- // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
- // the actual generated code, which involves extracting the last element of
- // a scalable vector where the lane to extract is unknown at compile time.
- return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
- CostKind) +
- (IsLoopInvariantStoreValue
- ? 0
- : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
- CostKind, VF.getKnownMinValue() - 1));
+ InstructionCost Cost =
+ TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) +
+ (IsLoopInvariantStoreValue
+ ? 0
+ : TTI.getVectorInstrCost(
+ Instruction::ExtractElement, VectorTy, CostKind,
+ VF.isScalable() ? TargetTransformInfo::LastIndex
+ : VF.getKnownMinValue() - 1));
+ return Cost;
}
InstructionCost
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ccce0e07e4d0a..4707a54744174 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -791,6 +791,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
switch (getOpcode()) {
+ case VPInstruction::ExtractLastElement: {
+ // Add on the cost of extracting the element.
+ auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ Ctx.CostKind,
+ TargetTransformInfo::LastIndex);
+ }
case Instruction::ExtractElement: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 2c0fb797d1d10..bcac0d434ecee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -917,32 +917,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; TFNONE-NEXT: [[ENTRY:.*]]:
; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
-; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
-; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; TFNONE: [[VECTOR_PH]]:
-; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; TFNONE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; TFNONE-NEXT: br label %[[VECTOR_BODY:.*]]
; TFNONE: [[VECTOR_BODY]]:
; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TFNONE-NEXT: [[TMP7:%.*]] = load double, ptr [[P2]], align 8
-; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
-; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-; TFNONE-NEXT: [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
-; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
-; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
-; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; TFNONE-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 2
-; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1
-; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
+; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
+; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
+; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
+; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
; TFNONE-NEXT: store double [[TMP14]], ptr [[P]], align 8
-; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; TFNONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; TFNONE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; TFNONE: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
index e7fdfbcf76caa..2816d94d96c3a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
@@ -8,7 +8,10 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts(
; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
-; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP0]])
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
@@ -17,23 +20,27 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> [[BROADCAST_SPLAT3]], <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 1 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]
More information about the llvm-commits
mailing list