[llvm] [LV][TTI] Calculate cost of extracting last index in a scalable vector (PR #144086)

Fri Jun 13 07:41:43 PDT 2025

https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/144086

There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.

This patch adds support for querying the cost of extracting
the last lane by passing a new negative value to
getVectorInstrCost that's different to -1. An index of -1
means completely unknown, whereas -2 means the last element.

I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement as
I happened to spot the opcode after a rebase.

>From 863c81e85226b6f15bd25b4147c5b5a89cd0ae1e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 13 Jun 2025 14:30:12 +0000
Subject: [PATCH 1/2] [TTI][NFC] Change Index argument in getVectorInstrCost to
 signed

We currently use an unsigned value for the index, but pass in
the value of (unsigned)-1 for unknown indices. This patch
changes the type to an int so that we no longer need the cast
to unsigned. Restricting the range of indices to 0..SINT_MAX
shouldn't cause any problems as it's large enough.

I've added a new isKnownVectorIndex helper function to
TargetTransformInfo as an easy way of asking if the index is
known, instead of constantly comparing the index against -1.
---
 .../llvm/Analysis/TargetTransformInfo.h       | 15 +++++--
 .../llvm/Analysis/TargetTransformInfoImpl.h   | 12 +++---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 12 +++---
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  8 ++--
 .../AArch64/AArch64TargetTransformInfo.cpp    | 42 +++++++++++--------
 .../AArch64/AArch64TargetTransformInfo.h      | 14 +++----
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  4 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  4 +-
 .../Target/AMDGPU/R600TargetTransformInfo.cpp |  5 +--
 .../Target/AMDGPU/R600TargetTransformInfo.h   |  4 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  2 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |  4 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |  7 ++--
 .../Hexagon/HexagonTargetTransformInfo.h      |  4 +-
 .../Target/PowerPC/PPCTargetTransformInfo.cpp | 12 +++---
 .../Target/PowerPC/PPCTargetTransformInfo.h   |  4 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 16 +++----
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  4 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  8 ++--
 .../SystemZ/SystemZTargetTransformInfo.h      |  4 +-
 .../WebAssemblyTargetTransformInfo.cpp        |  4 +-
 .../WebAssemblyTargetTransformInfo.h          |  4 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 12 +++---
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |  4 +-
 24 files changed, 113 insertions(+), 96 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..0b3b8e95c0cd5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1443,7 +1443,7 @@ class TargetTransformInfo {
   /// Index = -1 to indicate that there is no information about the index value.
   LLVM_ABI InstructionCost
   getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
-                           unsigned Index, TTI::TargetCostKind CostKind) const;
+                           int Index, TTI::TargetCostKind CostKind) const;
 
   /// \return The expected cost of control-flow related instructions such as
   /// Phi, Ret, Br, Switch.
@@ -1465,6 +1465,13 @@ class TargetTransformInfo {
       OperandValueInfo Op2Info = {OK_AnyValue, OP_None},
       const Instruction *I = nullptr) const;
 
+  enum : int {
+    UnknownIndex = -1,
+    // This will be expanded in a future patch.
+  };
+
+  static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
+
   /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
   /// This is used when the instruction is not available; a typical use
@@ -1472,7 +1479,7 @@ class TargetTransformInfo {
   /// vectorizer passes.
   LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                               TTI::TargetCostKind CostKind,
-                                              unsigned Index = -1,
+                                              int Index = UnknownIndex,
                                               const Value *Op0 = nullptr,
                                               const Value *Op1 = nullptr) const;
 
@@ -1486,7 +1493,7 @@ class TargetTransformInfo {
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   LLVM_ABI InstructionCost getVectorInstrCost(
-      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
       Value *Scalar,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
 
@@ -1498,7 +1505,7 @@ class TargetTransformInfo {
   /// exists (e.g., from basic blocks during transformation).
   LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                               TTI::TargetCostKind CostKind,
-                                              unsigned Index = -1) const;
+                                              int Index = UnknownIndex) const;
 
   /// \return The expected cost of aggregate inserts and extracts. This is
   /// used when the instruction is not available; a typical use case is to
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..e8037a2e208ab 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -758,7 +758,7 @@ class TargetTransformInfoImplBase {
 
   virtual InstructionCost
   getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
-                           unsigned Index, TTI::TargetCostKind CostKind) const {
+                           int Index, TTI::TargetCostKind CostKind) const {
     return 1;
   }
 
@@ -781,7 +781,7 @@ class TargetTransformInfoImplBase {
 
   virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                              TTI::TargetCostKind CostKind,
-                                             unsigned Index, const Value *Op0,
+                                             int Index, const Value *Op0,
                                              const Value *Op1) const {
     return 1;
   }
@@ -791,7 +791,7 @@ class TargetTransformInfoImplBase {
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   virtual InstructionCost getVectorInstrCost(
-      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
       Value *Scalar,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
     return 1;
@@ -799,7 +799,7 @@ class TargetTransformInfoImplBase {
 
   virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                              TTI::TargetCostKind CostKind,
-                                             unsigned Index) const {
+                                             int Index) const {
     return 1;
   }
 
@@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       auto *IE = dyn_cast<InsertElementInst>(U);
       if (!IE)
         return TTI::TCC_Basic; // FIXME
-      unsigned Idx = -1;
+      int Idx = TargetTransformInfo::UnknownIndex;
       if (auto *CI = dyn_cast<ConstantInt>(Operands[2]))
         if (CI->getValue().getActiveBits() <= 32)
           Idx = CI->getZExtValue();
@@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       auto *EEI = dyn_cast<ExtractElementInst>(U);
       if (!EEI)
         return TTI::TCC_Basic; // FIXME
-      unsigned Idx = -1;
+      int Idx = TargetTransformInfo::UnknownIndex;
       if (auto *CI = dyn_cast<ConstantInt>(Operands[1]))
         if (CI->getValue().getActiveBits() <= 32)
           Idx = CI->getZExtValue();
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..e9f2698ccbf8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   InstructionCost
   getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
-                           unsigned Index,
+                           int Index,
                            TTI::TargetCostKind CostKind) const override {
     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
                                        CostKind, Index, nullptr, nullptr) +
@@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override {
     return getRegUsageForType(Val->getScalarType());
   }
@@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, Value *Scalar,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     Value *Scalar,
                                      ArrayRef<std::tuple<Value *, User *, int>>
                                          ScalarUserAndIdx) const override {
     return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
@@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                      TTI::TargetCostKind CostKind,
-                                     unsigned Index) const override {
+                                     int Index) const override {
     Value *Op0 = nullptr;
     Value *Op1 = nullptr;
     if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2d053e55bdfa9..86846009fa60a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost(
 }
 
 InstructionCost TargetTransformInfo::getExtractWithExtendCost(
-    unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index,
+    unsigned Opcode, Type *Dst, VectorType *VecTy, int Index,
     TTI::TargetCostKind CostKind) const {
   InstructionCost Cost =
       TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
@@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost(
 }
 
 InstructionCost TargetTransformInfo::getVectorInstrCost(
-    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
     const Value *Op0, const Value *Op1) const {
   assert((Opcode == Instruction::InsertElement ||
           Opcode == Instruction::ExtractElement) &&
@@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
 }
 
 InstructionCost TargetTransformInfo::getVectorInstrCost(
-    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   assert((Opcode == Instruction::InsertElement ||
@@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
 InstructionCost
 TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
                                         TTI::TargetCostKind CostKind,
-                                        unsigned Index) const {
+                                        int Index) const {
   // FIXME: Assert that Opcode is either InsertElement or ExtractElement.
   // This is mentioned in the interface description and respected by all
   // callers, but never asserted upon.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..12bb00cdc8e69 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
 InstructionCost
 AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
-                                         VectorType *VecTy, unsigned Index,
+                                         VectorType *VecTy, int Index,
                                          TTI::TargetCostKind CostKind) const {
 
   // Make sure we were given a valid extend opcode.
@@ -3711,12 +3711,12 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
-    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
     bool HasRealUse, const Instruction *I, Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
-  if (Index != -1U) {
+  if (TargetTransformInfo::isKnownVectorIndex(Index)) {
     // Legalize the type.
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
 
@@ -3884,8 +3884,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
-                                                   unsigned Index,
-                                                   const Value *Op0,
+                                                   int Index, const Value *Op0,
                                                    const Value *Op1) const {
   bool HasRealUse =
       Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
@@ -3893,7 +3892,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(
-    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
@@ -3903,7 +3902,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                    Type *Val,
                                                    TTI::TargetCostKind CostKind,
-                                                   unsigned Index) const {
+                                                   int Index) const {
   return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
                                   true /* HasRealUse */, &I);
 }
@@ -4052,10 +4051,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       // loading the vector from constant pool or in some cases, may also result
       // in scalarization. For now, we are approximating this with the
       // scalarization cost.
-      auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
-                                                CostKind, -1, nullptr, nullptr);
-      auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
-                                           CostKind, -1, nullptr, nullptr);
+      auto ExtractCost =
+          2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+                                 TargetTransformInfo::UnknownIndex, nullptr,
+                                 nullptr);
+      auto InsertCost = getVectorInstrCost(
+          Instruction::InsertElement, Ty, CostKind,
+          TargetTransformInfo::UnknownIndex, nullptr, nullptr);
       unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
       return ExtractCost + InsertCost +
              NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
@@ -4153,9 +4155,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
         // On AArch64, without SVE, vector divisions are expanded
         // into scalar divisions of each pair of elements.
         Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
-                                   -1, nullptr, nullptr);
-        Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
-                                   nullptr, nullptr);
+                                   TargetTransformInfo::UnknownIndex, nullptr,
+                                   nullptr);
+        Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+                                   TargetTransformInfo::UnknownIndex, nullptr,
+                                   nullptr);
       }
 
       // TODO: if one of the arguments is scalar, then it's not necessary to
@@ -4186,11 +4190,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       return LT.first;
     return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
            (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
-            getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
-                               nullptr, nullptr) *
+            getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
+                               TargetTransformInfo::UnknownIndex, nullptr,
+                               nullptr) *
                 2 +
-            getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
-                               nullptr, nullptr));
+            getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
+                               TargetTransformInfo::UnknownIndex, nullptr,
+                               nullptr));
   case ISD::ADD:
   case ISD::XOR:
   case ISD::OR:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..96dc151eec783 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCostHelper(
-      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
       bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
 
@@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
   InstructionCost
   getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
-                           unsigned Index,
+                           int Index,
                            TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr) const override;
 
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
 
   /// \param ScalarUserAndIdx encodes the information about extracts from a
@@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, Value *Scalar,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     Value *Scalar,
                                      ArrayRef<std::tuple<Value *, User *, int>>
                                          ScalarUserAndIdx) const override;
 
   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                      TTI::TargetCostKind CostKind,
-                                     unsigned Index) const override;
+                                     int Index) const override;
 
   InstructionCost
   getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..3eb0b02f47d32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
 
 InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                                TTI::TargetCostKind CostKind,
-                                               unsigned Index, const Value *Op0,
+                                               int Index, const Value *Op0,
                                                const Value *Op1) const {
   switch (Opcode) {
   case Instruction::ExtractElement:
@@ -853,7 +853,7 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     // operations, and we don't have to copy into a different register class.
 
     // Dynamic indexing isn't free and is best avoided.
-    return Index == ~0u ? 2 : 0;
+    return TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 2;
   }
   default:
     return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ec298c7e9631a..7726fa31949da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -169,8 +169,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
 
   bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index 3093227279a31..3bd4a20390e32 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -110,8 +110,7 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode,
 
 InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                                 TTI::TargetCostKind CostKind,
-                                                unsigned Index,
-                                                const Value *Op0,
+                                                int Index, const Value *Op0,
                                                 const Value *Op1) const {
   switch (Opcode) {
   case Instruction::ExtractElement:
@@ -128,7 +127,7 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     // operations, and we don't have to copy into a different register class.
 
     // Dynamic indexing isn't free and is best avoided.
-    return Index == ~0u ? 2 : 0;
+    return TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 2;
   }
   default:
     return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
index 3deae69bfc8c9..2bcc47a01eb05 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -63,8 +63,8 @@ class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
                                  const Instruction *I = nullptr) const override;
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
 };
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 6c3a1ae7e1775..e0d89ea5d5325 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -901,7 +901,7 @@ InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 
 InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                                TTI::TargetCostKind CostKind,
-                                               unsigned Index, const Value *Op0,
+                                               int Index, const Value *Op0,
                                                const Value *Op1) const {
   // Penalize inserting into an D-subregister. We end up with a three times
   // lower estimated throughput on swift.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 20a2c59511087..36b988215c5d3 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -253,8 +253,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
 
   InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index a4cc472fdbf29..f5619d8931fe1 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -316,11 +316,10 @@ InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
 
 InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
-                                                   unsigned Index,
-                                                   const Value *Op0,
+                                                   int Index, const Value *Op0,
                                                    const Value *Op1) const {
-  Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType()
-                                   : Val;
+  Type *ElemTy =
+      Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() : Val;
   if (Opcode == Instruction::InsertElement) {
     // Need two rotations for non-zero index.
     unsigned Cost = (Index != 0) ? 2 : 0;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index d7509c3bb1d2f..676dd2cabb045 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -154,8 +154,8 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
                    const Instruction *I = nullptr) const override;
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
 
   InstructionCost
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f9e77f2abdca2..e2eb108167c8e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -674,7 +674,7 @@ InstructionCost PPCTTIImpl::getCmpSelInstrCost(
 
 InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                TTI::TargetCostKind CostKind,
-                                               unsigned Index, const Value *Op0,
+                                               int Index, const Value *Op0,
                                                const Value *Op1) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
@@ -702,7 +702,8 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     // Computing on 1 bit values requires extra mask or compare operations.
     unsigned MaskCostForOneBitSize = (VecMaskCost && EltSize == 1) ? 1 : 0;
     // Computing on non const index requires extra mask or compare operations.
-    unsigned MaskCostForIdx = (Index != -1U) ? 0 : 1;
+    unsigned MaskCostForIdx =
+        TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 1;
     if (ST->hasP9Altivec()) {
       // P10 has vxform insert which can handle non const index. The
       // MaskCostForIdx is for masking the index.
@@ -711,13 +712,13 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
       if (ISD == ISD::INSERT_VECTOR_ELT) {
         if (ST->hasP10Vector())
           return CostFactor + MaskCostForIdx;
-        if (Index != -1U)
+        if (TargetTransformInfo::isKnownVectorIndex(Index))
           return 2 * CostFactor;
       } else if (ISD == ISD::EXTRACT_VECTOR_ELT) {
         // It's an extract.  Maybe we can do a cheap move-from VSR.
         unsigned EltSize = Val->getScalarSizeInBits();
         // P9 has both mfvsrd and mfvsrld for 64 bit integer.
-        if (EltSize == 64 && Index != -1U)
+        if (EltSize == 64 && TargetTransformInfo::isKnownVectorIndex(Index))
           return 1;
         if (EltSize == 32) {
           unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
@@ -734,7 +735,8 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
         // (invariant, easily schedulable).
         return CostFactor + MaskCostForOneBitSize + MaskCostForIdx;
       }
-    } else if (ST->hasDirectMove() && Index != -1U) {
+    } else if (ST->hasDirectMove() &&
+               TargetTransformInfo::isKnownVectorIndex(Index)) {
       // Assume permute has standard cost.
       // Assume move-to/move-from VSR have 2x standard cost.
       if (ISD == ISD::INSERT_VECTOR_ELT)
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 361b2ff223ea0..42c6ffa746fcb 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -129,8 +129,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
       const Instruction *I = nullptr) const override;
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
   InstructionCost getMemoryOpCost(
       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bee47527cf428..2f7647e2501f8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2211,8 +2211,7 @@ InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
 
 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                  TTI::TargetCostKind CostKind,
-                                                 unsigned Index,
-                                                 const Value *Op0,
+                                                 int Index, const Value *Op0,
                                                  const Value *Op1) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
@@ -2227,7 +2226,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   if (!LT.second.isVector()) {
     auto *FixedVecTy = cast<FixedVectorType>(Val);
     // If Index is a known constant, cost is zero.
-    if (Index != -1U)
+    if (TargetTransformInfo::isKnownVectorIndex(Index))
       return 0;
     // Extract/InsertElement with non-constant index is very costly when
     // scalarized; estimate cost of loads/stores sequence via the stack:
@@ -2280,7 +2279,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   // When insertelement we should add the index with 1 as the input of vslideup.
   unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
 
-  if (Index != -1U) {
+  if (TargetTransformInfo::isKnownVectorIndex(Index)) {
     // The type may be split. For fixed-width vectors we can normalize the
     // index to the new type.
     if (LT.second.isFixedLengthVector()) {
@@ -2309,14 +2308,15 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   // When the vector needs to split into multiple register groups and the index
   // exceeds single vector register group, we need to insert/extract the element
   // via stack.
-  if (LT.first > 1 &&
-      ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
-                          LT.second.isScalableVector()))) {
+  if (LT.first > 1 && (!TargetTransformInfo::isKnownVectorIndex(Index) ||
+                       (Index >= LT.second.getVectorMinNumElements() &&
+                        LT.second.isScalableVector()))) {
     Type *ScalarType = Val->getScalarType();
     Align VecAlign = DL.getPrefTypeAlign(Val);
     Align SclAlign = DL.getPrefTypeAlign(ScalarType);
     // Extra addi for unknown index.
-    InstructionCost IdxCost = Index == -1U ? 1 : 0;
+    InstructionCost IdxCost =
+        TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 1;
 
     // Store all split vectors into stack and load the target element.
     if (Opcode == Instruction::ExtractElement)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 0a784461d67bf..131fe30325216 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -244,8 +244,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
 
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
 
   InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 68ba7498d586b..6f9d720896c25 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1194,8 +1194,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(
 
 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind CostKind,
-                                                   unsigned Index,
-                                                   const Value *Op0,
+                                                   int Index, const Value *Op0,
                                                    const Value *Op1) const {
   if (Opcode == Instruction::InsertElement) {
     // Vector Element Load.
@@ -1205,8 +1204,11 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     // vlvgp will insert two grs into a vector register, so count half the
     // number of instructions as an estimate when we don't have the full
     // picture (as in getScalarizationOverhead()).
-    if (Val->isIntOrIntVectorTy(64))
+    if (Val->isIntOrIntVectorTy(64)) {
+      if (!TargetTransformInfo::isKnownVectorIndex(Index))
+        return 0;
       return ((Index % 2 == 0) ? 1 : 0);
+    }
   }
 
   if (Opcode == Instruction::ExtractElement) {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index b4bc41974b70b..3e1462338deee 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -122,8 +122,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
       const Instruction *I = nullptr) const override;
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
   bool isFoldableLoad(const LoadInst *Ld,
                       const Instruction *&FoldedValue) const;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 978e08bb89551..61ffc47e52cfa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -183,13 +183,13 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
 }
 
 InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
-    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index,
     const Value *Op0, const Value *Op1) const {
   InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
       Opcode, Val, CostKind, Index, Op0, Op1);
 
   // SIMD128's insert/extract currently only take constant indices.
-  if (Index == -1u)
+  if (!TargetTransformInfo::isKnownVectorIndex(Index))
     return Cost + 25 * TargetTransformInfo::TCC_Expensive;
 
   return Cost;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 6b6d060076a80..38d97699f288f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -80,8 +80,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
       const Instruction *I = nullptr) const override;
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
   InstructionCost getPartialReductionCost(
       unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index a1a177528eb23..fc7aab6b41b34 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4767,7 +4767,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
 InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                TTI::TargetCostKind CostKind,
-                                               unsigned Index, const Value *Op0,
+                                               int Index, const Value *Op0,
                                                const Value *Op1) const {
   static const CostTblEntry SLMCostTbl[] = {
      { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
@@ -4782,8 +4782,9 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 
   // Non-immediate extraction/insertion can be handled as a sequence of
   // aliased loads+stores via the stack.
-  if (Index == -1U && (Opcode == Instruction::ExtractElement ||
-                       Opcode == Instruction::InsertElement)) {
+  if (!TargetTransformInfo::isKnownVectorIndex(Index) &&
+      (Opcode == Instruction::ExtractElement ||
+       Opcode == Instruction::InsertElement)) {
     // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
 
@@ -4807,8 +4808,9 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     }
   }
 
-  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
-                       Opcode == Instruction::InsertElement)) {
+  if (TargetTransformInfo::isKnownVectorIndex(Index) &&
+      (Opcode == Instruction::ExtractElement ||
+       Opcode == Instruction::InsertElement)) {
     // Extraction of vXi1 elements are now efficiently handled by MOVMSK.
     if (Opcode == Instruction::ExtractElement &&
         ScalarType->getScalarSizeInBits() == 1 &&
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 72673d6fbd80f..58fe7292a1f3d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -165,8 +165,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       const Instruction *I = nullptr) const override;
   using BaseT::getVectorInstrCost;
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, const Value *Op0,
+                                     TTI::TargetCostKind CostKind, int Index,
+                                     const Value *Op0,
                                      const Value *Op1) const override;
   InstructionCost getScalarizationOverhead(
       VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,

>From 222f4b15c63191f4260dd2e0c6d5c07c8575c42b Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 13 Jun 2025 14:30:30 +0000
Subject: [PATCH 2/2] [LV][TTI] Calculate cost of extracting last index in a
 scalable vector

There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.

This patch adds support for querying the cost of extracting
the last lane by passing a new negative value to
getVectorInstrCost that's different to -1. An index of -1
means completely unknown, whereas -2 means the last element.

I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement as
I happened to spot the opcode after a rebase.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  2 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    | 12 +++++++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 21 ++++++------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  7 ++++
 .../LoopVectorize/AArch64/masked-call.ll      | 27 +++++----------
 .../vf-will-not-generate-any-vector-insts.ll  | 33 +++++++++++--------
 6 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0b3b8e95c0cd5..eb0d17bdd44b4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1467,7 +1467,7 @@ class TargetTransformInfo {
 
   enum : int {
     UnknownIndex = -1,
-    // This will be expanded in a future patch.
+    LastIndex = -2,
   };
 
   static inline bool isKnownVectorIndex(int Index) { return Index >= 0; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 12bb00cdc8e69..94711dd63d0d1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3716,6 +3716,18 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
+  if (Index == TargetTransformInfo::LastIndex) {
+    if (isa<ScalableVectorType>(Val)) {
+      // This typically requires both while and lastb instructions in order
+      // to extract the last element. If this is in a loop the while
+      // instruction can at least be hoisted out, although it will consume a
+      // predicate register. The cost should be more expensive than the base
+      // extract cost, which is 2 for most CPUs.
+      return CostKind == TTI::TCK_CodeSize ? 2 : 3;
+    }
+    Index = cast<FixedVectorType>(Val)->getNumElements() - 1;
+  }
+
   if (TargetTransformInfo::isKnownVectorIndex(Index)) {
     // Legalize the type.
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fa313243a57da..09369fbdce390 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5342,17 +5342,16 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   StoreInst *SI = cast<StoreInst>(I);
 
   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
-  // TODO: We have existing tests that request the cost of extracting element
-  // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
-  // the actual generated code, which involves extracting the last element of
-  // a scalable vector where the lane to extract is unknown at compile time.
-  return TTI.getAddressComputationCost(ValTy) +
-         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
-                             CostKind) +
-         (IsLoopInvariantStoreValue
-              ? 0
-              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       CostKind, VF.getKnownMinValue() - 1));
+  InstructionCost Cost =
+      TTI.getAddressComputationCost(ValTy) +
+      TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) +
+      (IsLoopInvariantStoreValue
+           ? 0
+           : TTI.getVectorInstrCost(
+                 Instruction::ExtractElement, VectorTy, CostKind,
+                 VF.isScalable() ? TargetTransformInfo::LastIndex
+                                 : VF.getKnownMinValue() - 1));
+  return Cost;
 }
 
 InstructionCost
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ccce0e07e4d0a..4707a54744174 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -791,6 +791,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   }
 
   switch (getOpcode()) {
+  case VPInstruction::ExtractLastElement: {
+    // Add on the cost of extracting the element.
+    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                      Ctx.CostKind,
+                                      TargetTransformInfo::LastIndex);
+  }
   case Instruction::ExtractElement: {
     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 2c0fb797d1d10..bcac0d434ecee 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -917,32 +917,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
 ; TFNONE-NEXT:  [[ENTRY:.*]]:
 ; TFNONE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; TFNONE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
-; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
 ; TFNONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
-; TFNONE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; TFNONE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP7:%.*]] = load double, ptr [[P2]], align 8
-; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
-; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-; TFNONE-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
-; TFNONE-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
-; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
-; TFNONE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; TFNONE-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 2
-; TFNONE-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
-; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
+; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
+; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; TFNONE-NEXT:    [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
+; TFNONE-NEXT:    [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
+; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
 ; TFNONE-NEXT:    store double [[TMP14]], ptr [[P]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; TFNONE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; TFNONE:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
index e7fdfbcf76caa..2816d94d96c3a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll
@@ -8,7 +8,10 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
 ; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP0]])
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
@@ -17,23 +20,27 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 1 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 1 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> [[BROADCAST_SPLAT3]], <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 1 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]