[llvm] [LV][TTI] Calculate cost of extracting last index in a scalable vector (PR #144086)

Thu Aug 14 08:09:00 PDT 2025

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/144086

>From 8841bd3de31868fb3d0a99d7260082f04e2c5781 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 12 Aug 2025 13:28:47 +0000
Subject: [PATCH 1/2] [LV][TTI] Calculate cost of extracting last index in a
 scalable vector

There are a couple of places in the loop vectoriser where we
want to calculate the cost of extracting the last lane in a
vector. However, we wrongly assume that asking for the cost
of extracting lane (VF.getKnownMinValue() - 1) is an accurate
representation of the cost of extracting the last lane. For
SVE at least, this is non-trivial as it requires the use of
whilelo and lastb instructions.

To solve this problem I have added a new
getReverseVectorInstrCost interface where the index is used
in reverse from the end of the vector. Suppose a vector has
a given ElementCount EC, the extracted/inserted lane would be
EC - 1 - Index. For scalable vectors this index is unknown at
compile time. I've added a AArch64 hook that better represents
the cost, and also a RISCV hook that maintains compatibility
with the behaviour prior to this PR.

I've also taken the liberty of adding support in vplan for
calculating the cost of VPInstruction::ExtractLastElement.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  8 ++++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  7 ++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     | 10 ++++++++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 20 +++++++++++++++
 .../AArch64/AArch64TargetTransformInfo.h      |  4 +++
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 14 +++++++++++
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  4 +++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 19 ++++++++------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 +++++
 .../LoopVectorize/AArch64/masked-call.ll      | 25 +++++++------------
 10 files changed, 94 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9186419715cc4..987191dd31f2b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1512,6 +1512,14 @@ class TargetTransformInfo {
                                               TTI::TargetCostKind CostKind,
                                               unsigned Index = -1) const;
 
+  /// \return The expected cost of inserting or extracting a lane that is \p
+  /// Index from the end of a vector, i.e. the mathematical expression for
+  /// the lane is (VF - 1 - Index). This is required for scalable vectors where
+  /// the exact lane index is unknown at compile time.
+  LLVM_ABI InstructionCost
+  getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+                            TTI::TargetCostKind CostKind, unsigned Index) const;
+
   /// \return The expected cost of aggregate inserts and extracts. This is
   /// used when the instruction is not available; a typical use case is to
   /// provision the cost of vectorization/scalarization in vectorizer passes.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 200cbafbaa6e2..90f6276842449 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -809,6 +809,13 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  virtual InstructionCost
+  getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+                            TTI::TargetCostKind CostKind,
+                            unsigned Index) const {
+    return 1;
+  }
+
   virtual InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
                             const APInt &DemandedDstElts,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3141060a710ce..45dc3d55f5de4 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1130,6 +1130,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
   return Cost;
 }
 
+InstructionCost
+TargetTransformInfo::getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+                                               TTI::TargetCostKind CostKind,
+                                               unsigned Index) const {
+  InstructionCost Cost =
+      TTIImpl->getReverseVectorInstrCost(Opcode, Val, CostKind, Index);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getInsertExtractValueCost(
     unsigned Opcode, TTI::TargetCostKind CostKind) const {
   assert((Opcode == Instruction::InsertValue ||
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3fba7e853eafb..e0b8b3ad7f4e5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3976,6 +3976,26 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
   return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
 }
 
+InstructionCost
+AArch64TTIImpl::getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+                                          TTI::TargetCostKind CostKind,
+                                          unsigned Index) const {
+  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val)) {
+    unsigned NumElems = FixedVecTy->getNumElements();
+    assert(Index < NumElems && "Unexpected reverse index");
+    return getVectorInstrCostHelper(Opcode, Val, CostKind,
+                                    NumElems - 1 - Index);
+  }
+  // This typically requires both while and lastb instructions in order
+  // to extract the last element. If this is in a loop the while
+  // instruction can at least be hoisted out, although it will consume a
+  // predicate register. The cost should be more expensive than the base
+  // extract cost, which is 2 for most CPUs.
+  return CostKind == TTI::TCK_CodeSize
+             ? 2
+             : ST->getVectorInsertExtractBaseCost() + 1;
+}
+
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
     TTI::TargetCostKind CostKind, bool ForPoisonSrc,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 9c96fdd427814..eb0cc7879f8ce 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -220,6 +220,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index) const override;
 
+  InstructionCost getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+                                            TTI::TargetCostKind CostKind,
+                                            unsigned Index) const override;
+
   InstructionCost
   getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
                          TTI::TargetCostKind CostKind) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index af78b3cc2c7ff..32c3ba1ad95c4 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2415,6 +2415,20 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return BaseCost + SlideCost;
 }
 
+InstructionCost
+RISCVTTIImpl::getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+                                        TTI::TargetCostKind CostKind,
+                                        unsigned Index) const {
+  // TODO: This code replicates what LoopVectorize.cpp used to do when asking
+  // for the cost of extracting the last lane of a scalable vector. It probably
+  // needs a more accurate cost.
+  ElementCount EC = cast<VectorType>(Val)->getElementCount();
+  assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
+  return getVectorInstrCost(Opcode, Val, CostKind,
+                            EC.getKnownMinValue() - 1 - Index, nullptr,
+                            nullptr);
+}
+
 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6a1f4b3e3bedf..fed0c540b19c3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -243,6 +243,10 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
                                      unsigned Index, const Value *Op0,
                                      const Value *Op1) const override;
 
+  InstructionCost getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+                                            TTI::TargetCostKind CostKind,
+                                            unsigned Index) const override;
+
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb37ec3e94809..b877535abcf17 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5304,13 +5304,18 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
   // the actual generated code, which involves extracting the last element of
   // a scalable vector where the lane to extract is unknown at compile time.
-  return TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
-         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
-                             CostKind) +
-         (IsLoopInvariantStoreValue
-              ? 0
-              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       CostKind, VF.getKnownMinValue() - 1));
+  InstructionCost Cost =
+      TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
+      TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
+  if (!IsLoopInvariantStoreValue) {
+    if (VF.isFixed())
+      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+                                     CostKind, VF.getKnownMinValue() - 1);
+    else
+      Cost += TTI.getReverseVectorInstrCost(Instruction::ExtractElement,
+                                            VectorTy, CostKind, 0);
+  }
+  return Cost;
 }
 
 InstructionCost
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7bbd0dc325a9b..05ae2b09ffb83 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1011,6 +1011,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
+  case VPInstruction::ExtractLastElement: {
+    // Add on the cost of extracting the element.
+    auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+    return Ctx.TTI.getReverseVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                             Ctx.CostKind, 0);
+  }
   case VPInstruction::ExtractPenultimateElement:
     if (VF == ElementCount::getScalable(1))
       return InstructionCost::getInvalid();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 7028678b338f0..f8d54c0d5a8e3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -903,30 +903,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
 ; TFNONE-NEXT:  [[ENTRY:.*]]:
 ; TFNONE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; TFNONE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
-; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; TFNONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
 ; TFNONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; TFNONE:       [[VECTOR_PH]]:
-; TFNONE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
-; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; TFNONE-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; TFNONE:       [[VECTOR_BODY]]:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP7:%.*]] = load double, ptr [[P2]], align 8
-; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP7]], i64 0
-; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-; TFNONE-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
-; TFNONE-NEXT:    [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
-; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
-; TFNONE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; TFNONE-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 2
-; TFNONE-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
-; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i32 [[TMP13]]
+; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
+; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; TFNONE-NEXT:    [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]])
+; TFNONE-NEXT:    [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer
+; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00)
+; TFNONE-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1
 ; TFNONE-NEXT:    store double [[TMP14]], ptr [[P]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; TFNONE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; TFNONE:       [[MIDDLE_BLOCK]]:

>From 5c72770750ab99e9239ebd575fed1821d555dbe9 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 14 Aug 2025 14:46:55 +0000
Subject: [PATCH 2/2] Address review comments

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h    |  2 +-
 .../include/llvm/Analysis/TargetTransformInfoImpl.h |  2 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h            | 13 +++++++++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp           |  4 ++--
 .../Target/AArch64/AArch64TargetTransformInfo.cpp   | 11 ++++-------
 .../lib/Target/AArch64/AArch64TargetTransformInfo.h |  2 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp  |  5 ++++-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h    |  2 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp     | 11 +++--------
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp      |  2 +-
 10 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 987191dd31f2b..19bd70a91606e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1517,7 +1517,7 @@ class TargetTransformInfo {
   /// the lane is (VF - 1 - Index). This is required for scalable vectors where
   /// the exact lane index is unknown at compile time.
   LLVM_ABI InstructionCost
-  getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+  getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
                             TTI::TargetCostKind CostKind, unsigned Index) const;
 
   /// \return The expected cost of aggregate inserts and extracts. This is
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 90f6276842449..138ee3bbed40d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -810,7 +810,7 @@ class TargetTransformInfoImplBase {
   }
 
   virtual InstructionCost
-  getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+  getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
                             TTI::TargetCostKind CostKind,
                             unsigned Index) const {
     return 1;
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index aa9d1f0a1ccea..124b93804a630 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1444,6 +1444,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                        Op1);
   }
 
+  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
+                                            TTI::TargetCostKind CostKind,
+                                            unsigned Index) const override {
+    unsigned NewIndex = -1;
+    if (auto *FVTy = dyn_cast<FixedVectorType>(Val)) {
+      assert(Index < FVTy->getNumElements() &&
+             "Unexpected index from end of vector");
+      NewIndex = FVTy->getNumElements() - 1 - Index;
+    }
+    return thisT()->getVectorInstrCost(Opcode, Val, CostKind, NewIndex, nullptr,
+                                       nullptr);
+  }
+
   InstructionCost
   getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF,
                             const APInt &DemandedDstElts,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 45dc3d55f5de4..262fe51e41739 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1131,11 +1131,11 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
 }
 
 InstructionCost
-TargetTransformInfo::getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+TargetTransformInfo::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
                                                TTI::TargetCostKind CostKind,
                                                unsigned Index) const {
   InstructionCost Cost =
-      TTIImpl->getReverseVectorInstrCost(Opcode, Val, CostKind, Index);
+      TTIImpl->getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e0b8b3ad7f4e5..046b1e800432c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3977,15 +3977,12 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
 }
 
 InstructionCost
-AArch64TTIImpl::getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+AArch64TTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
                                           TTI::TargetCostKind CostKind,
                                           unsigned Index) const {
-  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val)) {
-    unsigned NumElems = FixedVecTy->getNumElements();
-    assert(Index < NumElems && "Unexpected reverse index");
-    return getVectorInstrCostHelper(Opcode, Val, CostKind,
-                                    NumElems - 1 - Index);
-  }
+  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
+    return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+
   // This typically requires both while and lastb instructions in order
   // to extract the last element. If this is in a loop the while
   // instruction can at least be hoisted out, although it will consume a
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index eb0cc7879f8ce..36706096cf964 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -220,7 +220,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index) const override;
 
-  InstructionCost getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
                                             TTI::TargetCostKind CostKind,
                                             unsigned Index) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 32c3ba1ad95c4..a0fce2812fe9a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2416,9 +2416,12 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 }
 
 InstructionCost
-RISCVTTIImpl::getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+RISCVTTIImpl::getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
                                         TTI::TargetCostKind CostKind,
                                         unsigned Index) const {
+  if (auto *FixedVecTy = dyn_cast<FixedVectorType>(Val))
+    return BaseT::getVectorInstrCostFromEnd(Opcode, Val, CostKind, Index);
+
   // TODO: This code replicates what LoopVectorize.cpp used to do when asking
   // for the cost of extracting the last lane of a scalable vector. It probably
   // needs a more accurate cost.
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index fed0c540b19c3..f502904645d0e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -243,7 +243,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
                                      unsigned Index, const Value *Op0,
                                      const Value *Op1) const override;
 
-  InstructionCost getReverseVectorInstrCost(unsigned Opcode, Type *Val,
+  InstructionCost getVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
                                             TTI::TargetCostKind CostKind,
                                             unsigned Index) const override;
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b877535abcf17..b5fc43861de45 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5307,14 +5307,9 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   InstructionCost Cost =
       TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, CostKind) +
       TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind);
-  if (!IsLoopInvariantStoreValue) {
-    if (VF.isFixed())
-      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                     CostKind, VF.getKnownMinValue() - 1);
-    else
-      Cost += TTI.getReverseVectorInstrCost(Instruction::ExtractElement,
-                                            VectorTy, CostKind, 0);
-  }
+  if (!IsLoopInvariantStoreValue)
+    Cost += TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VectorTy,
+                                          CostKind, 0);
   return Cost;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 05ae2b09ffb83..2cd17db58eb22 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1014,7 +1014,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
   case VPInstruction::ExtractLastElement: {
     // Add on the cost of extracting the element.
     auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
-    return Ctx.TTI.getReverseVectorInstrCost(Instruction::ExtractElement, VecTy,
+    return Ctx.TTI.getVectorInstrCostFromEnd(Instruction::ExtractElement, VecTy,
                                              Ctx.CostKind, 0);
   }
   case VPInstruction::ExtractPenultimateElement: