[llvm] 2be906b - Revert "[TTI][RISCV] Add cost modelling for intrinsic vp.load.ff (#160470)"

Fri Sep 26 17:27:21 PDT 2025

Author: ShihPo Hung
Date: 2025-09-26T17:24:56-07:00
New Revision: 2be906b25581eebbd6607a5d99943251617622cb

URL: https://github.com/llvm/llvm-project/commit/2be906b25581eebbd6607a5d99943251617622cb
DIFF: https://github.com/llvm/llvm-project/commit/2be906b25581eebbd6607a5d99943251617622cb.diff

LOG: Revert "[TTI][RISCV] Add cost modelling for intrinsic vp.load.ff (#160470)"

This reverts commit aa08b1a9963f33ded658d3ee655429e1121b5212.

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
    llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 318528021ef75..7a4abe9ee5082 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1619,11 +1619,6 @@ class TargetTransformInfo {
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       bool UseMaskForCond = false, bool UseMaskForGaps = false) const;
 
-  /// \return The cost of vp intrinsic vp.load.ff.
-  LLVM_ABI InstructionCost getFirstFaultLoadCost(
-      Type *DataTy, Align Alignment,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
-
   /// A helper function to determine the type of reduction algorithm used
   /// for a given \p Opcode and set of FastMathFlags \p FMF.
   static bool requiresOrderedReduction(std::optional<FastMathFlags> FMF) {

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index bb299becfdcba..566e1cf51631a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -885,12 +885,6 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
-  virtual InstructionCost
-  getFirstFaultLoadCost(Type *DataTy, Align Alignment,
-                        TTI::TargetCostKind CostKind) const {
-    return InstructionCost::getInvalid();
-  }
-
   virtual InstructionCost
   getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                         TTI::TargetCostKind CostKind) const {

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c8dfd9abf2c45..42ddb32d24093 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1795,14 +1795,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         }
       }
 
-      if (ICA.getID() == Intrinsic::vp_load_ff) {
-        Type *RetTy = ICA.getReturnType();
-        Type *DataTy = cast<StructType>(RetTy)->getElementType(0);
-        Align Alignment;
-        if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
-          Alignment = VPI->getPointerAlignment().valueOrOne();
-        return thisT()->getFirstFaultLoadCost(DataTy, Alignment, CostKind);
-      }
       if (ICA.getID() == Intrinsic::vp_scatter) {
         if (ICA.isTypeBasedOnly()) {
           IntrinsicCostAttributes MaskedScatter(

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b9dec59a1ecad..bf62623099a97 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1218,15 +1218,6 @@ InstructionCost TargetTransformInfo::getInterleavedMemoryOpCost(
   return Cost;
 }
 
-InstructionCost
-TargetTransformInfo::getFirstFaultLoadCost(Type *DataTy, Align Alignment,
-                                           TTI::TargetCostKind CostKind) const {
-  InstructionCost Cost =
-      TTIImpl->getFirstFaultLoadCost(DataTy, Alignment, CostKind);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
 InstructionCost
 TargetTransformInfo::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                            TTI::TargetCostKind CostKind) const {

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index cfa190fdc0a62..50649cf3caba4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24884,22 +24884,6 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
   return true;
 }
 
-bool RISCVTargetLowering::isLegalFirstFaultLoad(EVT DataType,
-                                                Align Alignment) const {
-  if (!Subtarget.hasVInstructions())
-    return false;
-
-  EVT ScalarType = DataType.getScalarType();
-  if (!isLegalElementTypeForRVV(ScalarType))
-    return false;
-
-  if (!Subtarget.enableUnalignedVectorMem() &&
-      Alignment < ScalarType.getStoreSize())
-    return false;
-
-  return true;
-}
-
 MachineInstr *
 RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
                                    MachineBasicBlock::instr_iterator &MBBI,

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index f9ed914ee84ff..3f81ed74c12ed 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -425,10 +425,6 @@ class RISCVTargetLowering : public TargetLowering {
   /// alignment is legal.
   bool isLegalStridedLoadStore(EVT DataType, Align Alignment) const;
 
-  /// Return true if a fault-only-first load of the given result type and
-  /// alignment is legal.
-  bool isLegalFirstFaultLoad(EVT DataType, Align Alignment) const;
-
   unsigned getMaxSupportedInterleaveFactor() const override { return 8; }
 
   bool fallBackToDAGISel(const Instruction &Inst) const override;

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 0e97c85e82da0..563f3bbee81df 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1070,17 +1070,6 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
   return MemCost + ShuffleCost;
 }
 
-InstructionCost
-RISCVTTIImpl::getFirstFaultLoadCost(Type *DataTy, Align Alignment,
-                                    TTI::TargetCostKind CostKind) const {
-  EVT DataTypeVT = TLI->getValueType(DL, DataTy);
-  if (!TLI->isLegalFirstFaultLoad(DataTypeVT, Alignment))
-    return BaseT::getFirstFaultLoadCost(DataTy, Alignment, CostKind);
-
-  return getMemoryOpCost(Instruction::Load, DataTy, Alignment, 0, CostKind,
-                         {TTI::OK_AnyValue, TTI::OP_None}, nullptr);
-}
-
 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index ebf0663881910..6886e8964e29e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -191,10 +191,6 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false) const override;
 
-  InstructionCost
-  getFirstFaultLoadCost(Type *DataTy, Align Alignment,
-                        TTI::TargetCostKind CostKind) const override;
-
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,

diff  --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index ba792d8f0955b..71746caf35f2e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -836,74 +836,74 @@ define void @abs() {
   ret void
 }
 
-define void @load(ptr %src) {
+define void @load() {
 ; CHECK-LABEL: 'load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr %src, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr align 1 %src, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr %src, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr align 1 %src, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr %src, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr align 1 %src, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr %src, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = call { <16 x i8>, i32 } @llvm.vp.load.ff.v16i8.p0(ptr align 1 %src, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %src, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr align 8 %src, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr %src, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t11 = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr align 8 %src, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t12 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr %src, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr align 8 %src, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t14 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr %src, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call { <16 x i64>, i32 } @llvm.vp.load.ff.v16i64.p0(ptr align 8 %src, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t16 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8.p0(ptr %src, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call { <vscale x 2 x i8>, i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr align 1 %src, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr %src, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call { <vscale x 4 x i8>, i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr align 1 %src, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr %src, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call { <vscale x 8 x i8>, i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr align 1 %src, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t22 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr %src, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr align 1 %src, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr %src, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call { <vscale x 2 x i64>, i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr align 8 %src, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t26 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr %src, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call { <vscale x 4 x i64>, i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr align 8 %src, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t28 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr %src, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call { <vscale x 8 x i64>, i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr align 8 %src, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t30 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr %src, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call { <vscale x 16 x i64>, i32 } @llvm.vp.load.ff.nxv16i64.p0(ptr align 8 %src, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t0 = call <2 x i8> @llvm.vp.load.v2i8.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = load <2 x i8>, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = call <4 x i8> @llvm.vp.load.v4i8.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = load <4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t5 = load <8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t6 = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t7 = load <16 x i8>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t8 = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = load <2 x i64>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.vp.load.v4i64.p0(ptr undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t12 = load <4 x i64>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t13 = call <8 x i64> @llvm.vp.load.v8i64.p0(ptr undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t14 = load <8 x i64>, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t15 = call <16 x i64> @llvm.vp.load.v16i64.p0(ptr undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t16 = load <16 x i64>, ptr undef, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t18 = load <vscale x 2 x i8>, ptr undef, align 2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t20 = load <vscale x 4 x i8>, ptr undef, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t22 = load <vscale x 8 x i8>, ptr undef, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t24 = load <vscale x 16 x i8>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t26 = load <vscale x 2 x i64>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64.p0(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t28 = load <vscale x 4 x i64>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64.p0(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t30 = load <vscale x 8 x i64>, ptr undef, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64.p0(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t32 = load <vscale x 16 x i64>, ptr undef, align 128
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr %src, <2 x i1> undef, i32 undef)
-  %t1 = call { <2 x i8>, i32 } @llvm.vp.load.ff.v2i8.p0(ptr align 1 %src, <2 x i1> undef, i32 undef)
-  %t2 = call <4 x i8> @llvm.vp.load.v4i8(ptr %src, <4 x i1> undef, i32 undef)
-  %t3 = call { <4 x i8>, i32 } @llvm.vp.load.ff.v4i8.p0(ptr align 1 %src, <4 x i1> undef, i32 undef)
-  %t4 = call <8 x i8> @llvm.vp.load.v8i8(ptr %src, <8 x i1> undef, i32 undef)
-  %t5 = call { <8 x i8>, i32 } @llvm.vp.load.ff.v8i8.p0(ptr align 1 %src, <8 x i1> undef, i32 undef)
-  %t6 = call <16 x i8> @llvm.vp.load.v16i8(ptr %src, <16 x i1> undef, i32 undef)
-  %t7 = call { <16 x i8>, i32 } @llvm.vp.load.ff.v16i8.p0(ptr align 1 %src, <16 x i1> undef, i32 undef)
-  %t8 = call <2 x i64> @llvm.vp.load.v2i64(ptr %src, <2 x i1> undef, i32 undef)
-  %t9 = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr align 8 %src, <2 x i1> undef, i32 undef)
-  %t10 = call <4 x i64> @llvm.vp.load.v4i64(ptr %src, <4 x i1> undef, i32 undef)
-  %t11 = call { <4 x i64>, i32 } @llvm.vp.load.ff.v4i64.p0(ptr align 8 %src, <4 x i1> undef, i32 undef)
-  %t12 = call <8 x i64> @llvm.vp.load.v8i64(ptr %src, <8 x i1> undef, i32 undef)
-  %t13 = call { <8 x i64>, i32 } @llvm.vp.load.ff.v8i64.p0(ptr align 8 %src, <8 x i1> undef, i32 undef)
-  %t14 = call <16 x i64> @llvm.vp.load.v16i64(ptr %src, <16 x i1> undef, i32 undef)
-  %t15 = call { <16 x i64>, i32 } @llvm.vp.load.ff.v16i64.p0(ptr align 8 %src, <16 x i1> undef, i32 undef)
-  %t16 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8(ptr %src, <vscale x 2 x i1> undef, i32 undef)
-  %t17 = call { <vscale x 2 x i8>, i32 } @llvm.vp.load.ff.nxv2i8.p0(ptr align 1 %src, <vscale x 2 x i1> undef, i32 undef)
-  %t18 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8(ptr %src, <vscale x 4 x i1> undef, i32 undef)
-  %t19 = call { <vscale x 4 x i8>, i32 } @llvm.vp.load.ff.nxv4i8.p0(ptr align 1 %src, <vscale x 4 x i1> undef, i32 undef)
-  %t20 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8(ptr %src, <vscale x 8 x i1> undef, i32 undef)
-  %t21 = call { <vscale x 8 x i8>, i32 } @llvm.vp.load.ff.nxv8i8.p0(ptr align 1 %src, <vscale x 8 x i1> undef, i32 undef)
-  %t22 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8(ptr %src, <vscale x 16 x i1> undef, i32 undef)
-  %t23 = call { <vscale x 16 x i8>, i32 } @llvm.vp.load.ff.nxv16i8.p0(ptr align 1 %src, <vscale x 16 x i1> undef, i32 undef)
-  %t24 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64(ptr %src, <vscale x 2 x i1> undef, i32 undef)
-  %t25 = call { <vscale x 2 x i64>, i32 } @llvm.vp.load.ff.nxv2i64.p0(ptr align 8 %src, <vscale x 2 x i1> undef, i32 undef)
-  %t26 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64(ptr %src, <vscale x 4 x i1> undef, i32 undef)
-  %t27 = call { <vscale x 4 x i64>, i32 } @llvm.vp.load.ff.nxv4i64.p0(ptr align 8 %src, <vscale x 4 x i1> undef, i32 undef)
-  %t28 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64(ptr %src, <vscale x 8 x i1> undef, i32 undef)
-  %t29 = call { <vscale x 8 x i64>, i32 } @llvm.vp.load.ff.nxv8i64.p0(ptr align 8 %src, <vscale x 8 x i1> undef, i32 undef)
-  %t30 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64(ptr %src, <vscale x 16 x i1> undef, i32 undef)
-  %t31 = call { <vscale x 16 x i64>, i32 } @llvm.vp.load.ff.nxv16i64.p0(ptr align 8 %src, <vscale x 16 x i1> undef, i32 undef)
+  %t0 = call <2 x i8> @llvm.vp.load.v2i8(ptr undef, <2 x i1> undef, i32 undef)
+  %t1 = load <2 x i8>, ptr undef
+  %t2 = call <4 x i8> @llvm.vp.load.v4i8(ptr undef, <4 x i1> undef, i32 undef)
+  %t3 = load <4 x i8>, ptr undef
+  %t4 = call <8 x i8> @llvm.vp.load.v8i8(ptr undef, <8 x i1> undef, i32 undef)
+  %t5 = load <8 x i8>, ptr undef
+  %t6 = call <16 x i8> @llvm.vp.load.v16i8(ptr undef, <16 x i1> undef, i32 undef)
+  %t7 = load <16 x i8>, ptr undef
+  %t8 = call <2 x i64> @llvm.vp.load.v2i64(ptr undef, <2 x i1> undef, i32 undef)
+  %t9 = load <2 x i64>, ptr undef
+  %t10 = call <4 x i64> @llvm.vp.load.v4i64(ptr undef, <4 x i1> undef, i32 undef)
+  %t12 = load <4 x i64>, ptr undef
+  %t13 = call <8 x i64> @llvm.vp.load.v8i64(ptr undef, <8 x i1> undef, i32 undef)
+  %t14 = load <8 x i64>, ptr undef
+  %t15 = call <16 x i64> @llvm.vp.load.v16i64(ptr undef, <16 x i1> undef, i32 undef)
+  %t16 = load <16 x i64>, ptr undef
+  %t17 = call <vscale x 2 x i8> @llvm.vp.load.nxv2i8(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  %t18 = load <vscale x 2 x i8>, ptr undef
+  %t19 = call <vscale x 4 x i8> @llvm.vp.load.nxv4i8(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  %t20 = load <vscale x 4 x i8>, ptr undef
+  %t21 = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  %t22 = load <vscale x 8 x i8>, ptr undef
+  %t23 = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  %t24 = load <vscale x 16 x i8>, ptr undef
+  %t25 = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64(ptr undef, <vscale x 2 x i1> undef, i32 undef)
+  %t26 = load <vscale x 2 x i64>, ptr undef
+  %t27 = call <vscale x 4 x i64> @llvm.vp.load.nxv4i64(ptr undef, <vscale x 4 x i1> undef, i32 undef)
+  %t28 = load <vscale x 4 x i64>, ptr undef
+  %t29 = call <vscale x 8 x i64> @llvm.vp.load.nxv8i64(ptr undef, <vscale x 8 x i1> undef, i32 undef)
+  %t30 = load <vscale x 8 x i64>, ptr undef
+  %t31 = call <vscale x 16 x i64> @llvm.vp.load.nxv16i64(ptr undef, <vscale x 16 x i1> undef, i32 undef)
+  %t32 = load <vscale x 16 x i64>, ptr undef
   ret void
 }