[llvm] [Analysis][RISCV] More accurately estimate the cost of strided vector loads (PR #175135)

Tue Mar 3 00:10:35 PST 2026

https://github.com/bababuck updated https://github.com/llvm/llvm-project/pull/175135

>From 0590e8d7cc5fe2528dd1ed4adff365a303d62f6f Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:36:28 -0800
Subject: [PATCH 1/7] [RISCV] Add new test cases for RISCV strided load cost
 modeling

---
 .../Analysis/CostModel/RISCV/vp-intrinsics.ll | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 9e8f727978001..fee3953ebdc95 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1223,7 +1223,32 @@ define void @strided_store() {
   ret void
 }
 
+define void @constant_strided_load() {
+; CHECK-LABEL: 'constant_strided_load'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef) ; Make sure cost doesn't go to zero
+  %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+  %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef) ; No overlap
+  %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef) ; Barely overlap
+  %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef) ; Should be limited by MaxCombines
+
+  call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+  call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef) ; Negative stride
+  call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+  call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
 
+  ret void
+}
 
 define void @reduce_add() {
 ; CHECK-LABEL: 'reduce_add'

>From 60c635fe508c47568e39268476fcc39d662a5029 Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 14:01:34 -0800
Subject: [PATCH 2/7] [Analysis] Add StrideVal to
 TargetTransformInfo::MemIntrinsicCostAttributes

For strided memory accesses with constant strides, will allow us to
properly cost cases when consecutive elements will fall on the same
cache line.
---
 .../llvm/Analysis/TargetTransformInfo.h        | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8b06b4aae26ce..be8171466f622 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -149,25 +149,30 @@ class MemIntrinsicCostAttributes {
   /// Alignment of single element.
   Align Alignment;
 
+  const Value *StrideVal;
+
 public:
   LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
                                       const Value *Ptr, bool VariableMask,
                                       Align Alignment,
-                                      const Instruction *I = nullptr)
+                                      const Instruction *I = nullptr,
+                                      const Value *StrideVal = nullptr)
       : I(I), Ptr(Ptr), DataTy(DataTy), IID(Id), VariableMask(VariableMask),
-        Alignment(Alignment) {}
+        Alignment(Alignment), StrideVal(StrideVal) {}
 
   LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
                                       Align Alignment,
-                                      unsigned AddressSpace = 0)
+                                      unsigned AddressSpace = 0,
+                                      const Value *StrideVal = nullptr)
       : DataTy(DataTy), IID(Id), AddressSpace(AddressSpace),
-        Alignment(Alignment) {}
+        Alignment(Alignment), StrideVal(StrideVal) {}
 
   LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
                                       bool VariableMask, Align Alignment,
-                                      const Instruction *I = nullptr)
+                                      const Instruction *I = nullptr,
+                                      const Value *StrideVal = nullptr)
       : I(I), DataTy(DataTy), IID(Id), VariableMask(VariableMask),
-        Alignment(Alignment) {}
+        Alignment(Alignment), StrideVal(StrideVal) {}
 
   Intrinsic::ID getID() const { return IID; }
   const Instruction *getInst() const { return I; }
@@ -176,6 +181,7 @@ class MemIntrinsicCostAttributes {
   bool getVariableMask() const { return VariableMask; }
   unsigned getAddressSpace() const { return AddressSpace; }
   Align getAlignment() const { return Alignment; }
+  const Value *getStrideVal() const { return StrideVal; }
 };
 
 class IntrinsicCostAttributes {

>From 9140d7656a0171cfe968b79c221781965b0737dd Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:35:03 -0800
Subject: [PATCH 3/7] [Analysis] Pass along the stride when calculating strided
 load costs in BasicTTIImpl

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index ef91c845ce9e7..6d67a7feb703c 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1948,6 +1948,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::experimental_vp_strided_store: {
       const Value *Data = Args[0];
       const Value *Ptr = Args[1];
+      const Value *Stride = Args[2];
       const Value *Mask = Args[3];
       const Value *EVL = Args[4];
       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
@@ -1956,11 +1957,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
           I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
       return thisT()->getMemIntrinsicInstrCost(
           MemIntrinsicCostAttributes(IID, Data->getType(), Ptr, VarMask,
-                                     Alignment, I),
+                                     Alignment, I, Stride),
           CostKind);
     }
     case Intrinsic::experimental_vp_strided_load: {
       const Value *Ptr = Args[0];
+      const Value *Stride = Args[1];
       const Value *Mask = Args[2];
       const Value *EVL = Args[3];
       bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
@@ -1968,7 +1970,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       Align Alignment =
           I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
       return thisT()->getMemIntrinsicInstrCost(
-          MemIntrinsicCostAttributes(IID, RetTy, Ptr, VarMask, Alignment, I),
+          MemIntrinsicCostAttributes(IID, RetTy, Ptr, VarMask, Alignment, I,
+                                     Stride),
           CostKind);
     }
     case Intrinsic::stepvector: {

>From 27f4e62c99fd0e281c9b3c20daa2fe81ad51ecef Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:35:18 -0800
Subject: [PATCH 4/7] [RISCV] Add MaxVectorCoalesceElts to RISCVTuneInfo

Controls the maximum amount of elements that can be
coalesced on a vector load if allowed by the access
pattern and cache line size.
---
 llvm/lib/Target/RISCV/RISCVProcessors.td | 4 +++-
 llvm/lib/Target/RISCV/RISCVSubtarget.h   | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 5becfd2ad502b..c8242c34eba8e 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -43,6 +43,8 @@ class RISCVTuneInfo {
   bits<32> MaxLoadsPerMemcmpOptSize = 4;
   bits<32> MaxLoadsPerMemcmp = 8;
 
+  bits<8> MaxVectorCoalesceElts = 2;
+
   // The direction of PostRA scheduling.
   code PostRASchedDirection = TopDown;
 }
@@ -58,7 +60,7 @@ def RISCVTuneInfoTable : GenericTable {
                 "MaxStoresPerMemcpyOptSize", "MaxStoresPerMemcpy",
                 "MaxStoresPerMemmoveOptSize", "MaxStoresPerMemmove",
                 "MaxLoadsPerMemcmpOptSize", "MaxLoadsPerMemcmp",
-                "PostRASchedDirection"];
+                "MaxVectorCoalesceElts", "PostRASchedDirection"];
 }
 
 def getRISCVTuneInfo : SearchIndex {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index b2e0abbdd6e64..fe7f16968ada9 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -68,6 +68,10 @@ struct RISCVTuneInfo {
   unsigned MaxLoadsPerMemcmpOptSize;
   unsigned MaxLoadsPerMemcmp;
 
+  // How many vector elements can be coalesced if on the
+  // same cache line
+  uint8_t MaxVectorCoalesceElts;
+
   // The direction of PostRA scheduling.
   MISched::Direction PostRASchedDirection;
 };
@@ -432,6 +436,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
                    : TuneInfo->MaxLoadsPerMemcmp;
   }
 
+  uint8_t getMaxVectorCoalesceElts() const {
+    return TuneInfo->MaxVectorCoalesceElts;
+  }
+
   MISched::Direction getPostRASchedDirection() const {
     return TuneInfo->PostRASchedDirection;
   }

>From 869a4cfcb39514ba03946731661e31e385f2138b Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:35:34 -0800
Subject: [PATCH 5/7] [RISCV] Consider the stride if known
 RISCVTTIImpl::getStridedMemoryOpCost()

When the stride is small and constant, we can predict the speedup
due to memory coalescing.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 19 +++++++++++
 .../Analysis/CostModel/RISCV/vp-intrinsics.ll | 34 +++++++++++++------
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e812d092c3ea0..300371146c759 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1274,6 +1274,25 @@ RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
       getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
                       {TTI::OK_AnyValue, TTI::OP_None}, I);
   unsigned NumLoads = getEstimatedVLFor(&VTy);
+  // Performant implementations of the vector extension will attempt to re-use
+  // elements if they fall on the same cache line
+  uint64_t CacheLineBytes = ST->getCacheLineSize();
+  if (!CacheLineBytes) // If no value, use default value of 64
+    CacheLineBytes = 64;
+  unsigned EltsPerCL = (CacheLineBytes * 8) / DataTy->getScalarSizeInBits();
+  if (const ConstantInt *StrideCI =
+          dyn_cast_or_null<ConstantInt>(MICA.getStrideVal())) {
+    uint64_t AbsStride = (uint64_t)std::abs(StrideCI->getSExtValue());
+    if (AbsStride < EltsPerCL) {
+      uint64_t MaxCombines = ST->getMaxVectorCoalesceElts();
+      if ((EltsPerCL / AbsStride) >= MaxCombines)
+        NumLoads = divideCeil(NumLoads, MaxCombines);
+      else
+        // If we were to calculate EltsPerCL / AbsStride first, would lose
+        // accuracy
+        NumLoads = divideCeil((NumLoads * AbsStride), EltsPerCL);
+    }
+  }
   return NumLoads * MemOpCost;
 }
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index fee3953ebdc95..94572f0272bfd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1224,17 +1224,29 @@ define void @strided_store() {
 }
 
 define void @constant_strided_load() {
-; CHECK-LABEL: 'constant_strided_load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ARGBASED-LABEL: 'constant_strided_load'
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'constant_strided_load'
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef) ; Make sure cost doesn't go to zero
   %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)

>From 574199b09b459504e2a18095ede4ae6971a18e91 Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Thu, 15 Jan 2026 11:54:37 -0800
Subject: [PATCH 6/7] Use poison rather than undef in test

---
 .../Analysis/CostModel/RISCV/vp-intrinsics.ll | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 94572f0272bfd..97a954b2657e5 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1225,39 +1225,39 @@ define void @strided_store() {
 
 define void @constant_strided_load() {
 ; ARGBASED-LABEL: 'constant_strided_load'
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
 ; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'constant_strided_load'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef) ; Make sure cost doesn't go to zero
-  %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
-  %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef) ; No overlap
-  %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef) ; Barely overlap
-  %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef) ; Should be limited by MaxCombines
+  %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison) ; Make sure cost doesn't go to zero
+  %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
+  %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison) ; No overlap
+  %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison) ; Barely overlap
+  %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison) ; Should be limited by MaxCombines
 
-  call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
-  call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef) ; Negative stride
-  call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
-  call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+  call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
+  call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison) ; Negative stride
+  call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
+  call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }

>From f863a971284d8a4b336f714eeaa36e047c3ea425 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 2 Mar 2026 23:43:47 -0800
Subject: [PATCH 7/7] [RISCV] Adjust for the fact that stride is in bytes, not
 elements

---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  9 ++--
 .../Analysis/CostModel/RISCV/vp-intrinsics.ll | 48 +++++++++----------
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 300371146c759..84ba004e03d04 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1279,18 +1279,17 @@ RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
   uint64_t CacheLineBytes = ST->getCacheLineSize();
   if (!CacheLineBytes) // If no value, use default value of 64
     CacheLineBytes = 64;
-  unsigned EltsPerCL = (CacheLineBytes * 8) / DataTy->getScalarSizeInBits();
   if (const ConstantInt *StrideCI =
           dyn_cast_or_null<ConstantInt>(MICA.getStrideVal())) {
     uint64_t AbsStride = (uint64_t)std::abs(StrideCI->getSExtValue());
-    if (AbsStride < EltsPerCL) {
+    if (AbsStride < CacheLineBytes) {
       uint64_t MaxCombines = ST->getMaxVectorCoalesceElts();
-      if ((EltsPerCL / AbsStride) >= MaxCombines)
+      if ((CacheLineBytes / AbsStride) >= MaxCombines)
         NumLoads = divideCeil(NumLoads, MaxCombines);
       else
-        // If we were to calculate EltsPerCL / AbsStride first, would lose
+        // If we were to calculate CacheLineBytes / AbsStride first, would lose
         // accuracy
-        NumLoads = divideCeil((NumLoads * AbsStride), EltsPerCL);
+        NumLoads = divideCeil((NumLoads * AbsStride), CacheLineBytes);
     }
   }
   return NumLoads * MemOpCost;
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 97a954b2657e5..0754d2fa0f151 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1225,39 +1225,39 @@ define void @strided_store() {
 
 define void @constant_strided_load() {
 ; ARGBASED-LABEL: 'constant_strided_load'
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 8, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 8, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 64, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 56, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 8, <vscale x 16 x i1> poison, i32 poison)
 ; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -24, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 32, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 40, <vscale x 16 x i1> poison, i32 poison)
 ; ARGBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPEBASED-LABEL: 'constant_strided_load'
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 8, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 8, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 64, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 56, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 8, <vscale x 16 x i1> poison, i32 poison)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -24, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 32, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 40, <vscale x 16 x i1> poison, i32 poison)
 ; TYPEBASED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison) ; Make sure cost doesn't go to zero
-  %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
-  %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison) ; No overlap
-  %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison) ; Barely overlap
-  %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison) ; Should be limited by MaxCombines
+  %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr poison, i64 8, <2 x i1> poison, i32 poison) ; Make sure cost doesn't go to zero
+  %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr poison, i64 8, <4 x i1> poison, i32 poison)
+  %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr poison, i64 64, <16 x i1> poison, i32 poison) ; No overlap
+  %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr poison, i64 56, <vscale x 4 x i1> poison, i32 poison) ; Barely overlap
+  %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr poison, i64 8, <vscale x 16 x i1> poison, i32 poison) ; Should be limited by MaxCombines
 
   call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
-  call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison) ; Negative stride
-  call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
-  call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
+  call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> poison, ptr poison, i64 -24, <16 x i1> poison, i32 poison) ; Negative stride
+  call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> poison, ptr poison, i64 32, <vscale x 4 x i1> poison, i32 poison)
+  call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> poison, ptr poison, i64 40, <vscale x 16 x i1> poison, i32 poison)
 
   ret void
 }