[llvm] [Analysis][RISCV] More accurately estimate the cost of strided vector loads (PR #175135)
Ryan Buchner via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 3 00:10:35 PST 2026
https://github.com/bababuck updated https://github.com/llvm/llvm-project/pull/175135
>From 0590e8d7cc5fe2528dd1ed4adff365a303d62f6f Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:36:28 -0800
Subject: [PATCH 1/7] [RISCV] Add new test cases for RISCV strided load cost
modeling
---
.../Analysis/CostModel/RISCV/vp-intrinsics.ll | 25 +++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 9e8f727978001..fee3953ebdc95 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1223,7 +1223,32 @@ define void @strided_store() {
ret void
}
+define void @constant_strided_load() {
+; CHECK-LABEL: 'constant_strided_load'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef) ; Make sure cost doesn't go to zero
+ %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+ %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef) ; No overlap
+ %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef) ; Barely overlap
+ %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef) ; Should be limited by MaxCombines
+
+ call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+ call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef) ; Negative stride
+ call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+ call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+ ret void
+}
define void @reduce_add() {
; CHECK-LABEL: 'reduce_add'
>From 60c635fe508c47568e39268476fcc39d662a5029 Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 14:01:34 -0800
Subject: [PATCH 2/7] [Analysis] Add StrideVal to
TargetTransformInfo::MemIntrinsicCostAttributes
For strided memory accesses with constant strides, will allow us to
properly cost cases when consecutive elements will fall on the same
cache line.
---
.../llvm/Analysis/TargetTransformInfo.h | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8b06b4aae26ce..be8171466f622 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -149,25 +149,30 @@ class MemIntrinsicCostAttributes {
/// Alignment of single element.
Align Alignment;
+ const Value *StrideVal;
+
public:
LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
const Value *Ptr, bool VariableMask,
Align Alignment,
- const Instruction *I = nullptr)
+ const Instruction *I = nullptr,
+ const Value *StrideVal = nullptr)
: I(I), Ptr(Ptr), DataTy(DataTy), IID(Id), VariableMask(VariableMask),
- Alignment(Alignment) {}
+ Alignment(Alignment), StrideVal(StrideVal) {}
LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
Align Alignment,
- unsigned AddressSpace = 0)
+ unsigned AddressSpace = 0,
+ const Value *StrideVal = nullptr)
: DataTy(DataTy), IID(Id), AddressSpace(AddressSpace),
- Alignment(Alignment) {}
+ Alignment(Alignment), StrideVal(StrideVal) {}
LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
bool VariableMask, Align Alignment,
- const Instruction *I = nullptr)
+ const Instruction *I = nullptr,
+ const Value *StrideVal = nullptr)
: I(I), DataTy(DataTy), IID(Id), VariableMask(VariableMask),
- Alignment(Alignment) {}
+ Alignment(Alignment), StrideVal(StrideVal) {}
Intrinsic::ID getID() const { return IID; }
const Instruction *getInst() const { return I; }
@@ -176,6 +181,7 @@ class MemIntrinsicCostAttributes {
bool getVariableMask() const { return VariableMask; }
unsigned getAddressSpace() const { return AddressSpace; }
Align getAlignment() const { return Alignment; }
+ const Value *getStrideVal() const { return StrideVal; }
};
class IntrinsicCostAttributes {
>From 9140d7656a0171cfe968b79c221781965b0737dd Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:35:03 -0800
Subject: [PATCH 3/7] [Analysis] Pass along the stride when calculating strided
load costs in BasicTTIImpl
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index ef91c845ce9e7..6d67a7feb703c 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1948,6 +1948,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::experimental_vp_strided_store: {
const Value *Data = Args[0];
const Value *Ptr = Args[1];
+ const Value *Stride = Args[2];
const Value *Mask = Args[3];
const Value *EVL = Args[4];
bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
@@ -1956,11 +1957,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
return thisT()->getMemIntrinsicInstrCost(
MemIntrinsicCostAttributes(IID, Data->getType(), Ptr, VarMask,
- Alignment, I),
+ Alignment, I, Stride),
CostKind);
}
case Intrinsic::experimental_vp_strided_load: {
const Value *Ptr = Args[0];
+ const Value *Stride = Args[1];
const Value *Mask = Args[2];
const Value *EVL = Args[3];
bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
@@ -1968,7 +1970,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
Align Alignment =
I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
return thisT()->getMemIntrinsicInstrCost(
- MemIntrinsicCostAttributes(IID, RetTy, Ptr, VarMask, Alignment, I),
+ MemIntrinsicCostAttributes(IID, RetTy, Ptr, VarMask, Alignment, I,
+ Stride),
CostKind);
}
case Intrinsic::stepvector: {
>From 27f4e62c99fd0e281c9b3c20daa2fe81ad51ecef Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:35:18 -0800
Subject: [PATCH 4/7] [RISCV] Add MaxVectorCoalesceElts to RISCVTuneInfo
Controls the maximum amount of elements that can be
coalesced on a vector load if allowed by the access
pattern and cache line size.
---
llvm/lib/Target/RISCV/RISCVProcessors.td | 4 +++-
llvm/lib/Target/RISCV/RISCVSubtarget.h | 8 ++++++++
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 5becfd2ad502b..c8242c34eba8e 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -43,6 +43,8 @@ class RISCVTuneInfo {
bits<32> MaxLoadsPerMemcmpOptSize = 4;
bits<32> MaxLoadsPerMemcmp = 8;
+ bits<8> MaxVectorCoalesceElts = 2;
+
// The direction of PostRA scheduling.
code PostRASchedDirection = TopDown;
}
@@ -58,7 +60,7 @@ def RISCVTuneInfoTable : GenericTable {
"MaxStoresPerMemcpyOptSize", "MaxStoresPerMemcpy",
"MaxStoresPerMemmoveOptSize", "MaxStoresPerMemmove",
"MaxLoadsPerMemcmpOptSize", "MaxLoadsPerMemcmp",
- "PostRASchedDirection"];
+ "MaxVectorCoalesceElts", "PostRASchedDirection"];
}
def getRISCVTuneInfo : SearchIndex {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index b2e0abbdd6e64..fe7f16968ada9 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -68,6 +68,10 @@ struct RISCVTuneInfo {
unsigned MaxLoadsPerMemcmpOptSize;
unsigned MaxLoadsPerMemcmp;
+ // How many vector elements can be coalesced if on the
+ // same cache line
+ uint8_t MaxVectorCoalesceElts;
+
// The direction of PostRA scheduling.
MISched::Direction PostRASchedDirection;
};
@@ -432,6 +436,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
: TuneInfo->MaxLoadsPerMemcmp;
}
+ uint8_t getMaxVectorCoalesceElts() const {
+ return TuneInfo->MaxVectorCoalesceElts;
+ }
+
MISched::Direction getPostRASchedDirection() const {
return TuneInfo->PostRASchedDirection;
}
>From 869a4cfcb39514ba03946731661e31e385f2138b Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Tue, 6 Jan 2026 15:35:34 -0800
Subject: [PATCH 5/7] [RISCV] Consider the stride if known
RISCVTTIImpl::getStridedMemoryOpCost()
When the stride is small and constant, we can predict the speedup
due to memory coalescing.
---
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 19 +++++++++++
.../Analysis/CostModel/RISCV/vp-intrinsics.ll | 34 +++++++++++++------
2 files changed, 42 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e812d092c3ea0..300371146c759 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1274,6 +1274,25 @@ RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
{TTI::OK_AnyValue, TTI::OP_None}, I);
unsigned NumLoads = getEstimatedVLFor(&VTy);
+ // Performant implementations of the vector extension will attempt to re-use
+ // elements if they fall on the same cache line
+ uint64_t CacheLineBytes = ST->getCacheLineSize();
+ if (!CacheLineBytes) // If no value, use default value of 64
+ CacheLineBytes = 64;
+ unsigned EltsPerCL = (CacheLineBytes * 8) / DataTy->getScalarSizeInBits();
+ if (const ConstantInt *StrideCI =
+ dyn_cast_or_null<ConstantInt>(MICA.getStrideVal())) {
+ uint64_t AbsStride = (uint64_t)std::abs(StrideCI->getSExtValue());
+ if (AbsStride < EltsPerCL) {
+ uint64_t MaxCombines = ST->getMaxVectorCoalesceElts();
+ if ((EltsPerCL / AbsStride) >= MaxCombines)
+ NumLoads = divideCeil(NumLoads, MaxCombines);
+ else
+ // If we were to calculate EltsPerCL / AbsStride first, would lose
+ // accuracy
+ NumLoads = divideCeil((NumLoads * AbsStride), EltsPerCL);
+ }
+ }
return NumLoads * MemOpCost;
}
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index fee3953ebdc95..94572f0272bfd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1224,17 +1224,29 @@ define void @strided_store() {
}
define void @constant_strided_load() {
-; CHECK-LABEL: 'constant_strided_load'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ARGBASED-LABEL: 'constant_strided_load'
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPEBASED-LABEL: 'constant_strided_load'
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef) ; Make sure cost doesn't go to zero
%t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
>From 574199b09b459504e2a18095ede4ae6971a18e91 Mon Sep 17 00:00:00 2001
From: bababuck <rbuchner at qti.qualcomm.com>
Date: Thu, 15 Jan 2026 11:54:37 -0800
Subject: [PATCH 6/7] Use poison rather than undef in test
---
.../Analysis/CostModel/RISCV/vp-intrinsics.ll | 54 +++++++++----------
1 file changed, 27 insertions(+), 27 deletions(-)
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 94572f0272bfd..97a954b2657e5 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1225,39 +1225,39 @@ define void @strided_store() {
define void @constant_strided_load() {
; ARGBASED-LABEL: 'constant_strided_load'
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
; ARGBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPEBASED-LABEL: 'constant_strided_load'
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
- %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr undef, i64 1, <2 x i1> undef, i32 undef) ; Make sure cost doesn't go to zero
- %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr undef, i64 1, <4 x i1> undef, i32 undef)
- %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr undef, i64 8, <16 x i1> undef, i32 undef) ; No overlap
- %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr undef, i64 7, <vscale x 4 x i1> undef, i32 undef) ; Barely overlap
- %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr undef, i64 1, <vscale x 16 x i1> undef, i32 undef) ; Should be limited by MaxCombines
+ %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison) ; Make sure cost doesn't go to zero
+ %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
+ %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison) ; No overlap
+ %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison) ; Barely overlap
+ %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison) ; Should be limited by MaxCombines
- call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> undef, ptr undef, i64 14, <4 x i1> undef, i32 undef)
- call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> undef, ptr undef, i64 -3, <16 x i1> undef, i32 undef) ; Negative stride
- call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> undef, ptr undef, i64 4, <vscale x 4 x i1> undef, i32 undef)
- call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> undef, ptr undef, i64 5, <vscale x 16 x i1> undef, i32 undef)
+ call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
+ call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison) ; Negative stride
+ call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
+ call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
ret void
}
>From f863a971284d8a4b336f714eeaa36e047c3ea425 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Mon, 2 Mar 2026 23:43:47 -0800
Subject: [PATCH 7/7] [RISCV] Adjust for the fact that stride is in bytes, not
elements
---
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 9 ++--
.../Analysis/CostModel/RISCV/vp-intrinsics.ll | 48 +++++++++----------
2 files changed, 28 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 300371146c759..84ba004e03d04 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1279,18 +1279,17 @@ RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
uint64_t CacheLineBytes = ST->getCacheLineSize();
if (!CacheLineBytes) // If no value, use default value of 64
CacheLineBytes = 64;
- unsigned EltsPerCL = (CacheLineBytes * 8) / DataTy->getScalarSizeInBits();
if (const ConstantInt *StrideCI =
dyn_cast_or_null<ConstantInt>(MICA.getStrideVal())) {
uint64_t AbsStride = (uint64_t)std::abs(StrideCI->getSExtValue());
- if (AbsStride < EltsPerCL) {
+ if (AbsStride < CacheLineBytes) {
uint64_t MaxCombines = ST->getMaxVectorCoalesceElts();
- if ((EltsPerCL / AbsStride) >= MaxCombines)
+ if ((CacheLineBytes / AbsStride) >= MaxCombines)
NumLoads = divideCeil(NumLoads, MaxCombines);
else
- // If we were to calculate EltsPerCL / AbsStride first, would lose
+ // If we were to calculate CacheLineBytes / AbsStride first, would lose
// accuracy
- NumLoads = divideCeil((NumLoads * AbsStride), EltsPerCL);
+ NumLoads = divideCeil((NumLoads * AbsStride), CacheLineBytes);
}
}
return NumLoads * MemOpCost;
diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
index 97a954b2657e5..0754d2fa0f151 100644
--- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll
@@ -1225,39 +1225,39 @@ define void @strided_store() {
define void @constant_strided_load() {
; ARGBASED-LABEL: 'constant_strided_load'
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 8, <2 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 8, <4 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 64, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 56, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 8, <vscale x 16 x i1> poison, i32 poison)
; ARGBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
-; ARGBASED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -24, <16 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 32, <vscale x 4 x i1> poison, i32 poison)
+; ARGBASED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 40, <vscale x 16 x i1> poison, i32 poison)
; ARGBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPEBASED-LABEL: 'constant_strided_load'
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr poison, i64 8, <2 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr poison, i64 8, <4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr poison, i64 64, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr poison, i64 56, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr poison, i64 8, <vscale x 16 x i1> poison, i32 poison)
; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
-; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> poison, ptr poison, i64 -24, <16 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64(<vscale x 4 x i64> poison, ptr poison, i64 32, <vscale x 4 x i1> poison, i32 poison)
+; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64(<vscale x 16 x i64> poison, ptr poison, i64 40, <vscale x 16 x i1> poison, i32 poison)
; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
- %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr poison, i64 1, <2 x i1> poison, i32 poison) ; Make sure cost doesn't go to zero
- %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr poison, i64 1, <4 x i1> poison, i32 poison)
- %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr poison, i64 8, <16 x i1> poison, i32 poison) ; No overlap
- %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr poison, i64 7, <vscale x 4 x i1> poison, i32 poison) ; Barely overlap
- %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr poison, i64 1, <vscale x 16 x i1> poison, i32 poison) ; Should be limited by MaxCombines
+ %t9 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.i64(ptr poison, i64 8, <2 x i1> poison, i32 poison) ; Make sure cost doesn't go to zero
+ %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.i64(ptr poison, i64 8, <4 x i1> poison, i32 poison)
+ %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.i64(ptr poison, i64 64, <16 x i1> poison, i32 poison) ; No overlap
+ %t27 = call <vscale x 4 x i64> @llvm.experimental.vp.strided.load.nxv4i64.i64(ptr poison, i64 56, <vscale x 4 x i1> poison, i32 poison) ; Barely overlap
+ %t31 = call <vscale x 16 x i64> @llvm.experimental.vp.strided.load.nxv16i64.i64(ptr poison, i64 8, <vscale x 16 x i1> poison, i32 poison) ; Should be limited by MaxCombines
call void @llvm.experimental.vp.strided.store.v4i8.i64(<4 x i8> poison, ptr poison, i64 14, <4 x i1> poison, i32 poison)
- call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> poison, ptr poison, i64 -3, <16 x i1> poison, i32 poison) ; Negative stride
- call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> poison, ptr poison, i64 4, <vscale x 4 x i1> poison, i32 poison)
- call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> poison, ptr poison, i64 5, <vscale x 16 x i1> poison, i32 poison)
+ call void @llvm.experimental.vp.strided.store.v16i64.i64(<16 x i64> poison, ptr poison, i64 -24, <16 x i1> poison, i32 poison) ; Negative stride
+ call void @llvm.experimental.vp.strided.store.nxv4i64.i64(<vscale x 4 x i64> poison, ptr poison, i64 32, <vscale x 4 x i1> poison, i32 poison)
+ call void @llvm.experimental.vp.strided.store.nxv16i64.i64(<vscale x 16 x i64> poison, ptr poison, i64 40, <vscale x 16 x i1> poison, i32 poison)
ret void
}
More information about the llvm-commits
mailing list