[llvm] [Analysis][AArch64] Add cost model for loop.dependence.{war/raw}.mask (PR #167551)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 03:16:17 PST 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/167551
>From 4cd8518ccbd25453e2db55e17db504104819afe7 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 11 Nov 2025 16:25:57 +0000
Subject: [PATCH 1/3] [Analysis][AArch64] Add cost model for
loop.dependence.{war/raw}.mask
This PR adds the cost model for the loop dependence mask intrinsics,
both for cases where they must be expanded and when they can be lowered
for AArch64.
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 47 ++++++++
.../AArch64/AArch64TargetTransformInfo.cpp | 34 ++++++
.../CostModel/AArch64/loop_dependence_mask.ll | 104 ++++++++++++++++++
3 files changed, 185 insertions(+)
create mode 100644 llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 129a1971981d5..5439dc26dadb0 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2139,6 +2139,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Otherwise, fallback to default scalarization cost.
break;
}
+ case Intrinsic::loop_dependence_raw_mask:
+ case Intrinsic::loop_dependence_war_mask: {
+ InstructionCost Cost = 0;
+ Type *PtrTy = ICA.getArgTypes()[0];
+ bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
+
+ Cost +=
+ thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind);
+ if (IsReadAfterWrite) {
+ IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
+ Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
+ }
+
+ Cost +=
+ thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
+ Type *CmpTy =
+ getTLI()
+ ->getSetCCResultType(
+ thisT()->getDataLayout(), RetTy->getContext(),
+ getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
+ .getTypeForEVT(RetTy->getContext());
+ Cost += thisT()->getCmpSelInstrCost(
+ BinaryOperator::ICmp, CmpTy, PtrTy,
+ IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);
+
+ // The deconstructed active lane mask
+ VectorType *RetTyVec = cast<VectorType>(RetTy);
+ VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
+ Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
+ CostKind, 0, nullptr);
+ IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
+ FMF);
+ Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
+ SplatTy, CmpInst::ICMP_ULT, CostKind);
+
+ Cost +=
+ thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
+ TTI::CastContextHint::None, CostKind);
+ Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
+ RetTyVec->getElementType(), CmpTy,
+ TTI::CastContextHint::None, CostKind);
+ Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
+ CostKind, 0, nullptr);
+ Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+ return Cost;
+ }
}
// Assume that we need to scalarize this intrinsic.)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 043be554f8441..3fd25620b3cc9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1064,6 +1064,40 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::loop_dependence_raw_mask:
+ case Intrinsic::loop_dependence_war_mask: {
+ auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
+ EVT VecVT = getTLI()->getValueType(DL, RetTy);
+ // An invalid element size and return type combination must be expanded.
+ bool MustBeExpanded = false;
+ switch (EltSize->getSExtValue()) {
+ case 1:
+ if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
+ MustBeExpanded = true;
+ break;
+ case 2:
+ if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
+ MustBeExpanded = true;
+ break;
+ case 4:
+ if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
+ MustBeExpanded = true;
+ break;
+ case 8:
+ if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
+ MustBeExpanded = true;
+ break;
+ default:
+ MustBeExpanded = true;
+ // Other element sizes are incompatible with whilewr/rw, so expand instead
+ break;
+ }
+
+ // The whilewr/rw instructions require SVE2 or SME
+ if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
+ break;
+ return 1;
+ }
case Intrinsic::experimental_vector_extract_last_active:
if (ST->isSVEorStreamingSVEAvailable()) {
auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
diff --git a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
new file mode 100644
index 0000000000000..1074d41d994d9
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-EXPANDED
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefix=CHECK
+
+; loop.dependence.{war,raw}.mask can be lowered to while{wr,rw} if SVE2 or SME is enabled.
+define void @loop_dependence_war_mask(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_war_mask'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+ %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+ %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+ %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+ ret void
+}
+
+define void @loop_dependence_raw_mask(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_raw_mask'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+ %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+ %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+ %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+ ret void
+}
+
+; Invalid element size and return type combinations must be expanded, even with sve2/sme
+define void @loop_dependence_war_mask_invalid(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+ %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+ %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+ %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+ %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+ ret void
+}
+
+define void @loop_dependence_raw_mask_invalid(ptr %a, ptr %b) {
+; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+ %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+ %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+ %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+ %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+ ret void
+}
>From c6c5fb0556a264ded301c4f779f97299b9e2951f Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 13 Nov 2025 15:46:05 +0000
Subject: [PATCH 2/3] Address review
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 16 +++++++++
.../SelectionDAG/LegalizeVectorOps.cpp | 16 +++++++++
.../AArch64/AArch64TargetTransformInfo.cpp | 35 +++++--------------
3 files changed, 40 insertions(+), 27 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5439dc26dadb0..1a830a4c5f064 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2141,6 +2141,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
case Intrinsic::loop_dependence_raw_mask:
case Intrinsic::loop_dependence_war_mask: {
+ // Compute the cost of the expanded version of these intrinsics:
+ // ; Figure out if there's overlap between the pointers.
+ // diff = (ptrB - ptrA) / eltSize ; read-after-write will use the
+ // absolute difference
+ // cmp = diff <= 0 ; read-after-write will check for equality
+ // with 0
+ // ; Create a mask with each lane < diff active. This is essentiallly
+ // an active lane mask between 0 and diff.
+ // diff_splat = splat diff to <Y x i64>
+ // steps = stepvector <Y x i64>
+ // diff_mask = steps <= diff_splat
+ // ; OR that diff mask with the comparison result, so that each lane is
+ // active if it's less than diff or there was no overlap in the
+ // first place. Otherwise the lane is inactive.
+ // cmp_splat = splat cmp to <Y x i1>
+ // result = or cmp_splat diff_mask
InstructionCost Cost = 0;
Type *PtrTy = ICA.getArgTypes()[0];
bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index e8d9bce43f6ea..b37d15b15ae67 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1811,6 +1811,22 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
}
SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) {
+ // Expand these intrinsics:
+ // ; Figure out if there's overlap between the pointers.
+ // diff = (ptrB - ptrA) / eltSize ; read-after-write will use the absolute
+ // difference
+ // cmp = diff <= 0 ; read-after-write will check for equality
+ // with 0
+ // ; Create a mask with each lane < diff active. This is essentiallly an
+ // active lane mask between 0 and diff.
+ // diff_splat = splat diff to <Y x i64>
+ // steps = stepvector <Y x i64>
+ // diff_mask = steps <= diff_splat
+ // ; OR that diff mask with the comparison result, so that each lane is
+ // active if it's less than diff or there was no overlap in the
+ // first place. Otherwise the lane is inactive.
+ // cmp_splat = splat cmp to <Y x i1>
+ // result = or cmp_splat diff_mask
SDLoc DL(N);
SDValue SourceValue = N->getOperand(0);
SDValue SinkValue = N->getOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3fd25620b3cc9..5b5db7e6a2ee8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1066,37 +1066,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
case Intrinsic::loop_dependence_raw_mask:
case Intrinsic::loop_dependence_war_mask: {
- auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
+ unsigned EltSizeInBytes =
+ cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
EVT VecVT = getTLI()->getValueType(DL, RetTy);
// An invalid element size and return type combination must be expanded.
- bool MustBeExpanded = false;
- switch (EltSize->getSExtValue()) {
- case 1:
- if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
- MustBeExpanded = true;
- break;
- case 2:
- if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
- MustBeExpanded = true;
- break;
- case 4:
- if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
- MustBeExpanded = true;
- break;
- case 8:
- if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
- MustBeExpanded = true;
- break;
- default:
- MustBeExpanded = true;
- // Other element sizes are incompatible with whilewr/rw, so expand instead
- break;
- }
+ bool MustBeExpanded =
+ VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes) ||
+ !isPowerOf2_32(EltSizeInBytes) || EltSizeInBytes > 8;
// The whilewr/rw instructions require SVE2 or SME
- if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
- break;
- return 1;
+ if (!MustBeExpanded && (ST->hasSVE2() || ST->hasSME()))
+ return 1;
+ break;
}
case Intrinsic::experimental_vector_extract_last_active:
if (ST->isSVEorStreamingSVEAvailable()) {
>From a1219362ca0ec9aa8b48d6fb67db8cc01002dc7b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 11 Dec 2025 11:14:00 +0000
Subject: [PATCH 3/3] Update cost model
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 80 +++++++-----------
.../AArch64/AArch64TargetTransformInfo.cpp | 20 ++---
.../CostModel/AArch64/loop_dependence_mask.ll | 84 +++++++++++--------
3 files changed, 91 insertions(+), 93 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 1a830a4c5f064..de64d9bbb7125 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2139,26 +2139,28 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Otherwise, fallback to default scalarization cost.
break;
}
- case Intrinsic::loop_dependence_raw_mask:
- case Intrinsic::loop_dependence_war_mask: {
+ case Intrinsic::loop_dependence_war_mask:
+ case Intrinsic::loop_dependence_raw_mask: {
// Compute the cost of the expanded version of these intrinsics:
- // ; Figure out if there's overlap between the pointers.
- // diff = (ptrB - ptrA) / eltSize ; read-after-write will use the
- // absolute difference
- // cmp = diff <= 0 ; read-after-write will check for equality
- // with 0
- // ; Create a mask with each lane < diff active. This is essentiallly
- // an active lane mask between 0 and diff.
- // diff_splat = splat diff to <Y x i64>
- // steps = stepvector <Y x i64>
- // diff_mask = steps <= diff_splat
- // ; OR that diff mask with the comparison result, so that each lane is
- // active if it's less than diff or there was no overlap in the
- // first place. Otherwise the lane is inactive.
- // cmp_splat = splat cmp to <Y x i1>
- // result = or cmp_splat diff_mask
+ //
+ // The possible expansions are...
+ //
+ // loop_dependence_war_mask:
+ // diff = (ptrB - ptrA) / eltSize
+ // cmp = icmp sle diff, 0
+ // upper_bound = select cmp, -1, diff
+ // mask = get_active_lane_mask lane_offset, upper_bound
+ //
+ // loop_dependence_raw_mask:
+ // diff = (abs(ptrB - ptrA)) / eltSize
+ // cmp = icmp eq diff, 0
+ // upper_bound = select cmp, -1, diff
+ // mask = get_active_lane_mask lane_offset, upper_bound
+ //
InstructionCost Cost = 0;
Type *PtrTy = ICA.getArgTypes()[0];
+ unsigned EltSizeInBytes =
+ cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;
Cost +=
@@ -2167,39 +2169,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
}
+ Cost += thisT()->getArithmeticInstrCost(
+ isPowerOf2_32(EltSizeInBytes) ? Instruction::AShr : Instruction::SDiv,
+ PtrTy, CostKind);
- Cost +=
- thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
- Type *CmpTy =
- getTLI()
- ->getSetCCResultType(
- thisT()->getDataLayout(), RetTy->getContext(),
- getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
- .getTypeForEVT(RetTy->getContext());
- Cost += thisT()->getCmpSelInstrCost(
- BinaryOperator::ICmp, CmpTy, PtrTy,
- IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);
-
- // The deconstructed active lane mask
- VectorType *RetTyVec = cast<VectorType>(RetTy);
- VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
- Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
- CostKind, 0, nullptr);
- IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
- FMF);
- Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
- Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
- SplatTy, CmpInst::ICMP_ULT, CostKind);
+ Type *CondTy = IntegerType::getInt1Ty(RetTy->getContext());
+ CmpInst::Predicate Pred =
+ IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE;
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CondTy, PtrTy,
+ Pred, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, PtrTy, CondTy,
+ Pred, CostKind);
- Cost +=
- thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
- TTI::CastContextHint::None, CostKind);
- Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
- RetTyVec->getElementType(), CmpTy,
- TTI::CastContextHint::None, CostKind);
- Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
- CostKind, 0, nullptr);
- Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+ IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
+ {PtrTy, PtrTy}, FMF);
+ Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
return Cost;
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b5db7e6a2ee8..b8d408dc6623b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1066,17 +1066,15 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
case Intrinsic::loop_dependence_raw_mask:
case Intrinsic::loop_dependence_war_mask: {
- unsigned EltSizeInBytes =
- cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
- EVT VecVT = getTLI()->getValueType(DL, RetTy);
- // An invalid element size and return type combination must be expanded.
- bool MustBeExpanded =
- VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes) ||
- !isPowerOf2_32(EltSizeInBytes) || EltSizeInBytes > 8;
-
- // The whilewr/rw instructions require SVE2 or SME
- if (!MustBeExpanded && (ST->hasSVE2() || ST->hasSME()))
- return 1;
+ // The whilewr/rw instructions require SVE2 or SME.
+ if (ST->hasSVE2() || ST->hasSME()) {
+ EVT VecVT = getTLI()->getValueType(DL, RetTy);
+ unsigned EltSizeInBytes =
+ cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
+ if (is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) &&
+ VecVT.getVectorMinNumElements() == (16 / EltSizeInBytes))
+ return 1;
+ }
break;
}
case Intrinsic::experimental_vector_extract_last_active:
diff --git a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
index 1074d41d994d9..1bd698224068f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/loop_dependence_mask.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-EXPANDED
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s --check-prefix=CHECK
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sme | FileCheck %s --check-prefixes=CHECK,CHECK-SME
; loop.dependence.{war,raw}.mask can be lowered to while{wr,rw} if SVE2 or SME is enabled.
define void @loop_dependence_war_mask(ptr %a, ptr %b) {
; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask'
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8)
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-LABEL: 'loop_dependence_war_mask'
@@ -29,10 +29,10 @@ entry:
define void @loop_dependence_raw_mask(ptr %a, ptr %b) {
; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask'
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res1 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res2 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res3 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res4 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8)
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-LABEL: 'loop_dependence_raw_mask'
@@ -53,20 +53,28 @@ entry:
; Invalid element size and return type combinations must be expanded, even with sve2/sme
define void @loop_dependence_war_mask_invalid(ptr %a, ptr %b) {
; CHECK-EXPANDED-LABEL: 'loop_dependence_war_mask_invalid'
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
-; CHECK-LABEL: 'loop_dependence_war_mask_invalid'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-SVE2-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-SME-LABEL: 'loop_dependence_war_mask_invalid'
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
entry:
%res5 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
@@ -79,20 +87,28 @@ entry:
define void @loop_dependence_raw_mask_invalid(ptr %a, ptr %b) {
; CHECK-EXPANDED-LABEL: 'loop_dependence_raw_mask_invalid'
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
-; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
; CHECK-EXPANDED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
-; CHECK-LABEL: 'loop_dependence_raw_mask_invalid'
-; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-SVE2-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-SME-LABEL: 'loop_dependence_raw_mask_invalid'
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %res6 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 4)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res7 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 2)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res8 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 1)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res9 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 10)
+; CHECK-SME-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
entry:
%res5 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
More information about the llvm-commits
mailing list