[llvm] [CostModel] Add type-based cost model for get.active.lane.mask intrinsic (PR #130132)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 7 06:40:46 PST 2025
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/130132
>From fc515fbb2b74f427fb4aa0cd7f1f9ada10ef2b4d Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 6 Mar 2025 16:18:06 +0000
Subject: [PATCH 1/3] [CostModel] Add type-based cost model for
get.active.lane.mask intrinsic
I recently realised that we return an invalid cost when requesting
the type-based cost for the get.active.lane.mask intrinsic. I've
fixed that in this patch by reusing the existing code for the
non-type-based model.
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 47 +++++++++++--------
.../CostModel/AArch64/sve-intrinsics.ll | 24 +++++-----
2 files changed, 40 insertions(+), 31 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 563953516a354..a663c350a8655 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1691,6 +1691,29 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
+ InstructionCost getActiveLaneMaskCost(Type *RetTy, Type *ArgTy,
+ TTI::TargetCostKind CostKind) {
+ EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
+ EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true);
+
+ // If we're not expanding the intrinsic then we assume this is cheap
+ // to implement.
+ if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT))
+ return getTypeLegalizationCost(RetTy).first;
+
+ // Create the expanded types that will be used to calculate the uadd_sat
+ // operation.
+ Type *ExpRetTy =
+ VectorType::get(ArgTy, cast<VectorType>(RetTy)->getElementCount());
+ IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {},
+ FastMathFlags());
+ InstructionCost Cost =
+ thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
+ CmpInst::ICMP_ULT, CostKind);
+ return Cost;
+ }
+
/// Get intrinsic cost based on arguments.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
@@ -1987,25 +2010,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
case Intrinsic::get_active_lane_mask: {
- EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
- EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
-
- // If we're not expanding the intrinsic then we assume this is cheap
- // to implement.
- if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
- return getTypeLegalizationCost(RetTy).first;
- }
-
- // Create the expanded types that will be used to calculate the uadd_sat
- // operation.
- Type *ExpRetTy = VectorType::get(
- ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
- IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
- InstructionCost Cost =
- thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
- Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
- CmpInst::ICMP_ULT, CostKind);
- return Cost;
+ return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0],
+ CostKind);
}
case Intrinsic::experimental_cttz_elts: {
EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
@@ -2394,6 +2400,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
return Cost;
}
+ case Intrinsic::get_active_lane_mask:
+ return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0],
+ CostKind);
case Intrinsic::abs:
ISD = ISD::ABS;
break;
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 0bf776b5c97e3..265ff89b36050 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -958,16 +958,16 @@ define void @get_lane_mask() #0 {
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
@@ -976,8 +976,8 @@ define void @get_lane_mask() #0 {
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
>From 3097f5491ad81c9d6f43990f67796173ea3f6b5a Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 7 Mar 2025 10:20:00 +0000
Subject: [PATCH 2/3] Address review comment
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 53 ++++++++++--------------
1 file changed, 23 insertions(+), 30 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index a663c350a8655..3e322e95628a5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1691,29 +1691,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
- InstructionCost getActiveLaneMaskCost(Type *RetTy, Type *ArgTy,
- TTI::TargetCostKind CostKind) {
- EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
- EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true);
-
- // If we're not expanding the intrinsic then we assume this is cheap
- // to implement.
- if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT))
- return getTypeLegalizationCost(RetTy).first;
-
- // Create the expanded types that will be used to calculate the uadd_sat
- // operation.
- Type *ExpRetTy =
- VectorType::get(ArgTy, cast<VectorType>(RetTy)->getElementCount());
- IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {},
- FastMathFlags());
- InstructionCost Cost =
- thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
- Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
- CmpInst::ICMP_ULT, CostKind);
- return Cost;
- }
-
/// Get intrinsic cost based on arguments.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
@@ -2009,10 +1986,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
}
return Cost;
}
- case Intrinsic::get_active_lane_mask: {
- return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0],
- CostKind);
- }
case Intrinsic::experimental_cttz_elts: {
EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
@@ -2060,6 +2033,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
+ case Intrinsic::get_active_lane_mask:
case Intrinsic::experimental_vector_match:
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
case Intrinsic::modf:
@@ -2400,9 +2374,28 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
return Cost;
}
- case Intrinsic::get_active_lane_mask:
- return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0],
- CostKind);
+ case Intrinsic::get_active_lane_mask: {
+ Type *ArgTy = ICA.getArgTypes()[0];
+ EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
+ EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true);
+
+ // If we're not expanding the intrinsic then we assume this is cheap
+ // to implement.
+ if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT))
+ return getTypeLegalizationCost(RetTy).first;
+
+ // Create the expanded types that will be used to calculate the uadd_sat
+ // operation.
+ Type *ExpRetTy =
+ VectorType::get(ArgTy, cast<VectorType>(RetTy)->getElementCount());
+ IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {},
+ FastMathFlags());
+ InstructionCost Cost =
+ thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
+ CmpInst::ICMP_ULT, CostKind);
+ return Cost;
+ }
case Intrinsic::abs:
ISD = ISD::ABS;
break;
>From 7f0015fffd7b29512706f4a00ba1663a4fc69092 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Fri, 7 Mar 2025 14:40:05 +0000
Subject: [PATCH 3/3] Address review comment
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 3e322e95628a5..bde19de0c87f5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2388,8 +2388,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// operation.
Type *ExpRetTy =
VectorType::get(ArgTy, cast<VectorType>(RetTy)->getElementCount());
- IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {},
- FastMathFlags());
+ IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
InstructionCost Cost =
thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
More information about the llvm-commits
mailing list