[llvm] [AArch64] Add cost model for @experimental.vector.match (PR #118512)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 10 08:26:40 PST 2024
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/118512
>From 955435efdfc917a7a9a75dcebaeaf4bc384e1055 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 3 Dec 2024 06:53:42 -0800
Subject: [PATCH 1/4] [AArch64] Add cost model for @experimental.vector.match
Currently, vector types that would default to generic lowering (i.e.
that would not utilise a MATCH instruction) return invalid cost.
Fixed-length search vectors have a higher cost than scalable vectors
because we need a few more instructions to convert the boolean mask.
---
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +-
.../AArch64/AArch64TargetTransformInfo.cpp | 17 +++++++
.../CostModel/AArch64/sve-intrinsics.ll | 48 +++++++++++++++++++
3 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 48ebffff8cbfc2..306e305068e2d9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -781,7 +781,9 @@ class TargetTransformInfoImplBase {
default:
break;
case Intrinsic::experimental_vector_histogram_add:
- // For now, we want explicit support from the target for histograms.
+ case Intrinsic::experimental_vector_match:
+ // For now, we want explicit support from the target for histograms and
+ // matches.
return InstructionCost::getInvalid();
case Intrinsic::allow_runtime_check:
case Intrinsic::allow_ubsan_check:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b333d33cffd52..8c8bb6b21ebdde 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -905,6 +905,23 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
+ case Intrinsic::experimental_vector_match: {
+ EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ unsigned SearchSize =
+ cast<FixedVectorType>(ICA.getArgTypes()[1])->getNumElements();
+ // If we can't lower to MATCH, return an invalid cost.
+ if (getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
+ return InstructionCost::getInvalid();
+ // Base cost for MATCH instructions. At least on the Neoverse V2 and
+ // Neoverse V3 these are cheap operations with the same latency as a vector
+ // ADD. In most cases, however, we also need to do an extra DUP.
+ InstructionCost Cost = 4;
+ // For fixed-length vectors we currently need an extra five--six
+ // instructions besides the MATCH.
+ if (isa<FixedVectorType>(RetTy))
+ Cost += 10;
+ return Cost;
+ }
default:
break;
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index cca4ea73ee6628..9ba8fcb42a5298 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1360,6 +1360,54 @@ define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %m
ret void
}
+define void @match() #3 {
+; CHECK-VSCALE-1-LABEL: 'match'
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-VSCALE-2-LABEL: 'match'
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; TYPE_BASED_ONLY-LABEL: 'match'
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+
+ %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
+ %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
+ %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+ %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+
+ %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
+ %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
+ %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+ %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+
+ ret void
+}
+
declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
>From b0c35dee7619f5b9bd4c0dba66083fa43507b941 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 5 Dec 2024 00:38:20 -0800
Subject: [PATCH 2/4] Implement cost calculation for generic lowering
---
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +--
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 29 +++++++++++++++++
.../AArch64/AArch64TargetTransformInfo.cpp | 31 ++++++++++---------
.../CostModel/AArch64/sve-intrinsics.ll | 20 ++++++------
4 files changed, 56 insertions(+), 28 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 306e305068e2d9..48ebffff8cbfc2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -781,9 +781,7 @@ class TargetTransformInfoImplBase {
default:
break;
case Intrinsic::experimental_vector_histogram_add:
- case Intrinsic::experimental_vector_match:
- // For now, we want explicit support from the target for histograms and
- // matches.
+ // For now, we want explicit support from the target for histograms.
return InstructionCost::getInvalid();
case Intrinsic::allow_runtime_check:
case Intrinsic::allow_ubsan_check:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index f46f07122329e7..03fa3fbac8e970 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1935,6 +1935,35 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
+ case Intrinsic::experimental_vector_match: {
+ auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
+ auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
+ unsigned SearchSize = NeedleTy->getNumElements();
+
+ // If we're not expanding the intrinsic then we assume this is cheap to
+ // implement.
+ EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
+ if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
+ return getTypeLegalizationCost(RetTy).first;
+
+ // Approximate the cost based on the expansion code in
+ // SelectionDAGBuilder.
+ InstructionCost Cost = 0;
+ Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
+ CostKind, 1, nullptr, nullptr);
+ Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
+ CostKind, 0, nullptr, nullptr);
+ Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
+ CostKind, 0, nullptr);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
+ CmpInst::ICMP_EQ, CostKind);
+ Cost +=
+ thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
+ Cost *= SearchSize;
+ Cost +=
+ thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
+ return Cost;
+ }
}
// Assume that we need to scalarize this intrinsic.)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8c8bb6b21ebdde..4389a0e048ff61 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -906,21 +906,22 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
break;
}
case Intrinsic::experimental_vector_match: {
- EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
- unsigned SearchSize =
- cast<FixedVectorType>(ICA.getArgTypes()[1])->getNumElements();
- // If we can't lower to MATCH, return an invalid cost.
- if (getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
- return InstructionCost::getInvalid();
- // Base cost for MATCH instructions. At least on the Neoverse V2 and
- // Neoverse V3 these are cheap operations with the same latency as a vector
- // ADD. In most cases, however, we also need to do an extra DUP.
- InstructionCost Cost = 4;
- // For fixed-length vectors we currently need an extra five--six
- // instructions besides the MATCH.
- if (isa<FixedVectorType>(RetTy))
- Cost += 10;
- return Cost;
+ if (auto *NeedleTy = dyn_cast<FixedVectorType>(ICA.getArgTypes()[1])) {
+ EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ unsigned SearchSize = NeedleTy->getNumElements();
+ if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
+ // Base cost for MATCH instructions. At least on the Neoverse V2 and
+ // Neoverse V3, these are cheap operations with the same latency as a
+ // vector ADD. In most cases, however, we also need to do an extra DUP.
+ // For fixed-length vectors we currently need an extra five--six
+ // instructions besides the MATCH.
+ InstructionCost Cost = 4;
+ if (isa<FixedVectorType>(RetTy))
+ Cost += 10;
+ return Cost;
+ }
+ }
+ break;
}
default:
break;
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 9ba8fcb42a5298..61c805993e9d3e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1364,23 +1364,23 @@ define void @match() #3 {
; CHECK-VSCALE-1-LABEL: 'match'
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT: Cost Model: Invalid cost for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
; CHECK-VSCALE-1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-VSCALE-2-LABEL: 'match'
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT: Cost Model: Invalid cost for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'match'
@@ -1390,8 +1390,8 @@ define void @match() #3 {
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
>From 15df9dd2428cc498fe59f4ff8d9a9e3c9200d8d5 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 10 Dec 2024 04:14:20 -0800
Subject: [PATCH 3/4] Move base cost calculation to
getTypeBasedIntrinsicInstrCost
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 58 +++++++++----------
.../AArch64/AArch64TargetTransformInfo.cpp | 27 +++++----
.../CostModel/AArch64/sve-intrinsics.ll | 8 +--
3 files changed, 46 insertions(+), 47 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 03fa3fbac8e970..a4dc3b5c44c632 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1935,35 +1935,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
- case Intrinsic::experimental_vector_match: {
- auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
- auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
- unsigned SearchSize = NeedleTy->getNumElements();
-
- // If we're not expanding the intrinsic then we assume this is cheap to
- // implement.
- EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
- if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
- return getTypeLegalizationCost(RetTy).first;
-
- // Approximate the cost based on the expansion code in
- // SelectionDAGBuilder.
- InstructionCost Cost = 0;
- Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
- CostKind, 1, nullptr, nullptr);
- Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
- CostKind, 0, nullptr, nullptr);
- Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
- CostKind, 0, nullptr);
- Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
- CmpInst::ICMP_EQ, CostKind);
- Cost +=
- thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
- Cost *= SearchSize;
- Cost +=
- thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
- return Cost;
- }
}
// Assume that we need to scalarize this intrinsic.)
@@ -2219,6 +2190,35 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::vector_reduce_fminimum:
return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
VecOpTy, ICA.getFlags(), CostKind);
+ case Intrinsic::experimental_vector_match: {
+ auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
+ auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
+ unsigned SearchSize = NeedleTy->getNumElements();
+
+ // If we're not expanding the intrinsic then we assume this is cheap to
+ // implement.
+ EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
+ if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
+ return getTypeLegalizationCost(RetTy).first;
+
+ // Approximate the cost based on the expansion code in
+ // SelectionDAGBuilder.
+ InstructionCost Cost = 0;
+ Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
+ CostKind, 1, nullptr, nullptr);
+ Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
+ CostKind, 0, nullptr, nullptr);
+ Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
+ CostKind, 0, nullptr);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
+ CmpInst::ICMP_EQ, CostKind);
+ Cost +=
+ thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
+ Cost *= SearchSize;
+ Cost +=
+ thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
+ return Cost;
+ }
case Intrinsic::abs:
ISD = ISD::ABS;
break;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4389a0e048ff61..000485986cc996 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -906,20 +906,19 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
break;
}
case Intrinsic::experimental_vector_match: {
- if (auto *NeedleTy = dyn_cast<FixedVectorType>(ICA.getArgTypes()[1])) {
- EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
- unsigned SearchSize = NeedleTy->getNumElements();
- if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
- // Base cost for MATCH instructions. At least on the Neoverse V2 and
- // Neoverse V3, these are cheap operations with the same latency as a
- // vector ADD. In most cases, however, we also need to do an extra DUP.
- // For fixed-length vectors we currently need an extra five--six
- // instructions besides the MATCH.
- InstructionCost Cost = 4;
- if (isa<FixedVectorType>(RetTy))
- Cost += 10;
- return Cost;
- }
+ auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
+ EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ unsigned SearchSize = NeedleTy->getNumElements();
+ if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
+ // Base cost for MATCH instructions. At least on the Neoverse V2 and
+ // Neoverse V3, these are cheap operations with the same latency as a
+ // vector ADD. In most cases, however, we also need to do an extra DUP.
+ // For fixed-length vectors we currently need an extra five--six
+ // instructions besides the MATCH.
+ InstructionCost Cost = 4;
+ if (isa<FixedVectorType>(RetTy))
+ Cost += 10;
+ return Cost;
}
break;
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 61c805993e9d3e..dd3909ade53159 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1386,12 +1386,12 @@ define void @match() #3 {
; TYPE_BASED_ONLY-LABEL: 'match'
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
>From 3353d5e02efc96c8bc835324421a097d412ed1c9 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Tue, 10 Dec 2024 08:21:22 -0800
Subject: [PATCH 4/4] Add shortcut to call onto getTypeBasedIntrinsicInstrCost
directly
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index a4dc3b5c44c632..8eef8ea665a26f 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1935,6 +1935,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Cost;
}
+ case Intrinsic::experimental_vector_match:
+ return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
}
// Assume that we need to scalarize this intrinsic.)
More information about the llvm-commits
mailing list