[llvm] [AArch64] Increase scatter overhead on Neoverse-V2 (PR #101296)
Madhur Amilkanthwar via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 7 08:31:17 PDT 2024
https://github.com/madhur13490 updated https://github.com/llvm/llvm-project/pull/101296
>From 5bc19b3d59dded4756d2cd18d22c6cd727217b37 Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Thu, 25 Jul 2024 18:30:21 +0530
Subject: [PATCH 1/3] [AArch64] Increase scatter overhead on Neoverse-V2
This patch increases scatter overhead on Neoverse-V2 to 13.
This benefits s128 kernel from TSVC_2 test suite.
SPEC 17, RAJAPerf, Sptter are unaffected with this patch.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 22 +++++++++++++++++--
.../LoopVectorize/AArch64/scatter-cost.ll | 8 +++++++
2 files changed, 28 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 79c0e45e3aa5b..d8c6416849cc5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3389,8 +3389,26 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
-static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
+static unsigned getSVEGatherScatterOverhead(unsigned Opcode, AArch64Subtarget::ARMProcFamilyEnum ProcFamily) {
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Should be called on only load or stores.");
return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
+ unsigned Cost = 1;
+ switch(Opcode) {
+ case Instruction::Load:
+ Cost = SVEGatherOverhead;
+ break;
+ case Instruction::Store:
+ if (ProcFamily == AArch64Subtarget::NeoverseV2) {
+ Cost = 13;
+ } else {
+ Cost = SVEScatterOverhead;
+ }
+ break;
+ default:
+ llvm_unreachable("Shouldn't have reached here");
+ }
+ return Cost;
}
InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
@@ -3424,7 +3442,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
// Add on an overhead cost for using gathers/scatters.
// TODO: At the moment this is applied unilaterally for all CPUs, but at some
// point we may want a per-CPU overhead.
- MemOpCost *= getSVEGatherScatterOverhead(Opcode);
+ MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST->getProcFamily());
return LT.first * MemOpCost * getMaxNumElements(LegalVF);
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
new file mode 100644
index 0000000000000..8bdae00411a1f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
@@ -0,0 +1,8 @@
+; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output | FileCheck %s
+; CHECK: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
+
+define void @masked_scatter_nxv8f32_i64(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, <vscale x 4 x i64> %V) #0 {
+ call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+ ret void
+}
+
>From 2a7128ede5134d8d68c1c64082136cb4b68edc97 Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Wed, 7 Aug 2024 19:38:16 +0530
Subject: [PATCH 2/3] Address review comments
---
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 4 +++-
llvm/lib/Target/AArch64/AArch64Subtarget.h | 2 ++
.../AArch64/AArch64TargetTransformInfo.cpp | 17 ++++++-----------
.../LoopVectorize/AArch64/scatter-cost.ll | 2 +-
4 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 642006e706c13..0f90f78dcde29 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -233,9 +233,11 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
PrefLoopAlignment = Align(32);
MaxBytesForLoopAlignment = 16;
break;
+ case NeoverseV2:
+ ScatterOverhead = 13;
+ LLVM_FALLTHROUGH;
case NeoverseN2:
case NeoverseN3:
- case NeoverseV2:
case NeoverseV3:
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(32);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 0f3a637f98fbe..3890758086f4b 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -59,6 +59,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 2;
uint16_t CacheLineSize = 0;
+ unsigned ScatterOverhead = 10;
uint16_t PrefetchDistance = 0;
uint16_t MinPrefetchStride = 1;
unsigned MaxPrefetchIterationsAhead = UINT_MAX;
@@ -225,6 +226,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
unsigned getVectorInsertExtractBaseCost() const;
unsigned getCacheLineSize() const override { return CacheLineSize; }
+ unsigned getScatterOverhead() const { return ScatterOverhead; }
unsigned getPrefetchDistance() const override { return PrefetchDistance; }
unsigned getMinPrefetchStride(unsigned NumMemAccesses,
unsigned NumStridedMemAccesses,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d8c6416849cc5..e554d865c318f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3389,26 +3389,21 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
-static unsigned getSVEGatherScatterOverhead(unsigned Opcode, AArch64Subtarget::ARMProcFamilyEnum ProcFamily) {
+static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget* ST) {
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
"Should be called on only load or stores.");
- return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
- unsigned Cost = 1;
switch(Opcode) {
case Instruction::Load:
- Cost = SVEGatherOverhead;
+ return SVEGatherOverhead;
break;
case Instruction::Store:
- if (ProcFamily == AArch64Subtarget::NeoverseV2) {
- Cost = 13;
- } else {
- Cost = SVEScatterOverhead;
- }
+ if (SVEScatterOverhead.getNumOccurrences() > 0)
+ return SVEScatterOverhead;
+ return ST->getScatterOverhead();
break;
default:
llvm_unreachable("Shouldn't have reached here");
}
- return Cost;
}
InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
@@ -3442,7 +3437,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
// Add on an overhead cost for using gathers/scatters.
// TODO: At the moment this is applied unilaterally for all CPUs, but at some
// point we may want a per-CPU overhead.
- MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST->getProcFamily());
+ MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
return LT.first * MemOpCost * getMaxNumElements(LegalVF);
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
index 8bdae00411a1f..a17781105613c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output | FileCheck %s
+; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s
; CHECK: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
define void @masked_scatter_nxv8f32_i64(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, <vscale x 4 x i64> %V) #0 {
>From f58e534a518cced44b9005c3ce4567b0c8464a33 Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Wed, 7 Aug 2024 21:01:01 +0530
Subject: [PATCH 3/3] Address more review comments
---
llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 3 ++-
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 4 ++--
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 0f90f78dcde29..32db1e8c2477a 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -234,8 +234,9 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 16;
break;
case NeoverseV2:
+ // Specialize cost for Neoverse-V2.
ScatterOverhead = 13;
- LLVM_FALLTHROUGH;
+ LLVM_FALLTHROUGH;
case NeoverseN2:
case NeoverseN3:
case NeoverseV3:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e554d865c318f..6be0e137b2432 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3389,6 +3389,8 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
+// This function returns gather/scatter overhead either from
+// user-provided value or specialized values per-target from \p ST.
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget* ST) {
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
"Should be called on only load or stores.");
@@ -3435,8 +3437,6 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
{TTI::OK_AnyValue, TTI::OP_None}, I);
// Add on an overhead cost for using gathers/scatters.
- // TODO: At the moment this is applied unilaterally for all CPUs, but at some
- // point we may want a per-CPU overhead.
MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
return LT.first * MemOpCost * getMaxNumElements(LegalVF);
}
More information about the llvm-commits
mailing list