[llvm] [AArch64] Increase scatter overhead on Neoverse-V2 (PR #101296)

Tue Aug 13 08:39:55 PDT 2024

https://github.com/madhur13490 updated https://github.com/llvm/llvm-project/pull/101296

>From 862f1e0bbfd6f62ab1b3965e177ab83b2bcfca5b Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Thu, 25 Jul 2024 18:30:21 +0530
Subject: [PATCH 1/5] [AArch64] Increase scatter overhead on Neoverse-V2

This patch increases scatter overhead on Neoverse-V2 to 13.
This benefits s128 kernel from TSVC_2 test suite.
SPEC 17, RAJAPerf, Sptter are unaffected with this patch.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 22 +++++++++++++++++--
 .../LoopVectorize/AArch64/scatter-cost.ll     |  8 +++++++
 2 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e85fd73996dd1a..faa2c049067994 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3411,8 +3411,26 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
-static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
+static unsigned getSVEGatherScatterOverhead(unsigned Opcode, AArch64Subtarget::ARMProcFamilyEnum ProcFamily) {
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+          "Should be called on only load or stores.");
   return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
+  unsigned Cost = 1;
+  switch(Opcode) {
+    case Instruction::Load:
+      Cost = SVEGatherOverhead;
+      break;
+    case Instruction::Store:
+      if (ProcFamily == AArch64Subtarget::NeoverseV2) {
+        Cost = 13;
+      } else {
+        Cost = SVEScatterOverhead;
+      }
+    break;
+    default:
+      llvm_unreachable("Shouldn't have reached here");
+  }
+  return Cost;
 }
 
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
@@ -3446,7 +3464,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   // Add on an overhead cost for using gathers/scatters.
   // TODO: At the moment this is applied unilaterally for all CPUs, but at some
   // point we may want a per-CPU overhead.
-  MemOpCost *= getSVEGatherScatterOverhead(Opcode);
+  MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST->getProcFamily());
   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
new file mode 100644
index 00000000000000..8bdae00411a1f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
@@ -0,0 +1,8 @@
+; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output | FileCheck %s
+; CHECK: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
+
+define void @masked_scatter_nxv8f32_i64(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, <vscale x 4 x i64> %V) #0 {
+  call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+  ret void
+}
+

>From 3c359cb588f0857c4dffc943681b6e4442f228fd Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Wed, 7 Aug 2024 19:38:16 +0530
Subject: [PATCH 2/5] Address review comments

---
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp    |  4 +++-
 llvm/lib/Target/AArch64/AArch64Subtarget.h      |  2 ++
 .../AArch64/AArch64TargetTransformInfo.cpp      | 17 ++++++-----------
 .../LoopVectorize/AArch64/scatter-cost.ll       |  2 +-
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 642006e706c13b..0f90f78dcde292 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -233,9 +233,11 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     PrefLoopAlignment = Align(32);
     MaxBytesForLoopAlignment = 16;
     break;
+  case NeoverseV2:
+    ScatterOverhead = 13;
+  LLVM_FALLTHROUGH;
   case NeoverseN2:
   case NeoverseN3:
-  case NeoverseV2:
   case NeoverseV3:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(32);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 0f3a637f98fbe7..3890758086f4ba 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -59,6 +59,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 2;
   uint16_t CacheLineSize = 0;
+  unsigned ScatterOverhead = 10;
   uint16_t PrefetchDistance = 0;
   uint16_t MinPrefetchStride = 1;
   unsigned MaxPrefetchIterationsAhead = UINT_MAX;
@@ -225,6 +226,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const;
   unsigned getCacheLineSize() const override { return CacheLineSize; }
+  unsigned getScatterOverhead() const { return ScatterOverhead; }
   unsigned getPrefetchDistance() const override { return PrefetchDistance; }
   unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                 unsigned NumStridedMemAccesses,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index faa2c049067994..b8682a4d08446b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3411,26 +3411,21 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
-static unsigned getSVEGatherScatterOverhead(unsigned Opcode, AArch64Subtarget::ARMProcFamilyEnum ProcFamily) {
+static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget* ST) {
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
           "Should be called on only load or stores.");
-  return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
-  unsigned Cost = 1;
   switch(Opcode) {
     case Instruction::Load:
-      Cost = SVEGatherOverhead;
+      return SVEGatherOverhead;
       break;
     case Instruction::Store:
-      if (ProcFamily == AArch64Subtarget::NeoverseV2) {
-        Cost = 13;
-      } else {
-        Cost = SVEScatterOverhead;
-      }
+      if (SVEScatterOverhead.getNumOccurrences() > 0)
+        return SVEScatterOverhead;
+      return ST->getScatterOverhead();
     break;
     default:
       llvm_unreachable("Shouldn't have reached here");
   }
-  return Cost;
 }
 
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
@@ -3464,7 +3459,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   // Add on an overhead cost for using gathers/scatters.
   // TODO: At the moment this is applied unilaterally for all CPUs, but at some
   // point we may want a per-CPU overhead.
-  MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST->getProcFamily());
+  MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
index 8bdae00411a1f8..a17781105613c7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output | FileCheck %s
+; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s
 ; CHECK: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
 
 define void @masked_scatter_nxv8f32_i64(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, <vscale x 4 x i64> %V) #0 {

>From ea10a5f5c52f3ef52c4f55e7b7f4ee4bf75fee06 Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Wed, 7 Aug 2024 21:01:01 +0530
Subject: [PATCH 3/5] Address more review comments

---
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp           | 3 ++-
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 0f90f78dcde292..32db1e8c2477a8 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -234,8 +234,9 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     MaxBytesForLoopAlignment = 16;
     break;
   case NeoverseV2:
+    // Specialize cost for Neoverse-V2.
     ScatterOverhead = 13;
-  LLVM_FALLTHROUGH;
+    LLVM_FALLTHROUGH;
   case NeoverseN2:
   case NeoverseN3:
   case NeoverseV3:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8682a4d08446b..91e98a4a1011eb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3411,6 +3411,8 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
+// This function returns gather/scatter overhead either from
+// user-provided value or specialized values per-target from \p ST.
 static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget* ST) {
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
           "Should be called on only load or stores.");
@@ -3457,8 +3459,6 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
                       {TTI::OK_AnyValue, TTI::OP_None}, I);
   // Add on an overhead cost for using gathers/scatters.
-  // TODO: At the moment this is applied unilaterally for all CPUs, but at some
-  // point we may want a per-CPU overhead.
   MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
   return LT.first * MemOpCost * getMaxNumElements(LegalVF);
 }

>From fc1430d34a55f51ba8d6459e021518102457a741 Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Mon, 12 Aug 2024 20:45:43 +0530
Subject: [PATCH 4/5] Handle GatherOverhead too as requested

---
 llvm/lib/Target/AArch64/AArch64Subtarget.h             | 3 +++
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 3890758086f4ba..accfb49c6fbe3a 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -59,7 +59,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 2;
   uint16_t CacheLineSize = 0;
+  // Default scatter/gather overhead.
   unsigned ScatterOverhead = 10;
+  unsigned GatherOverhead = 10;
   uint16_t PrefetchDistance = 0;
   uint16_t MinPrefetchStride = 1;
   unsigned MaxPrefetchIterationsAhead = UINT_MAX;
@@ -227,6 +229,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   unsigned getVectorInsertExtractBaseCost() const;
   unsigned getCacheLineSize() const override { return CacheLineSize; }
   unsigned getScatterOverhead() const { return ScatterOverhead; }
+  unsigned getGatherOverhead() const { return GatherOverhead; }
   unsigned getPrefetchDistance() const override { return PrefetchDistance; }
   unsigned getMinPrefetchStride(unsigned NumMemAccesses,
                                 unsigned NumStridedMemAccesses,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 91e98a4a1011eb..5aa8319b65cb9e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3418,7 +3418,9 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtar
           "Should be called on only load or stores.");
   switch(Opcode) {
     case Instruction::Load:
-      return SVEGatherOverhead;
+      if (SVEGatherOverhead.getNumOccurrences() > 0)
+        return SVEGatherOverhead;
+      return ST->getGatherOverhead();
       break;
     case Instruction::Store:
       if (SVEScatterOverhead.getNumOccurrences() > 0)

>From 0c6cbc47099f43575072886953a1ced8eb514a6f Mon Sep 17 00:00:00 2001
From: Madhur Amilkanthwar <madhura at nvidia.com>
Date: Tue, 13 Aug 2024 19:59:11 +0530
Subject: [PATCH 5/5] Add non-v2 case

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 27 ++++++++++---------
 .../LoopVectorize/AArch64/scatter-cost.ll     |  6 +++--
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5aa8319b65cb9e..4a80530e85094d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3413,22 +3413,23 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
 
 // This function returns gather/scatter overhead either from
 // user-provided value or specialized values per-target from \p ST.
-static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget* ST) {
+static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
+                                            const AArch64Subtarget *ST) {
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
-          "Should be called on only load or stores.");
+        "Should be called on only load or stores.");
   switch(Opcode) {
-    case Instruction::Load:
-      if (SVEGatherOverhead.getNumOccurrences() > 0)
-        return SVEGatherOverhead;
-      return ST->getGatherOverhead();
-      break;
-    case Instruction::Store:
-      if (SVEScatterOverhead.getNumOccurrences() > 0)
-        return SVEScatterOverhead;
-      return ST->getScatterOverhead();
+  case Instruction::Load:
+    if (SVEGatherOverhead.getNumOccurrences() > 0)
+      return SVEGatherOverhead;
+    return ST->getGatherOverhead();
     break;
-    default:
-      llvm_unreachable("Shouldn't have reached here");
+  case Instruction::Store:
+    if (SVEScatterOverhead.getNumOccurrences() > 0)
+      return SVEScatterOverhead;
+    return ST->getScatterOverhead();
+    break;
+  default:
+    llvm_unreachable("Shouldn't have reached here");
   }
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
index a17781105613c7..c2701017ced7a5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scatter-cost.ll
@@ -1,5 +1,7 @@
-; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s
-; CHECK: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
+; RUN: opt -mtriple aarch64 -mcpu=neoverse-v2 -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-V2
+; RUN: opt -mtriple aarch64  -passes="print<cost-model>" -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-GENERIC
+; CHECK-V2: Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32
+; CHECK-GENERIC: Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32
 
 define void @masked_scatter_nxv8f32_i64(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, <vscale x 4 x i64> %V) #0 {
   call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %data, <vscale x 4 x ptr> %b, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))