[llvm] AMDGPU/GFX12: Insert waitcnts before stores with scope_sys (PR #82996)

Mon Feb 26 06:24:39 PST 2024

https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/82996

>From b41acdc30b4ef122559a123f657bf31cbeeafd90 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Mon, 26 Feb 2024 13:41:18 +0100
Subject: [PATCH] AMDGPU/GFX12: Insert waitcnts before stores with scope_sys

Insert waitcnts for loads and atomics before stores with system scope.
Scope is field in instruction encoding and corresponds to desired
coherence level in cache hierarchy. Only intrinsic stores can set scope.
Currently there is no reliable way to set scope on generic stores they
are by default lowest scope level.
Waitcnts are not required if it is guaranteed that memory is cached.
For example vulkan shaders can guarantee this.
TODO: implement flag for frontends to give us a hint not to insert waits.
Expecting vulkan flag to be implemented as vulkan:private MMRA.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  2 +
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  | 37 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  1 +
 .../AMDGPU/wait-for-stores-with-scope_sys.ll  | 16 ++++++++
 .../AMDGPU/wait-for-stores-with-scope_sys.mir | 22 +++++++++++
 5 files changed, 78 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d774826c1d08c0..a8a33a5fecb413 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -949,6 +949,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
       return AMDGPU::S_WAIT_BVHCNT;
     case AMDGPU::S_WAIT_DSCNT_soft:
       return AMDGPU::S_WAIT_DSCNT;
+    case AMDGPU::S_WAIT_KMCNT_soft:
+      return AMDGPU::S_WAIT_KMCNT;
     default:
       return Opcode;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f62e808b33e42b..3cee2100886481 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -312,6 +312,10 @@ class SICacheControl {
                                               SIMemOp Op, bool IsVolatile,
                                               bool IsNonTemporal) const = 0;
 
+  virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+    return false;
+  };
+
   /// Inserts any necessary instructions at position \p Pos relative
   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
   /// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -603,6 +607,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile,
                                       bool IsNonTemporal) const override;
+
+  bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
 };
 
 class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2381,6 +2387,33 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::expandSystemScopeStore(
+    MachineBasicBlock::iterator &MI) const {
+
+  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+  if (!CPol || ((CPol->getImm() & CPol::SCOPE) != CPol::SCOPE_SYS))
+    return false;
+
+  // Stores with system scope (SCOPE_SYS) need to wait for:
+  // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
+  // - non-returning-atomics       - wait for STORECNT==0
+  //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
+  //   since it does not distinguish atomics-with-return from regular stores.
+
+  // There is no need to wait if memory is cached (mtype != UC).
+  // For example shader-visible memory is cached.
+  // TODO: implement flag for frontend to give us a hint not to insert waits.
+  MachineBasicBlock &MBB = *MI->getParent();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
+  return true;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
@@ -2467,6 +2500,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   Changed |= CC->enableVolatileAndOrNonTemporal(
       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
       MOI.isNonTemporal());
+
+  // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
+  // instruction field, do not confuse it with atomic scope.
+  Changed |= CC->expandSystemScopeStore(MI);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0fe2845f8edc31..b5de311f8c58ce 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1601,6 +1601,7 @@ let SubtargetPredicate = isGFX12Plus in {
   def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
 }
 
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
diff --git a/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.ll
new file mode 100644
index 00000000000000..074cc6b66e32dd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: intrinsic_store_system_scope:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
+  ret void
+}
+
+declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.mir
new file mode 100644
index 00000000000000..463a70a555a192
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-for-stores-with-scope_sys.mir
@@ -0,0 +1,22 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer  %s -o - | FileCheck -check-prefix=GFX12 %s
+
+---
+name: intrinsic_store_system_scope
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX12-LABEL: name: intrinsic_store_system_scope
+    ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+    ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    ; GFX12-NEXT: S_ENDPGM 0
+    BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    S_ENDPGM 0
+...