[llvm] [AMDGPU] Create new option for force flush load counter (PR #124974)

Wed Jan 29 11:40:30 PST 2025

https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/124974

In ceratin situations it is beneficial to wait for all outstanding
loads regardless of specific load's data we need. This may allow
to reduce a number of cache requests.

Fixes: SWDEV-511507

>From d97055b66939661ed6ede9769a6142ae550829b0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 27 Jan 2025 13:27:31 -0800
Subject: [PATCH] [AMDGPU] Create new option for force flush load counter

In ceratin situations it is beneficial to wait for all outstanding
loads regardless of specific load's data we need. This may allow
to reduce a number of cache requests.

Fixes: SWDEV-511507
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  8 ++++
 llvm/test/CodeGen/AMDGPU/load-store-cnt.ll  | 48 +++++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/load-store-cnt.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index de2095fa60ffd4..3d6419778f4b1c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -53,6 +53,11 @@ static cl::opt<bool>
                                "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
                       cl::init(false), cl::Hidden);
 
+static cl::opt<bool> ForceEmitZeroLoadFlag(
+    "amdgpu-waitcnt-load-forcezero",
+    cl::desc("Force all waitcnt load counters to wait until 0"),
+    cl::init(false), cl::Hidden);
+
 namespace {
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
@@ -1850,6 +1855,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       Wait.BvhCnt = 0;
   }
 
+  if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
+    Wait.LoadCnt = 0;
+
   return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
                          OldWaitcntInstr);
 }
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-cnt.ll b/llvm/test/CodeGen/AMDGPU/load-store-cnt.ll
new file mode 100644
index 00000000000000..a7fccde4166713
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/load-store-cnt.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=DEFAULT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-waitcnt-load-forcezero < %s | FileCheck --check-prefixes=LDZERO %s
+
+define amdgpu_kernel void @copy(ptr addrspace(1) noalias nocapture readonly %src1, ptr addrspace(1) noalias nocapture readonly %src2, ptr addrspace(1) noalias nocapture writeonly %dst1, ptr addrspace(1) noalias nocapture writeonly %dst2) {
+; DEFAULT-LABEL: copy:
+; DEFAULT:       ; %bb.0:
+; DEFAULT-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; DEFAULT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; DEFAULT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; DEFAULT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; DEFAULT-NEXT:    s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT:    s_clause 0x1
+; DEFAULT-NEXT:    global_load_b32 v1, v0, s[0:1]
+; DEFAULT-NEXT:    global_load_b32 v2, v0, s[2:3]
+; DEFAULT-NEXT:    s_waitcnt vmcnt(1)
+; DEFAULT-NEXT:    global_store_b32 v0, v1, s[4:5]
+; DEFAULT-NEXT:    s_waitcnt vmcnt(0)
+; DEFAULT-NEXT:    global_store_b32 v0, v2, s[6:7]
+; DEFAULT-NEXT:    s_endpgm
+;
+; LDZERO-LABEL: copy:
+; LDZERO:       ; %bb.0:
+; LDZERO-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; LDZERO-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; LDZERO-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; LDZERO-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; LDZERO-NEXT:    s_waitcnt lgkmcnt(0)
+; LDZERO-NEXT:    s_clause 0x1
+; LDZERO-NEXT:    global_load_b32 v1, v0, s[0:1]
+; LDZERO-NEXT:    global_load_b32 v2, v0, s[2:3]
+; LDZERO-NEXT:    s_waitcnt vmcnt(0)
+; LDZERO-NEXT:    s_clause 0x1
+; LDZERO-NEXT:    global_store_b32 v0, v1, s[4:5]
+; LDZERO-NEXT:    global_store_b32 v0, v2, s[6:7]
+; LDZERO-NEXT:    s_endpgm
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep.ld1 = getelementptr inbounds nuw float, ptr addrspace(1) %src1, i64 %idx
+  %v1 = load float, ptr addrspace(1) %gep.ld1, align 4
+  %gep.ld2 = getelementptr inbounds nuw float, ptr addrspace(1) %src2, i64 %idx
+  %v2 = load float, ptr addrspace(1) %gep.ld2, align 4
+  %gep.st1 = getelementptr inbounds nuw float, ptr addrspace(1) %dst1, i64 %idx
+  store float %v1, ptr addrspace(1) %gep.st1, align 4
+  %gep.st2 = getelementptr inbounds nuw float, ptr addrspace(1) %dst2, i64 %idx
+  store float %v2, ptr addrspace(1) %gep.st2, align 4
+  ret void
+}