[llvm] [AMDGPU] Add cl option to relax lds dma waitcnt (PR #131842)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 20 14:16:27 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Austin Kerbow (kerbowa)
<details>
<summary>Changes</summary>
---
Full diff: https://github.com/llvm/llvm-project/pull/131842.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+6-1)
- (added) llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll (+146)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 239f2664f59f3..51cfc3f005c19 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -58,6 +58,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
cl::desc("Force all waitcnt load counters to wait until 0"),
cl::init(false), cl::Hidden);
+static cl::opt<bool> RelaxLDSDMA(
+ "amdgpu-relax-lds-dma-waitcnt",
+ cl::desc("Relax the waitcnt for LDS DMA instructions that do not alias"),
+ cl::init(false), cl::ReallyHidden);
+
namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand. Used for determining whether
@@ -1748,7 +1753,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
}
}
}
- if (!FoundAliasingStore)
+ if (!FoundAliasingStore && !RelaxLDSDMA)
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
if (Memop->isStore()) {
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
diff --git a/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll
new file mode 100644
index 0000000000000..f3fe4946b26c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/relax-lds-dma-waitcnt.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-relax-lds-dma-waitcnt < %s | FileCheck %s --check-prefix=RELAXED
+
+; In relaxed mode don't wait on vmcnt(0) if the global_laod_lds and ds_reads do not alias
+
+define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; DEFAULT-LABEL: global_load_lds_no_alias_ds_read:
+; DEFAULT: ; %bb.0: ; %main_body
+; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; DEFAULT-NEXT: v_mov_b32_e32 v2, 0
+; DEFAULT-NEXT: s_mov_b32 m0, 0
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1]
+; DEFAULT-NEXT: s_movk_i32 m0, 0x100
+; DEFAULT-NEXT: s_nop 0
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; DEFAULT-NEXT: s_lshl_b32 s0, s2, 2
+; DEFAULT-NEXT: v_mov_b32_e32 v0, s0
+; DEFAULT-NEXT: s_lshl_b32 s0, s3, 2
+; DEFAULT-NEXT: v_mov_b32_e32 v1, s0
+; DEFAULT-NEXT: s_waitcnt vmcnt(1)
+; DEFAULT-NEXT: s_barrier
+; DEFAULT-NEXT: s_waitcnt vmcnt(0)
+; DEFAULT-NEXT: ds_read_b32 v0, v0 offset:512
+; DEFAULT-NEXT: s_waitcnt vmcnt(0)
+; DEFAULT-NEXT: s_barrier
+; DEFAULT-NEXT: ds_read_b32 v1, v1 offset:768
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; DEFAULT-NEXT: s_endpgm
+;
+; RELAXED-LABEL: global_load_lds_no_alias_ds_read:
+; RELAXED: ; %bb.0: ; %main_body
+; RELAXED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; RELAXED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; RELAXED-NEXT: v_mov_b32_e32 v2, 0
+; RELAXED-NEXT: s_mov_b32 m0, 0
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1]
+; RELAXED-NEXT: s_movk_i32 m0, 0x100
+; RELAXED-NEXT: s_nop 0
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; RELAXED-NEXT: s_lshl_b32 s0, s2, 2
+; RELAXED-NEXT: v_mov_b32_e32 v0, s0
+; RELAXED-NEXT: s_lshl_b32 s0, s3, 2
+; RELAXED-NEXT: v_mov_b32_e32 v1, s0
+; RELAXED-NEXT: s_waitcnt vmcnt(1)
+; RELAXED-NEXT: s_barrier
+; RELAXED-NEXT: ds_read_b32 v0, v0 offset:512
+; RELAXED-NEXT: s_waitcnt vmcnt(0)
+; RELAXED-NEXT: s_barrier
+; RELAXED-NEXT: ds_read_b32 v1, v1 offset:768
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; RELAXED-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
+ call void @llvm.amdgcn.s.waitcnt(i32 3953)
+ call void @llvm.amdgcn.s.barrier()
+ %gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
+ %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+ call void @llvm.amdgcn.s.waitcnt(i32 3952)
+ call void @llvm.amdgcn.s.barrier()
+ %gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
+ %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+ %tmp = insertelement <2 x float> poison, float %val.0, i32 0
+ %res = insertelement <2 x float> %tmp, float %val.1, i32 1
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+; Always wait on vmcnt(0) if the global_laod_lds and ds_reads alias
+
+define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+; DEFAULT-LABEL: global_load_lds_dword_2_arrays:
+; DEFAULT: ; %bb.0: ; %main_body
+; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; DEFAULT-NEXT: v_mov_b32_e32 v2, 0
+; DEFAULT-NEXT: s_mov_b32 m0, 0
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1]
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; DEFAULT-NEXT: s_movk_i32 m0, 0x100
+; DEFAULT-NEXT: s_nop 0
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:8
+; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:12
+; DEFAULT-NEXT: s_lshl_b32 s0, s2, 2
+; DEFAULT-NEXT: s_lshl_b32 s1, s3, 2
+; DEFAULT-NEXT: v_mov_b32_e32 v0, s0
+; DEFAULT-NEXT: v_mov_b32_e32 v1, s1
+; DEFAULT-NEXT: s_waitcnt vmcnt(0)
+; DEFAULT-NEXT: ds_read_b32 v0, v0
+; DEFAULT-NEXT: ; wave barrier
+; DEFAULT-NEXT: ds_read_b32 v1, v1 offset:256
+; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
+; DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; DEFAULT-NEXT: s_endpgm
+;
+; RELAXED-LABEL: global_load_lds_dword_2_arrays:
+; RELAXED: ; %bb.0: ; %main_body
+; RELAXED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; RELAXED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
+; RELAXED-NEXT: v_mov_b32_e32 v2, 0
+; RELAXED-NEXT: s_mov_b32 m0, 0
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1]
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:4
+; RELAXED-NEXT: s_movk_i32 m0, 0x100
+; RELAXED-NEXT: s_nop 0
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:8
+; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:12
+; RELAXED-NEXT: s_lshl_b32 s0, s2, 2
+; RELAXED-NEXT: s_lshl_b32 s1, s3, 2
+; RELAXED-NEXT: v_mov_b32_e32 v0, s0
+; RELAXED-NEXT: v_mov_b32_e32 v1, s1
+; RELAXED-NEXT: s_waitcnt vmcnt(0)
+; RELAXED-NEXT: ds_read_b32 v0, v0
+; RELAXED-NEXT: ; wave barrier
+; RELAXED-NEXT: ds_read_b32 v1, v1 offset:256
+; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
+; RELAXED-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; RELAXED-NEXT: s_endpgm
+main_body:
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
+ call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
+ %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+ %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+ %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+ call void @llvm.amdgcn.wave.barrier()
+ %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+ %tmp.0 = insertelement <2 x float> poison, float %val.0, i32 0
+ %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+ at lds.0 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.1 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.2 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.3 = internal addrspace(3) global [64 x float] poison, align 16
``````````
</details>
https://github.com/llvm/llvm-project/pull/131842
More information about the llvm-commits
mailing list