[llvm-branch-commits] [llvm] [AMDGPU] Handle direct loads to LDS in memory model (PR #142018)
Austin Kerbow via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu May 29 12:28:49 PDT 2025
https://github.com/kerbowa created https://github.com/llvm/llvm-project/pull/142018
Add additional waitcnt insertion to ensure proper ordering between LDS
operations and direct loads from global memory to LDS on pre-GFX10
hardware.
Direct LDS loads perform both a global memory load and an LDS store,
which can be reordered with respect to other LDS operations without
explicit synchronization. This can cause ordering violations even within
a single thread.
The change conservatively inserts vmcnt(0) waits for all sync scopes
when the LDS address space is involved. Future optimizations in
SIInsertWaitcnts can relax this to only wait for outstanding direct LDS
loads rather than all vmcnt events.
This change only affects LDS address space synchronization and preserves
existing cross-address space ordering behavior.
>From c5c5225accd5dbc32cc62e64ae63bb00f5632a1c Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Thu, 29 May 2025 10:28:16 -0700
Subject: [PATCH] [AMDGPU] Handle direct loads to LDS in memory model
Add additional waitcnt insertion to ensure proper ordering between LDS
operations and direct loads from global memory to LDS on pre-GFX10
hardware.
Direct LDS loads perform both a global memory load and an LDS store,
which can be reordered with respect to other LDS operations without
explicit synchronization. This can cause ordering violations even within
a single thread.
The change conservatively inserts vmcnt(0) waits for all sync scopes
when the LDS address space is involved. Future optimizations in
SIInsertWaitcnts can relax this to only wait for outstanding direct LDS
loads rather than all vmcnt events.
This change only affects LDS address space synchronization and preserves
existing cross-address space ordering behavior.
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 17 ++++
.../memory-legalizer-atomic-fence.ll | 80 +++++++++++++++++++
.../CodeGen/AMDGPU/branch-condition-and.ll | 4 +-
.../CodeGen/AMDGPU/indirect-addressing-si.ll | 2 +
.../kernel-vgpr-spill-mubuf-with-voffset.ll | 1 +
5 files changed, 103 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 56fec409d11ae..7624bcfe3da0e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1084,6 +1084,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
bool VMCnt = false;
bool LGKMCnt = false;
+ bool DirectLDSWait = false;
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
SIAtomicAddrSpace::NONE) {
@@ -1104,6 +1105,10 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
}
if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ // Wait for direct loads to LDS from global memory to ensure that
+ // LDS operations cannot be reordered with respect to global memory
+ // operations.
+ DirectLDSWait = true;
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
@@ -1149,6 +1154,18 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
}
}
+ // Conservatively wait for vmcnt(0) to ensure that LDS operations and direct
+ // LDS loads from global memory cannot be reordered with respect to each other.
+ // This waitcnt can be safely optimized to wait for a higher vmcnt based on
+ // the number of outstanding direct LDS loads.
+ if (DirectLDSWait) {
+ unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(
+ IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV));
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_DIRECT_LDS_LOAD_soft))
+ .addImm(WaitCntImmediate);
+ Changed = true;
+ }
+
if (VMCnt || LGKMCnt) {
unsigned WaitCntImmediate =
AMDGPU::encodeWaitcnt(IV,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 66037615f0ba0..7f197b3580042 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -13,12 +13,14 @@
define amdgpu_kernel void @system_one_as_acquire() #0 {
; GFX6-LABEL: name: system_one_as_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_one_as_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -62,11 +64,13 @@ entry:
define amdgpu_kernel void @system_one_as_release() #0 {
; GFX6-LABEL: name: system_one_as_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_one_as_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
@@ -101,12 +105,14 @@ entry:
define amdgpu_kernel void @system_one_as_acq_rel() #0 {
; GFX6-LABEL: name: system_one_as_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_one_as_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -150,12 +156,14 @@ entry:
define amdgpu_kernel void @system_one_as_seq_cst() #0 {
; GFX6-LABEL: name: system_one_as_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_one_as_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -199,10 +207,12 @@ entry:
define amdgpu_kernel void @singlethread_one_as_acquire() #0 {
; GFX6-LABEL: name: singlethread_one_as_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_one_as_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_one_as_acquire
@@ -228,10 +238,12 @@ entry:
define amdgpu_kernel void @singlethread_one_as_release() #0 {
; GFX6-LABEL: name: singlethread_one_as_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_one_as_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_one_as_release
@@ -257,10 +269,12 @@ entry:
define amdgpu_kernel void @singlethread_one_as_acq_rel() #0 {
; GFX6-LABEL: name: singlethread_one_as_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_one_as_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_one_as_acq_rel
@@ -286,10 +300,12 @@ entry:
define amdgpu_kernel void @singlethread_one_as_seq_cst() #0 {
; GFX6-LABEL: name: singlethread_one_as_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_one_as_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_one_as_seq_cst
@@ -315,12 +331,14 @@ entry:
define amdgpu_kernel void @agent_one_as_acquire() #0 {
; GFX6-LABEL: name: agent_one_as_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_one_as_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -364,11 +382,13 @@ entry:
define amdgpu_kernel void @agent_one_as_release() #0 {
; GFX6-LABEL: name: agent_one_as_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_one_as_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
@@ -403,12 +423,14 @@ entry:
define amdgpu_kernel void @agent_one_as_acq_rel() #0 {
; GFX6-LABEL: name: agent_one_as_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_one_as_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -452,12 +474,14 @@ entry:
define amdgpu_kernel void @agent_one_as_seq_cst() #0 {
; GFX6-LABEL: name: agent_one_as_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 3952
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_one_as_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 3952
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -501,10 +525,12 @@ entry:
define amdgpu_kernel void @workgroup_one_as_acquire() #0 {
; GFX6-LABEL: name: workgroup_one_as_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_one_as_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: workgroup_one_as_acquire
@@ -536,10 +562,12 @@ entry:
define amdgpu_kernel void @workgroup_one_as_release() #0 {
; GFX6-LABEL: name: workgroup_one_as_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_one_as_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: workgroup_one_as_release
@@ -569,10 +597,12 @@ entry:
define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
; GFX6-LABEL: name: workgroup_one_as_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_one_as_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -604,10 +634,12 @@ entry:
define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
; GFX6-LABEL: name: workgroup_one_as_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_one_as_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -639,10 +671,12 @@ entry:
define amdgpu_kernel void @wavefront_one_as_acquire() #0 {
; GFX6-LABEL: name: wavefront_one_as_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_one_as_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_one_as_acquire
@@ -668,10 +702,12 @@ entry:
define amdgpu_kernel void @wavefront_one_as_release() #0 {
; GFX6-LABEL: name: wavefront_one_as_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_one_as_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_one_as_release
@@ -697,10 +733,12 @@ entry:
define amdgpu_kernel void @wavefront_one_as_acq_rel() #0 {
; GFX6-LABEL: name: wavefront_one_as_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_one_as_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_one_as_acq_rel
@@ -726,10 +764,12 @@ entry:
define amdgpu_kernel void @wavefront_one_as_seq_cst() #0 {
; GFX6-LABEL: name: wavefront_one_as_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_one_as_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_one_as_seq_cst
@@ -755,12 +795,14 @@ entry:
define amdgpu_kernel void @system_acquire() #0 {
; GFX6-LABEL: name: system_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -804,11 +846,13 @@ entry:
define amdgpu_kernel void @system_release() #0 {
; GFX6-LABEL: name: system_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: S_ENDPGM 0
;
@@ -843,12 +887,14 @@ entry:
define amdgpu_kernel void @system_acq_rel() #0 {
; GFX6-LABEL: name: system_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -892,12 +938,14 @@ entry:
define amdgpu_kernel void @system_seq_cst() #0 {
; GFX6-LABEL: name: system_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: system_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -941,10 +989,12 @@ entry:
define amdgpu_kernel void @singlethread_acquire() #0 {
; GFX6-LABEL: name: singlethread_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_acquire
@@ -970,10 +1020,12 @@ entry:
define amdgpu_kernel void @singlethread_release() #0 {
; GFX6-LABEL: name: singlethread_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_release
@@ -999,10 +1051,12 @@ entry:
define amdgpu_kernel void @singlethread_acq_rel() #0 {
; GFX6-LABEL: name: singlethread_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_acq_rel
@@ -1028,10 +1082,12 @@ entry:
define amdgpu_kernel void @singlethread_seq_cst() #0 {
; GFX6-LABEL: name: singlethread_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: singlethread_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: singlethread_seq_cst
@@ -1057,12 +1113,14 @@ entry:
define amdgpu_kernel void @agent_acquire() #0 {
; GFX6-LABEL: name: agent_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -1106,11 +1164,13 @@ entry:
define amdgpu_kernel void @agent_release() #0 {
; GFX6-LABEL: name: agent_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: S_ENDPGM 0
;
@@ -1145,12 +1205,14 @@ entry:
define amdgpu_kernel void @agent_acq_rel() #0 {
; GFX6-LABEL: name: agent_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -1194,12 +1256,14 @@ entry:
define amdgpu_kernel void @agent_seq_cst() #0 {
; GFX6-LABEL: name: agent_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 112
; GFX6-NEXT: BUFFER_WBINVL1 implicit $exec
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: agent_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 112
; GFX8-NEXT: BUFFER_WBINVL1_VOL implicit $exec
; GFX8-NEXT: S_ENDPGM 0
@@ -1243,11 +1307,13 @@ entry:
define amdgpu_kernel void @workgroup_acquire() #0 {
; GFX6-LABEL: name: workgroup_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 127
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 127
; GFX8-NEXT: S_ENDPGM 0
;
@@ -1282,11 +1348,13 @@ entry:
define amdgpu_kernel void @workgroup_release() #0 {
; GFX6-LABEL: name: workgroup_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 127
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 127
; GFX8-NEXT: S_ENDPGM 0
;
@@ -1319,11 +1387,13 @@ entry:
define amdgpu_kernel void @workgroup_acq_rel() #0 {
; GFX6-LABEL: name: workgroup_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 127
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 127
; GFX8-NEXT: S_ENDPGM 0
;
@@ -1358,11 +1428,13 @@ entry:
define amdgpu_kernel void @workgroup_seq_cst() #0 {
; GFX6-LABEL: name: workgroup_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_WAITCNT_soft 127
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: workgroup_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_WAITCNT_soft 127
; GFX8-NEXT: S_ENDPGM 0
;
@@ -1397,10 +1469,12 @@ entry:
define amdgpu_kernel void @wavefront_acquire() #0 {
; GFX6-LABEL: name: wavefront_acquire
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_acquire
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_acquire
@@ -1426,10 +1500,12 @@ entry:
define amdgpu_kernel void @wavefront_release() #0 {
; GFX6-LABEL: name: wavefront_release
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_release
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_release
@@ -1455,10 +1531,12 @@ entry:
define amdgpu_kernel void @wavefront_acq_rel() #0 {
; GFX6-LABEL: name: wavefront_acq_rel
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_acq_rel
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_acq_rel
@@ -1484,10 +1562,12 @@ entry:
define amdgpu_kernel void @wavefront_seq_cst() #0 {
; GFX6-LABEL: name: wavefront_seq_cst
; GFX6: bb.0.entry:
+ ; GFX6-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX6-NEXT: S_ENDPGM 0
;
; GFX8-LABEL: name: wavefront_seq_cst
; GFX8: bb.0.entry:
+ ; GFX8-NEXT: S_WAITCNT_DIRECT_LDS_LOAD_soft 3952
; GFX8-NEXT: S_ENDPGM 0
;
; GFX10WGP-LABEL: name: wavefront_seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
index 2bf4a2c028fdc..9fd44da40453f 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -17,12 +17,14 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
+; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb4
; GCN-NEXT: v_mov_b32_e32 v0, 4
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v0, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; divergent unreachable
-; GCN-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GCN-NEXT: .LBB0_2: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
bb:
%tmp = fcmp ogt float %arg, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index b5665835eaf7a..8c2011b49ceb9 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -7807,10 +7807,12 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) {
; NOOPT-NEXT: ; implicit-def: $sgpr0
; NOOPT-NEXT: v_mov_b32_e32 v0, s0
; NOOPT-NEXT: ds_write_b32 v0, v2
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: s_mov_b32 m0, -1
; NOOPT-NEXT: ; implicit-def: $sgpr0
; NOOPT-NEXT: v_mov_b32_e32 v0, s0
; NOOPT-NEXT: ds_write_b32 v0, v1
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
; NOOPT-NEXT: s_endpgm
;
; SI-MOVREL-LABEL: multi_same_block:
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 0681263b7428e..04e352984b948 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -71,6 +71,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: .LBB0_2: ; %end
; CHECK-NEXT: s_endpgm
More information about the llvm-branch-commits
mailing list