[llvm-branch-commits] [llvm] [AMDGPU] GFX12 VMEM instructions can write VGPR results out of order (PR #105549)
Jay Foad via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Aug 21 09:40:11 PDT 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/105549
Fix SIInsertWaitcnts to account for this by adding extra waits to avoid
WAW dependencies.
>From 9a2103df4094af38f59e1adce5414b94672e6d6e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 21 Aug 2024 16:23:49 +0100
Subject: [PATCH] [AMDGPU] GFX12 VMEM instructions can write VGPR results out
of order
Fix SIInsertWaitcnts to account for this by adding extra waits to avoid
WAW dependencies.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 23 ++++++++++++++-----
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +++
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 7 +++---
.../buffer-fat-pointer-atomicrmw-fadd.ll | 3 +++
.../buffer-fat-pointer-atomicrmw-fmax.ll | 5 ++++
.../buffer-fat-pointer-atomicrmw-fmin.ll | 5 ++++
....amdgcn.struct.buffer.load.format.v3f16.ll | 1 +
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 10 +++++++-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 10 ++++++++
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 2 ++
.../AMDGPU/spill-csr-frame-ptr-reg-copy.ll | 1 +
.../CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir | 8 +++----
12 files changed, 64 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7906e0ee9d7858..9efdbd751d96e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -953,6 +953,12 @@ def FeatureRequiredExportPriority : SubtargetFeature<"required-export-priority",
"Export priority must be explicitly manipulated on GFX11.5"
>;
+def FeatureVmemWriteVgprInOrder : SubtargetFeature<"vmem-write-vgpr-in-order",
+ "HasVmemWriteVgprInOrder",
+ "true",
+ "VMEM instructions of the same type write VGPR results in order"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1123,7 +1129,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
- FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1136,7 +1143,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1152,7 +1160,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
- FeatureDefaultComponentZero
+ FeatureDefaultComponentZero, FeatureVmemWriteVgprInOrder
]
>;
@@ -1170,7 +1178,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
+ FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1193,7 +1202,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength63,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
- FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts
+ FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
@@ -1215,7 +1225,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength32,
- FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts
+ FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
+ FeatureVmemWriteVgprInOrder
]
>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 902f51ae358d59..9386bcf0d74b22 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -239,6 +239,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasVALUTransUseHazard = false;
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;
+ bool HasVmemWriteVgprInOrder = false;
bool RequiresCOV6 = false;
@@ -1285,6 +1286,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
+ bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
+
/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 59a1eee8d4f91d..4262e7b5d9c250 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1778,11 +1778,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (IsVGPR) {
// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
// previous write and this write are the same type of VMEM
- // instruction, in which case they're guaranteed to write their
- // results in order anyway.
+ // instruction, in which case they are (in some architectures)
+ // guaranteed to write their results in order anyway.
if (Op.isUse() || !updateVMCntOnly(MI) ||
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
- getVmemType(MI))) {
+ getVmemType(MI)) ||
+ !ST->hasVmemWriteVgprInOrder()) {
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 9d9e6898417e87..63cdd8a3bb16dc 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -2599,6 +2599,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -4432,6 +4433,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
@@ -5911,6 +5913,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 172ce4c065e13d..c90296124eb127 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -1737,6 +1737,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -3459,6 +3460,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
@@ -4959,6 +4961,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
@@ -6329,6 +6332,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -7822,6 +7826,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 61ee956747135f..91adbfa5599761 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -1737,6 +1737,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -3459,6 +3460,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
@@ -4959,6 +4961,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
@@ -6329,6 +6332,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
@@ -7822,6 +7826,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index 4c1ae4c228adb3..0522d5258b9b5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -128,6 +128,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 355c296d122ff2..22b718935738bd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -745,7 +745,7 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_clause 0xf
+; GFX12-NEXT: s_clause 0x7
; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
@@ -754,13 +754,21 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
+; GFX12-NEXT: s_wait_loadcnt 0x7
; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
; GFX12-NEXT: s_wait_loadcnt 0x4
; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 142bc37fdeb755..4cc47b09d813d6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3563,15 +3563,19 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -4371,8 +4375,10 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
@@ -7341,8 +7347,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v39
@@ -7364,8 +7372,10 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index c0649322c81953..7cdf270810dea0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3091,8 +3091,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
index b045dd559aac26..34bcc3f02ac66d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
@@ -15,6 +15,7 @@
; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
; GCN: s_xor_saveexec_b64
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
index e51174919b8d3a..bdef55ab956a01 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir
@@ -297,7 +297,7 @@ body: |
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
-# GFX12-NOT: S_WAIT_LOADCNT 0
+# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name: waitcnt_vm_loop2
body: |
@@ -344,7 +344,7 @@ body: |
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
-# GFX12-NOT: S_WAIT_LOADCNT 0
+# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name: waitcnt_vm_loop2_store
body: |
@@ -445,7 +445,7 @@ body: |
# GFX12-LABEL: bb.1:
# GFX12-NOT: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
-# GFX12-NOT: S_WAIT_LOADCNT 0
+# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.3:
name: waitcnt_vm_loop2_nowait
body: |
@@ -602,7 +602,7 @@ body: |
# GFX12: BUFFER_LOAD_FORMAT_X_IDXEN
# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.1:
-# GFX12-NOT: S_WAIT_LOADCNT 0
+# GFX12: S_WAIT_LOADCNT 0
# GFX12-LABEL: bb.2:
name: waitcnt_vm_zero
More information about the llvm-branch-commits
mailing list