[llvm] 0eaf675 - [AMDGPU][InsertWaits] No wait for WAW for global/scratch_load
Ruiling Song via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 22 17:58:11 PST 2022
Author: Ruiling Song
Date: 2022-11-23T09:57:50+08:00
New Revision: 0eaf6759aee92fa27efb78a97405195869c67bb0
URL: https://github.com/llvm/llvm-project/commit/0eaf6759aee92fa27efb78a97405195869c67bb0
DIFF: https://github.com/llvm/llvm-project/commit/0eaf6759aee92fa27efb78a97405195869c67bb0.diff
LOG: [AMDGPU][InsertWaits] No wait for WAW for global/scratch_load
global/scratch_load will return in order they are issued. No
need to insert a s_waitcnt for WAW hazard.
Reviewed By: foad
Differential Revision: https://reviews.llvm.org/D138476
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 3ccd09449dc7a..3a7e017b0945e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -141,8 +141,13 @@ enum VmemType {
VMEM_BVH
};
+static bool updateVMCntOnly(const MachineInstr &Inst) {
+ return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
+ SIInstrInfo::isFLATScratch(Inst);
+}
+
VmemType getVmemType(const MachineInstr &Inst) {
- assert(SIInstrInfo::isVMEM(Inst));
+ assert(updateVMCntOnly(Inst));
if (!SIInstrInfo::isMIMG(Inst))
return VMEM_NOSAMPLER;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
@@ -683,7 +688,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (T == VM_CNT) {
if (Interval.first >= NUM_ALL_VGPRS)
continue;
- if (SIInstrInfo::isVMEM(Inst)) {
+ if (updateVMCntOnly(Inst)) {
VmemType V = getVmemType(Inst);
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
VgprVmemTypes[RegNo] |= 1 << V;
@@ -1182,7 +1187,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// previous write and this write are the same type of VMEM
// instruction, in which case they're guaranteed to write their
// results in order anyway.
- if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
+ if (Op.isUse() || !updateVMCntOnly(MI) ||
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
getVmemType(MI))) {
ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 28865f55ec5f3..7828a632f9e46 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -476,9 +476,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
; GFX9V3: ; %bb.0:
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
@@ -495,9 +493,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
@@ -515,9 +511,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index d5d24fad7a313..9ff7c3bd75385 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -534,7 +534,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
; GFX9-NEXT: .LBB3_2:
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB3_3: ; %T
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
@@ -706,7 +705,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB4_3: ; %T
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
@@ -878,7 +876,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB5_3: ; %T
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 1af5080038f63..03cdd85a2960f 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -462,9 +462,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
; GFX9V3: ; %bb.0:
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V3-NEXT: s_waitcnt vmcnt(0)
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
@@ -481,9 +479,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
-; GFX9V4-NEXT: s_waitcnt vmcnt(0)
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
@@ -500,9 +496,7 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
; GFX9V5: ; %bb.0:
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
-; GFX9V5-NEXT: s_waitcnt vmcnt(0)
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index e029286333a8b..1446c0c1b65ab 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1667,8 +1667,8 @@ define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x hal
; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 offset:16
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, v3
@@ -1771,7 +1771,6 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
@@ -1786,7 +1785,6 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3
@@ -1800,7 +1798,6 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
index d5a77787d897e..ac6cdbf7d563e 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
@@ -72,3 +72,56 @@ body: |
$vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
$vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s128))
...
+# (global_load + scratch_load + buffer_load)
+---
+name: global_scratch_buffer
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-LABEL: name: global_scratch_buffer
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: S_WAITCNT 0
+ ; GFX9-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+...
+# waw between flat and buffer should have a wait inserted between.
+# (flat + buffer)
+---
+name: flat_buffer
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-LABEL: name: flat_buffer
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: S_WAITCNT 0
+ ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX9-NEXT: S_WAITCNT 49279
+ ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+...
+# buffer + flat
+---
+name: buffer_flat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-LABEL: name: buffer_flat
+ ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: S_WAITCNT 0
+ ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ ; GFX9-NEXT: S_WAITCNT 3952
+ ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
More information about the llvm-commits
mailing list