[llvm] [AMDGPU] Include WWM register spill into BB Prolog (PR #111496)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 8 01:10:50 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Christudasan Devadasan (cdevadas)
<details>
<summary>Changes</summary>
With #<!-- -->93526 we split the regalloc pipeline further
to have a standalone allocation for wwm registers
and per-lane VGPRs. Currently the presence of the
wwm-spill reloads inserted at the bb-top limits the
isBasicPrologue function during the per-lane vgpr
regalloc to skip past the exec manipulation instruction
and ended up causing incorrect codegen. The wmm-spill
inserted during the wwm-regalloc pipeline should also
be included in the bb-prolog so that the per-lane vgpr
regalloc pipeline can identify the appropriate insertion
points for their spills and copies.
---
Patch is 188.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111496.diff
14 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+56-51)
- (modified) llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll (+4-10)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+153-125)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+135-118)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+67-64)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+72-68)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+72-62)
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+6-6)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0d153df5c3977c..0c2ae382f53a19 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8903,7 +8903,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
uint16_t Opcode = MI.getOpcode();
return IsNullOrVectorRegister &&
- (isSGPRSpill(Opcode) ||
+ (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
(!MI.isTerminator() && Opcode != AMDGPU::COPY &&
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index e9e7360733581a..c9426106af5dad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -68,6 +68,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 exec_lo, s21
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_or_saveexec_b32 s21, -1
+; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b32 exec_lo, s21
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -84,10 +87,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; CHECK-NEXT: s_or_saveexec_b32 s21, -1
-; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT: s_mov_b32 exec_lo, s21
-; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_readfirstlane_b32 s12, v7
; CHECK-NEXT: v_readfirstlane_b32 s10, v6
; CHECK-NEXT: v_readfirstlane_b32 s9, v5
@@ -104,7 +104,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_mov_b32 s17, s6
; CHECK-NEXT: s_mov_b32 s18, s5
; CHECK-NEXT: s_mov_b32 s19, s4
-; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_writelane_b32 v16, s12, 5
; CHECK-NEXT: v_writelane_b32 v16, s13, 6
; CHECK-NEXT: v_writelane_b32 v16, s14, 7
@@ -138,8 +137,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 exec_lo, s21
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -157,6 +154,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: v_readlane_b32 s17, v16, 1
; CHECK-NEXT: v_readlane_b32 s18, v16, 2
; CHECK-NEXT: v_readlane_b32 s19, v16, 3
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index 2b98f61748066f..9988b2fa1eaf08 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -29,11 +29,11 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
; REGALLOC-NEXT: bb.1.Flow:
; REGALLOC-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
; REGALLOC-NEXT: {{ $}}
- ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
; REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0, implicit-def $sgpr4_sgpr5
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
; REGALLOC-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr63, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
@@ -62,11 +62,11 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
; REGALLOC-NEXT: S_BRANCH %bb.1
; REGALLOC-NEXT: {{ $}}
; REGALLOC-NEXT: bb.4.bb.3:
- ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
; REGALLOC-NEXT: $vgpr63 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
; REGALLOC-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2, implicit-def $sgpr4_sgpr5
; REGALLOC-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR killed $vgpr63, 3
; REGALLOC-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+ ; REGALLOC-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
; REGALLOC-NEXT: renamable $vgpr0 = V_LSHL_ADD_U32_e64 killed $vgpr0, 2, $vgpr0, implicit $exec
; REGALLOC-NEXT: SI_RETURN implicit killed $vgpr0
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 7cec15ea5be87a..75d0b83a024ff5 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -67,7 +67,6 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_4
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -75,12 +74,14 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
@@ -99,8 +100,6 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB0_3
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT: s_waitcnt expcnt(1)
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -108,7 +107,9 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -235,7 +236,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_3
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -243,12 +243,14 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s4, v4, 0
; GCN-O0-NEXT: v_readlane_b32 s5, v4, 1
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; GCN-O0-NEXT: s_mov_b32 s1, s2
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
@@ -267,8 +269,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB1_4
; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT: s_waitcnt expcnt(1)
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -276,7 +276,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -302,8 +304,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_branch .LBB1_5
; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end
-; GCN-O0-NEXT: s_waitcnt expcnt(1)
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -314,7 +314,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -454,18 +456,17 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_6
; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 2
-; GCN-O0-NEXT: s_waitcnt vmcnt(1)
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
-; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v4, s2, 4
; GCN-O0-NEXT: v_writelane_b32 v4, s3, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
@@ -492,7 +493,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB2_5
; GCN-O0-NEXT: ; %bb.3: ; %bb.then
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -500,7 +500,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -516,7 +518,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-O0-NEXT: s_branch .LBB2_5
; GCN-O0-NEXT: .LBB2_4: ; %bb.else
-; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -524,7 +525,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v4, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v4, 1
+; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1
; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -721,13 +724,13 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB3_8
; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s0, 0xf000
; GCN-O0-NEXT: s_mov_b32 s2, 0
; GCN-O0-NEXT: s_mov_b32 s4, s2
@@ -737,12 +740,11 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
; GCN-O0-NEXT: v_mov_b32_e32 v1, 1
-; GCN-O0-NEXT: s_waitcnt vmcnt(1)
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 offset:4
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_writelane_b32 v6, s0, 4
; GCN-O0-NEXT: v_writelane_b32 v6, s1, 5
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -768,13 +770,13 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GCN-O0-NEXT: s_branch .LBB3_7
; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v6, off, s[12:15], 0 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
; GCN-O0-NEXT: s_mov_b32 s0, 0
; GCN-O0-NEXT: s_mov_b32 s2, s0
@@ -784,11 +786,10 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-O0-NEXT: v_mov_b32_e32 v1, 3
-; GCN-O0-NEXT: s_waitcnt vmcnt(1)
+; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64 offset:12
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0
; GCN-O0-NEXT: s_mov_b64 s[0:1], exec
-; GCN-O0-NEXT: s_waitcnt vmcnt(1)
; GCN-O0-NEXT: v_writelane_b32 v6, s0, 6
; GCN-O0-NEXT: v_writelane_b32 v6, s1, 7
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
@@ -926,7 +927,6 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_mov_b64 exec, s[0:1]
; GCN-O0-NEXT: s_cbranch_execz .LBB4_2
; GCN-O0-NEXT: ; %bb.1: ; %bb.then
-; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 ; 4-byte Folded Reload
@@ -934,12 +934,14 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
; GCN-O0-NEXT: v_readlane_b32 s0, v3, 0
; GCN-O0-NEXT: v_readlane_b32 s1, v3, 1
+; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
; GCN-O0-NEXT: s_mov_b32 s4, 0
; GCN-O0-NE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/111496
More information about the llvm-commits
mailing list