[llvm] [AMDGPU] Do not bundle meta instructions with memops (PR #153533)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 13 21:36:56 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Yoonseo Choi (yoonseoch)
<details>
<summary>Changes</summary>
Bundling memory ops with in-between SCHED_BARRIER can prevent that SCHED_BARRIER or any neighboring SCHED_BARRIER being honored. As users already provided SCHED_BARRIER between memory ops, don't bundle it together with memory ops.
---
Patch is 106.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153533.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIPostRABundler.cpp (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+470-460)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+23-23)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+27-25)
- (modified) llvm/test/CodeGen/AMDGPU/max.i16.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir (+91-34)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index efdc55b8e68be..e12d9e6c69509 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -179,20 +179,20 @@ bool SIPostRABundler::run(MachineFunction &MF) {
Next = std::next(I);
assert(BundleEnd != I);
- if (canBundle(*BundleEnd, *I)) {
- BundleEnd = I;
- if (I->getNumExplicitDefs() != 0)
- Defs.insert(I->defs().begin()->getReg());
- ++ClauseLength;
- } else if (!I->isMetaInstruction()) {
- // Allow meta instructions in between bundle candidates, but do not
- // start or end a bundle on one.
- //
- // TODO: It may be better to move meta instructions like dbg_value
- // after the bundle. We're relying on the memory legalizer to unbundle
- // these.
+ if (!canBundle(*BundleEnd, *I)) {
+ // Do not allow even meta instructions (e.g. SCHED_BARRIER) bundled
+ // between memory operations.
+ // SCHED_BARRIERs are added by users for a finer control over schedule
+ // than bundling.
+ // Examples of meta instructions: WAVE_BARRIER, SCHED_{GROUP_}BARRIER,
+ // IGLP_OPT, amdgcn.unreachable.
break;
}
+
+ BundleEnd = I;
+ if (I->getNumExplicitDefs() != 0)
+ Defs.insert(I->defs().begin()->getReg());
+ ++ClauseLength;
}
Next = std::next(BundleEnd);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 0d5f538215f18..2022bd07fd281 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -4333,9 +4333,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -4437,18 +4442,45 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
@@ -4478,45 +4510,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB12_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -4694,6 +4695,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: s_cbranch_execz .LBB12_4
; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30
@@ -5302,9 +5304,16 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -5398,14 +5407,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr38
@@ -5437,15 +5444,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -5493,11 +5500,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB12_2
@@ -40180,9 +40182,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -40284,18 +40291,45 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
@@ -40325,45 +40359,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB36_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -40541,6 +40544,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: s_cbranch_execz .LBB36_4
; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f32_e32 v32, 1.0, v32
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_f32_e32 v31, 1.0, v31
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; VI-NEXT: v_add_f32_e32 v30, 1.0, v30
@@ -41149,9 +41153,16 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -41245,14 +41256,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr38
@@ -41284,15 +41293,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -41340,11 +41349,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB36_2
@@ -75962,9 +75966,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/153533
More information about the llvm-commits
mailing list