[llvm] [AMDGPU] Do not bundle meta instructions with memops (PR #153533)
Yoonseo Choi via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 15 20:40:29 PDT 2025
https://github.com/yoonseoch updated https://github.com/llvm/llvm-project/pull/153533
>From 5532bd03b52ac57d2ad2861993aae6e3924eb4dc Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 20:40:04 -0500
Subject: [PATCH 1/5] [AMDGPU] Do not bundle a meta-instruction with memory
operations
---
llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 19 +-
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 930 +++++++++---------
.../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 46 +-
.../AMDGPU/gfx-callable-argument-types.ll | 52 +-
llvm/test/CodeGen/AMDGPU/max.i16.ll | 1 -
.../CodeGen/AMDGPU/postra-bundle-memops.mir | 58 +-
6 files changed, 553 insertions(+), 553 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index efdc55b8e68be..79be639993343 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -179,20 +179,19 @@ bool SIPostRABundler::run(MachineFunction &MF) {
Next = std::next(I);
assert(BundleEnd != I);
- if (canBundle(*BundleEnd, *I)) {
+ if (!canBundle(*BundleEnd, *I)) {
+ // Do not allow even meta instructions (e.g. SCHED_BARRIER) bundled
+ // between memory operations.
+ // SCHED_BARRIERs are added by users for a finer control over schedule
+ // than bundling.
+ // Examples of meta instructions: WAVE_BARRIER, SCHED_{GROUP_}BARRIER,
+ // IGLP_OPT, amdgcn.unreachable.
+ break;
+ }
BundleEnd = I;
if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
++ClauseLength;
- } else if (!I->isMetaInstruction()) {
- // Allow meta instructions in between bundle candidates, but do not
- // start or end a bundle on one.
- //
- // TODO: It may be better to move meta instructions like dbg_value
- // after the bundle. We're relying on the memory legalizer to unbundle
- // these.
- break;
- }
}
Next = std::next(BundleEnd);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 0d5f538215f18..2022bd07fd281 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -4333,9 +4333,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -4437,18 +4442,45 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
@@ -4478,45 +4510,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB12_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -4694,6 +4695,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; VI-NEXT: s_cbranch_execz .LBB12_4
; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30
@@ -5302,9 +5304,16 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -5398,14 +5407,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr38
@@ -5437,15 +5444,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -5493,11 +5500,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB12_2
@@ -40180,9 +40182,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -40284,18 +40291,45 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
@@ -40325,45 +40359,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB36_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -40541,6 +40544,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; VI-NEXT: s_cbranch_execz .LBB36_4
; VI-NEXT: ; %bb.3: ; %cmp.true
; VI-NEXT: v_add_f32_e32 v32, 1.0, v32
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_f32_e32 v31, 1.0, v31
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; VI-NEXT: v_add_f32_e32 v30, 1.0, v30
@@ -41149,9 +41153,16 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -41245,14 +41256,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr38
@@ -41284,15 +41293,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -41340,11 +41349,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB36_2
@@ -75962,9 +75966,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -76066,18 +76075,45 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr59
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr58
@@ -76107,45 +76143,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr58
; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr57
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB56_2
; VI-NEXT: ; %bb.1: ; %cmp.false
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -76352,6 +76357,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc
; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29
; VI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31
; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v32, vcc
; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -76931,9 +76937,16 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr40
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
@@ -77027,14 +77040,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr56
; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; implicit-def: $vgpr38
@@ -77066,15 +77077,15 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: ; kill: killed $vgpr40
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr40
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; kill: killed $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
@@ -77122,11 +77133,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(29)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB56_2
@@ -109845,9 +109851,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr57
+; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
@@ -109947,18 +109958,45 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr57
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: ; kill: killed $vgpr39
; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; implicit-def: $vgpr47
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr46
+; VI-NEXT: ; implicit-def: $vgpr53
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr52
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr63
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr51
+; VI-NEXT: ; implicit-def: $vgpr37
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr60
+; VI-NEXT: ; implicit-def: $vgpr59
+; VI-NEXT: ; implicit-def: $vgpr36
+; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; kill: killed $vgpr39
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr41
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr56
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr56
@@ -109991,39 +110029,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr55
-; VI-NEXT: ; implicit-def: $vgpr47
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr46
-; VI-NEXT: ; implicit-def: $vgpr53
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: ; implicit-def: $vgpr52
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr63
-; VI-NEXT: ; implicit-def: $vgpr62
-; VI-NEXT: ; implicit-def: $vgpr51
-; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr60
-; VI-NEXT: ; implicit-def: $vgpr59
-; VI-NEXT: ; implicit-def: $vgpr36
-; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; kill: killed $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr39
; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: s_waitcnt vmcnt(14)
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB72_2
@@ -110032,6 +110038,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31
@@ -110206,6 +110213,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; VI-NEXT: s_cbranch_execz .LBB72_4
; VI-NEXT: ; %bb.3: ; %cmp.true
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0
; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0
; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0
@@ -110801,9 +110809,16 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
@@ -110897,14 +110912,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; kill: killed $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; kill: killed $vgpr48
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: ; implicit-def: $vgpr46
; GFX9-NEXT: ; implicit-def: $vgpr54
; GFX9-NEXT: ; implicit-def: $vgpr45
@@ -110936,15 +110949,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr48
; GFX9-NEXT: ; kill: killed $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr41
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr42
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr42
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
+; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -110996,9 +111007,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(31)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
-; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB72_2
@@ -156968,12 +156976,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr34
; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
; VI-NEXT: ; implicit-def: $vgpr38
; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr54
@@ -156998,52 +157010,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr52
; VI-NEXT: ; implicit-def: $vgpr37
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: ; implicit-def: $vgpr56
; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr56
-; VI-NEXT: ; implicit-def: $vgpr34
-; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: s_waitcnt vmcnt(5)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; kill: killed $vgpr33
@@ -157152,6 +157123,43 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr33
; VI-NEXT: ; kill: killed $vgpr33
; VI-NEXT: ; implicit-def: $vgpr33
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr34
+; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; VI-NEXT: s_cbranch_execz .LBB90_2
@@ -157286,6 +157294,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2]
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32]
; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12
; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -158514,10 +158523,60 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: ; kill: killed $vgpr59
+; GFX9-NEXT: ; implicit-def: $vgpr59
+; GFX9-NEXT: ; kill: killed $vgpr58
+; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: v_mov_b32_e32 v46, v15
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: ; kill: killed $vgpr59
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
@@ -158527,86 +158586,42 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: v_mov_b32_e32 v47, v16
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr36
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: ; implicit-def: $vgpr59
-; GFX9-NEXT: ; kill: killed $vgpr58
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr41
; GFX9-NEXT: ; implicit-def: $vgpr38
; GFX9-NEXT: ; implicit-def: $vgpr51
@@ -158637,15 +158652,25 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr57
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr63
-; GFX9-NEXT: ; kill: killed $vgpr59
-; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr58
+; GFX9-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
+; GFX9-NEXT: ; kill: killed $vgpr15
+; GFX9-NEXT: ; implicit-def: $vgpr15
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -158701,21 +158726,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(33)
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
-; GFX9-NEXT: ; kill: killed $vgpr15
-; GFX9-NEXT: ; implicit-def: $vgpr15
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB90_2
@@ -183293,9 +183303,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16
; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15
-; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12
; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v11
; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v10
@@ -183305,19 +183318,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v28
; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v26
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v24
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23
; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v22
; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21
; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20
; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19
; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr52
; VI-NEXT: ; implicit-def: $vgpr45
; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr50
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr49
+; VI-NEXT: ; implicit-def: $vgpr35
; VI-NEXT: s_waitcnt vmcnt(13)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; VI-NEXT: ; implicit-def: $vgpr31
@@ -183415,15 +183429,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: s_waitcnt vmcnt(12)
-; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v61
-; VI-NEXT: s_waitcnt vmcnt(11)
-; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v60
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: s_waitcnt vmcnt(14)
+; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v61
+; VI-NEXT: s_waitcnt vmcnt(13)
+; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v60
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr31
@@ -183435,15 +183449,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr50
-; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr49
-; VI-NEXT: ; implicit-def: $vgpr35
; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr32
+; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr32
@@ -184340,9 +184350,17 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr44
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
@@ -184402,10 +184420,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -184432,41 +184448,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: ; implicit-def: $vgpr53
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -184525,6 +184514,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr54
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB94_2
@@ -205893,12 +205903,74 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26
+; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25
+; VI-NEXT: ; kill: killed $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24
+; VI-NEXT: ; kill: killed $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23
+; VI-NEXT: ; kill: killed $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22
+; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21
+; VI-NEXT: ; kill: killed $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr55
+; VI-NEXT: ; kill: killed $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16
; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14
; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12
@@ -205908,20 +205980,34 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4
; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22
-; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21
; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20
; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19
; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18
-; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr32
+; VI-NEXT: ; implicit-def: $vgpr62
+; VI-NEXT: ; implicit-def: $vgpr38
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: ; implicit-def: $vgpr61
+; VI-NEXT: ; implicit-def: $vgpr58
+; VI-NEXT: ; implicit-def: $vgpr39
+; VI-NEXT: ; kill: killed $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr35
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr40
+; VI-NEXT: ; implicit-def: $vgpr45
+; VI-NEXT: ; implicit-def: $vgpr44
+; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: s_waitcnt vmcnt(14)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36
; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17
; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
@@ -205992,7 +206078,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr32
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
@@ -206002,88 +206087,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
; VI-NEXT: ; implicit-def: $vgpr31
+; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36
; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr55
; VI-NEXT: ; implicit-def: $vgpr31
; VI-NEXT: ; kill: killed $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr62
; VI-NEXT: ; implicit-def: $vgpr31
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr38
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: ; implicit-def: $vgpr61
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr58
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr39
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; kill: killed $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr35
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: ; kill: killed $vgpr48
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; kill: killed $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr43
-; VI-NEXT: ; implicit-def: $vgpr40
-; VI-NEXT: ; implicit-def: $vgpr48
-; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr54
+; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr54
@@ -206092,14 +206106,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr45
-; VI-NEXT: ; implicit-def: $vgpr44
-; VI-NEXT: ; implicit-def: $vgpr41
-; VI-NEXT: ; implicit-def: $vgpr54
-; VI-NEXT: ; implicit-def: $vgpr48
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr48
+; VI-NEXT: ; implicit-def: $vgpr54
; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr48
@@ -207094,9 +207104,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr44
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
@@ -207156,10 +207174,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr50
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr44
; GFX9-NEXT: ; kill: killed $vgpr50
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: ; implicit-def: $vgpr58
; GFX9-NEXT: ; implicit-def: $vgpr36
; GFX9-NEXT: ; implicit-def: $vgpr57
@@ -207186,41 +207202,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr52
; GFX9-NEXT: ; implicit-def: $vgpr51
; GFX9-NEXT: ; implicit-def: $vgpr50
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr53
; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr54
-; GFX9-NEXT: ; implicit-def: $vgpr53
-; GFX9-NEXT: ; implicit-def: $vgpr43
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT: ; implicit-def: $vgpr43
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT: s_waitcnt vmcnt(17)
+; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
@@ -207279,6 +207268,27 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX9-NEXT: ; implicit-def: $vgpr33
; GFX9-NEXT: ; kill: killed $vgpr33
; GFX9-NEXT: ; implicit-def: $vgpr33
+; GFX9-NEXT: ; implicit-def: $vgpr54
+; GFX9-NEXT: ; implicit-def: $vgpr53
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT: ; implicit-def: $vgpr43
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB98_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 5aac06a7f3a2b..f0f986faddcef 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -63783,8 +63783,28 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
;
; VI-LABEL: bitcast_v32i16_to_v64i8:
; VI: ; %bb.0:
-; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr43
+; VI-NEXT: ; implicit-def: $vgpr42
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT: ; implicit-def: $vgpr19
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; kill: killed $vgpr19
@@ -63805,23 +63825,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; kill: killed $vgpr19
; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr43
; VI-NEXT: ; kill: killed $vgpr17
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; kill: killed $vgpr19
@@ -63847,11 +63850,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr17
; VI-NEXT: ; kill: killed $vgpr19
; VI-NEXT: ; implicit-def: $vgpr19
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT: ; implicit-def: $vgpr42
-; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr41
; VI-NEXT: ; implicit-def: $vgpr18
; VI-NEXT: ; implicit-def: $vgpr54
@@ -63883,6 +63881,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; VI-NEXT: ; implicit-def: $vgpr21
; VI-NEXT: ; implicit-def: $vgpr20
; VI-NEXT: ; implicit-def: $vgpr19
+; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
; VI-NEXT: ; implicit-def: $vgpr42
; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 2fdc1a8854863..59333f63f9330 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -15792,18 +15792,16 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s14, 10
; GFX9-NEXT: v_writelane_b32 v40, s15, 11
; GFX9-NEXT: v_writelane_b32 v40, s16, 12
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s17, 13
; GFX9-NEXT: v_writelane_b32 v40, s18, 14
-; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s19, 15
; GFX9-NEXT: v_writelane_b32 v40, s20, 16
; GFX9-NEXT: v_writelane_b32 v40, s21, 17
-; GFX9-NEXT: v_writelane_b32 v40, s22, 18
-; GFX9-NEXT: v_writelane_b32 v40, s23, 19
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0
-; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT: v_writelane_b32 v40, s22, 18
+; GFX9-NEXT: v_writelane_b32 v40, s23, 19
; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX9-NEXT: v_writelane_b32 v40, s24, 20
@@ -15826,6 +15824,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT: v_mov_b32_e32 v0, s50
; GFX9-NEXT: v_writelane_b32 v40, s30, 26
+; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT: v_mov_b32_e32 v0, s51
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
@@ -15892,14 +15892,20 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s34, 28
; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT: s_addk_i32 s32, 0x200
+; GFX10-NEXT: ; kill: killed $sgpr34_sgpr35
+; GFX10-NEXT: ; kill: killed $sgpr34_sgpr35
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0
; GFX10-NEXT: v_writelane_b32 v40, s8, 4
; GFX10-NEXT: v_writelane_b32 v40, s9, 5
; GFX10-NEXT: v_writelane_b32 v40, s10, 6
; GFX10-NEXT: v_writelane_b32 v40, s11, 7
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s52
; GFX10-NEXT: v_writelane_b32 v40, s12, 8
; GFX10-NEXT: v_writelane_b32 v40, s13, 9
; GFX10-NEXT: v_writelane_b32 v40, s14, 10
@@ -15908,44 +15914,39 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-NEXT: v_writelane_b32 v40, s19, 15
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0
-; GFX10-NEXT: ; meta instruction
-; GFX10-NEXT: ; meta instruction
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
; GFX10-NEXT: v_writelane_b32 v40, s20, 16
; GFX10-NEXT: v_writelane_b32 v40, s21, 17
; GFX10-NEXT: v_writelane_b32 v40, s22, 18
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s52
-; GFX10-NEXT: v_mov_b32_e32 v1, s47
-; GFX10-NEXT: v_writelane_b32 v40, s23, 19
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX10-NEXT: v_mov_b32_e32 v0, s46
+; GFX10-NEXT: v_writelane_b32 v40, s23, 19
+; GFX10-NEXT: v_mov_b32_e32 v1, s47
; GFX10-NEXT: v_mov_b32_e32 v2, s48
; GFX10-NEXT: v_mov_b32_e32 v3, s49
-; GFX10-NEXT: v_writelane_b32 v40, s24, 20
; GFX10-NEXT: s_mov_b32 s20, s36
+; GFX10-NEXT: v_writelane_b32 v40, s24, 20
; GFX10-NEXT: s_mov_b32 s21, s37
; GFX10-NEXT: s_mov_b32 s22, s38
; GFX10-NEXT: s_mov_b32 s23, s39
-; GFX10-NEXT: v_writelane_b32 v40, s25, 21
; GFX10-NEXT: s_mov_b32 s24, s40
+; GFX10-NEXT: v_writelane_b32 v40, s25, 21
; GFX10-NEXT: s_mov_b32 s25, s41
; GFX10-NEXT: v_mov_b32_e32 v4, s50
; GFX10-NEXT: v_mov_b32_e32 v5, s51
-; GFX10-NEXT: v_writelane_b32 v40, s26, 22
-; GFX10-NEXT: s_mov_b32 s26, s42
; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20
+; GFX10-NEXT: v_writelane_b32 v40, s26, 22
+; GFX10-NEXT: s_mov_b32 s26, s42
; GFX10-NEXT: v_writelane_b32 v40, s27, 23
; GFX10-NEXT: s_mov_b32 s27, s43
; GFX10-NEXT: v_writelane_b32 v40, s28, 24
@@ -16111,15 +16112,22 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28
; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1
+; GFX10-SCRATCH-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7
+; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2
+; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s12, 8
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s13, 9
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s14, 10
@@ -16128,11 +16136,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15
-; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT: s_clause 0x2
-; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT: ; meta instruction
-; GFX10-SCRATCH-NEXT: ; meta instruction
+; GFX10-SCRATCH-NEXT: s_clause 0x1
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40
; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg at abs32@hi
@@ -16141,8 +16145,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18
; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2
-; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19
; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index a5b64f6f80d9b..956ee8a3a5384 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -149,7 +149,6 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
; GFX9-NEXT: global_load_dword v4, v0, s[2:3]
-; GFX9-NEXT: ; kill: killed $sgpr2_sgpr3
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16 v1, v0, s[6:7] offset:4
; GFX9-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 1e6c00a54a012..b8972cce73bb5 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -114,7 +114,7 @@ body: |
$vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
...
-# Middle dbg_value should be bundled
+# Loads are not bundled due to middle deb_value
---
name: bundle_dbg_value_0
tracksRegLiveness: true
@@ -124,18 +124,16 @@ body: |
; GCN-LABEL: name: bundle_dbg_value_0
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
- ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
- ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
- ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
- ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
+ ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
DBG_VALUE $vgpr0, 0, 0
$vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
...
-# Middle dbg_value should be bundled
+# Loads are not bundled due to middle deb_value
---
name: bundle_dbg_value_1
tracksRegLiveness: true
@@ -145,12 +143,10 @@ body: |
; GCN-LABEL: name: bundle_dbg_value_1
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
- ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
- ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
- ; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
- ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
- ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
+ ; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
; GCN-NEXT: DBG_VALUE $vgpr2, 0, 0
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
DBG_VALUE $vgpr0, 0, 0
@@ -159,7 +155,7 @@ body: |
DBG_VALUE $vgpr2, 0, 0
...
-# Starting and ending dbg_values should not be in the bundle
+# Loads are not bundled due to middle deb_value
---
name: bundle_dbg_value_2
tracksRegLiveness: true
@@ -170,11 +166,9 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
- ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
- ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
- ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
- ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
; GCN-NEXT: DBG_VALUE $vgpr2, 0, 0
DBG_VALUE $vgpr1, 0, 0
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
@@ -183,6 +177,7 @@ body: |
DBG_VALUE $vgpr2, 0, 0
...
+# Loads are not bundled due to KILL
---
name: bundle_kill
tracksRegLiveness: true
@@ -193,11 +188,9 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
- ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
- ; GCN-NEXT: KILL $vgpr1
- ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
- ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: KILL $vgpr1
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
KILL $vgpr1
@@ -215,11 +208,9 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
- ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
- ; GCN-NEXT: KILL internal $vgpr0
- ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
- ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: KILL $vgpr0
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
KILL $vgpr0
@@ -341,11 +332,10 @@ body: |
; GCN-LABEL: name: post_bundle_kill_and_null_reg_dbginfo
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
- ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
- ; GCN-NEXT: DBG_VALUE $noreg, $noreg
- ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
- ; GCN-NEXT: }
+ ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
+ ; GCN-NEXT: DBG_VALUE $noreg, $noreg
+ ; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
+ ; GCN-NEXT: KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
$vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
DBG_VALUE $noreg, $noreg
$vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
>From 4f7612662c434a3eaf73725f0b7395e20ae88dac Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 20:48:10 -0500
Subject: [PATCH 2/5] formatting
---
llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 79be639993343..f009bfd2ca372 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -188,10 +188,11 @@ bool SIPostRABundler::run(MachineFunction &MF) {
// IGLP_OPT, amdgcn.unreachable.
break;
}
- BundleEnd = I;
- if (I->getNumExplicitDefs() != 0)
- Defs.insert(I->defs().begin()->getReg());
- ++ClauseLength;
+
+ BundleEnd = I;
+ if (I->getNumExplicitDefs() != 0)
+ Defs.insert(I->defs().begin()->getReg());
+ ++ClauseLength;
}
Next = std::next(BundleEnd);
>From ecdc6121afeadaa8ae46f0611fcd14a0bf73e077 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 20:48:28 -0500
Subject: [PATCH 3/5] formatting
---
llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index f009bfd2ca372..e12d9e6c69509 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -188,7 +188,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
// IGLP_OPT, amdgcn.unreachable.
break;
}
-
+
BundleEnd = I;
if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
>From 7a1de8d8ad382cca6e3d3c00bf319727f06fac8d Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Wed, 13 Aug 2025 23:32:38 -0500
Subject: [PATCH 4/5] Updating a lit-test
---
.../CodeGen/AMDGPU/postra-bundle-memops.mir | 67 +++++++++++++++++++
1 file changed, 67 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index b8972cce73bb5..10ee822405caa 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -341,3 +341,70 @@ body: |
$vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
...
+
+# The 3rd and 4th loads are not bundled due to an in-between SCHED_BARRIER meta instruction.
+# This helps SCHED_BARRIERs are honored maximally.
+# MASK 1924 = 0b 0111 1000 0100 VALU and all VMEM, VMEM Read/Write cannot be
+# scheduled across SCHED_BARRIER.
+
+---
+name: extra_sched_barrier_in_bundle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: extra_sched_barrier_in_bundle
+ ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; GCN-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; GCN-NEXT: SCHED_BARRIER 1924
+ ; GCN-NEXT: renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ ; GCN-NEXT: SCHED_BARRIER 1924
+ ; GCN-NEXT: renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ ; GCN-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+ ; GCN-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+ ; GCN-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: S_ENDPGM 0
+ renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ renamable $vgpr0 = IMPLICIT_DEF
+ BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec
+ renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec
+ }
+ renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+ renamable $vgpr10 = IMPLICIT_DEF
+ renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ SCHED_BARRIER 1924
+ renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ SCHED_BARRIER 1924
+ renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+ renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+ BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+ GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+ }
+ BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+ GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+ }
+ S_ENDPGM 0
+...
>From 59be7551e51d8e1f453e4cb8beeb57dc04f961b4 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Thu, 14 Aug 2025 11:25:36 -0500
Subject: [PATCH 5/5] Resolving a merge conflict
---
.../test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
index 5c7dc8a079345..fe9dd0294cc4d 100644
--- a/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
@@ -23,12 +23,10 @@ body: |
; GCN-BUNDLE-LABEL: name: clause_implicit_def
; GCN-BUNDLE: liveins: $vgpr0, $sgpr32
; GCN-BUNDLE-NEXT: {{ $}}
- ; GCN-BUNDLE-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $sgpr32, implicit $exec, implicit $flat_scr {
- ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
- ; GCN-BUNDLE-NEXT: $vgpr2 = IMPLICIT_DEF
- ; GCN-BUNDLE-NEXT: $vgpr3 = IMPLICIT_DEF
- ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORDX2_SADDR internal $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
- ; GCN-BUNDLE-NEXT: }
+ ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
+ ; GCN-BUNDLE-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; GCN-BUNDLE-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORDX2_SADDR $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
$vgpr2 = IMPLICIT_DEF
$vgpr3 = IMPLICIT_DEF
More information about the llvm-commits
mailing list